From 9312111295ef41c6739c201acd33b03b59e3b8e7 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Fri, 3 Mar 2023 06:39:38 -0500
Subject: [PATCH 01/42] perf: fix duplicate trace logic for
 COLLISION_MODE_PER_FRAME_PLANESET

collision mode COLLISION_MODE_PER_FRAME_PLANESET was running traces
for the next mode due to a missing break, which decreased performance.

these traces are expensive as seen by profiling
---
 src/particles/builtin_constraints.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/particles/builtin_constraints.cpp b/src/particles/builtin_constraints.cpp
index 5658972e1..d84afa3ba 100644
--- a/src/particles/builtin_constraints.cpp
+++ b/src/particles/builtin_constraints.cpp
@@ -503,17 +503,16 @@ void CWorldCollideContextData::CalculatePlanes( CParticleCollection *pParticles,
 					}
 			m_nNumFixedPlanes = nIndexOut;
 			m_nActivePlanes = nIndexOut;
+			// UNDONE: We're now introducing this fix.
+			// Long missing break. Added to Source2 in change 700053.
+			// It's a bug, but changing it now could cause regressions, so
+			// leaving it for now until someone decides it's worth fixing.
+			// This break is necessary when exceptions are enabled because otherwise
+			// m_bPlaneActive[21] is set even though that plane is filled with
+			// NaNs. We should perhaps put this break in, but we need to do
+			// careful particle testing.
+			break;
 		}
-		// Long missing break. Added to Source2 in change 700053.
-		// It's a bug, but changing it now could cause regressions, so
-		// leaving it for now until someone decides it's worth fixing.
-#ifdef FP_EXCEPTIONS_ENABLED
-		// This break is necessary when exceptions are enabled because otherwise
-		// m_bPlaneActive[21] is set even though that plane is filled with
-		// NaNs. We should perhaps put this break in, but we need to do
-		// careful particle testing.
-		break;
-#endif
 
 		case COLLISION_MODE_USE_NEAREST_TRACE:
 		{
@@ -529,6 +528,7 @@ void CWorldCollideContextData::CalculatePlanes( CParticleCollection *pParticles,
 					}
 			m_nNumFixedPlanes = nIndexOut;
 			m_nActivePlanes = nIndexOut;
+			break;
 		}
 	}
 }

From ad892e06185a0084c63d283e93a9ba709b9d6540 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Sat, 4 Mar 2023 17:16:07 -0500
Subject: [PATCH 02/42] perf: enable SSE2 in build

allows for much more efficient math optimizations
---
 src/vpc_scripts/source_dll_win32_base.vpc     | 2 +-
 src/vpc_scripts/source_exe_win_win32_base.vpc | 2 +-
 src/vpc_scripts/source_lib_win32_base.vpc     | 2 +-
 src/vphysics/vphysics.vpc                     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/vpc_scripts/source_dll_win32_base.vpc b/src/vpc_scripts/source_dll_win32_base.vpc
index a61409584..40729b6c7 100644
--- a/src/vpc_scripts/source_dll_win32_base.vpc
+++ b/src/vpc_scripts/source_dll_win32_base.vpc
@@ -41,7 +41,7 @@ $Configuration
 
 	$Compiler [$WIN32]
 	{
-		$EnableEnhancedInstructionSet	"Streaming SIMD Extensions (/arch:SSE)"
+		$EnableEnhancedInstructionSet	"Streaming SIMD Extensions 2 (/arch:SSE2)"
 	}
 	
 	$Linker
diff --git a/src/vpc_scripts/source_exe_win_win32_base.vpc b/src/vpc_scripts/source_exe_win_win32_base.vpc
index 16350b03b..f71cc479b 100644
--- a/src/vpc_scripts/source_exe_win_win32_base.vpc
+++ b/src/vpc_scripts/source_exe_win_win32_base.vpc
@@ -41,7 +41,7 @@ $Configuration
 
 	$Compiler [$WIN32]
 	{
-		$EnableEnhancedInstructionSet	"Streaming SIMD Extensions (/arch:SSE)"
+		$EnableEnhancedInstructionSet	"Streaming SIMD Extensions 2 (/arch:SSE2)"
 	}
 	
 	$Linker
diff --git a/src/vpc_scripts/source_lib_win32_base.vpc b/src/vpc_scripts/source_lib_win32_base.vpc
index 1bc4f7fe1..94b9dd7d6 100644
--- a/src/vpc_scripts/source_lib_win32_base.vpc
+++ b/src/vpc_scripts/source_lib_win32_base.vpc
@@ -40,7 +40,7 @@ $Configuration
 
 	$Compiler [$WIN32]
 	{
-		$EnableEnhancedInstructionSet	"Streaming SIMD Extensions (/arch:SSE)"
+		$EnableEnhancedInstructionSet	"Streaming SIMD Extensions 2 (/arch:SSE2)"
 	}
 	
 	$PreBuildEvent
diff --git a/src/vphysics/vphysics.vpc b/src/vphysics/vphysics.vpc
index 9b4ee3f1d..5d8f43213 100644
--- a/src/vphysics/vphysics.vpc
+++ b/src/vphysics/vphysics.vpc
@@ -23,7 +23,7 @@ $Configuration
 	}
 	$Compiler [$WIN32]
 	{
-		$EnableEnhancedInstructionSet	"Streaming SIMD Extensions (/arch:SSE)"
+		$EnableEnhancedInstructionSet	"Streaming SIMD Extensions 2 (/arch:SSE2)"
 	}
 
 	$Linker

From f8dc5b84898b664162c25004690a1c1e2bba872d Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Sat, 4 Mar 2023 17:17:02 -0500
Subject: [PATCH 03/42] perf: add DirectXMath third party dep

will be used for optimized SSE routines. I chose this library
because it has a lineage from what Source uses for Xbox 360 math:
xboxmath -> xnamath -> DirectXMath, so porting SIMD operations to PC
is simplified, and there are similar guarantees.

can be retrieved from https://github.com/microsoft/DirectXMath

A blank sal.h also needs to be retrieved from https://github.com/microsoft/omi/blob/master/Unix/common/linux/sal.h
for POSIX (https://github.com/microsoft/DirectXMath/issues/89#issuecomment-530519242)

an edit needs to be made for Inc/DirectXMath.h to include sal.h on Windows
and the above sal.h on POSIX.
---
 .../DirectXMath-dec2022/.gitattributes        |     8 +
 src/thirdparty/DirectXMath-dec2022/.gitignore |    24 +
 .../.nuget/directxmath.nuspec                 |    33 +
 .../.nuget/directxmath.targets                |    11 +
 .../DirectXMath-dec2022/.nuget/icon.jpg       |   Bin 0 -> 3479 bytes
 .../DirectXMath-dec2022/.nuget/signconfig.xml |     6 +
 .../DirectXMath-dec2022/CMakeLists.txt        |    91 +
 .../DirectXMath-dec2022/CMakePresets.json     |   175 +
 .../Extensions/DirectXMathAVX.h               |   275 +
 .../Extensions/DirectXMathAVX2.h              |  1037 ++
 .../Extensions/DirectXMathBE.h                |    95 +
 .../Extensions/DirectXMathF16C.h              |   471 +
 .../Extensions/DirectXMathFMA3.h              |   391 +
 .../Extensions/DirectXMathFMA4.h              |   415 +
 .../Extensions/DirectXMathSSE3.h              |   111 +
 .../Extensions/DirectXMathSSE4.h              |   417 +
 src/thirdparty/DirectXMath-dec2022/HISTORY.md |   198 +
 .../Inc/DirectXCollision.h                    |   359 +
 .../Inc/DirectXCollision.inl                  |  4816 +++++
 .../DirectXMath-dec2022/Inc/DirectXColors.h   |   312 +
 .../DirectXMath-dec2022/Inc/DirectXMath.h     |  2280 +++
 .../Inc/DirectXMathConvert.inl                |  2191 +++
 .../Inc/DirectXMathMatrix.inl                 |  3550 ++++
 .../Inc/DirectXMathMisc.inl                   |  2493 +++
 .../Inc/DirectXMathVector.inl                 | 14819 ++++++++++++++++
 .../Inc/DirectXPackedVector.h                 |  1224 ++
 .../Inc/DirectXPackedVector.inl               |  4459 +++++
 src/thirdparty/DirectXMath-dec2022/LICENSE    |    21 +
 .../MatrixStack/DirectXMatrixStack.h          |   241 +
 src/thirdparty/DirectXMath-dec2022/README.md  |   115 +
 .../DirectXMath-dec2022/SECURITY.md           |    41 +
 .../DirectXMath-dec2022/SHMath/DirectXSH.cpp  |  4908 +++++
 .../DirectXMath-dec2022/SHMath/DirectXSH.h    |    72 +
 .../SHMath/DirectXSHD3D11.cpp                 |   383 +
 .../SHMath/DirectXSHD3D12.cpp                 |   339 +
 .../Stereo3D/Stereo3DMatrixHelper.cpp         |   257 +
 .../Stereo3D/Stereo3DMatrixHelper.h           |    64 +
 .../DirectXMath-dec2022/XDSP/XDSP.h           |   871 +
 .../build/DirectXMath-GitHub-CMake-Dev17.yml  |   119 +
 .../build/DirectXMath-GitHub-CMake.yml        |   103 +
 .../build/DirectXMath-GitHub-Dev17.yml        |   296 +
 .../build/DirectXMath-GitHub-MinGW.yml        |   170 +
 .../build/DirectXMath-GitHub-WSL-11.yml       |    64 +
 .../build/DirectXMath-GitHub-WSL.yml          |    64 +
 .../build/DirectXMath-GitHub.yml              |   543 +
 .../build/DirectXMath-config.cmake.in         |     5 +
 src/thirdparty/dotnetrt/sal.h                 |  2953 +++
 47 files changed, 51890 insertions(+)
 create mode 100644 src/thirdparty/DirectXMath-dec2022/.gitattributes
 create mode 100644 src/thirdparty/DirectXMath-dec2022/.gitignore
 create mode 100644 src/thirdparty/DirectXMath-dec2022/.nuget/directxmath.nuspec
 create mode 100644 src/thirdparty/DirectXMath-dec2022/.nuget/directxmath.targets
 create mode 100644 src/thirdparty/DirectXMath-dec2022/.nuget/icon.jpg
 create mode 100644 src/thirdparty/DirectXMath-dec2022/.nuget/signconfig.xml
 create mode 100644 src/thirdparty/DirectXMath-dec2022/CMakeLists.txt
 create mode 100644 src/thirdparty/DirectXMath-dec2022/CMakePresets.json
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathAVX.h
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathAVX2.h
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathBE.h
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathF16C.h
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathFMA3.h
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathFMA4.h
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathSSE3.h
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathSSE4.h
 create mode 100644 src/thirdparty/DirectXMath-dec2022/HISTORY.md
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXCollision.h
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXCollision.inl
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXColors.h
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathConvert.inl
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathMatrix.inl
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathMisc.inl
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathVector.inl
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXPackedVector.h
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXPackedVector.inl
 create mode 100644 src/thirdparty/DirectXMath-dec2022/LICENSE
 create mode 100644 src/thirdparty/DirectXMath-dec2022/MatrixStack/DirectXMatrixStack.h
 create mode 100644 src/thirdparty/DirectXMath-dec2022/README.md
 create mode 100644 src/thirdparty/DirectXMath-dec2022/SECURITY.md
 create mode 100644 src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSH.cpp
 create mode 100644 src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSH.h
 create mode 100644 src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSHD3D11.cpp
 create mode 100644 src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSHD3D12.cpp
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Stereo3D/Stereo3DMatrixHelper.cpp
 create mode 100644 src/thirdparty/DirectXMath-dec2022/Stereo3D/Stereo3DMatrixHelper.h
 create mode 100644 src/thirdparty/DirectXMath-dec2022/XDSP/XDSP.h
 create mode 100644 src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-CMake-Dev17.yml
 create mode 100644 src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-CMake.yml
 create mode 100644 src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-Dev17.yml
 create mode 100644 src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-MinGW.yml
 create mode 100644 src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-WSL-11.yml
 create mode 100644 src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-WSL.yml
 create mode 100644 src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub.yml
 create mode 100644 src/thirdparty/DirectXMath-dec2022/build/DirectXMath-config.cmake.in
 create mode 100644 src/thirdparty/dotnetrt/sal.h

diff --git a/src/thirdparty/DirectXMath-dec2022/.gitattributes b/src/thirdparty/DirectXMath-dec2022/.gitattributes
new file mode 100644
index 000000000..f416ccf92
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/.gitattributes
@@ -0,0 +1,8 @@
+# Auto detect text files and perform LF normalization
+* text=auto
+
+# Explicitly declare code/VS files as CRLF
+*.cpp eol=crlf
+*.cmd eol=crlf
+*.h eol=crlf
+*.inl eol=crlf
diff --git a/src/thirdparty/DirectXMath-dec2022/.gitignore b/src/thirdparty/DirectXMath-dec2022/.gitignore
new file mode 100644
index 000000000..33834c644
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/.gitignore
@@ -0,0 +1,24 @@
+*.psess
+*.vsp
+*.log
+*.err
+*.wrn
+*.suo
+*.sdf
+*.user
+*.i
+*.vspscc
+*.opensdf
+*.opendb
+*.ipch
+*.cache
+*.tlog
+*.lastbuildstate
+*.ilk
+*.VC.db
+*.nupkg
+.vs
+/Tests
+/wiki
+/out
+/CMakeUserPresets.json
diff --git a/src/thirdparty/DirectXMath-dec2022/.nuget/directxmath.nuspec b/src/thirdparty/DirectXMath-dec2022/.nuget/directxmath.nuspec
new file mode 100644
index 000000000..218c3bfe2
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/.nuget/directxmath.nuspec
@@ -0,0 +1,33 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<package xmlns="http://schemas.microsoft.com/packaging/2010/07/nuspec.xsd">
+    <metadata minClientVersion="2.8.6">
+        <id>directxmath</id>
+        <version>0.0.0-SpecifyVersionOnCommandline</version>
+        <title>DirectXMath</title>
+        <authors>Microsoft</authors>
+        <owners>microsoft,directxtk</owners>
+        <summary>DirectXMath is an all inline SIMD C++ linear algebra library for use in games and graphics apps.</summary>
+        <description>The DirectXMath API provides SIMD-friendly C++ types and functions for common linear algebra and graphics math operations common to DirectX applications. The library provides optimized versions for Windows 32-bit (x86), Windows 64-bit (x64), and Windows on ARM through SSE2 and ARM-NEON intrinsics support in the Visual Studio compiler.</description>
+        <releaseNotes>Matches the December 2022 release.</releaseNotes>
+        <projectUrl>http://go.microsoft.com/fwlink/?LinkID=615560</projectUrl>
+        <repository type="git" url="https://github.com/microsoft/DirectXMath.git" />
+        <icon>images\icon.jpg</icon>
+        <readme>docs\README.md</readme>
+        <license type="expression">MIT</license>
+        <requireLicenseAcceptance>false</requireLicenseAcceptance>
+        <copyright>&#169; Microsoft Corporation. All rights reserved.</copyright>
+        <tags>C++  native  DirectX  math nativepackage</tags>
+    </metadata>
+
+    <files>
+
+        <file target="docs" src="*.md" />
+
+        <file target="include" src="Inc\*" />
+
+        <file src=".nuget/directxmath.targets" target="build\native" />
+
+        <file src=".nuget/icon.jpg" target="images\" />
+
+    </files>
+</package>
\ No newline at end of file
diff --git a/src/thirdparty/DirectXMath-dec2022/.nuget/directxmath.targets b/src/thirdparty/DirectXMath-dec2022/.nuget/directxmath.targets
new file mode 100644
index 000000000..0a31f579b
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/.nuget/directxmath.targets
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <PreprocessorDefinitions>HAS_DIRECTXMATH;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(MSBuildThisFileDirectory)..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+  </ItemDefinitionGroup>
+
+</Project>
diff --git a/src/thirdparty/DirectXMath-dec2022/.nuget/icon.jpg b/src/thirdparty/DirectXMath-dec2022/.nuget/icon.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..08fe1faeb7f6e45d796cf1e67bf1cbb1347c514a
GIT binary patch
literal 3479
zcmbW$XHe5?mjLj82q7Rr2u1}(Y6PT8@1RJ((ximmf=CfjB1P&|i4>8JSGXbwVrWvN
zBb`vCw;<9%dPy)y2mxMvcjxZR&VJb4^UV45oO$N_<~-+Q;$#kBGtkl30YD%S;B<O`
zlkdPSfRTX#$^c=6LZM7djAvj778o-#jFbHwE8@x}9_}lbxVU(c5<<NEVgg)T!V1^K
zu1m?v%JK*)sVPdUO327c|9%9-#KZ(+hH<d4a7goU@k#%m<D?mYGXk!FBN)UF(7{1q
zIOqfq@Bjb^a@y_hhW|4_bYOZ21C){J4D)G0B^y8o0)y%3!4L>N{b_aR={-OXhp_WW
z-DEgt>I~(7gphuk_?}Tfv#jO(ogpGp#>Mv;)0qnwIXEv}6%-P_CL${*ub`-;d`nAP
zM^{hZz|7pj5@ls=gLb{^cF*0z6Z6>5KOpc)(DMlFi^!<xn55*C)YpH#NlVYp`H=e&
zmzQ7gxxAvXs=B7OuC)!{-qG3B-7`EgIyO$2_%=Deu(<SNd1ZBNowWOty!UILazOp<
z0s-LvVErTeZ!Y+$i;kWiOb`9-0?`GWCKyf+;ge!uziA3}esqps`Y9tqGx2>{3zLA%
z9pZTx-=Q-Xkh1evNxy0Tmi_Nw&;DPse_;RRngn2A(CP5Na6kjtrwV8Pc}BQcs3RN^
zqB7U+Ui?6PBs5OkNy(>k_)is2(G@0jHLtCOS0Y}8aNO!gr(rF$N^Zmt{cf&BrNx?(
z9Mf?vlnyKwWC`Vsw*mYRsHZdoN?XxtnDTKSk(O9tSs;+c364gpHQ^ATofK3Y<&VYf
z<ow%J4gyum7UTYDFK%1!Z#R|9J2WANrZkgp@#~-avum}v5Ip!yIR*$tnBIUOP;Y6D
zq#<JDi?9K=6M!V!*!%G57}P%t#v1L2@X@F+j-c@zZW->DPvP_;AF|$jI{K^lUTY(K
zn7ol&+vJ?@x!p3k5Qg%{L_nLq_y)VA%^lJCoSmpImpHmZ3b6aSpC&W#*ReWo%~tZ9
zYy>Um(b`C+!7j-vP3_udwd#}+zpdFfuGw~R-gRZvcQgh$?x=1|-w{OGYN4?N`84mG
zKLJ{`=M;-@$$r}@+T}~T=+5o$y9eQOa=rF>b=A|#uf^ZoWue)HNAh4FHl3C8Q0Jyu
zT?!pZDXVw(;zM^3T3($1;OckHl@4^((ho!BM8T+ZVi<0iTbcxOxIoN3{v--G#~z>$
z`-52f8_~oEZ|nLtW<0}XJY_g@eWhn^-{-)RC3!8NrDpa`o!()FUieD^&x>3+EZOb2
zh#cgBQcl6bz_1qY&8^}bY=pEyKDUk(8=%v4M?q;HWABISiK{FBp<Qwz@SZ+l(y>C*
z@~Fi%UCbw5^|Y~>#cGG3m=#sN#n9k+^{P6fQe@p=?)^=u3P>YN^45C8QM??%&UL!y
zh}LoJ$55F4o$^wvrs_*H$iMf_B7@)+EB3JGB)Junj!IFrc;z^$eCy5NPuQd4d}qjB
z?3!B9Dl*1g35IL1D#-|Aq%^LyH6k;csr8vP={kqkg!q-}QlW>%O5Bv=z0mUxbpd1C
zp997qA+Hu^Ig86l7L{zI%G}si6^!%+?a45bY4^(8fc8*~9WJJki-PVD?kn)<l89>l
z@_gSnp2D_4RzjrLmC9i{@7L>lV4Qy1@jI0Y?oP)hoB+*DF*3)*&Ib36+lkS9Z#cT>
zrR6Omeb818{1ncGcv$~rB(}uX1gpf%xx0-BLM!?-|6H@P$c&Mw_|h%DC+e{&ky<9&
z!rhih^<Z|^r+TaJEswRx9^{_@js$YN{6`jC-|b(b);YEO;~qAAl07iItmo<13E+7H
zBlSN@|G5)AiOim4G12ON<bJH(B>;JHfFY}f@mG<J5!TZkeg5cRKf8vpQn5TW*I3%B
z8d0|)2tKs6cXX9MRiN$`r{x{voh(IHrJdN!5SQ#&x7^;bbegsf`|`49UzkuNMW=tq
zakN;@P)A=VeZyBZ&bLD<{`$(SC=U%rMHO_E7f4;tsKS4(Pe7;uQmZ#QwkaK=mTfIT
zZt=^PRm>~?Oc-gj$}>%ZBKOsxb_9h5?*OCWMaL?K1B0w6=h>g$sw?hFN*f~$kIm~x
z{Db};GeV8{;yuo&-}O=T!;8N?)*z7cD(&xx{((`$4_Y_Z*lF|tn+#8UTI-ijfCZN2
zkI#*UWSeGt+V0Od9Q4byVmXdjd;$aJd|s*=O3zSYmexuYga*`Imx2W!HbwIo^mCl0
z3JA{yuf~bC2OVYXM0v|y;mmj-UX`h4=C)#%>ZH6Bjt#}m9~o9=o?V+vZY4I1)ftfL
zQibt!I;`*eFf#~>?YuD~ebQHB+n4Ppz+YdA^aVq8@6v$c{su;EO%dUp2y>V0XECDf
zA=adM?~>f%5SfpL0U^a`Nq#N{E;fX(p{Z`!^o*^QZDsp3{-I{h)g`Aa6gI?!k~nAR
zHch5fMD2Z<Lc}exytCe_?CgEAsaSkxM~CDSD|FF8Vr;1zTYDidKwe{-N90&YeQaoT
z@@#UJ{KjQtNtXw~FWP!B&7i1eBo$C0h>;#I9H0x8a51=xJ(c$|0)yu}Y731@B({_V
zIS*1Qa*aQrGY9AzQwBAyqS)Qsm-L$972((IoF0(-l~HiWRMJ)zHYDngP&_7$bw8u7
z><F5BmqKXAtd1K{?I&L7xv&asoz?H3AY|@vWty;!!|}oT-#iIZ_h7oJMb&EeQ16(s
zzg?U)vC@v+?iAvt^o4XU%re=3Ep+r`uz0mn6TmD9Pq5lz7JPVCs?GaXUw{LY)VC5%
z-bfu41e;W)D7tqHB#;HTBj5)p7YV+xCHLZGx&^x;>CfPwPcy~44!b56RiBCKqW0rd
z^lB|CwnlfuK{TyeikDK^XuT3xl&gM5FK+I1n|yerss8&VSu{x!PaC(mw!MalR;`*h
zP6atox*856#LrOt$B)HJMYJztmfqBV4GX>a3OCJI%wtJ3Ekmo-m$<G^NnM>2n!NqM
zI48cN_?-y5e~{D&zSTx;piH<VrGoY4%ypuD&&0Zs9J4EUAH`a~wQ13^MoDzM9q47P
z&QF}lEV`^)ap=nL%v|b{MPBmjYN2`MtE|VRY<m)V9TBo?GziH>Eohp<YMp*ym#nm4
zeuOyz=+2Y3d}rBIFM2O#G8^XhSgNF|dlxN*7bcw501qDWm|T<nP=p`$?A=|H6llN-
zJ$<g(#mVv5Nf&wod@`KsyM*Jzw1P*enP#*vDA4@0%}VQJ(F_NPzuQxMNhCLtiX&@m
zzRM!7sIdfPR2J#EJcjhXVXBEcZaX+6ytGNlS9e%>Wt05jAUO4hc`0^ZuV5XU>p#c8
zh-3}%lUoYA`g+G^{L|xLbGN$s3|IFr2T4v^kw>GVWU;Z;8L82nsyIC)J|Tk;06iSc
zzl=I&_<>rhD?g^A_&c%%_V2f&HX6jDh4BwZy5EIEx_|W$j2L1J2dKwQb5|5aEbIF*
zrbEjVGm@0p$CR(Gt2zC5o0EjN-iCi}J^^Os*2H;ZOpI<^<&uswf1aRQ5}qfQPdaSP
z{LT%Jx2=v>u(7>MRJvZxa=^v&(JPW@S>YECYjyi->=A`@9(4*SG9I}Sl;E+L-yov<
z-fGgvAaT{lu6calw{^XMb%(BG8b}_Qd>2(Nq1yZzO|PKCy+ZTcTJEea{T}It{R}M0
zW8zH{yQqXT%1HZ5cKO6ym4@CXWN(8xw>->Gu=ZX=%6g-H+E+0#9BO)^*YIk{BNG4j
zTf^_;RLm4!55Syn+`u3s!Gm03+?Br|mGzdd%WQ8v<-^juKhMtN0PmK_P-KYn^;dG#
zaryX*){UmM#<qcV5!81~G^pp^rC(@n3-m<n6N_gt!UdeIAvzTF$3SYAo?gY+d%K(d
zg>U1xYqp#6wx)RK4+CP3FRqt)9P^eYdFc&o2S^F)1a%5&s>K`MbhNdwzh(4!tY~Jy
zq~gW;q;UhH>x%K5rx0VGjlohsJiQ{3go_TboLscKc|Q(alg7bb;~mo^7RVjjAGoep
zQqag1Xcef@ObvU}1sQTwUZwU-RsU!`LRiu}f@uO?3tr-$Jt}{Pm){mV6iRCpbnZI*
zCIE#NE0!4G7==Z7BM|2kS*D8=#xXW_yh%)p?n}3ZQesaRzN=@KCQaJ@^kD`XY+5JN
zIqTSPd2v;D*=vJ3U++jPY&(`CeDbq5Ew-3{+2cOv^vvX}&SESnc?4k~V$kawlT@xA
zSW9})w9rk^BJM?eOZ{BPC@up-BToRvPBSW@j3z;mf?_|sv?kCBKRJaAX{oz!#g`l!
z?+p6ymFN!E%VbmBMmFo@|AxH1xAVm7r^aFL6CeC|CHz@jYpl8d8R6`_tn3(J;D1Q)
H$+v$2rQD|6

literal 0
HcmV?d00001

diff --git a/src/thirdparty/DirectXMath-dec2022/.nuget/signconfig.xml b/src/thirdparty/DirectXMath-dec2022/.nuget/signconfig.xml
new file mode 100644
index 000000000..f32a6a464
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/.nuget/signconfig.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<SignConfigXML>
+  <job dest="__OUTPATHROOT__" certSubject="NuGet" jobname="NugetSigningTest">
+    <file src="__INPATHROOT__\directxmath*.nupkg" signType="CP-401405" dest="__OUTPATHROOT__\directxmath*.nupkg" />
+  </job>
+</SignConfigXML>
\ No newline at end of file
diff --git a/src/thirdparty/DirectXMath-dec2022/CMakeLists.txt b/src/thirdparty/DirectXMath-dec2022/CMakeLists.txt
new file mode 100644
index 000000000..90ca3e2d3
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/CMakeLists.txt
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+cmake_minimum_required (VERSION 3.20)
+
+set(DIRECTXMATH_VERSION 3.1.8)
+
+project(DirectXMath
+  VERSION ${DIRECTXMATH_VERSION}
+  DESCRIPTION "DirectXMath SIMD C++ math library"
+  HOMEPAGE_URL "https://go.microsoft.com/fwlink/?LinkID=615560"
+  LANGUAGES CXX)
+
+include(GNUInstallDirs)
+
+#--- Library
+set(LIBRARY_HEADERS
+    Inc/DirectXCollision.h
+    Inc/DirectXCollision.inl
+    Inc/DirectXColors.h
+    Inc/DirectXMath.h
+    Inc/DirectXMathConvert.inl
+    Inc/DirectXMathMatrix.inl
+    Inc/DirectXMathMisc.inl
+    Inc/DirectXMathVector.inl
+    Inc/DirectXPackedVector.h
+    Inc/DirectXPackedVector.inl)
+
+add_library(${PROJECT_NAME} INTERFACE)
+
+target_include_directories(${PROJECT_NAME} INTERFACE
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/Inc>
+  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/directxmath>)
+
+target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_11)
+
+#--- Package
+include(CMakePackageConfigHelpers)
+
+string(TOLOWER ${PROJECT_NAME} PACKAGE_NAME)
+
+write_basic_package_version_file(
+  ${PACKAGE_NAME}-config-version.cmake
+  VERSION ${DIRECTXMATH_VERSION}
+  COMPATIBILITY AnyNewerVersion
+  ARCH_INDEPENDENT)
+
+install(TARGETS ${PROJECT_NAME}
+  EXPORT ${PROJECT_NAME}-targets
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/build/${PROJECT_NAME}-config.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}-config.cmake
+  INSTALL_DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PACKAGE_NAME})
+
+install(EXPORT ${PROJECT_NAME}-targets
+  FILE ${PROJECT_NAME}-targets.cmake
+  NAMESPACE Microsoft::
+  DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PACKAGE_NAME})
+
+install(FILES ${LIBRARY_HEADERS}
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/directxmath)
+
+install(FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}-config.cmake
+    ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}-config-version.cmake
+  DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PACKAGE_NAME})
+
+#--- Test suite
+if (DEFINED VCPKG_TARGET_ARCHITECTURE)
+    set(DXMATH_ARCHITECTURE ${VCPKG_TARGET_ARCHITECTURE})
+elseif(CMAKE_GENERATOR_PLATFORM MATCHES "^[Ww][Ii][Nn]32$")
+    set(DXMATH_ARCHITECTURE x86)
+elseif(CMAKE_GENERATOR_PLATFORM MATCHES "^[Xx]64$")
+    set(DXMATH_ARCHITECTURE x64)
+elseif(CMAKE_GENERATOR_PLATFORM MATCHES "^[Aa][Rr][Mm]$")
+    set(DXMATH_ARCHITECTURE arm)
+elseif(CMAKE_GENERATOR_PLATFORM MATCHES "^[Aa][Rr][Mm]64$")
+    set(DXMATH_ARCHITECTURE arm64)
+elseif(NOT DXMATH_ARCHITECTURE)
+    set(DXMATH_ARCHITECTURE "x64")
+endif()
+
+#--- Test suite
+include(CTest)
+if(BUILD_TESTING AND WIN32 AND (NOT WINDOWS_STORE) AND (EXISTS "${CMAKE_CURRENT_LIST_DIR}/Tests/CMakeLists.txt"))
+  enable_testing()
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/Tests)
+endif()
diff --git a/src/thirdparty/DirectXMath-dec2022/CMakePresets.json b/src/thirdparty/DirectXMath-dec2022/CMakePresets.json
new file mode 100644
index 000000000..90b680e7d
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/CMakePresets.json
@@ -0,0 +1,175 @@
+﻿{
+  "version": 2,
+  "configurePresets": [
+    {
+      "name": "base",
+      "displayName": "Basic Config",
+      "description": "Basic build using Ninja generator",
+      "generator": "Ninja",
+      "hidden": true,
+      "binaryDir": "${sourceDir}/out/build/${presetName}",
+      "cacheVariables": { "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}" }
+    },
+
+    {
+      "name": "x64",
+      "architecture": {
+        "value": "x64",
+        "strategy": "external"
+      },
+      "cacheVariables": { "DXMATH_ARCHITECTURE": "x64" },
+      "hidden": true
+    },
+    {
+      "name": "x86",
+      "architecture": {
+        "value": "x86",
+        "strategy": "external"
+      },
+      "cacheVariables": { "DXMATH_ARCHITECTURE": "x86" },
+      "hidden": true
+    },
+    {
+      "name": "ARM",
+      "architecture": {
+        "value": "arm",
+        "strategy": "external"
+      },
+      "cacheVariables": { "DXMATH_ARCHITECTURE": "arm" },
+      "hidden": true
+    },
+    {
+      "name": "ARM64",
+      "architecture": {
+        "value": "arm64",
+        "strategy": "external"
+      },
+      "cacheVariables": { "DXMATH_ARCHITECTURE": "arm64" },
+      "hidden": true
+    },
+
+    {
+      "name": "Debug",
+      "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" },
+      "hidden": true
+    },
+    {
+      "name": "Release",
+      "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" },
+      "hidden": true
+    },
+
+    {
+      "name": "OneCore",
+      "cacheVariables": { "BUILD_FOR_ONECORE": true },
+      "hidden": true
+    },
+    {
+      "name": "AVX",
+      "cacheVariables": { "BUILD_AVX_TEST": true },
+      "hidden": true
+    },
+    {
+      "name": "AVX2",
+      "cacheVariables": { "BUILD_AVX2_TEST": true },
+      "hidden": true
+    },
+    {
+      "name": "F16C",
+      "cacheVariables": { "BUILD_F16C_TEST": true },
+      "hidden": true
+    },
+    {
+      "name": "NI",
+      "cacheVariables": { "BUILD_NO_INTRINSICS": true },
+      "hidden": true
+    },
+
+    {
+      "name": "MSVC",
+      "hidden": true,
+      "cacheVariables": {
+        "CMAKE_CXX_COMPILER": "cl.exe"
+      },
+      "toolset": {
+        "value": "host=x64",
+        "strategy": "external"
+      }
+    },
+    {
+      "name": "Clang",
+      "hidden": true,
+      "cacheVariables": {
+        "CMAKE_CXX_COMPILER": "clang-cl.exe"
+      },
+      "toolset": {
+        "value": "host=x64",
+        "strategy": "external"
+      }
+    },
+    {
+      "name": "GNUC",
+      "hidden": true,
+      "cacheVariables": {
+        "CMAKE_CXX_COMPILER": "g++.exe"
+      },
+      "toolset": {
+        "value": "host=x64",
+        "strategy": "external"
+      }
+    },
+    {
+      "name": "Intel",
+      "hidden": true,
+      "cacheVariables": {
+        "CMAKE_CXX_COMPILER": "icl.exe"
+      },
+      "toolset": {
+        "value": "host=x64",
+        "strategy": "external"
+      }
+    },
+    {
+      "name": "IntelLLVM",
+      "hidden": true,
+      "cacheVariables": {
+        "CMAKE_CXX_COMPILER": "icx.exe"
+      },
+      "toolset": {
+        "value": "host=x64",
+        "strategy": "external"
+      }
+    },
+
+    { "name": "x64-Debug"    , "description": "MSVC for x64 (Debug) - SSE/SSE2", "inherits": [ "base", "x64", "Debug", "MSVC" ] },
+    { "name": "x64-Release"  , "description": "MSVC for x64 (Release) - SSE/SSE2", "inherits": [ "base", "x64", "Release", "MSVC" ] },
+    { "name": "x86-Debug"    , "description": "MSVC for x86 (Debug) - SSE/SSE2", "inherits": [ "base", "x86", "Debug", "MSVC" ] },
+    { "name": "x86-Release"  , "description": "MSVC for x86 (Release) - SSE/SSE2", "inherits": [ "base", "x86", "Release", "MSVC" ] },
+    { "name": "arm-Debug"    , "description": "MSVC for ARM (Debug) - ARM-NEON", "inherits": [ "base", "ARM", "Debug", "MSVC" ] },
+    { "name": "arm-Release"  , "description": "MSVC for ARM (Release) - ARM-NEON", "inherits": [ "base", "ARM", "Release", "MSVC" ] },
+    { "name": "arm64-Debug"  , "description": "MSVC for ARM64 (Debug) - ARM-NEON", "inherits": [ "base", "ARM64", "Debug", "MSVC" ] },
+    { "name": "arm64-Release", "description": "MSVC for ARM64 (Release) - ARM-NEON", "inherits": [ "base", "ARM64", "Release", "MSVC" ] },
+
+    { "name": "x64-Debug-Clang"    , "description": "Clang/LLVM for x64 (Debug) - SSE/SSE2", "inherits": [ "base", "x64", "Debug", "Clang" ] },
+    { "name": "x64-Release-Clang"  , "description": "Clang/LLVM for x64 (Release) - SSE/SSE2", "inherits": [ "base", "x64", "Release", "Clang" ] },
+    { "name": "x86-Debug-Clang"    , "description": "Clang/LLVM for x86 (Debug) - SSE/SSE2", "inherits": [ "base", "x86", "Debug", "Clang" ], "environment": { "CXXFLAGS": "-m32" } },
+    { "name": "x86-Release-Clang"  , "description": "Clang/LLVM for x86 (Release) - SSE/SSE2", "inherits": [ "base", "x86", "Release", "Clang" ], "environment": { "CXXFLAGS": "-m32" } },
+    { "name": "arm64-Debug-Clang"  , "description": "Clang/LLVM for AArch64 (Debug) - ARM-NEON", "inherits": [ "base", "ARM64", "Debug", "Clang" ], "environment": { "CXXFLAGS": "--target=arm64-pc-windows-msvc" } },
+    { "name": "arm64-Release-Clang", "description": "Clang/LLVM for AArch64 (Release) - ARM-NEON", "inherits": [ "base", "ARM64", "Release", "Clang" ], "environment": { "CXXFLAGS": "--target=arm64-pc-windows-msvc" } }
+  ],
+  "testPresets": [
+    { "name": "x64-Debug"    , "configurePreset": "x64-Debug" },
+    { "name": "x64-Release"  , "configurePreset": "x64-Release" },
+    { "name": "x86-Debug"    , "configurePreset": "x86-Debug" },
+    { "name": "x86-Release"  , "configurePreset": "x86-Release" },
+    { "name": "arm64-Debug"  , "configurePreset": "arm64-Debug" },
+    { "name": "arm64-Release", "configurePreset": "arm64-Release" },
+
+    { "name": "x64-Debug-Clang"    , "configurePreset": "x64-Debug-Clang" },
+    { "name": "x64-Release-Clang"  , "configurePreset": "x64-Release-Clang" },
+    { "name": "x86-Debug-Clang"    , "configurePreset": "x86-Debug-Clang" },
+    { "name": "x86-Release-Clang"  , "configurePreset": "x86-Release-Clang" },
+    { "name": "arm64-Debug-Clang"  , "configurePreset": "arm64-Debug-Clang" },
+    { "name": "arm64-Release-Clang", "configurePreset": "arm64-Release-Clang" }
+  ]
+}
\ No newline at end of file
diff --git a/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathAVX.h b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathAVX.h
new file mode 100644
index 000000000..901a1c9b3
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathAVX.h
@@ -0,0 +1,275 @@
+//-------------------------------------------------------------------------------------
+// DirectXMathAVX.h -- AVX (version 1) extensions for SIMD C++ Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
+#error AVX not supported on ARM platform
+#endif
+
+#include <DirectXMath.h>
+
+namespace DirectX
+{
+
+namespace AVX
+{
+
+inline bool XMVerifyAVXSupport()
+{
+    // Should return true for AMD Bulldozer, Intel "Sandy Bridge", and Intel "Ivy Bridge" or later processors
+    // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
+
+    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+    int CPUInfo[4] = {-1};
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid( CPUInfo, 0 );
+#endif
+
+    if ( CPUInfo[0] < 1  )
+        return false;
+
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 1 );
+#endif
+
+    // We check for AVX, OSXSAVE, SSSE4.1, and SSE3
+    return ( (CPUInfo[2] & 0x18080001) == 0x18080001 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr( _In_  const float *pValue )
+{
+    return _mm_broadcast_ss( pValue );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatX( FXMVECTOR V )
+{
+    return _mm_permute_ps( V, _MM_SHUFFLE(0, 0, 0, 0) );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatY( FXMVECTOR V )
+{
+    return _mm_permute_ps( V, _MM_SHUFFLE(1, 1, 1, 1) );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatZ( FXMVECTOR V )
+{
+    return _mm_permute_ps( V, _MM_SHUFFLE(2, 2, 2, 2) );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatW( FXMVECTOR V )
+{
+    return _mm_permute_ps( V, _MM_SHUFFLE(3, 3, 3, 3) );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSwizzle( FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3 )
+{
+    assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
+    _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
+
+    unsigned int elem[4] = { E0, E1, E2, E3 };
+    __m128i vControl = _mm_loadu_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
+    return _mm_permutevar_ps( V, vControl );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorPermute( FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW )
+{
+    assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
+    _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
+
+    static const XMVECTORU32 three = { { { 3, 3, 3, 3 } } };
+
+    XM_ALIGNED_DATA(16) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
+    __m128i vControl = _mm_load_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
+    
+    __m128i vSelect = _mm_cmpgt_epi32( vControl, three );
+    vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) );
+
+    __m128 shuffled1 = _mm_permutevar_ps( V1, vControl );
+    __m128 shuffled2 = _mm_permutevar_ps( V2, vControl );
+
+    __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 );
+    __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 );
+
+    return _mm_or_ps( masked1, masked2 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements)
+{
+    assert( Elements < 4 );
+    _Analysis_assume_( Elements < 4 );
+    return AVX::XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3));
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements)
+{
+    assert( Elements < 4 );
+    _Analysis_assume_( Elements < 4 );
+    return AVX::XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements)
+{
+    assert( Elements < 4 );
+    _Analysis_assume_( Elements < 4 );
+    return AVX::XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Permute Templates
+//-------------------------------------------------------------------------------------
+
+namespace Internal
+{
+    // Slow path fallback for permutes that do not map to a single SSE opcode.
+    template<uint32_t Shuffle, bool WhichX, bool WhichY, bool WhichZ, bool WhichW> struct PermuteHelper
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2)
+        {
+            static const XMVECTORU32 selectMask =
+            {
+                WhichX ? 0xFFFFFFFF : 0,
+                WhichY ? 0xFFFFFFFF : 0,
+                WhichZ ? 0xFFFFFFFF : 0,
+                WhichW ? 0xFFFFFFFF : 0,
+            };
+
+            XMVECTOR shuffled1 = _mm_permute_ps(v1, Shuffle);
+            XMVECTOR shuffled2 = _mm_permute_ps(v2, Shuffle);
+
+            XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1);
+            XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2);
+
+            return _mm_or_ps(masked1, masked2);
+        }
+    };
+
+    // Fast path for permutes that only read from the first vector.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, false, false>
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_permute_ps(v1, Shuffle); }
+    };
+
+    // Fast path for permutes that only read from the second vector.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, true, true>
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return _mm_permute_ps(v2, Shuffle); }
+    };
+
+    // Fast path for permutes that read XY from the first vector, ZW from the second.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, true, true>
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); }
+    };
+
+    // Fast path for permutes that read XY from the second vector, ZW from the first.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, false, false>
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); }
+    };
+};
+
+// General permute template
+template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW>
+    inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2)
+{
+    static_assert(PermuteX <= 7, "PermuteX template parameter out of range");
+    static_assert(PermuteY <= 7, "PermuteY template parameter out of range");
+    static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range");
+    static_assert(PermuteW <= 7, "PermuteW template parameter out of range");
+
+    const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3);
+
+    const bool WhichX = PermuteX > 3;
+    const bool WhichY = PermuteY > 3;
+    const bool WhichZ = PermuteZ > 3;
+    const bool WhichW = PermuteW > 3;
+
+    return AVX::Internal::PermuteHelper<Shuffle, WhichX, WhichY, WhichZ, WhichW>::Permute(V1, V2);
+}
+
+// Special-case permute templates
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR) { return V1; }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR, FXMVECTOR V2) { return V2; }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); }
+
+
+//-------------------------------------------------------------------------------------
+// Swizzle Templates
+//-------------------------------------------------------------------------------------
+
+// General swizzle template
+template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
+    inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V)
+{
+    static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
+    static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
+    static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
+    static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
+
+    return _mm_permute_ps( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) );
+}
+
+// Specialized swizzles
+template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }
+template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
+
+
+//-------------------------------------------------------------------------------------
+// Other Templates
+//-------------------------------------------------------------------------------------
+
+template<uint32_t Elements>
+    inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2)
+{
+    static_assert( Elements < 4, "Elements template parameter out of range" );
+    return AVX::XMVectorPermute<Elements, (Elements + 1), (Elements + 2), (Elements + 3)>(V1, V2);
+}
+
+template<uint32_t Elements>
+    inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V)
+{
+    static_assert( Elements < 4, "Elements template parameter out of range" );
+    return AVX::XMVectorSwizzle<Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3>(V);
+}
+
+template<uint32_t Elements>
+    inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V)
+{
+    static_assert( Elements < 4, "Elements template parameter out of range" );
+    return AVX::XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V);
+}
+
+} // namespace AVX
+
+} // namespace DirectX;
diff --git a/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathAVX2.h b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathAVX2.h
new file mode 100644
index 000000000..9624dc954
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathAVX2.h
@@ -0,0 +1,1037 @@
+//-------------------------------------------------------------------------------------
+// DirectXMathAVX2.h -- AVX2 extensions for SIMD C++ Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
+#error AVX2 not supported on ARM platform
+#endif
+
+#include <DirectXMath.h>
+#include <DirectXPackedVector.h>
+
+namespace DirectX
+{
+
+namespace AVX2
+{
+
+inline bool XMVerifyAVX2Support()
+{
+    // Should return true for AMD "Excavator", Intel "Haswell" or later processors
+    // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
+
+    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+    int CPUInfo[4] = {-1};
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 0);
+#endif
+
+    if ( CPUInfo[0] < 7  )
+        return false;
+
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 1);
+#endif
+
+    // We check for F16C, FMA3, AVX, OSXSAVE, SSSE4.1, and SSE3
+    if ( (CPUInfo[2] & 0x38081001) != 0x38081001 )
+        return false;
+
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid_count(7, 0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuidex(CPUInfo, 7, 0);
+#endif
+
+    return ( (CPUInfo[1] & 0x20 ) == 0x20 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr( _In_  const float *pValue )
+{
+    return _mm_broadcast_ss( pValue );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatX( FXMVECTOR V )
+{
+    return _mm_broadcastss_ps( V );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatY( FXMVECTOR V )
+{
+    return _mm_permute_ps( V, _MM_SHUFFLE(1, 1, 1, 1) );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatZ( FXMVECTOR V )
+{
+    return _mm_permute_ps( V, _MM_SHUFFLE(2, 2, 2, 2) );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSplatW( FXMVECTOR V )
+{
+    return _mm_permute_ps( V, _MM_SHUFFLE(3, 3, 3, 3) );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR V3
+)
+{
+    return _mm_fmadd_ps( V1, V2, V3 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR V3
+)
+{
+    return _mm_fnmadd_ps( V1, V2, V3 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSwizzle( FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3 )
+{
+    assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
+    _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
+
+    unsigned int elem[4] = { E0, E1, E2, E3 };
+    __m128i vControl = _mm_loadu_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
+    return _mm_permutevar_ps( V, vControl );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorPermute( FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW )
+{
+    assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
+    _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
+
+    static const XMVECTORU32 three = { { { 3, 3, 3, 3 } } };
+
+    XM_ALIGNED_DATA(16) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
+    __m128i vControl = _mm_load_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
+    
+    __m128i vSelect = _mm_cmpgt_epi32( vControl, three );
+    vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) );
+
+    __m128 shuffled1 = _mm_permutevar_ps( V1, vControl );
+    __m128 shuffled2 = _mm_permutevar_ps( V2, vControl );
+
+    __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 );
+    __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 );
+
+    return _mm_or_ps( masked1, masked2 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements)
+{
+    assert( Elements < 4 );
+    _Analysis_assume_( Elements < 4 );
+    return AVX2::XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3));
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements)
+{
+    assert( Elements < 4 );
+    _Analysis_assume_( Elements < 4 );
+    return AVX2::XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements)
+{
+    assert( Elements < 4 );
+    _Analysis_assume_( Elements < 4 );
+    return AVX2::XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector2
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] );
+    XMVECTOR vTemp = _mm_broadcastss_ps(V); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2TransformCoord
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] );
+    XMVECTOR vTemp = _mm_broadcastss_ps(V); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
+    vResult = _mm_div_ps( vResult, W );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2TransformNormal
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_mul_ps( vResult, M.r[1] );
+    XMVECTOR vTemp = _mm_broadcastss_ps(V); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector3
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_broadcastss_ps(V); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3TransformCoord
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_broadcastss_ps(V); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
+    vResult = _mm_div_ps( vResult, W );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3TransformNormal
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_mul_ps( vResult, M.r[2] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_broadcastss_ps(V); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2);
+
+inline XMVECTOR XM_CALLCONV XMVector3Project
+(
+    FXMVECTOR V, 
+    float    ViewportX, 
+    float    ViewportY, 
+    float    ViewportWidth, 
+    float    ViewportHeight, 
+    float    ViewportMinZ, 
+    float    ViewportMaxZ, 
+    CXMMATRIX Projection, 
+    CXMMATRIX View, 
+    CXMMATRIX World
+)
+{
+    const float HalfViewportWidth = ViewportWidth * 0.5f;
+    const float HalfViewportHeight = ViewportHeight * 0.5f;
+
+    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f);
+    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
+
+    XMMATRIX Transform = AVX2::XMMatrixMultiply(World, View);
+    Transform = AVX2::XMMatrixMultiply(Transform, Projection);
+
+    XMVECTOR Result = AVX2::XMVector3TransformCoord(V, Transform);
+
+    Result = AVX2::XMVectorMultiplyAdd(Result, Scale, Offset);
+
+    return Result;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3Unproject
+(
+    FXMVECTOR V, 
+    float     ViewportX, 
+    float     ViewportY, 
+    float     ViewportWidth, 
+    float     ViewportHeight, 
+    float     ViewportMinZ, 
+    float     ViewportMaxZ, 
+    CXMMATRIX Projection, 
+    CXMMATRIX View, 
+    CXMMATRIX World
+)
+{
+    static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };
+
+    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
+    Scale = XMVectorReciprocal(Scale);
+
+    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
+    Offset = AVX2::XMVectorMultiplyAdd(Scale, Offset, D.v);
+
+    XMMATRIX Transform = AVX2::XMMatrixMultiply(World, View);
+    Transform = AVX2::XMMatrixMultiply(Transform, Projection);
+    Transform = XMMatrixInverse(nullptr, Transform);
+
+    XMVECTOR Result = AVX2::XMVectorMultiplyAdd(V, Scale, Offset);
+
+    return AVX2::XMVector3TransformCoord(Result, Transform);
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector4
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W
+    vResult = _mm_mul_ps( vResult, M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_fmadd_ps( vTemp, M.r[2], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_broadcastss_ps(V); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Matrix
+//-------------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixMultiply
+(
+    CXMMATRIX M1, 
+    CXMMATRIX M2
+)
+{
+    XMMATRIX mResult;
+    // Use vW to hold the original row
+    XMVECTOR vW = M1.r[0];
+    // Splat the component X,Y,Z then W
+    XMVECTOR vX = _mm_broadcastss_ps(vW);
+    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    // Perform the operation on the first row
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    mResult.r[0] = vX;
+    // Repeat for the other 3 rows
+    vW = M1.r[1];
+    vX = _mm_broadcastss_ps(vW);
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    mResult.r[1] = vX;
+    vW = M1.r[2];
+    vX = _mm_broadcastss_ps(vW);
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    mResult.r[2] = vX;
+    vW = M1.r[3];
+    vX = _mm_broadcastss_ps(vW);
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    mResult.r[3] = vX;
+    return mResult;
+}
+
+inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose
+(
+    FXMMATRIX M1, 
+    CXMMATRIX M2
+)
+{
+    // Use vW to hold the original row
+    XMVECTOR vW = M1.r[0];
+    // Splat the component X,Y,Z then W
+    XMVECTOR vX = _mm_broadcastss_ps(vW);
+    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    // Perform the operation on the first row
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    __m128 r0 = vX;
+    // Repeat for the other 3 rows
+    vW = M1.r[1];
+    vX = _mm_broadcastss_ps(vW);
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    __m128 r1 = vX;
+    vW = M1.r[2];
+    vX = _mm_broadcastss_ps(vW);
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    __m128 r2 = vX;
+    vW = M1.r[3];
+    vX = _mm_broadcastss_ps(vW);
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    __m128 r3 = vX;
+
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2));
+
+    XMMATRIX mResult;
+    // x.x,y.x,z.x,w.x
+    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0));
+    // x.y,y.y,z.y,w.y
+    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1));
+    // x.z,y.z,z.z,w.z
+    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0));
+    // x.w,y.w,z.w,w.w
+    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1));
+    return mResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Permute Templates
+//-------------------------------------------------------------------------------------
+
+namespace Internal
+{
+    // Slow path fallback for permutes that do not map to a single SSE opcode.
+    template<uint32_t Shuffle, bool WhichX, bool WhichY, bool WhichZ, bool WhichW> struct PermuteHelper
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2)
+        {
+            static const XMVECTORU32 selectMask =
+            {
+                WhichX ? 0xFFFFFFFF : 0,
+                WhichY ? 0xFFFFFFFF : 0,
+                WhichZ ? 0xFFFFFFFF : 0,
+                WhichW ? 0xFFFFFFFF : 0,
+            };
+
+            XMVECTOR shuffled1 = _mm_permute_ps(v1, Shuffle);
+            XMVECTOR shuffled2 = _mm_permute_ps(v2, Shuffle);
+
+            XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1);
+            XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2);
+
+            return _mm_or_ps(masked1, masked2);
+        }
+    };
+
+    // Fast path for permutes that only read from the first vector.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, false, false>
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_permute_ps(v1, Shuffle); }
+    };
+
+    // Fast path for permutes that only read from the second vector.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, true, true>
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return _mm_permute_ps(v2, Shuffle); }
+    };
+
+    // Fast path for permutes that read XY from the first vector, ZW from the second.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, true, true>
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); }
+    };
+
+    // Fast path for permutes that read XY from the second vector, ZW from the first.
+    template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, false, false>
+    {
+        static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); }
+    };
+};
+
+// General permute template
+template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW>
+    inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2)
+{
+    static_assert(PermuteX <= 7, "PermuteX template parameter out of range");
+    static_assert(PermuteY <= 7, "PermuteY template parameter out of range");
+    static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range");
+    static_assert(PermuteW <= 7, "PermuteW template parameter out of range");
+
+    const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3);
+
+    const bool WhichX = PermuteX > 3;
+    const bool WhichY = PermuteY > 3;
+    const bool WhichZ = PermuteZ > 3;
+    const bool WhichW = PermuteW > 3;
+
+    return AVX2::Internal::PermuteHelper<Shuffle, WhichX, WhichY, WhichZ, WhichW>::Permute(V1, V2);
+}
+
+// Special-case permute templates
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR) { return V1; }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR, FXMVECTOR V2) { return V2; }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); }
+
+
+//-------------------------------------------------------------------------------------
+// Swizzle Templates
+//-------------------------------------------------------------------------------------
+
+// General swizzle template
+template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
+    inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V)
+{
+    static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
+    static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
+    static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
+    static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
+
+    return _mm_permute_ps( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) );
+}
+
+// Specialized swizzles
+template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }
+template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return _mm_broadcastss_ps(V); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); }
+template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
+
+
+//-------------------------------------------------------------------------------------
+// Other Templates
+//-------------------------------------------------------------------------------------
+
+template<uint32_t Elements>
+    inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2)
+{
+    static_assert( Elements < 4, "Elements template parameter out of range" );
+    return AVX2::XMVectorPermute<Elements, (Elements + 1), (Elements + 2), (Elements + 3)>(V1, V2);
+}
+
+template<uint32_t Elements>
+    inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V)
+{
+    static_assert( Elements < 4, "Elements template parameter out of range" );
+    return AVX2::XMVectorSwizzle<Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3>(V);
+}
+
+template<uint32_t Elements>
+    inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V)
+{
+    static_assert( Elements < 4, "Elements template parameter out of range" );
+    return AVX2::XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V);
+}
+
+//-------------------------------------------------------------------------------------
+// Data conversion
+//-------------------------------------------------------------------------------------
+
+inline float XMConvertHalfToFloat( PackedVector::HALF Value )
+{
+    __m128i V1 = _mm_cvtsi32_si128( static_cast<int>(Value) );
+    __m128 V2 = _mm_cvtph_ps( V1 );
+    return _mm_cvtss_f32( V2 );
+}
+
+inline PackedVector::HALF XMConvertFloatToHalf( float Value )
+{
+    __m128 V1 = _mm_set_ss( Value );
+    __m128i V2 = _mm_cvtps_ph( V1, 0 );
+    return static_cast<PackedVector::HALF>( _mm_cvtsi128_si32(V2) );
+}
+
+inline float* XMConvertHalfToFloatStream
+(
+    _Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream, 
+     _In_ size_t      OutputStride, 
+    _In_reads_bytes_(2+InputStride*(HalfCount-1)) const PackedVector::HALF* pInputStream, 
+    _In_ size_t      InputStride, 
+    _In_ size_t      HalfCount
+)
+{
+    using namespace PackedVector;
+
+    assert(pOutputStream);
+    assert(pInputStream);
+
+    assert(InputStride >= sizeof(HALF));
+    assert(OutputStride >= sizeof(float));
+
+    auto pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = HalfCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(HALF))
+        {
+            if (OutputStride == sizeof(float))
+            {
+                if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
+                        pHalf += InputStride * 4;
+
+                        __m128 FV = _mm_cvtph_ps(HV);
+
+                        _mm_stream_ps(reinterpret_cast<float*>(pFloat), FV);
+                        pFloat += OutputStride * 4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
+                        pHalf += InputStride * 4;
+
+                        __m128 FV = _mm_cvtph_ps(HV);
+
+                        _mm_storeu_ps(reinterpret_cast<float*>(pFloat), FV);
+                        pFloat += OutputStride * 4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, scattered output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
+                    pHalf += InputStride * 4;
+
+                    __m128 FV = _mm_cvtph_ps(HV);
+
+                    _mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
+                    pFloat += OutputStride;
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
+                    pFloat += OutputStride;
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
+                    pFloat += OutputStride;
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
+                    pFloat += OutputStride;
+                    i += 4;
+                }
+            }
+        }
+        else if (OutputStride == sizeof(float))
+        {
+            if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
+            {
+                // Scattered input, aligned & packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+
+                    __m128i HV = _mm_setzero_si128();
+                    HV = _mm_insert_epi16(HV, H1, 0);
+                    HV = _mm_insert_epi16(HV, H2, 1);
+                    HV = _mm_insert_epi16(HV, H3, 2);
+                    HV = _mm_insert_epi16(HV, H4, 3);
+                    __m128 FV = _mm_cvtph_ps(HV);
+
+                    _mm_stream_ps(reinterpret_cast<float*>(pFloat), FV);
+                    pFloat += OutputStride * 4;
+                    i += 4;
+                }
+            }
+            else
+            {
+                // Scattered input, packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+
+                    __m128i HV = _mm_setzero_si128();
+                    HV = _mm_insert_epi16(HV, H1, 0);
+                    HV = _mm_insert_epi16(HV, H2, 1);
+                    HV = _mm_insert_epi16(HV, H3, 2);
+                    HV = _mm_insert_epi16(HV, H4, 3);
+                    __m128 FV = _mm_cvtph_ps(HV);
+
+                    _mm_storeu_ps(reinterpret_cast<float*>(pFloat), FV);
+                    pFloat += OutputStride * 4;
+                    i += 4;
+                }
+
+            }
+        }
+        else
+        {
+            // Scattered input, scattered output
+            for (size_t j = 0; j < four; ++j)
+            {
+                uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+
+                __m128i HV = _mm_setzero_si128();
+                HV = _mm_insert_epi16(HV, H1, 0);
+                HV = _mm_insert_epi16(HV, H2, 1);
+                HV = _mm_insert_epi16(HV, H3, 2);
+                HV = _mm_insert_epi16(HV, H4, 3);
+                __m128 FV = _mm_cvtph_ps(HV);
+
+                _mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
+                pFloat += OutputStride;
+                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
+                pFloat += OutputStride;
+                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
+                pFloat += OutputStride;
+                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
+                pFloat += OutputStride;
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < HalfCount; ++i)
+    {
+        *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
+        pHalf += InputStride;
+        pFloat += OutputStride; 
+    }
+
+    return pOutputStream;
+}
+
+
+inline PackedVector::HALF* XMConvertFloatToHalfStream
+(
+    _Out_writes_bytes_(2+OutputStride*(FloatCount-1)) PackedVector::HALF* pOutputStream, 
+    _In_ size_t       OutputStride, 
+    _In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream, 
+    _In_ size_t       InputStride, 
+    _In_ size_t       FloatCount
+)
+{
+    using namespace PackedVector;
+
+    assert(pOutputStream);
+    assert(pInputStream);
+
+    assert(InputStride >= sizeof(float));
+    assert(OutputStride >= sizeof(HALF));
+
+    auto pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = FloatCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(float))
+        {
+            if (OutputStride == sizeof(HALF))
+            {
+                if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
+                {
+                    // Aligned and packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
+                        pFloat += InputStride * 4;
+
+                        __m128i HV = _mm_cvtps_ph(FV, 0);
+
+                        _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
+                        pHalf += OutputStride * 4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
+                        pFloat += InputStride * 4;
+
+                        __m128i HV = _mm_cvtps_ph(FV, 0);
+
+                        _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
+                        pHalf += OutputStride * 4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
+                {
+                    // Aligned & packed input, scattered output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
+                        pFloat += InputStride * 4;
+
+                        __m128i HV = _mm_cvtps_ph(FV, 0);
+
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
+                        pHalf += OutputStride;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, scattered output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
+                        pFloat += InputStride * 4;
+
+                        __m128i HV = _mm_cvtps_ph(FV, 0);
+
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
+                        pHalf += OutputStride;
+                        i += 4;
+                    }
+                }
+            }
+        }
+        else if (OutputStride == sizeof(HALF))
+        {
+            // Scattered input, packed output
+            for (size_t j = 0; j < four; ++j)
+            {
+                __m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
+                __m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
+                FV = _mm_blend_ps(FV, FT, 0xC);
+
+                __m128i HV = _mm_cvtps_ph(FV, 0);
+
+                _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
+                pHalf += OutputStride * 4;
+                i += 4;
+            }
+        }
+        else
+        {
+            // Scattered input, scattered output
+            for (size_t j = 0; j < four; ++j)
+            {
+                __m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
+                __m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
+                FV = _mm_blend_ps(FV, FT, 0xC);
+
+                __m128i HV = _mm_cvtps_ph(FV, 0);
+
+                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
+                pHalf += OutputStride;
+                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
+                pHalf += OutputStride;
+                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
+                pHalf += OutputStride;
+                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
+                pHalf += OutputStride;
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < FloatCount; ++i)
+    {
+        *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
+        pFloat += InputStride; 
+        pHalf += OutputStride;
+    }
+
+    return pOutputStream;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Half2
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMLoadHalf2( _In_ const PackedVector::XMHALF2* pSource )
+{
+    assert(pSource);
+    __m128 V = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
+    return _mm_cvtph_ps( _mm_castps_si128( V ) );
+}
+
+inline void XM_CALLCONV XMStoreHalf2( _Out_ PackedVector::XMHALF2* pDestination, _In_ FXMVECTOR V )
+{
+    assert(pDestination);
+    __m128i V1 = _mm_cvtps_ph( V, 0 );
+    _mm_store_ss( reinterpret_cast<float*>(pDestination), _mm_castsi128_ps(V1) );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Half4
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMLoadHalf4( _In_ const PackedVector::XMHALF4* pSource )
+{
+    assert(pSource);
+    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
+    return _mm_cvtph_ps( V );
+}
+
+inline void XM_CALLCONV XMStoreHalf4( _Out_ PackedVector::XMHALF4* pDestination, _In_ FXMVECTOR V )
+{
+    assert(pDestination);
+    __m128i V1 = _mm_cvtps_ph( V, 0 );
+    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 );
+}
+
+} // namespace AVX2
+
+} // namespace DirectX;
diff --git a/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathBE.h b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathBE.h
new file mode 100644
index 000000000..e5a0f85f7
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathBE.h
@@ -0,0 +1,95 @@
+//-------------------------------------------------------------------------------------
+// DirectXMathBE.h -- Big-endian swap extensions for SIMD C++ Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64)
+#include <tmmintrin.h>
+#endif
+
+#include <DirectXMath.h>
+
+namespace DirectX
+{
+
+inline XMVECTOR XM_CALLCONV XMVectorEndian
+(
+    FXMVECTOR V 
+)
+{
+#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    static const XMVECTORU32 idx = { { { 0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu } } };
+
+    uint8x8x2_t tbl;
+    tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V));
+    tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V));
+
+    const uint8x8_t rL = vtbl2_u8(tbl, vget_low_u32(idx));
+    const uint8x8_t rH = vtbl2_u8(tbl, vget_high_u32(idx));
+    return vcombine_f32(vreinterpret_f32_u8(rL), vreinterpret_f32_u8(rH));
+#else
+    XMVECTORU32 E;
+    E.v = V;
+    uint32_t value = E.u[0];
+    E.u[0] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) );
+    value = E.u[1];
+    E.u[1] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) );
+    value = E.u[2];
+    E.u[2] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) );
+    value = E.u[3];
+    E.u[3] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) );
+    return E.v;
+#endif
+}
+
+
+#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64)
+namespace SSSE3
+{
+
+inline bool XMVerifySSSE3Support()
+{
+    // Should return true on AMD Bulldozer, Intel Core i7/i5/i3, Intel Atom, or later processors
+
+    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+    int CPUInfo[4] = { -1 };
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 0);
+#endif
+
+    if ( CPUInfo[0] < 1  )
+        return false;
+
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 1);
+#endif
+
+    // Check for SSSE3 instruction set.
+    return ( (CPUInfo[2] & 0x200) != 0 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorEndian
+(
+    FXMVECTOR V 
+)
+{
+    static const XMVECTORU32 idx = { { { 0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu } } };
+   
+    __m128i Result = _mm_shuffle_epi8( _mm_castps_si128(V), idx );
+    return _mm_castsi128_ps( Result );
+}
+
+} // namespace SSSE3
+#endif // X86 || X64
+
+} // namespace DirectX
diff --git a/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathF16C.h b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathF16C.h
new file mode 100644
index 000000000..5802be68e
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathF16C.h
@@ -0,0 +1,471 @@
+//-------------------------------------------------------------------------------------
+// DirectXMathF16C.h -- F16C/CVT16 extensions for SIMD C++ Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
+#error F16C not supported on ARM platform
+#endif
+
+#include <DirectXMath.h>
+#include <DirectXPackedVector.h>
+
+namespace DirectX
+{
+
+namespace F16C
+{
+
+inline bool XMVerifyF16CSupport()
+{
+    // Should return true for AMD "Piledriver" and Intel "Ivy Bridge" processors
+    // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
+
+    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+    int CPUInfo[4] = { -1 };
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 0);
+#endif
+
+    if ( CPUInfo[0] < 1  )
+        return false;
+
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 1);
+#endif
+
+    // We check for F16C, AVX, OSXSAVE, and SSE4.1
+    return ( (CPUInfo[2] & 0x38080000 ) == 0x38080000 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Data conversion
+//-------------------------------------------------------------------------------------
+
+inline float XMConvertHalfToFloat( PackedVector::HALF Value )
+{
+    __m128i V1 = _mm_cvtsi32_si128( static_cast<int>(Value) );
+    __m128 V2 = _mm_cvtph_ps( V1 );
+    return _mm_cvtss_f32( V2 );
+}
+
+inline PackedVector::HALF XMConvertFloatToHalf( float Value )
+{
+    __m128 V1 = _mm_set_ss( Value );
+    __m128i V2 = _mm_cvtps_ph( V1, 0 );
+    return static_cast<PackedVector::HALF>( _mm_cvtsi128_si32(V2) );
+}
+
+inline float* XMConvertHalfToFloatStream
+(
+    _Out_writes_bytes_(sizeof(float) + OutputStride * (HalfCount - 1)) float* pOutputStream,
+    _In_ size_t      OutputStride,
+    _In_reads_bytes_(2 + InputStride * (HalfCount - 1)) const PackedVector::HALF* pInputStream,
+    _In_ size_t      InputStride,
+    _In_ size_t      HalfCount
+)
+{
+    using namespace PackedVector;
+
+    assert(pOutputStream);
+    assert(pInputStream);
+
+    assert(InputStride >= sizeof(HALF));
+    assert(OutputStride >= sizeof(float));
+
+    auto pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = HalfCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(HALF))
+        {
+            if (OutputStride == sizeof(float))
+            {
+                if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
+                        pHalf += InputStride * 4;
+
+                        __m128 FV = _mm_cvtph_ps(HV);
+
+                        _mm_stream_ps(reinterpret_cast<float*>(pFloat), FV);
+                        pFloat += OutputStride * 4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
+                        pHalf += InputStride * 4;
+
+                        __m128 FV = _mm_cvtph_ps(HV);
+
+                        _mm_storeu_ps(reinterpret_cast<float*>(pFloat), FV);
+                        pFloat += OutputStride * 4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, scattered output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
+                    pHalf += InputStride * 4;
+
+                    __m128 FV = _mm_cvtph_ps(HV);
+
+                    _mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
+                    pFloat += OutputStride;
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
+                    pFloat += OutputStride;
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
+                    pFloat += OutputStride;
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
+                    pFloat += OutputStride;
+                    i += 4;
+                }
+            }
+        }
+        else if (OutputStride == sizeof(float))
+        {
+            if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
+            {
+                // Scattered input, aligned & packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+
+                    __m128i HV = _mm_setzero_si128();
+                    HV = _mm_insert_epi16(HV, H1, 0);
+                    HV = _mm_insert_epi16(HV, H2, 1);
+                    HV = _mm_insert_epi16(HV, H3, 2);
+                    HV = _mm_insert_epi16(HV, H4, 3);
+                    __m128 FV = _mm_cvtph_ps(HV);
+
+                    _mm_stream_ps(reinterpret_cast<float*>(pFloat), FV);
+                    pFloat += OutputStride * 4;
+                    i += 4;
+                }
+            }
+            else
+            {
+                // Scattered input, packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+
+                    __m128i HV = _mm_setzero_si128();
+                    HV = _mm_insert_epi16(HV, H1, 0);
+                    HV = _mm_insert_epi16(HV, H2, 1);
+                    HV = _mm_insert_epi16(HV, H3, 2);
+                    HV = _mm_insert_epi16(HV, H4, 3);
+                    __m128 FV = _mm_cvtph_ps(HV);
+
+                    _mm_storeu_ps(reinterpret_cast<float*>(pFloat), FV);
+                    pFloat += OutputStride * 4;
+                    i += 4;
+                }
+
+            }
+        }
+        else
+        {
+            // Scattered input, scattered output
+            for (size_t j = 0; j < four; ++j)
+            {
+                uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+
+                __m128i HV = _mm_setzero_si128();
+                HV = _mm_insert_epi16(HV, H1, 0);
+                HV = _mm_insert_epi16(HV, H2, 1);
+                HV = _mm_insert_epi16(HV, H3, 2);
+                HV = _mm_insert_epi16(HV, H4, 3);
+                __m128 FV = _mm_cvtph_ps(HV);
+
+                _mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
+                pFloat += OutputStride;
+                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
+                pFloat += OutputStride;
+                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
+                pFloat += OutputStride;
+                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
+                pFloat += OutputStride;
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < HalfCount; ++i)
+    {
+        *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
+        pHalf += InputStride;
+        pFloat += OutputStride;
+    }
+
+    return pOutputStream;
+}
+
+
+inline PackedVector::HALF* XMConvertFloatToHalfStream
+(
+    _Out_writes_bytes_(2 + OutputStride * (FloatCount - 1)) PackedVector::HALF* pOutputStream,
+    _In_ size_t       OutputStride,
+    _In_reads_bytes_(sizeof(float) + InputStride * (FloatCount - 1)) const float* pInputStream,
+    _In_ size_t       InputStride,
+    _In_ size_t       FloatCount
+)
+{
+    using namespace PackedVector;
+
+    assert(pOutputStream);
+    assert(pInputStream);
+
+    assert(InputStride >= sizeof(float));
+    assert(OutputStride >= sizeof(HALF));
+
+    auto pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = FloatCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(float))
+        {
+            if (OutputStride == sizeof(HALF))
+            {
+                if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
+                {
+                    // Aligned and packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
+                        pFloat += InputStride * 4;
+
+                        __m128i HV = _mm_cvtps_ph(FV, 0);
+
+                        _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
+                        pHalf += OutputStride * 4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
+                        pFloat += InputStride * 4;
+
+                        __m128i HV = _mm_cvtps_ph(FV, 0);
+
+                        _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
+                        pHalf += OutputStride * 4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
+                {
+                    // Aligned & packed input, scattered output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
+                        pFloat += InputStride * 4;
+
+                        __m128i HV = _mm_cvtps_ph(FV, 0);
+
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
+                        pHalf += OutputStride;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, scattered output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
+                        pFloat += InputStride * 4;
+
+                        __m128i HV = _mm_cvtps_ph(FV, 0);
+
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
+                        pHalf += OutputStride;
+                        i += 4;
+                    }
+                }
+            }
+        }
+        else if (OutputStride == sizeof(HALF))
+        {
+            // Scattered input, packed output
+            for (size_t j = 0; j < four; ++j)
+            {
+                __m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
+                __m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
+                FV = _mm_blend_ps(FV, FT, 0xC);
+
+                __m128i HV = _mm_cvtps_ph(FV, 0);
+
+                _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
+                pHalf += OutputStride * 4;
+                i += 4;
+            }
+        }
+        else
+        {
+            // Scattered input, scattered output
+            for (size_t j = 0; j < four; ++j)
+            {
+                __m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
+                __m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
+                FV = _mm_blend_ps(FV, FT, 0xC);
+
+                __m128i HV = _mm_cvtps_ph(FV, 0);
+
+                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
+                pHalf += OutputStride;
+                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
+                pHalf += OutputStride;
+                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
+                pHalf += OutputStride;
+                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
+                pHalf += OutputStride;
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < FloatCount; ++i)
+    {
+        *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
+        pFloat += InputStride;
+        pHalf += OutputStride;
+    }
+
+    return pOutputStream;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Half2
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMLoadHalf2( _In_ const PackedVector::XMHALF2* pSource )
+{
+    assert(pSource);
+    __m128 V = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
+    return _mm_cvtph_ps( _mm_castps_si128( V ) );
+}
+
+inline void XM_CALLCONV XMStoreHalf2( _Out_ PackedVector::XMHALF2* pDestination, _In_ FXMVECTOR V )
+{
+    assert(pDestination);
+    __m128i V1 = _mm_cvtps_ph( V, 0 );
+    _mm_store_ss( reinterpret_cast<float*>(pDestination), _mm_castsi128_ps(V1) );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Half4
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMLoadHalf4( _In_ const PackedVector::XMHALF4* pSource )
+{
+    assert(pSource);
+    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
+    return _mm_cvtph_ps( V );
+}
+
+inline void XM_CALLCONV XMStoreHalf4( _Out_ PackedVector::XMHALF4* pDestination, _In_ FXMVECTOR V )
+{
+    assert(pDestination);
+    __m128i V1 = _mm_cvtps_ph( V, 0 );
+    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 );
+}
+
+} // namespace F16C
+
+} // namespace DirectX
diff --git a/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathFMA3.h b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathFMA3.h
new file mode 100644
index 000000000..8fae18e0b
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathFMA3.h
@@ -0,0 +1,391 @@
+//-------------------------------------------------------------------------------------
+// DirectXMathFMA3.h -- FMA3 extensions for SIMD C++ Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
+#error FMA3 not supported on ARM platform
+#endif
+
+#include <DirectXMath.h>
+
+namespace DirectX
+{
+
+namespace FMA3
+{
+
+inline bool XMVerifyFMA3Support()
+{
+    // Should return true for AMD "Pildriver" and Intel "Haswell" processors
+    // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
+
+    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+    int CPUInfo[4] = {-1};
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 0);
+#endif
+
+    if ( CPUInfo[0] < 1  )
+        return false;
+
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 1);
+#endif
+
+    // We check for FMA3, AVX, OSXSAVE
+    return ( (CPUInfo[2] & 0x18001000) == 0x18001000 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR V3
+)
+{
+    return _mm_fmadd_ps( V1, V2, V3 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR V3
+)
+{
+    return _mm_fnmadd_ps( V1, V2, V3 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector2
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2TransformCoord
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
+    vResult = _mm_div_ps( vResult, W );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2TransformNormal
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_mul_ps( vResult, M.r[1] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector3
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3TransformCoord
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
+    vResult = _mm_div_ps( vResult, W );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3TransformNormal
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_mul_ps( vResult, M.r[2] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2);
+
+inline XMVECTOR XM_CALLCONV XMVector3Project
+(
+    FXMVECTOR V, 
+    float    ViewportX, 
+    float    ViewportY, 
+    float    ViewportWidth, 
+    float    ViewportHeight, 
+    float    ViewportMinZ, 
+    float    ViewportMaxZ, 
+    CXMMATRIX Projection, 
+    CXMMATRIX View, 
+    CXMMATRIX World
+)
+{
+    const float HalfViewportWidth = ViewportWidth * 0.5f;
+    const float HalfViewportHeight = ViewportHeight * 0.5f;
+
+    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f);
+    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
+
+    XMMATRIX Transform = FMA3::XMMatrixMultiply(World, View);
+    Transform = FMA3::XMMatrixMultiply(Transform, Projection);
+
+    XMVECTOR Result = FMA3::XMVector3TransformCoord(V, Transform);
+
+    Result = FMA3::XMVectorMultiplyAdd(Result, Scale, Offset);
+
+    return Result;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3Unproject
+(
+    FXMVECTOR V, 
+    float     ViewportX, 
+    float     ViewportY, 
+    float     ViewportWidth, 
+    float     ViewportHeight, 
+    float     ViewportMinZ, 
+    float     ViewportMaxZ, 
+    CXMMATRIX Projection, 
+    CXMMATRIX View, 
+    CXMMATRIX World
+)
+{
+    static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };
+
+    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
+    Scale = XMVectorReciprocal(Scale);
+
+    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
+    Offset = FMA3::XMVectorMultiplyAdd(Scale, Offset, D.v);
+
+    XMMATRIX Transform = FMA3::XMMatrixMultiply(World, View);
+    Transform = FMA3::XMMatrixMultiply(Transform, Projection);
+    Transform = XMMatrixInverse(nullptr, Transform);
+
+    XMVECTOR Result = FMA3::XMVectorMultiplyAdd(V, Scale, Offset);
+
+    return FMA3::XMVector3TransformCoord(Result, Transform);
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector4
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W
+    vResult = _mm_mul_ps( vResult, M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_fmadd_ps( vTemp, M.r[2], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Matrix
+//-------------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixMultiply
+(
+    CXMMATRIX M1, 
+    CXMMATRIX M2
+)
+{
+    XMMATRIX mResult;
+    // Use vW to hold the original row
+    XMVECTOR vW = M1.r[0];
+    // Splat the component X,Y,Z then W
+    XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    // Perform the operation on the first row
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    mResult.r[0] = vX;
+    // Repeat for the other 3 rows
+    vW = M1.r[1];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    mResult.r[1] = vX;
+    vW = M1.r[2];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    mResult.r[2] = vX;
+    vW = M1.r[3];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    mResult.r[3] = vX;
+    return mResult;
+}
+
+inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose
+(
+    FXMMATRIX M1, 
+    CXMMATRIX M2
+)
+{
+    // Use vW to hold the original row
+    XMVECTOR vW = M1.r[0];
+    // Splat the component X,Y,Z then W
+    XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    // Perform the operation on the first row
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    __m128 r0 = vX;
+    // Repeat for the other 3 rows
+    vW = M1.r[1];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    __m128 r1 = vX;
+    vW = M1.r[2];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    __m128 r2 = vX;
+    vW = M1.r[3];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_fmadd_ps(vY,M2.r[1],vX);
+    vX = _mm_fmadd_ps(vZ,M2.r[2],vX);
+    vX = _mm_fmadd_ps(vW,M2.r[3],vX);
+    __m128 r3 = vX;
+
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2));
+
+    XMMATRIX mResult;
+    // x.x,y.x,z.x,w.x
+    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0));
+    // x.y,y.y,z.y,w.y
+    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1));
+    // x.z,y.z,z.z,w.z
+    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0));
+    // x.w,y.w,z.w,w.w
+    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1));
+    return mResult;
+}
+
+} // namespace FMA3
+
+} // namespace DirectX;
diff --git a/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathFMA4.h b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathFMA4.h
new file mode 100644
index 000000000..2cec13e38
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathFMA4.h
@@ -0,0 +1,415 @@
+//-------------------------------------------------------------------------------------
+// DirectXMathFMA4.h -- FMA4 extensions for SIMD C++ Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
+#error FMA4 not supported on ARM platform
+#endif
+
+#include <DirectXMath.h>
+#include <ammintrin.h>
+
+#ifdef __GNUC__
+#include <x86intrin.h>
+#endif
+
+namespace DirectX
+{
+
+namespace FMA4
+{
+
+inline bool XMVerifyFMA4Support()
+{
+    // Should return true for AMD Bulldozer processors
+    // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
+
+   // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+   int CPUInfo[4] = {-1};
+#if defined(__clang__) || defined(__GNUC__)
+   __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+   __cpuid(CPUInfo, 0);
+#endif
+
+   if ( CPUInfo[0] < 1  )
+       return false;
+
+#if defined(__clang__) || defined(__GNUC__)
+   __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+   __cpuid(CPUInfo, 1);
+#endif
+
+    // We check for AVX, OSXSAVE (required to access FMA4)
+    if ( (CPUInfo[2] & 0x18000000) != 0x18000000 )
+        return false;
+
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(0x80000000, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 0x80000000);
+#endif
+
+    if ( uint32_t(CPUInfo[0]) < 0x80000001u )
+        return false;
+
+    // We check for FMA4
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(0x80000001, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 0x80000001);
+#endif
+
+    return ( CPUInfo[2] & 0x10000 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR V3
+)
+{
+    return _mm_macc_ps( V1, V2, V3 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2, 
+    FXMVECTOR V3
+)
+{
+    return _mm_nmacc_ps( V1, V2, V3 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector2
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_macc_ps( vResult, M.r[1], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2TransformCoord
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_macc_ps( vResult, M.r[1], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
+    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
+    vResult = _mm_div_ps( vResult, W );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2TransformNormal
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_mul_ps( vResult, M.r[1] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector3
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_macc_ps( vResult, M.r[2], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_macc_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3TransformCoord
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_macc_ps( vResult, M.r[2], M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_macc_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
+    XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3));
+    vResult = _mm_div_ps( vResult, W );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3TransformNormal
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_mul_ps( vResult, M.r[2] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_macc_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2);
+
+inline XMVECTOR XM_CALLCONV XMVector3Project
+(
+    FXMVECTOR V, 
+    float    ViewportX, 
+    float    ViewportY, 
+    float    ViewportWidth, 
+    float    ViewportHeight, 
+    float    ViewportMinZ, 
+    float    ViewportMaxZ, 
+    CXMMATRIX Projection, 
+    CXMMATRIX View, 
+    CXMMATRIX World
+)
+{
+    const float HalfViewportWidth = ViewportWidth * 0.5f;
+    const float HalfViewportHeight = ViewportHeight * 0.5f;
+
+    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f);
+    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
+
+    XMMATRIX Transform = FMA4::XMMatrixMultiply(World, View);
+    Transform = FMA4::XMMatrixMultiply(Transform, Projection);
+
+    XMVECTOR Result = FMA4::XMVector3TransformCoord(V, Transform);
+
+    Result = FMA4::XMVectorMultiplyAdd(Result, Scale, Offset);
+
+    return Result;
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3Unproject
+(
+    FXMVECTOR V, 
+    float     ViewportX, 
+    float     ViewportY, 
+    float     ViewportWidth, 
+    float     ViewportHeight, 
+    float     ViewportMinZ, 
+    float     ViewportMaxZ, 
+    CXMMATRIX Projection, 
+    CXMMATRIX View, 
+    CXMMATRIX World
+)
+{
+    static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };
+
+    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
+    Scale = XMVectorReciprocal(Scale);
+
+    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
+    Offset = FMA4::XMVectorMultiplyAdd(Scale, Offset, D.v);
+
+    XMMATRIX Transform = FMA4::XMMatrixMultiply(World, View);
+    Transform = FMA4::XMMatrixMultiply(Transform, Projection);
+    Transform = XMMatrixInverse(nullptr, Transform);
+
+    XMVECTOR Result = FMA4::XMVectorMultiplyAdd(V, Scale, Offset);
+
+    return FMA4::XMVector3TransformCoord(Result, Transform);
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector4
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Transform
+(
+    FXMVECTOR V, 
+    CXMMATRIX M
+)
+{
+    XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W
+    vResult = _mm_mul_ps( vResult, M.r[3] );
+    XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z
+    vResult = _mm_macc_ps( vTemp, M.r[2], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y
+    vResult = _mm_macc_ps( vTemp, M.r[1], vResult );
+    vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X
+    vResult = _mm_macc_ps( vTemp, M.r[0], vResult );
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Matrix
+//-------------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixMultiply
+(
+    CXMMATRIX M1, 
+    CXMMATRIX M2
+)
+{
+    XMMATRIX mResult;
+    // Use vW to hold the original row
+    XMVECTOR vW = M1.r[0];
+    // Splat the component X,Y,Z then W
+    XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    // Perform the operation on the first row
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_macc_ps(vY,M2.r[1],vX);
+    vX = _mm_macc_ps(vZ,M2.r[2],vX);
+    vX = _mm_macc_ps(vW,M2.r[3],vX);
+    mResult.r[0] = vX;
+    // Repeat for the other 3 rows
+    vW = M1.r[1];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_macc_ps(vY,M2.r[1],vX);
+    vX = _mm_macc_ps(vZ,M2.r[2],vX);
+    vX = _mm_macc_ps(vW,M2.r[3],vX);
+    mResult.r[1] = vX;
+    vW = M1.r[2];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_macc_ps(vY,M2.r[1],vX);
+    vX = _mm_macc_ps(vZ,M2.r[2],vX);
+    vX = _mm_macc_ps(vW,M2.r[3],vX);
+    mResult.r[2] = vX;
+    vW = M1.r[3];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_macc_ps(vY,M2.r[1],vX);
+    vX = _mm_macc_ps(vZ,M2.r[2],vX);
+    vX = _mm_macc_ps(vW,M2.r[3],vX);
+    mResult.r[3] = vX;
+    return mResult;
+}
+
+inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose
+(
+    FXMMATRIX M1, 
+    CXMMATRIX M2
+)
+{
+    // Use vW to hold the original row
+    XMVECTOR vW = M1.r[0];
+    // Splat the component X,Y,Z then W
+    XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    // Perform the operation on the first row
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_macc_ps(vY,M2.r[1],vX);
+    vX = _mm_macc_ps(vZ,M2.r[2],vX);
+    vX = _mm_macc_ps(vW,M2.r[3],vX);
+    __m128 r0 = vX;
+    // Repeat for the other 3 rows
+    vW = M1.r[1];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_macc_ps(vY,M2.r[1],vX);
+    vX = _mm_macc_ps(vZ,M2.r[2],vX);
+    vX = _mm_macc_ps(vW,M2.r[3],vX);
+    __m128 r1 = vX;
+    vW = M1.r[2];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_macc_ps(vY,M2.r[1],vX);
+    vX = _mm_macc_ps(vZ,M2.r[2],vX);
+    vX = _mm_macc_ps(vW,M2.r[3],vX);
+    __m128 r2 = vX;
+    vW = M1.r[3];
+    vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0));
+    vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1));
+    vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2));
+    vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3));
+    vX = _mm_mul_ps(vX,M2.r[0]);
+    vX = _mm_macc_ps(vY,M2.r[1],vX);
+    vX = _mm_macc_ps(vZ,M2.r[2],vX);
+    vX = _mm_macc_ps(vW,M2.r[3],vX);
+    __m128 r3 = vX;
+
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2));
+
+    XMMATRIX mResult;
+    // x.x,y.x,z.x,w.x
+    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0));
+    // x.y,y.y,z.y,w.y
+    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1));
+    // x.z,y.z,z.z,w.z
+    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0));
+    // x.w,y.w,z.w,w.w
+    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1));
+    return mResult;
+}
+
+} // namespace FMA4
+
+} // namespace DirectX;
diff --git a/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathSSE3.h b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathSSE3.h
new file mode 100644
index 000000000..926de4a9b
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathSSE3.h
@@ -0,0 +1,111 @@
+//-------------------------------------------------------------------------------------
+// DirectXMathSSE3.h -- SSE3 extensions for SIMD C++ Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
+#error SSE3 not supported on ARM platform
+#endif
+
+#include <pmmintrin.h>
+
+#include <DirectXMath.h>
+
+namespace DirectX
+{
+
+namespace SSE3
+{
+
+inline bool XMVerifySSE3Support()
+{
+    // Should return true on AMD Athlon 64, AMD Phenom, and Intel Pentium 4 or later processors
+
+    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+    int CPUInfo[4] = { -1 };
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 0);
+#endif
+    if ( CPUInfo[0] < 1  )
+        return false;
+
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 1);
+#endif
+
+    // We only check for SSE3 instruction set. SSSE3 instructions are not used.
+    return ( (CPUInfo[2] & 0x1) != 0 );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2Dot
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+    XMVECTOR vTemp = _mm_mul_ps(V1,V2);
+    vTemp = _mm_hadd_ps(vTemp,vTemp);
+    return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(0,0,0,0));
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2LengthSq( FXMVECTOR V )
+{
+    return SSE3::XMVector2Dot(V, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3Dot
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+    XMVECTOR vTemp = _mm_mul_ps(V1,V2);
+    vTemp = _mm_and_ps( vTemp, g_XMMask3 );
+    vTemp = _mm_hadd_ps(vTemp,vTemp);
+    return _mm_hadd_ps(vTemp,vTemp);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3LengthSq( FXMVECTOR V )
+{
+    return SSE3::XMVector3Dot(V, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4Dot
+(
+    FXMVECTOR V1, 
+    FXMVECTOR V2
+)
+{
+    XMVECTOR vTemp = _mm_mul_ps(V1,V2);
+    vTemp = _mm_hadd_ps( vTemp, vTemp );
+    return _mm_hadd_ps( vTemp, vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4LengthSq( FXMVECTOR V )
+{
+    return SSE3::XMVector4Dot(V, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSwizzle_0022( FXMVECTOR V )
+{
+    return _mm_moveldup_ps(V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSwizzle_1133( FXMVECTOR V )
+{
+    return _mm_movehdup_ps(V);
+}
+
+} // namespace SSE3
+
+} // namespace DirectX
diff --git a/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathSSE4.h b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathSSE4.h
new file mode 100644
index 000000000..4e432a986
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathSSE4.h
@@ -0,0 +1,417 @@
+//-------------------------------------------------------------------------------------
+// DirectXMathSSE4.h -- SSE4.1 extensions for SIMD C++ Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
+#error SSE4 not supported on ARM platform
+#endif
+
+#include <smmintrin.h>
+
+#include <DirectXMath.h>
+
+namespace DirectX
+{
+
+namespace SSE4
+{
+
+inline bool XMVerifySSE4Support()
+{
+    // Should return true on AMD Bulldozer, Intel Core 2 ("Penryn"), and Intel Core i7 ("Nehalem") or later processors
+
+    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+    int CPUInfo[4] = { -1 };
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 0);
+#endif
+    if ( CPUInfo[0] < 1  )
+        return false;
+
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 1);
+#endif
+
+    // We only check for SSE4.1 instruction set. SSE4.2 instructions are not used.
+    return ( (CPUInfo[2] & 0x80000) == 0x80000 );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector
+//-------------------------------------------------------------------------------------
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
+#endif
+
+inline void XM_CALLCONV XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V)
+{
+    assert( y != nullptr );
+    *reinterpret_cast<int*>(y) = _mm_extract_ps( V, 1 );
+}
+
+inline void XM_CALLCONV XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V)
+{
+    assert( z != nullptr );
+    *reinterpret_cast<int*>(z) = _mm_extract_ps( V, 2 );
+}
+
+inline void XM_CALLCONV XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V)
+{
+    assert( w != nullptr );
+    *reinterpret_cast<int*>(w) = _mm_extract_ps( V, 3 );
+}
+
+inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V)
+{
+    __m128i V1 = _mm_castps_si128( V );
+    return static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
+}
+
+inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V)
+{
+    __m128i V1 = _mm_castps_si128( V );
+    return static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
+}
+
+inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V)
+{
+    __m128i V1 = _mm_castps_si128( V );
+    return static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
+}
+
+inline void XM_CALLCONV XMVectorGetIntYPtr(_Out_ uint32_t *y, _In_ FXMVECTOR V)
+{
+    assert( y != nullptr );
+    __m128i V1 = _mm_castps_si128( V );
+    *y = static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
+}
+
+inline void XM_CALLCONV XMVectorGetIntZPtr(_Out_ uint32_t *z, _In_ FXMVECTOR V)
+{
+    assert( z != nullptr );
+    __m128i V1 = _mm_castps_si128( V );
+    *z = static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
+}
+
+inline void XM_CALLCONV XMVectorGetIntWPtr(_Out_ uint32_t *w, _In_ FXMVECTOR V)
+{
+    assert( w != nullptr );
+    __m128i V1 = _mm_castps_si128( V );
+    *w = static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y)
+{
+    XMVECTOR vResult = _mm_set_ss(y);
+    vResult = _mm_insert_ps( V, vResult, 0x10 );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z)
+{
+    XMVECTOR vResult = _mm_set_ss(z);
+    vResult = _mm_insert_ps( V, vResult, 0x20 );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w)
+{
+    XMVECTOR vResult = _mm_set_ss(w);
+    vResult = _mm_insert_ps( V, vResult, 0x30 );
+    return vResult;
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y)
+{
+    __m128i vResult = _mm_castps_si128( V );
+    vResult = _mm_insert_epi32( vResult, static_cast<int>(y), 1 );
+    return _mm_castsi128_ps( vResult );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z)
+{
+    __m128i vResult = _mm_castps_si128( V );
+    vResult = _mm_insert_epi32( vResult, static_cast<int>(z), 2 );
+    return _mm_castsi128_ps( vResult );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w)
+{
+    __m128i vResult = _mm_castps_si128( V );
+    vResult = _mm_insert_epi32( vResult, static_cast<int>(w), 3 );
+    return _mm_castsi128_ps( vResult );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorRound( FXMVECTOR V )
+{
+    return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorTruncate( FXMVECTOR V )
+{
+    return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorFloor( FXMVECTOR V )
+{
+    return _mm_floor_ps( V );
+}
+
+inline XMVECTOR XM_CALLCONV XMVectorCeiling( FXMVECTOR V )
+{
+    return _mm_ceil_ps( V );
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector2
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Dot( FXMVECTOR V1, FXMVECTOR V2 )
+{
+    return _mm_dp_ps( V1, V2, 0x3f );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2LengthSq( FXMVECTOR V )
+{
+    return SSE4::XMVector2Dot(V, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    return _mm_rsqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
+    return _mm_div_ps( g_XMOne, vLengthSq );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2LengthEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    return _mm_sqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2Length( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    return _mm_sqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector2Normalize( FXMVECTOR V )
+{
+    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector3
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Dot( FXMVECTOR V1, FXMVECTOR V2 )
+{
+    return _mm_dp_ps( V1, V2, 0x7f );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3LengthSq( FXMVECTOR V )
+{
+    return SSE4::XMVector3Dot(V, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    return _mm_rsqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
+    return _mm_div_ps( g_XMOne, vLengthSq );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3LengthEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    return _mm_sqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3Length( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    return _mm_sqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector3Normalize( FXMVECTOR V )
+{
+    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Vector4
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Dot( FXMVECTOR V1, FXMVECTOR V2 )
+{
+    return _mm_dp_ps( V1, V2, 0xff );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4LengthSq( FXMVECTOR V )
+{
+    return SSE4::XMVector4Dot(V, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    return _mm_rsqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
+    return _mm_div_ps( g_XMOne, vLengthSq );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4LengthEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    return _mm_sqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4Length( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    return _mm_sqrt_ps( vTemp );
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst( FXMVECTOR V )
+{
+    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, V);
+}
+
+inline XMVECTOR XM_CALLCONV XMVector4Normalize( FXMVECTOR V )
+{
+    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
+    vResult = _mm_or_ps(vTemp1,vTemp2);
+    return vResult;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Plane
+//-------------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst( FXMVECTOR P )
+{
+    XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f );
+    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
+    return _mm_mul_ps(vResult, P);
+}
+
+inline XMVECTOR XM_CALLCONV XMPlaneNormalize( FXMVECTOR P )
+{
+    XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f );
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(P,vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult,vLengthSq);
+    return vResult;
+}
+
+} // namespace SSE4
+
+} // namespace DirectX
diff --git a/src/thirdparty/DirectXMath-dec2022/HISTORY.md b/src/thirdparty/DirectXMath-dec2022/HISTORY.md
new file mode 100644
index 000000000..776291ab7
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/HISTORY.md
@@ -0,0 +1,198 @@
+# DirectXMath
+
+https://github.com/Microsoft/DirectXMath
+
+Release available for download on [GitHub](https://github.com/microsoft/DirectXMath/releases)
+
+## Release History
+
+### December 2022 (3.18)
+* C++20 spaceship operators for XMFLOAT2, XMFLOAT3, etc. when building with ``/std:c++20 /Zc:_cplusplus``
+* Improved conformance for ARM64 when using `/Zc:arm64-aliased-neon-types-`
+* Minor code review
+* CMake project updated to require 3.20 or later
+* Added Azure Dev Ops Pipeline YAML files
+
+### May 2022 (3.17b)
+* Hot-fix to address ``-Wreserved-identifier`` warnings with clang v13
+* C++20 spaceship operators for XMFLOAT2, XMFLOAT3, etc. when building with ``/std:c++20 /Zc:_cplusplus``
+* Minor CMake project update
+
+### January 2022 (3.17)
+* Added ColorsLinear namespace to DirectXColors.h with linear versions of .NET colors
+* Optimized the ``XMMatrixRotationRollPitchYaw(FromVector)`` functions
+* Fixed overread problem for 16bpp GPU types Load functions:
+  * ``XMUNIBBLE4``, ``XMU555``, ``XMU565``, ``XMBYTEN2``, ``XMBYTE2``, ``XMUBYTEN2``, ``XMUBYTE2``
+* ``XM_CACHE_LINE_SIZE`` updated for ARM/ARM64 targets to 128 bytes
+* A few comments added to improve IntelliSense experience
+* Conformance improvements for GNU compiler
+* Minor code cleanup
+
+### January 2021 (3.16b)
+* Hot-fixes to resolve build breaks for clang/LLVM and GCC on ARM64
+* ``XM_ALIGNED_DATA`` and ``XM_ALIGNED_STRUCT`` macros updated to use C++17 ``alignas`` when available
+
+### December 2020 (3.16)
+* Added ``XMVectorLog10`` / ``XMVectorExp10``
+* Added ``XMColorRGBToYUV_UHD`` / ``XMColorYUVToRGB_UHD`` for Rec. 2020 YUV
+* Added optional ``rhcoords`` parameter for BoundingFrustum ``CreateFromMatrix``
+* Added use of Intel&reg; Short Vector Matrix Library (SVML) supported by VS 2019
+  * Opt-in with ``_XM_SVML_INTRINSICS_``; opt-out with ``_XM_DISABLE_INTEL_SVML_``
+* Fixed denorm handling for ``XMConvertFloatToHalf``
+* Fixed flush (too small for denorm) handling for ``XMStoreFloat3PK``
+* Fixed clamping bug in ``XMStoreByteN4``
+* Cleaned up ARM-NEON intrinsics type issues for improved portability on GNUC
+* Fixed ``GXMVECTOR`` for x86 ``__vectorcall``
+* Code review
+
+### April 2020 (3.15)
+* Added ``XMMatrixVectorTensorProduct`` for creating a matrix from two vectors
+* Use of m256 registers and FMA3 with ``/arch:AVX2`` for stream and some matrix functions
+* Optimized load/stores for SSE2 float2 & float3 functions
+* Optimized some instruction choices for better AMD CPU support
+* Improved conformance for clang/LLVM, GCC, and MinGW compilers
+* Code review (``constexpr`` / ``noexcept`` usage)
+* Retired VS 2015 support
+
+### August 2019 (3.14)
+* Added float control around IsNan functions to resolve issue with VS 2019 with ``/fp:fast``
+* XMVerifyCPUSupport updated for clang/LLVM cpuid implementation on x86/x64
+* Added support for clang/LLVM built-in platform defines as well as the MSVC ones
+* Cleaned up ARM-NEON intrinsics type issues for improved portability
+* Removed unneeded malloc.h include in DirectXMath.h
+* Whitespace cleanup
+
+### July 2018 (3.13)
+* ``XMFLOAT3X4``, ``XMFLOAT3X4A``, and associated Load/Store functions
+* Move/copy constructors and assignment operators for C++ types
+* Minor fix for XMVectorClamp behavior with NaN
+* Fixed compilation warnings with VS 2017 (15.7 update), Intel C++ 18.0 compiler, and clang 6
+* Retired VS 2013 support
+* Minor code cleanup
+
+### February 2018 (3.12)
+* ARM64 use of fused multiply-accumulate intriniscs
+* Conformance fix for XMConvertFloatToHalf
+* Minor code cleanup
+
+### June 2017 (3.11)
+* AVX optimization of XMMatrixMultiply and XMMatrixMultiplyTranspose
+* AVX2 optimization for XMVectorSplatX
+* FMA3 optimization of XMVectorMultiplyAdd and XMVectorNegativeMultiplySubtract (implied by /arch:AVX2)
+* Conformance fixes to support compilation with Clang 3.7
+
+### January 2017 (3.10)
+* Added XMVectorSum for horizontal adds
+* ARMv8 intrinsics use for ARM64 platform (division, rounding, half-precision conversion)
+* Added SSE3 codepaths using opt-in ``_XM_SSE3_INTRINSICS_``
+* XMVectorRound fix for no-intrinsics to match round to nearest (even)
+* XMStoreFloat3SE fix when max channel isn't a perfect power of 2
+* constexpr conformance fix and workaround for compiler bug in VS 2015 RTM
+* Remove support for VS 2012 compilers
+* Remove ``__vector4i`` deprecated type
+
+### June 2016 (3.09)
+* Includes support for additional optimizations when built with /arch:AVX or /arch:AVX2
+* Added use of constexpr for type constructors, XMConvertToRadians, and XMConvertToDegrees
+* Marked ``__vector4i``, ``XMXDEC4``, ``XMDECN4``, ``XMDEC4``, and associated Load & Store functions as deprecated.
+  + These are vestiges of Xbox 360 support and will be removed in a future release
+* Renamed parameter in XMMatrixPerspectiveFov* to reduce user confusion when relying on IntelliSense
+* XMU565, XMUNIBBLE4 constructors take uint8_t instead of int8_t
+
+### May 2016
+* DirectXMath 3.08 released under the MIT license
+
+### November 2015 (3.08)
+* Added use of ``_mm_sfence`` for Stream methods
+* Fixed bug with non-uniform scaling transforms for BoundingOrientedBox
+* Added asserts for Near/FarZ in XMMatrix* methods
+* Added use of ``=default`` for PODs with VS 2013/2015
+* Additional SSE and ARM-NEON optimizations for PackedVector functions
+
+### April 2015 (3.07)
+* Fix customer reported bugs in BoundingBox methods
+* Fix customer reported bug in XMStoreFloat3SE
+* Fix customer reported bug in XMVectorATan2, XMVectorATan2Est
+* Fix customer reported bug in XMVectorRound
+
+### October 2013 (3.06)
+* Fixed load/store of XMFLOAT3SE to properly match the ``DXGI_FORMAT_R9G9B9E5_SHAREDEXP``
+* Added ``XMLoadUDecN4_XR`` and ``XMStoreUDecN4_XR`` to match ``DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM``
+* Added ``XMColorRGBToSRGB`` and ``XMColorSRGBToRGB`` to convert linear RGB <-> sRGB
+
+### July 2013 (3.05)
+* Use x86/x64 ``__vectorcall`` calling-convention when available (``XM_CALLCONV``, ``HXMVECTOR``, ``FXMMATRIX`` introduced)
+* Fixed bug with XMVectorFloor and XMVectorCeiling when given whole odd numbers (i.e. 105.0)
+* Improved XMVectorRound algorithm
+* ARM-NEON optimizations for XMVectorExp2, XMVectorLog2, XMVectorExpE, and XMVectorLogE
+* ARM-NEON code paths use multiply-by-scalar intrinsics when supported
+* Additional optimizations for ARM-NEON Stream functions
+* Fixed potential warning C4723 using ``operator/`` or ``operator/=``
+
+### March 2013 (3.04)
+* ``XMVectorExp2``, ``XMVectorLog2``, ``XMVectorExpE``, and ``XMVectorLogE`` functions added to provide base-e support in addition to the existing base-2 support
+* ``XMVectorExp`` and ``XMVectorLog`` are now aliases for XMVectorExp2 and XMVectorLog2
+* Additional optimizations for Stream functions
+* XMVector3Cross now ensures w component is zero on ARM
+* XMConvertHalfToFloat and XMConvertFloatToHalf  now use IEEE 754 standard float16 behavior for INF/QNAN
+* Updated matrix version Transform for BoundingOrientedBox and BoundingFrustum to handle scaling
+
+### March 2012 (3.03)
+* *breaking change* Removed union members from XMMATRIX type to make it a fully 'opaque' type
+* Marked single-parameter C++ constructors for XMFLOAT2, XMFLOAT2A, XMFLOAT3, XMFLOAT3A, XMFLOAT4, and XMFLOAT4A explicit
+
+### February 2012 (3.02)
+* ARM-NEON intrinsics (selected by default for the ARM platform)
+* Reworked XMVectorPermute, change of ``XM_PERMUTE_`` defines, removal of XMVectorPermuteControl
+* Addition of ``XM_SWIZZLE_`` defines
+* Optimizations for transcendental functions
+* Template forms for permute, swizzle, shift-left, rotate-left, rotation-right, and insert
+* Removal of deprecated types and functions
+  + ``XM_CACHE_LINE_SIZE`` define, XMVectorExpEst, XMVectorLogEst, XMVectorPowEst, XMVectorSinHEs, XMVectorCosHEst, XMVectorTanHEst, XMVector2InBoundsR, XMVector3InBoundsR, XMVector4InBoundsR
+* Removed ``XM_STRICT_VECTOR4``; XMVECTOR in NO-INTRINSICS always defined without .x, .y, .z, .w, .v, or .u
+* Additional bounding types
+* SAL fixes and improvements
+
+### September 2011 (3.00)
+* Renamed and reorganized the headers
+* Introduced C++ namespaces
+* Removed the Xbox 360-specific GPU types
+  + HENDN3, XMHEND3, XMUHENDN3, XMUHEND3, XMDHENN3, XMDHEN3, XMUDHENN3, XMUDHEN3, XMXICON4, XMXICO4, XMICON4, XMICO4, XMUICON4, XMUICO4
+
+### July 2012 (XNAMath 2.05)
+* Template forms have been added for `XMVectorPermute`, `XMVectorSwizzle`, `XMVectorShiftLeft`, `XMVectorRotateLeft`, `XMVectorRotateRight`, and `XMVectorInsert`
+* The `XM_STRICT_XMMATRIX` compilation define has been added for opaque `XMMATRIX`.
+* Stream stride and count arguments have been changed to `size_t`
+* The ``pDeterminant`` parameter of `XMMatrixInverse` is now optional
+* Additional operator= overloads for `XMBYTEN4`, `XMBYTE4`, `XMUBYTEN4`, and `XMUBYTE4` types are now available
+
+### February 2011 (XNAMath 2.04)
+* Addition of new data types and associated load-store functions:
+  + `XMBYTEN2, XMBYTE2, XMUBYTEN2, XMUBYTE2`
+  + `XMLoadByteN2, XMLoadByte2, XMLoadUByteN2, XMLoadUByte2`
+  + `XMStoreByteN2, XMStoreByte2, XMStoreUByteN2, XMStoreUByte2`
+  + `XMINT2, XMUINT2, XMINT3, XMUINT3, XMINT4, XMUINT4`
+  + `XMLoadSInt2, XMLoadUInt2, XMLoadSInt3, XMLoadUInt3, XMLoadSInt4, XMLoadUInt4`
+  + `XMStoreSInt2, XMStoreUInt2, XMStoreSInt3, XMStoreUInt3, XMStoreSInt4, XMStoreUInt4`
+* Marked most single-parameter C++ constructors with `explicit` keyword
+* Corrected range issues with SSE implementations of `XMVectorFloor` and `XMVectorCeiling`
+
+
+### June 2010 (XNAMath 2.03)
+* Addition of ``XMVectorDivide`` to optimize SSE2 vector division operations
+* Unified handling of floating-point specials between the Windows SSE2 and no-intrinsics implementations
+* Use of Visual Studio style SAL annotations
+* Modifications to the C++ declarations for `XMFLOAT2A/3A/4A/4X3A/4X4A` to better support these types in C++ templates
+
+### February 2010 (XNAMath 2.02)
+* Fixes to `XMStoreColor`, `XMQuaternionRotationMatrix`, `XMVectorATan2`, and `XMVectorATan2Est`
+
+### August 2009 (XNAMath 2.01)
+* Adds ``XM_STRICT_VECTOR4``. This opt-in directive disallows the usage of XboxMath-like  member accessors such as .x, .y, and .z. This makes it easier to write portable XNA Math code.
+* Added conversion support for the following Windows graphics formats:
+  + 16-bit color formats (565, 555X, 5551)
+  + 4-bits per channel color formats (4444)
+  + Unique Direct3D 10/11 formats (``DXGI_FORMAT_R9G9B9E5_SHAREDEXP`` and ``DXGI_FORMAT_R11G11B10_FLOAT``)
+
+### March 2009 (XNAMath 2.00)
+* Initial release (based on the Xbox 360 Xbox math library)
diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXCollision.h b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXCollision.h
new file mode 100644
index 000000000..989c469f3
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXCollision.h
@@ -0,0 +1,359 @@
+//-------------------------------------------------------------------------------------
+// DirectXCollision.h -- C++ Collision Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#include "DirectXMath.h"
+
+namespace DirectX
+{
+
+    enum ContainmentType
+    {
+        DISJOINT = 0,
+        INTERSECTS = 1,
+        CONTAINS = 2
+    };
+
+    enum PlaneIntersectionType
+    {
+        FRONT = 0,
+        INTERSECTING = 1,
+        BACK = 2
+    };
+
+    struct BoundingBox;
+    struct BoundingOrientedBox;
+    struct BoundingFrustum;
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4324 4820)
+    // C4324: alignment padding warnings
+    // C4820: Off by default noise
+#endif
+
+    //-------------------------------------------------------------------------------------
+    // Bounding sphere
+    //-------------------------------------------------------------------------------------
+    struct BoundingSphere
+    {
+        XMFLOAT3 Center;            // Center of the sphere.
+        float Radius;               // Radius of the sphere.
+
+        // Creators
+        BoundingSphere() noexcept : Center(0, 0, 0), Radius(1.f) {}
+
+        BoundingSphere(const BoundingSphere&) = default;
+        BoundingSphere& operator=(const BoundingSphere&) = default;
+
+        BoundingSphere(BoundingSphere&&) = default;
+        BoundingSphere& operator=(BoundingSphere&&) = default;
+
+        constexpr BoundingSphere(_In_ const XMFLOAT3& center, _In_ float radius) noexcept
+            : Center(center), Radius(radius) {}
+
+        // Methods
+        void    XM_CALLCONV     Transform(_Out_ BoundingSphere& Out, _In_ FXMMATRIX M) const noexcept;
+        void    XM_CALLCONV     Transform(_Out_ BoundingSphere& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation) const noexcept;
+        // Transform the sphere
+
+        ContainmentType    XM_CALLCONV     Contains(_In_ FXMVECTOR Point) const noexcept;
+        ContainmentType    XM_CALLCONV     Contains(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept;
+        ContainmentType Contains(_In_ const BoundingSphere& sh) const noexcept;
+        ContainmentType Contains(_In_ const BoundingBox& box) const noexcept;
+        ContainmentType Contains(_In_ const BoundingOrientedBox& box) const noexcept;
+        ContainmentType Contains(_In_ const BoundingFrustum& fr) const noexcept;
+
+        bool Intersects(_In_ const BoundingSphere& sh) const noexcept;
+        bool Intersects(_In_ const BoundingBox& box) const noexcept;
+        bool Intersects(_In_ const BoundingOrientedBox& box) const noexcept;
+        bool Intersects(_In_ const BoundingFrustum& fr) const noexcept;
+
+        bool    XM_CALLCONV     Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept;
+        // Triangle-sphere test
+
+        PlaneIntersectionType    XM_CALLCONV     Intersects(_In_ FXMVECTOR Plane) const noexcept;
+        // Plane-sphere test
+
+        bool    XM_CALLCONV     Intersects(_In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist) const noexcept;
+        // Ray-sphere test
+
+        ContainmentType     XM_CALLCONV     ContainedBy(_In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2,
+            _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5) const noexcept;
+        // Test sphere against six planes (see BoundingFrustum::GetPlanes)
+
+        // Static methods
+        static void CreateMerged(_Out_ BoundingSphere& Out, _In_ const BoundingSphere& S1, _In_ const BoundingSphere& S2) noexcept;
+
+        static void CreateFromBoundingBox(_Out_ BoundingSphere& Out, _In_ const BoundingBox& box) noexcept;
+        static void CreateFromBoundingBox(_Out_ BoundingSphere& Out, _In_ const BoundingOrientedBox& box) noexcept;
+
+        static void CreateFromPoints(_Out_ BoundingSphere& Out, _In_ size_t Count,
+            _In_reads_bytes_(sizeof(XMFLOAT3) + Stride * (Count - 1)) const XMFLOAT3* pPoints, _In_ size_t Stride) noexcept;
+
+        static void CreateFromFrustum(_Out_ BoundingSphere& Out, _In_ const BoundingFrustum& fr) noexcept;
+    };
+
+    //-------------------------------------------------------------------------------------
+    // Axis-aligned bounding box
+    //-------------------------------------------------------------------------------------
+    struct BoundingBox
+    {
+        static constexpr size_t CORNER_COUNT = 8;
+
+        XMFLOAT3 Center;            // Center of the box.
+        XMFLOAT3 Extents;           // Distance from the center to each side.
+
+        // Creators
+        BoundingBox() noexcept : Center(0, 0, 0), Extents(1.f, 1.f, 1.f) {}
+
+        BoundingBox(const BoundingBox&) = default;
+        BoundingBox& operator=(const BoundingBox&) = default;
+
+        BoundingBox(BoundingBox&&) = default;
+        BoundingBox& operator=(BoundingBox&&) = default;
+
+        constexpr BoundingBox(_In_ const XMFLOAT3& center, _In_ const XMFLOAT3& extents) noexcept
+            : Center(center), Extents(extents) {}
+
+        // Methods
+        void    XM_CALLCONV     Transform(_Out_ BoundingBox& Out, _In_ FXMMATRIX M) const noexcept;
+        void    XM_CALLCONV     Transform(_Out_ BoundingBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation) const noexcept;
+
+        void GetCorners(_Out_writes_(8) XMFLOAT3* Corners) const noexcept;
+        // Gets the 8 corners of the box
+
+        ContainmentType    XM_CALLCONV     Contains(_In_ FXMVECTOR Point) const noexcept;
+        ContainmentType    XM_CALLCONV     Contains(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept;
+        ContainmentType Contains(_In_ const BoundingSphere& sh) const noexcept;
+        ContainmentType Contains(_In_ const BoundingBox& box) const noexcept;
+        ContainmentType Contains(_In_ const BoundingOrientedBox& box) const noexcept;
+        ContainmentType Contains(_In_ const BoundingFrustum& fr) const noexcept;
+
+        bool Intersects(_In_ const BoundingSphere& sh) const noexcept;
+        bool Intersects(_In_ const BoundingBox& box) const noexcept;
+        bool Intersects(_In_ const BoundingOrientedBox& box) const noexcept;
+        bool Intersects(_In_ const BoundingFrustum& fr) const noexcept;
+
+        bool    XM_CALLCONV     Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept;
+        // Triangle-Box test
+
+        PlaneIntersectionType    XM_CALLCONV     Intersects(_In_ FXMVECTOR Plane) const noexcept;
+        // Plane-box test
+
+        bool    XM_CALLCONV     Intersects(_In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist) const noexcept;
+        // Ray-Box test
+
+        ContainmentType     XM_CALLCONV     ContainedBy(_In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2,
+            _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5) const noexcept;
+        // Test box against six planes (see BoundingFrustum::GetPlanes)
+
+        // Static methods
+        static void CreateMerged(_Out_ BoundingBox& Out, _In_ const BoundingBox& b1, _In_ const BoundingBox& b2) noexcept;
+
+        static void CreateFromSphere(_Out_ BoundingBox& Out, _In_ const BoundingSphere& sh) noexcept;
+
+        static void    XM_CALLCONV     CreateFromPoints(_Out_ BoundingBox& Out, _In_ FXMVECTOR pt1, _In_ FXMVECTOR pt2) noexcept;
+        static void CreateFromPoints(_Out_ BoundingBox& Out, _In_ size_t Count,
+            _In_reads_bytes_(sizeof(XMFLOAT3) + Stride * (Count - 1)) const XMFLOAT3* pPoints, _In_ size_t Stride) noexcept;
+    };
+
+    //-------------------------------------------------------------------------------------
+    // Oriented bounding box
+    //-------------------------------------------------------------------------------------
+    struct BoundingOrientedBox
+    {
+        static constexpr size_t CORNER_COUNT = 8;
+
+        XMFLOAT3 Center;            // Center of the box.
+        XMFLOAT3 Extents;           // Distance from the center to each side.
+        XMFLOAT4 Orientation;       // Unit quaternion representing rotation (box -> world).
+
+        // Creators
+        BoundingOrientedBox() noexcept : Center(0, 0, 0), Extents(1.f, 1.f, 1.f), Orientation(0, 0, 0, 1.f) {}
+
+        BoundingOrientedBox(const BoundingOrientedBox&) = default;
+        BoundingOrientedBox& operator=(const BoundingOrientedBox&) = default;
+
+        BoundingOrientedBox(BoundingOrientedBox&&) = default;
+        BoundingOrientedBox& operator=(BoundingOrientedBox&&) = default;
+
+        constexpr BoundingOrientedBox(_In_ const XMFLOAT3& center, _In_ const XMFLOAT3& extents, _In_ const XMFLOAT4& orientation) noexcept
+            : Center(center), Extents(extents), Orientation(orientation) {}
+
+        // Methods
+        void    XM_CALLCONV     Transform(_Out_ BoundingOrientedBox& Out, _In_ FXMMATRIX M) const noexcept;
+        void    XM_CALLCONV     Transform(_Out_ BoundingOrientedBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation) const noexcept;
+
+        void GetCorners(_Out_writes_(8) XMFLOAT3* Corners) const noexcept;
+        // Gets the 8 corners of the box
+
+        ContainmentType    XM_CALLCONV     Contains(_In_ FXMVECTOR Point) const noexcept;
+        ContainmentType    XM_CALLCONV     Contains(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept;
+        ContainmentType Contains(_In_ const BoundingSphere& sh) const noexcept;
+        ContainmentType Contains(_In_ const BoundingBox& box) const noexcept;
+        ContainmentType Contains(_In_ const BoundingOrientedBox& box) const noexcept;
+        ContainmentType Contains(_In_ const BoundingFrustum& fr) const noexcept;
+
+        bool Intersects(_In_ const BoundingSphere& sh) const noexcept;
+        bool Intersects(_In_ const BoundingBox& box) const noexcept;
+        bool Intersects(_In_ const BoundingOrientedBox& box) const noexcept;
+        bool Intersects(_In_ const BoundingFrustum& fr) const noexcept;
+
+        bool    XM_CALLCONV     Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept;
+        // Triangle-OrientedBox test
+
+        PlaneIntersectionType    XM_CALLCONV     Intersects(_In_ FXMVECTOR Plane) const noexcept;
+        // Plane-OrientedBox test
+
+        bool    XM_CALLCONV     Intersects(_In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist) const noexcept;
+        // Ray-OrientedBox test
+
+        ContainmentType     XM_CALLCONV     ContainedBy(_In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2,
+            _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5) const noexcept;
+        // Test OrientedBox against six planes (see BoundingFrustum::GetPlanes)
+
+        // Static methods
+        static void CreateFromBoundingBox(_Out_ BoundingOrientedBox& Out, _In_ const BoundingBox& box) noexcept;
+
+        static void CreateFromPoints(_Out_ BoundingOrientedBox& Out, _In_ size_t Count,
+            _In_reads_bytes_(sizeof(XMFLOAT3) + Stride * (Count - 1)) const XMFLOAT3* pPoints, _In_ size_t Stride) noexcept;
+    };
+
+    //-------------------------------------------------------------------------------------
+    // Bounding frustum
+    //-------------------------------------------------------------------------------------
+    struct BoundingFrustum
+    {
+        static constexpr size_t CORNER_COUNT = 8;
+
+        XMFLOAT3 Origin;            // Origin of the frustum (and projection).
+        XMFLOAT4 Orientation;       // Quaternion representing rotation.
+
+        float RightSlope;           // Positive X (X/Z)
+        float LeftSlope;            // Negative X
+        float TopSlope;             // Positive Y (Y/Z)
+        float BottomSlope;          // Negative Y
+        float Near, Far;            // Z of the near plane and far plane.
+
+        // Creators
+        BoundingFrustum() noexcept :
+            Origin(0, 0, 0), Orientation(0, 0, 0, 1.f), RightSlope(1.f), LeftSlope(-1.f),
+            TopSlope(1.f), BottomSlope(-1.f), Near(0), Far(1.f) {}
+
+        BoundingFrustum(const BoundingFrustum&) = default;
+        BoundingFrustum& operator=(const BoundingFrustum&) = default;
+
+        BoundingFrustum(BoundingFrustum&&) = default;
+        BoundingFrustum& operator=(BoundingFrustum&&) = default;
+
+        constexpr BoundingFrustum(_In_ const XMFLOAT3& origin, _In_ const XMFLOAT4& orientation,
+            _In_ float rightSlope, _In_ float leftSlope, _In_ float topSlope, _In_ float bottomSlope,
+            _In_ float nearPlane, _In_ float farPlane) noexcept
+            : Origin(origin), Orientation(orientation),
+            RightSlope(rightSlope), LeftSlope(leftSlope), TopSlope(topSlope), BottomSlope(bottomSlope),
+            Near(nearPlane), Far(farPlane) {}
+        BoundingFrustum(_In_ CXMMATRIX Projection, bool rhcoords = false) noexcept;
+
+        // Methods
+        void    XM_CALLCONV     Transform(_Out_ BoundingFrustum& Out, _In_ FXMMATRIX M) const noexcept;
+        void    XM_CALLCONV     Transform(_Out_ BoundingFrustum& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation) const noexcept;
+
+        void GetCorners(_Out_writes_(8) XMFLOAT3* Corners) const noexcept;
+        // Gets the 8 corners of the frustum
+
+        ContainmentType    XM_CALLCONV     Contains(_In_ FXMVECTOR Point) const noexcept;
+        ContainmentType    XM_CALLCONV     Contains(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept;
+        ContainmentType Contains(_In_ const BoundingSphere& sp) const noexcept;
+        ContainmentType Contains(_In_ const BoundingBox& box) const noexcept;
+        ContainmentType Contains(_In_ const BoundingOrientedBox& box) const noexcept;
+        ContainmentType Contains(_In_ const BoundingFrustum& fr) const noexcept;
+        // Frustum-Frustum test
+
+        bool Intersects(_In_ const BoundingSphere& sh) const noexcept;
+        bool Intersects(_In_ const BoundingBox& box) const noexcept;
+        bool Intersects(_In_ const BoundingOrientedBox& box) const noexcept;
+        bool Intersects(_In_ const BoundingFrustum& fr) const noexcept;
+
+        bool    XM_CALLCONV     Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept;
+        // Triangle-Frustum test
+
+        PlaneIntersectionType    XM_CALLCONV     Intersects(_In_ FXMVECTOR Plane) const noexcept;
+        // Plane-Frustum test
+
+        bool    XM_CALLCONV     Intersects(_In_ FXMVECTOR rayOrigin, _In_ FXMVECTOR Direction, _Out_ float& Dist) const noexcept;
+        // Ray-Frustum test
+
+        ContainmentType     XM_CALLCONV     ContainedBy(_In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2,
+            _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5) const noexcept;
+        // Test frustum against six planes (see BoundingFrustum::GetPlanes)
+
+        void GetPlanes(_Out_opt_ XMVECTOR* NearPlane, _Out_opt_ XMVECTOR* FarPlane, _Out_opt_ XMVECTOR* RightPlane,
+            _Out_opt_ XMVECTOR* LeftPlane, _Out_opt_ XMVECTOR* TopPlane, _Out_opt_ XMVECTOR* BottomPlane) const noexcept;
+        // Create 6 Planes representation of Frustum
+
+        // Static methods
+        static void     XM_CALLCONV     CreateFromMatrix(_Out_ BoundingFrustum& Out, _In_ FXMMATRIX Projection, bool rhcoords = false) noexcept;
+    };
+
+    //-----------------------------------------------------------------------------
+    // Triangle intersection testing routines.
+    //-----------------------------------------------------------------------------
+    namespace TriangleTests
+    {
+        bool                    XM_CALLCONV     Intersects(_In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _In_ FXMVECTOR V0, _In_ GXMVECTOR V1, _In_ HXMVECTOR V2, _Out_ float& Dist) noexcept;
+        // Ray-Triangle
+
+        bool                    XM_CALLCONV     Intersects(_In_ FXMVECTOR A0, _In_ FXMVECTOR A1, _In_ FXMVECTOR A2, _In_ GXMVECTOR B0, _In_ HXMVECTOR B1, _In_ HXMVECTOR B2) noexcept;
+        // Triangle-Triangle
+
+        PlaneIntersectionType   XM_CALLCONV     Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2, _In_ GXMVECTOR Plane) noexcept;
+        // Plane-Triangle
+
+        ContainmentType         XM_CALLCONV     ContainedBy(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2,
+            _In_ GXMVECTOR Plane0, _In_ HXMVECTOR Plane1, _In_ HXMVECTOR Plane2,
+            _In_ CXMVECTOR Plane3, _In_ CXMVECTOR Plane4, _In_ CXMVECTOR Plane5) noexcept;
+        // Test a triangle against six planes at once (see BoundingFrustum::GetPlanes)
+    }
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+    /****************************************************************************
+     *
+     * Implementation
+     *
+     ****************************************************************************/
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4068 4365 4616 6001)
+     // C4068/4616: ignore unknown pragmas
+     // C4365: Off by default noise
+     // C6001: False positives
+#endif
+#ifdef _PREFAST_
+#pragma prefast(push)
+#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
+#pragma prefast(disable : 26495, "Union initialization confuses /analyze")
+#endif
+
+#include "DirectXCollision.inl"
+
+#ifdef _PREFAST_
+#pragma prefast(pop)
+#endif
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+} // namespace DirectX
+
diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXCollision.inl b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXCollision.inl
new file mode 100644
index 000000000..51d37926a
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXCollision.inl
@@ -0,0 +1,4816 @@
+//-------------------------------------------------------------------------------------
+// DirectXCollision.inl -- C++ Collision Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+XMGLOBALCONST XMVECTORF32 g_BoxOffset[8] =
+{
+    { { { -1.0f, -1.0f,  1.0f, 0.0f } } },
+    { { {  1.0f, -1.0f,  1.0f, 0.0f } } },
+    { { {  1.0f,  1.0f,  1.0f, 0.0f } } },
+    { { { -1.0f,  1.0f,  1.0f, 0.0f } } },
+    { { { -1.0f, -1.0f, -1.0f, 0.0f } } },
+    { { {  1.0f, -1.0f, -1.0f, 0.0f } } },
+    { { {  1.0f,  1.0f, -1.0f, 0.0f } } },
+    { { { -1.0f,  1.0f, -1.0f, 0.0f } } },
+};
+
+XMGLOBALCONST XMVECTORF32 g_RayEpsilon = { { { 1e-20f, 1e-20f, 1e-20f, 1e-20f } } };
+XMGLOBALCONST XMVECTORF32 g_RayNegEpsilon = { { { -1e-20f, -1e-20f, -1e-20f, -1e-20f } } };
+XMGLOBALCONST XMVECTORF32 g_FltMin = { { { -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX } } };
+XMGLOBALCONST XMVECTORF32 g_FltMax = { { { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX } } };
+
+namespace Internal
+{
+
+    //-----------------------------------------------------------------------------
+    // Return true if any of the elements of a 3 vector are equal to 0xffffffff.
+    // Slightly more efficient than using XMVector3EqualInt.
+    //-----------------------------------------------------------------------------
+    inline bool XMVector3AnyTrue(_In_ FXMVECTOR V) noexcept
+    {
+        // Duplicate the fourth element from the first element.
+        XMVECTOR C = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X>(V);
+
+        return XMComparisonAnyTrue(XMVector4EqualIntR(C, XMVectorTrueInt()));
+    }
+
+
+    //-----------------------------------------------------------------------------
+    // Return true if all of the elements of a 3 vector are equal to 0xffffffff.
+    // Slightly more efficient than using XMVector3EqualInt.
+    //-----------------------------------------------------------------------------
+    inline bool XMVector3AllTrue(_In_ FXMVECTOR V) noexcept
+    {
+        // Duplicate the fourth element from the first element.
+        XMVECTOR C = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X>(V);
+
+        return XMComparisonAllTrue(XMVector4EqualIntR(C, XMVectorTrueInt()));
+    }
+
+#if defined(_PREFAST_) || !defined(NDEBUG)
+
+    XMGLOBALCONST XMVECTORF32 g_UnitVectorEpsilon = { { { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f } } };
+    XMGLOBALCONST XMVECTORF32 g_UnitQuaternionEpsilon = { { { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f } } };
+    XMGLOBALCONST XMVECTORF32 g_UnitPlaneEpsilon = { { { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f } } };
+
+    //-----------------------------------------------------------------------------
+    // Return true if the vector is a unit vector (length == 1).
+    //-----------------------------------------------------------------------------
+    inline bool XMVector3IsUnit(_In_ FXMVECTOR V) noexcept
+    {
+        XMVECTOR Difference = XMVectorSubtract(XMVector3Length(V), XMVectorSplatOne());
+        return XMVector4Less(XMVectorAbs(Difference), g_UnitVectorEpsilon);
+    }
+
+    //-----------------------------------------------------------------------------
+    // Return true if the quaterion is a unit quaternion.
+    //-----------------------------------------------------------------------------
+    inline bool XMQuaternionIsUnit(_In_ FXMVECTOR Q) noexcept
+    {
+        XMVECTOR Difference = XMVectorSubtract(XMVector4Length(Q), XMVectorSplatOne());
+        return XMVector4Less(XMVectorAbs(Difference), g_UnitQuaternionEpsilon);
+    }
+
+    //-----------------------------------------------------------------------------
+    // Return true if the plane is a unit plane.
+    //-----------------------------------------------------------------------------
+    inline bool XMPlaneIsUnit(_In_ FXMVECTOR Plane) noexcept
+    {
+        XMVECTOR Difference = XMVectorSubtract(XMVector3Length(Plane), XMVectorSplatOne());
+        return XMVector4Less(XMVectorAbs(Difference), g_UnitPlaneEpsilon);
+    }
+
+#endif // _PREFAST_ || !NDEBUG
+
+    //-----------------------------------------------------------------------------
+    inline XMVECTOR XMPlaneTransform(_In_ FXMVECTOR Plane, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation) noexcept
+    {
+        XMVECTOR vNormal = XMVector3Rotate(Plane, Rotation);
+        XMVECTOR vD = XMVectorSubtract(XMVectorSplatW(Plane), XMVector3Dot(vNormal, Translation));
+
+        return XMVectorInsert<0, 0, 0, 0, 1>(vNormal, vD);
+    }
+
+    //-----------------------------------------------------------------------------
+    // Return the point on the line segement (S1, S2) nearest the point P.
+    //-----------------------------------------------------------------------------
+    inline XMVECTOR PointOnLineSegmentNearestPoint(_In_ FXMVECTOR S1, _In_ FXMVECTOR S2, _In_ FXMVECTOR P) noexcept
+    {
+        XMVECTOR Dir = XMVectorSubtract(S2, S1);
+        XMVECTOR Projection = XMVectorSubtract(XMVector3Dot(P, Dir), XMVector3Dot(S1, Dir));
+        XMVECTOR LengthSq = XMVector3Dot(Dir, Dir);
+
+        XMVECTOR t = XMVectorMultiply(Projection, XMVectorReciprocal(LengthSq));
+        XMVECTOR Point = XMVectorMultiplyAdd(t, Dir, S1);
+
+        // t < 0
+        XMVECTOR SelectS1 = XMVectorLess(Projection, XMVectorZero());
+        Point = XMVectorSelect(Point, S1, SelectS1);
+
+        // t > 1
+        XMVECTOR SelectS2 = XMVectorGreater(Projection, LengthSq);
+        Point = XMVectorSelect(Point, S2, SelectS2);
+
+        return Point;
+    }
+
+    //-----------------------------------------------------------------------------
+    // Test if the point (P) on the plane of the triangle is inside the triangle
+    // (V0, V1, V2).
+    //-----------------------------------------------------------------------------
+    inline XMVECTOR XM_CALLCONV PointOnPlaneInsideTriangle(_In_ FXMVECTOR P, _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ GXMVECTOR V2) noexcept
+    {
+        // Compute the triangle normal.
+        XMVECTOR N = XMVector3Cross(XMVectorSubtract(V2, V0), XMVectorSubtract(V1, V0));
+
+        // Compute the cross products of the vector from the base of each edge to
+        // the point with each edge vector.
+        XMVECTOR C0 = XMVector3Cross(XMVectorSubtract(P, V0), XMVectorSubtract(V1, V0));
+        XMVECTOR C1 = XMVector3Cross(XMVectorSubtract(P, V1), XMVectorSubtract(V2, V1));
+        XMVECTOR C2 = XMVector3Cross(XMVectorSubtract(P, V2), XMVectorSubtract(V0, V2));
+
+        // If the cross product points in the same direction as the normal the the
+        // point is inside the edge (it is zero if is on the edge).
+        XMVECTOR Zero = XMVectorZero();
+        XMVECTOR Inside0 = XMVectorGreaterOrEqual(XMVector3Dot(C0, N), Zero);
+        XMVECTOR Inside1 = XMVectorGreaterOrEqual(XMVector3Dot(C1, N), Zero);
+        XMVECTOR Inside2 = XMVectorGreaterOrEqual(XMVector3Dot(C2, N), Zero);
+
+        // If the point inside all of the edges it is inside.
+        return XMVectorAndInt(XMVectorAndInt(Inside0, Inside1), Inside2);
+    }
+
+    //-----------------------------------------------------------------------------
+    inline bool SolveCubic(_In_ float e, _In_ float f, _In_ float g, _Out_ float* t, _Out_ float* u, _Out_ float* v) noexcept
+    {
+        float p, q, h, rc, d, theta, costh3, sinth3;
+
+        p = f - e * e / 3.0f;
+        q = g - e * f / 3.0f + e * e * e * 2.0f / 27.0f;
+        h = q * q / 4.0f + p * p * p / 27.0f;
+
+        if (h > 0)
+        {
+            *t = *u = *v = 0.f;
+            return false; // only one real root
+        }
+
+        if ((h == 0) && (q == 0)) // all the same root
+        {
+            *t = -e / 3;
+            *u = -e / 3;
+            *v = -e / 3;
+
+            return true;
+        }
+
+        d = sqrtf(q * q / 4.0f - h);
+        if (d < 0)
+            rc = -powf(-d, 1.0f / 3.0f);
+        else
+            rc = powf(d, 1.0f / 3.0f);
+
+        theta = XMScalarACos(-q / (2.0f * d));
+        costh3 = XMScalarCos(theta / 3.0f);
+        sinth3 = sqrtf(3.0f) * XMScalarSin(theta / 3.0f);
+        *t = 2.0f * rc * costh3 - e / 3.0f;
+        *u = -rc * (costh3 + sinth3) - e / 3.0f;
+        *v = -rc * (costh3 - sinth3) - e / 3.0f;
+
+        return true;
+    }
+
+    //-----------------------------------------------------------------------------
+    inline XMVECTOR CalculateEigenVector(_In_ float m11, _In_ float m12, _In_ float m13,
+        _In_ float m22, _In_ float m23, _In_ float m33, _In_ float e) noexcept
+    {
+        float fTmp[3];
+        fTmp[0] = m12 * m23 - m13 * (m22 - e);
+        fTmp[1] = m13 * m12 - m23 * (m11 - e);
+        fTmp[2] = (m11 - e) * (m22 - e) - m12 * m12;
+
+        XMVECTOR vTmp = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(fTmp));
+
+        if (XMVector3Equal(vTmp, XMVectorZero())) // planar or linear
+        {
+            float f1, f2, f3;
+
+            // we only have one equation - find a valid one
+            if ((m11 - e != 0) || (m12 != 0) || (m13 != 0))
+            {
+                f1 = m11 - e; f2 = m12; f3 = m13;
+            }
+            else if ((m12 != 0) || (m22 - e != 0) || (m23 != 0))
+            {
+                f1 = m12; f2 = m22 - e; f3 = m23;
+            }
+            else if ((m13 != 0) || (m23 != 0) || (m33 - e != 0))
+            {
+                f1 = m13; f2 = m23; f3 = m33 - e;
+            }
+            else
+            {
+                // error, we'll just make something up - we have NO context
+                f1 = 1.0f; f2 = 0.0f; f3 = 0.0f;
+            }
+
+            if (f1 == 0)
+                vTmp = XMVectorSetX(vTmp, 0.0f);
+            else
+                vTmp = XMVectorSetX(vTmp, 1.0f);
+
+            if (f2 == 0)
+                vTmp = XMVectorSetY(vTmp, 0.0f);
+            else
+                vTmp = XMVectorSetY(vTmp, 1.0f);
+
+            if (f3 == 0)
+            {
+                vTmp = XMVectorSetZ(vTmp, 0.0f);
+                // recalculate y to make equation work
+                if (m12 != 0)
+                    vTmp = XMVectorSetY(vTmp, -f1 / f2);
+            }
+            else
+            {
+                vTmp = XMVectorSetZ(vTmp, (f2 - f1) / f3);
+            }
+        }
+
+        if (XMVectorGetX(XMVector3LengthSq(vTmp)) > 1e-5f)
+        {
+            return XMVector3Normalize(vTmp);
+        }
+        else
+        {
+            // Multiply by a value large enough to make the vector non-zero.
+            vTmp = XMVectorScale(vTmp, 1e5f);
+            return XMVector3Normalize(vTmp);
+        }
+    }
+
+    //-----------------------------------------------------------------------------
+    inline bool CalculateEigenVectors(_In_ float m11, _In_ float m12, _In_ float m13,
+        _In_ float m22, _In_ float m23, _In_ float m33,
+        _In_ float e1, _In_ float e2, _In_ float e3,
+        _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3) noexcept
+    {
+        *pV1 = DirectX::Internal::CalculateEigenVector(m11, m12, m13, m22, m23, m33, e1);
+        *pV2 = DirectX::Internal::CalculateEigenVector(m11, m12, m13, m22, m23, m33, e2);
+        *pV3 = DirectX::Internal::CalculateEigenVector(m11, m12, m13, m22, m23, m33, e3);
+
+        bool v1z = false;
+        bool v2z = false;
+        bool v3z = false;
+
+        XMVECTOR Zero = XMVectorZero();
+
+        if (XMVector3Equal(*pV1, Zero))
+            v1z = true;
+
+        if (XMVector3Equal(*pV2, Zero))
+            v2z = true;
+
+        if (XMVector3Equal(*pV3, Zero))
+            v3z = true;
+
+        bool e12 = (fabsf(XMVectorGetX(XMVector3Dot(*pV1, *pV2))) > 0.1f); // check for non-orthogonal vectors
+        bool e13 = (fabsf(XMVectorGetX(XMVector3Dot(*pV1, *pV3))) > 0.1f);
+        bool e23 = (fabsf(XMVectorGetX(XMVector3Dot(*pV2, *pV3))) > 0.1f);
+
+        if ((v1z && v2z && v3z) || (e12 && e13 && e23) ||
+            (e12 && v3z) || (e13 && v2z) || (e23 && v1z)) // all eigenvectors are 0- any basis set
+        {
+            *pV1 = g_XMIdentityR0.v;
+            *pV2 = g_XMIdentityR1.v;
+            *pV3 = g_XMIdentityR2.v;
+            return true;
+        }
+
+        if (v1z && v2z)
+        {
+            XMVECTOR vTmp = XMVector3Cross(g_XMIdentityR1, *pV3);
+            if (XMVectorGetX(XMVector3LengthSq(vTmp)) < 1e-5f)
+            {
+                vTmp = XMVector3Cross(g_XMIdentityR0, *pV3);
+            }
+            *pV1 = XMVector3Normalize(vTmp);
+            *pV2 = XMVector3Cross(*pV3, *pV1);
+            return true;
+        }
+
+        if (v3z && v1z)
+        {
+            XMVECTOR vTmp = XMVector3Cross(g_XMIdentityR1, *pV2);
+            if (XMVectorGetX(XMVector3LengthSq(vTmp)) < 1e-5f)
+            {
+                vTmp = XMVector3Cross(g_XMIdentityR0, *pV2);
+            }
+            *pV3 = XMVector3Normalize(vTmp);
+            *pV1 = XMVector3Cross(*pV2, *pV3);
+            return true;
+        }
+
+        if (v2z && v3z)
+        {
+            XMVECTOR vTmp = XMVector3Cross(g_XMIdentityR1, *pV1);
+            if (XMVectorGetX(XMVector3LengthSq(vTmp)) < 1e-5f)
+            {
+                vTmp = XMVector3Cross(g_XMIdentityR0, *pV1);
+            }
+            *pV2 = XMVector3Normalize(vTmp);
+            *pV3 = XMVector3Cross(*pV1, *pV2);
+            return true;
+        }
+
+        if ((v1z) || e12)
+        {
+            *pV1 = XMVector3Cross(*pV2, *pV3);
+            return true;
+        }
+
+        if ((v2z) || e23)
+        {
+            *pV2 = XMVector3Cross(*pV3, *pV1);
+            return true;
+        }
+
+        if ((v3z) || e13)
+        {
+            *pV3 = XMVector3Cross(*pV1, *pV2);
+            return true;
+        }
+
+        return true;
+    }
+
+    //-----------------------------------------------------------------------------
+    inline bool CalculateEigenVectorsFromCovarianceMatrix(_In_ float Cxx, _In_ float Cyy, _In_ float Czz,
+        _In_ float Cxy, _In_ float Cxz, _In_ float Cyz,
+        _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3) noexcept
+    {
+        // Calculate the eigenvalues by solving a cubic equation.
+        float e = -(Cxx + Cyy + Czz);
+        float f = Cxx * Cyy + Cyy * Czz + Czz * Cxx - Cxy * Cxy - Cxz * Cxz - Cyz * Cyz;
+        float g = Cxy * Cxy * Czz + Cxz * Cxz * Cyy + Cyz * Cyz * Cxx - Cxy * Cyz * Cxz * 2.0f - Cxx * Cyy * Czz;
+
+        float ev1, ev2, ev3;
+        if (!DirectX::Internal::SolveCubic(e, f, g, &ev1, &ev2, &ev3))
+        {
+            // set them to arbitrary orthonormal basis set
+            *pV1 = g_XMIdentityR0.v;
+            *pV2 = g_XMIdentityR1.v;
+            *pV3 = g_XMIdentityR2.v;
+            return false;
+        }
+
+        return DirectX::Internal::CalculateEigenVectors(Cxx, Cxy, Cxz, Cyy, Cyz, Czz, ev1, ev2, ev3, pV1, pV2, pV3);
+    }
+
+    //-----------------------------------------------------------------------------
+    inline void XM_CALLCONV FastIntersectTrianglePlane(
+        FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2,
+        GXMVECTOR Plane,
+        XMVECTOR& Outside, XMVECTOR& Inside) noexcept
+    {
+        // Plane0
+        XMVECTOR Dist0 = XMVector4Dot(V0, Plane);
+        XMVECTOR Dist1 = XMVector4Dot(V1, Plane);
+        XMVECTOR Dist2 = XMVector4Dot(V2, Plane);
+
+        XMVECTOR MinDist = XMVectorMin(Dist0, Dist1);
+        MinDist = XMVectorMin(MinDist, Dist2);
+
+        XMVECTOR MaxDist = XMVectorMax(Dist0, Dist1);
+        MaxDist = XMVectorMax(MaxDist, Dist2);
+
+        XMVECTOR Zero = XMVectorZero();
+
+        // Outside the plane?
+        Outside = XMVectorGreater(MinDist, Zero);
+
+        // Fully inside the plane?
+        Inside = XMVectorLess(MaxDist, Zero);
+    }
+
+    //-----------------------------------------------------------------------------
+    inline void FastIntersectSpherePlane(_In_ FXMVECTOR Center, _In_ FXMVECTOR Radius, _In_ FXMVECTOR Plane,
+        _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside) noexcept
+    {
+        XMVECTOR Dist = XMVector4Dot(Center, Plane);
+
+        // Outside the plane?
+        Outside = XMVectorGreater(Dist, Radius);
+
+        // Fully inside the plane?
+        Inside = XMVectorLess(Dist, XMVectorNegate(Radius));
+    }
+
+    //-----------------------------------------------------------------------------
+    inline void FastIntersectAxisAlignedBoxPlane(_In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Plane,
+        _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside) noexcept
+    {
+        // Compute the distance to the center of the box.
+        XMVECTOR Dist = XMVector4Dot(Center, Plane);
+
+        // Project the axes of the box onto the normal of the plane.  Half the
+        // length of the projection (sometime called the "radius") is equal to
+        // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w))
+        // where h(i) are extents of the box, n is the plane normal, and b(i) are the
+        // axes of the box. In this case b(i) = [(1,0,0), (0,1,0), (0,0,1)].
+        XMVECTOR Radius = XMVector3Dot(Extents, XMVectorAbs(Plane));
+
+        // Outside the plane?
+        Outside = XMVectorGreater(Dist, Radius);
+
+        // Fully inside the plane?
+        Inside = XMVectorLess(Dist, XMVectorNegate(Radius));
+    }
+
+    //-----------------------------------------------------------------------------
+    inline void XM_CALLCONV FastIntersectOrientedBoxPlane(
+        _In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Axis0,
+        _In_ GXMVECTOR Axis1,
+        _In_ HXMVECTOR Axis2, _In_ HXMVECTOR Plane,
+        _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside) noexcept
+    {
+        // Compute the distance to the center of the box.
+        XMVECTOR Dist = XMVector4Dot(Center, Plane);
+
+        // Project the axes of the box onto the normal of the plane.  Half the
+        // length of the projection (sometime called the "radius") is equal to
+        // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w))
+        // where h(i) are extents of the box, n is the plane normal, and b(i) are the
+        // axes of the box.
+        XMVECTOR Radius = XMVector3Dot(Plane, Axis0);
+        Radius = XMVectorInsert<0, 0, 1, 0, 0>(Radius, XMVector3Dot(Plane, Axis1));
+        Radius = XMVectorInsert<0, 0, 0, 1, 0>(Radius, XMVector3Dot(Plane, Axis2));
+        Radius = XMVector3Dot(Extents, XMVectorAbs(Radius));
+
+        // Outside the plane?
+        Outside = XMVectorGreater(Dist, Radius);
+
+        // Fully inside the plane?
+        Inside = XMVectorLess(Dist, XMVectorNegate(Radius));
+    }
+
+    //-----------------------------------------------------------------------------
+    inline void XM_CALLCONV FastIntersectFrustumPlane(
+        _In_ FXMVECTOR Point0, _In_ FXMVECTOR Point1, _In_ FXMVECTOR Point2,
+        _In_ GXMVECTOR Point3,
+        _In_ HXMVECTOR Point4, _In_ HXMVECTOR Point5,
+        _In_ CXMVECTOR Point6, _In_ CXMVECTOR Point7, _In_ CXMVECTOR Plane,
+        _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside) noexcept
+    {
+        // Find the min/max projection of the frustum onto the plane normal.
+        XMVECTOR Min, Max, Dist;
+
+        Min = Max = XMVector3Dot(Plane, Point0);
+
+        Dist = XMVector3Dot(Plane, Point1);
+        Min = XMVectorMin(Min, Dist);
+        Max = XMVectorMax(Max, Dist);
+
+        Dist = XMVector3Dot(Plane, Point2);
+        Min = XMVectorMin(Min, Dist);
+        Max = XMVectorMax(Max, Dist);
+
+        Dist = XMVector3Dot(Plane, Point3);
+        Min = XMVectorMin(Min, Dist);
+        Max = XMVectorMax(Max, Dist);
+
+        Dist = XMVector3Dot(Plane, Point4);
+        Min = XMVectorMin(Min, Dist);
+        Max = XMVectorMax(Max, Dist);
+
+        Dist = XMVector3Dot(Plane, Point5);
+        Min = XMVectorMin(Min, Dist);
+        Max = XMVectorMax(Max, Dist);
+
+        Dist = XMVector3Dot(Plane, Point6);
+        Min = XMVectorMin(Min, Dist);
+        Max = XMVectorMax(Max, Dist);
+
+        Dist = XMVector3Dot(Plane, Point7);
+        Min = XMVectorMin(Min, Dist);
+        Max = XMVectorMax(Max, Dist);
+
+        XMVECTOR PlaneDist = XMVectorNegate(XMVectorSplatW(Plane));
+
+        // Outside the plane?
+        Outside = XMVectorGreater(Min, PlaneDist);
+
+        // Fully inside the plane?
+        Inside = XMVectorLess(Max, PlaneDist);
+    }
+
+} // namespace Internal
+
+
+/****************************************************************************
+ *
+ * BoundingSphere
+ *
+ ****************************************************************************/
+
+ //-----------------------------------------------------------------------------
+ // Transform a sphere by an angle preserving transform.
+ //-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingSphere::Transform(BoundingSphere& Out, FXMMATRIX M) const noexcept
+{
+    // Load the center of the sphere.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+
+    // Transform the center of the sphere.
+    XMVECTOR C = XMVector3Transform(vCenter, M);
+
+    XMVECTOR dX = XMVector3Dot(M.r[0], M.r[0]);
+    XMVECTOR dY = XMVector3Dot(M.r[1], M.r[1]);
+    XMVECTOR dZ = XMVector3Dot(M.r[2], M.r[2]);
+
+    XMVECTOR d = XMVectorMax(dX, XMVectorMax(dY, dZ));
+
+    // Store the center sphere.
+    XMStoreFloat3(&Out.Center, C);
+
+    // Scale the radius of the pshere.
+    float Scale = sqrtf(XMVectorGetX(d));
+    Out.Radius = Radius * Scale;
+}
+
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingSphere::Transform(BoundingSphere& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation) const noexcept
+{
+    // Load the center of the sphere.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+
+    // Transform the center of the sphere.
+    vCenter = XMVectorAdd(XMVector3Rotate(XMVectorScale(vCenter, Scale), Rotation), Translation);
+
+    // Store the center sphere.
+    XMStoreFloat3(&Out.Center, vCenter);
+
+    // Scale the radius of the pshere.
+    Out.Radius = Radius * Scale;
+}
+
+
+//-----------------------------------------------------------------------------
+// Point in sphere test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingSphere::Contains(FXMVECTOR Point) const noexcept
+{
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
+
+    XMVECTOR DistanceSquared = XMVector3LengthSq(XMVectorSubtract(Point, vCenter));
+    XMVECTOR RadiusSquared = XMVectorMultiply(vRadius, vRadius);
+
+    return XMVector3LessOrEqual(DistanceSquared, RadiusSquared) ? CONTAINS : DISJOINT;
+}
+
+
+//-----------------------------------------------------------------------------
+// Triangle in sphere test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingSphere::Contains(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept
+{
+    if (!Intersects(V0, V1, V2))
+        return DISJOINT;
+
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
+    XMVECTOR RadiusSquared = XMVectorMultiply(vRadius, vRadius);
+
+    XMVECTOR DistanceSquared = XMVector3LengthSq(XMVectorSubtract(V0, vCenter));
+    XMVECTOR Inside = XMVectorLessOrEqual(DistanceSquared, RadiusSquared);
+
+    DistanceSquared = XMVector3LengthSq(XMVectorSubtract(V1, vCenter));
+    Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared));
+
+    DistanceSquared = XMVector3LengthSq(XMVectorSubtract(V2, vCenter));
+    Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared));
+
+    return (XMVector3EqualInt(Inside, XMVectorTrueInt())) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Sphere in sphere test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingSphere::Contains(const BoundingSphere& sh) const noexcept
+{
+    XMVECTOR Center1 = XMLoadFloat3(&Center);
+    float r1 = Radius;
+
+    XMVECTOR Center2 = XMLoadFloat3(&sh.Center);
+    float r2 = sh.Radius;
+
+    XMVECTOR V = XMVectorSubtract(Center2, Center1);
+
+    XMVECTOR Dist = XMVector3Length(V);
+
+    float d = XMVectorGetX(Dist);
+
+    return (r1 + r2 >= d) ? ((r1 - r2 >= d) ? CONTAINS : INTERSECTS) : DISJOINT;
+}
+
+
+//-----------------------------------------------------------------------------
+// Axis-aligned box in sphere test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingSphere::Contains(const BoundingBox& box) const noexcept
+{
+    if (!box.Intersects(*this))
+        return DISJOINT;
+
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
+    XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius);
+
+    XMVECTOR boxCenter = XMLoadFloat3(&box.Center);
+    XMVECTOR boxExtents = XMLoadFloat3(&box.Extents);
+
+    XMVECTOR InsideAll = XMVectorTrueInt();
+
+    XMVECTOR offset = XMVectorSubtract(boxCenter, vCenter);
+
+    for (size_t i = 0; i < BoundingBox::CORNER_COUNT; ++i)
+    {
+        XMVECTOR C = XMVectorMultiplyAdd(boxExtents, g_BoxOffset[i], offset);
+        XMVECTOR d = XMVector3LengthSq(C);
+        InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(d, RadiusSq));
+    }
+
+    return (XMVector3EqualInt(InsideAll, XMVectorTrueInt())) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Oriented box in sphere test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingSphere::Contains(const BoundingOrientedBox& box) const noexcept
+{
+    if (!box.Intersects(*this))
+        return DISJOINT;
+
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
+    XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius);
+
+    XMVECTOR boxCenter = XMLoadFloat3(&box.Center);
+    XMVECTOR boxExtents = XMLoadFloat3(&box.Extents);
+    XMVECTOR boxOrientation = XMLoadFloat4(&box.Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(boxOrientation));
+
+    XMVECTOR InsideAll = XMVectorTrueInt();
+
+    for (size_t i = 0; i < BoundingOrientedBox::CORNER_COUNT; ++i)
+    {
+        XMVECTOR C = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(boxExtents, g_BoxOffset[i]), boxOrientation), boxCenter);
+        XMVECTOR d = XMVector3LengthSq(XMVectorSubtract(vCenter, C));
+        InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(d, RadiusSq));
+    }
+
+    return (XMVector3EqualInt(InsideAll, XMVectorTrueInt())) ? CONTAINS : INTERSECTS;
+
+}
+
+
+//-----------------------------------------------------------------------------
+// Frustum in sphere test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingSphere::Contains(const BoundingFrustum& fr) const noexcept
+{
+    if (!fr.Intersects(*this))
+        return DISJOINT;
+
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
+    XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius);
+
+    XMVECTOR vOrigin = XMLoadFloat3(&fr.Origin);
+    XMVECTOR vOrientation = XMLoadFloat4(&fr.Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation));
+
+    // Build the corners of the frustum.
+    XMVECTOR vRightTop = XMVectorSet(fr.RightSlope, fr.TopSlope, 1.0f, 0.0f);
+    XMVECTOR vRightBottom = XMVectorSet(fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f);
+    XMVECTOR vLeftTop = XMVectorSet(fr.LeftSlope, fr.TopSlope, 1.0f, 0.0f);
+    XMVECTOR vLeftBottom = XMVectorSet(fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f);
+    XMVECTOR vNear = XMVectorReplicatePtr(&fr.Near);
+    XMVECTOR vFar = XMVectorReplicatePtr(&fr.Far);
+
+    XMVECTOR Corners[BoundingFrustum::CORNER_COUNT];
+    Corners[0] = XMVectorMultiply(vRightTop, vNear);
+    Corners[1] = XMVectorMultiply(vRightBottom, vNear);
+    Corners[2] = XMVectorMultiply(vLeftTop, vNear);
+    Corners[3] = XMVectorMultiply(vLeftBottom, vNear);
+    Corners[4] = XMVectorMultiply(vRightTop, vFar);
+    Corners[5] = XMVectorMultiply(vRightBottom, vFar);
+    Corners[6] = XMVectorMultiply(vLeftTop, vFar);
+    Corners[7] = XMVectorMultiply(vLeftBottom, vFar);
+
+    XMVECTOR InsideAll = XMVectorTrueInt();
+    for (size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i)
+    {
+        XMVECTOR C = XMVectorAdd(XMVector3Rotate(Corners[i], vOrientation), vOrigin);
+        XMVECTOR d = XMVector3LengthSq(XMVectorSubtract(vCenter, C));
+        InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(d, RadiusSq));
+    }
+
+    return (XMVector3EqualInt(InsideAll, XMVectorTrueInt())) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Sphere vs. sphere test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingSphere::Intersects(const BoundingSphere& sh) const noexcept
+{
+    // Load A.
+    XMVECTOR vCenterA = XMLoadFloat3(&Center);
+    XMVECTOR vRadiusA = XMVectorReplicatePtr(&Radius);
+
+    // Load B.
+    XMVECTOR vCenterB = XMLoadFloat3(&sh.Center);
+    XMVECTOR vRadiusB = XMVectorReplicatePtr(&sh.Radius);
+
+    // Distance squared between centers.
+    XMVECTOR Delta = XMVectorSubtract(vCenterB, vCenterA);
+    XMVECTOR DistanceSquared = XMVector3LengthSq(Delta);
+
+    // Sum of the radii squared.
+    XMVECTOR RadiusSquared = XMVectorAdd(vRadiusA, vRadiusB);
+    RadiusSquared = XMVectorMultiply(RadiusSquared, RadiusSquared);
+
+    return XMVector3LessOrEqual(DistanceSquared, RadiusSquared);
+}
+
+
+//-----------------------------------------------------------------------------
+// Box vs. sphere test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingSphere::Intersects(const BoundingBox& box) const noexcept
+{
+    return box.Intersects(*this);
+}
+
+_Use_decl_annotations_
+inline bool BoundingSphere::Intersects(const BoundingOrientedBox& box) const noexcept
+{
+    return box.Intersects(*this);
+}
+
+
+//-----------------------------------------------------------------------------
+// Frustum vs. sphere test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingSphere::Intersects(const BoundingFrustum& fr) const noexcept
+{
+    return fr.Intersects(*this);
+}
+
+
+//-----------------------------------------------------------------------------
+// Triangle vs sphere test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV BoundingSphere::Intersects(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept
+{
+    // Load the sphere.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
+
+    // Compute the plane of the triangle (has to be normalized).
+    XMVECTOR N = XMVector3Normalize(XMVector3Cross(XMVectorSubtract(V1, V0), XMVectorSubtract(V2, V0)));
+
+    // Assert that the triangle is not degenerate.
+    assert(!XMVector3Equal(N, XMVectorZero()));
+
+    // Find the nearest feature on the triangle to the sphere.
+    XMVECTOR Dist = XMVector3Dot(XMVectorSubtract(vCenter, V0), N);
+
+    // If the center of the sphere is farther from the plane of the triangle than
+    // the radius of the sphere, then there cannot be an intersection.
+    XMVECTOR NoIntersection = XMVectorLess(Dist, XMVectorNegate(vRadius));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Dist, vRadius));
+
+    // Project the center of the sphere onto the plane of the triangle.
+    XMVECTOR Point = XMVectorNegativeMultiplySubtract(N, Dist, vCenter);
+
+    // Is it inside all the edges? If so we intersect because the distance
+    // to the plane is less than the radius.
+    XMVECTOR Intersection = DirectX::Internal::PointOnPlaneInsideTriangle(Point, V0, V1, V2);
+
+    // Find the nearest point on each edge.
+    XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius);
+
+    // Edge 0,1
+    Point = DirectX::Internal::PointOnLineSegmentNearestPoint(V0, V1, vCenter);
+
+    // If the distance to the center of the sphere to the point is less than
+    // the radius of the sphere then it must intersect.
+    Intersection = XMVectorOrInt(Intersection, XMVectorLessOrEqual(XMVector3LengthSq(XMVectorSubtract(vCenter, Point)), RadiusSq));
+
+    // Edge 1,2
+    Point = DirectX::Internal::PointOnLineSegmentNearestPoint(V1, V2, vCenter);
+
+    // If the distance to the center of the sphere to the point is less than
+    // the radius of the sphere then it must intersect.
+    Intersection = XMVectorOrInt(Intersection, XMVectorLessOrEqual(XMVector3LengthSq(XMVectorSubtract(vCenter, Point)), RadiusSq));
+
+    // Edge 2,0
+    Point = DirectX::Internal::PointOnLineSegmentNearestPoint(V2, V0, vCenter);
+
+    // If the distance to the center of the sphere to the point is less than
+    // the radius of the sphere then it must intersect.
+    Intersection = XMVectorOrInt(Intersection, XMVectorLessOrEqual(XMVector3LengthSq(XMVectorSubtract(vCenter, Point)), RadiusSq));
+
+    return XMVector4EqualInt(XMVectorAndCInt(Intersection, NoIntersection), XMVectorTrueInt());
+}
+
+
+//-----------------------------------------------------------------------------
+// Sphere-plane intersection
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PlaneIntersectionType XM_CALLCONV BoundingSphere::Intersects(FXMVECTOR Plane) const noexcept
+{
+    assert(DirectX::Internal::XMPlaneIsUnit(Plane));
+
+    // Load the sphere.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
+
+    // Set w of the center to one so we can dot4 with a plane.
+    vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne());
+
+    XMVECTOR Outside, Inside;
+    DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane, Outside, Inside);
+
+    // If the sphere is outside any plane it is outside.
+    if (XMVector4EqualInt(Outside, XMVectorTrueInt()))
+        return FRONT;
+
+    // If the sphere is inside all planes it is inside.
+    if (XMVector4EqualInt(Inside, XMVectorTrueInt()))
+        return BACK;
+
+    // The sphere is not inside all planes or outside a plane it intersects.
+    return INTERSECTING;
+}
+
+
+//-----------------------------------------------------------------------------
+// Compute the intersection of a ray (Origin, Direction) with a sphere.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV BoundingSphere::Intersects(FXMVECTOR Origin, FXMVECTOR Direction, float& Dist) const noexcept
+{
+    assert(DirectX::Internal::XMVector3IsUnit(Direction));
+
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
+
+    // l is the vector from the ray origin to the center of the sphere.
+    XMVECTOR l = XMVectorSubtract(vCenter, Origin);
+
+    // s is the projection of the l onto the ray direction.
+    XMVECTOR s = XMVector3Dot(l, Direction);
+
+    XMVECTOR l2 = XMVector3Dot(l, l);
+
+    XMVECTOR r2 = XMVectorMultiply(vRadius, vRadius);
+
+    // m2 is squared distance from the center of the sphere to the projection.
+    XMVECTOR m2 = XMVectorNegativeMultiplySubtract(s, s, l2);
+
+    XMVECTOR NoIntersection;
+
+    // If the ray origin is outside the sphere and the center of the sphere is
+    // behind the ray origin there is no intersection.
+    NoIntersection = XMVectorAndInt(XMVectorLess(s, XMVectorZero()), XMVectorGreater(l2, r2));
+
+    // If the squared distance from the center of the sphere to the projection
+    // is greater than the radius squared the ray will miss the sphere.
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(m2, r2));
+
+    // The ray hits the sphere, compute the nearest intersection point.
+    XMVECTOR q = XMVectorSqrt(XMVectorSubtract(r2, m2));
+    XMVECTOR t1 = XMVectorSubtract(s, q);
+    XMVECTOR t2 = XMVectorAdd(s, q);
+
+    XMVECTOR OriginInside = XMVectorLessOrEqual(l2, r2);
+    XMVECTOR t = XMVectorSelect(t1, t2, OriginInside);
+
+    if (XMVector4NotEqualInt(NoIntersection, XMVectorTrueInt()))
+    {
+        // Store the x-component to *pDist.
+        XMStoreFloat(&Dist, t);
+        return true;
+    }
+
+    Dist = 0.f;
+    return false;
+}
+
+
+//-----------------------------------------------------------------------------
+// Test a sphere vs 6 planes (typically forming a frustum).
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingSphere::ContainedBy(
+    FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2,
+    GXMVECTOR Plane3,
+    HXMVECTOR Plane4, HXMVECTOR Plane5) const noexcept
+{
+    // Load the sphere.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
+
+    // Set w of the center to one so we can dot4 with a plane.
+    vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne());
+
+    XMVECTOR Outside, Inside;
+
+    // Test against each plane.
+    DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane0, Outside, Inside);
+
+    XMVECTOR AnyOutside = Outside;
+    XMVECTOR AllInside = Inside;
+
+    DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane1, Outside, Inside);
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane2, Outside, Inside);
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane3, Outside, Inside);
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane4, Outside, Inside);
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane5, Outside, Inside);
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    // If the sphere is outside any plane it is outside.
+    if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt()))
+        return DISJOINT;
+
+    // If the sphere is inside all planes it is inside.
+    if (XMVector4EqualInt(AllInside, XMVectorTrueInt()))
+        return CONTAINS;
+
+    // The sphere is not inside all planes or outside a plane, it may intersect.
+    return INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Creates a bounding sphere that contains two other bounding spheres
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingSphere::CreateMerged(BoundingSphere& Out, const BoundingSphere& S1, const BoundingSphere& S2) noexcept
+{
+    XMVECTOR Center1 = XMLoadFloat3(&S1.Center);
+    float r1 = S1.Radius;
+
+    XMVECTOR Center2 = XMLoadFloat3(&S2.Center);
+    float r2 = S2.Radius;
+
+    XMVECTOR V = XMVectorSubtract(Center2, Center1);
+
+    XMVECTOR Dist = XMVector3Length(V);
+
+    float d = XMVectorGetX(Dist);
+
+    if (r1 + r2 >= d)
+    {
+        if (r1 - r2 >= d)
+        {
+            Out = S1;
+            return;
+        }
+        else if (r2 - r1 >= d)
+        {
+            Out = S2;
+            return;
+        }
+    }
+
+    XMVECTOR N = XMVectorDivide(V, Dist);
+
+    float t1 = XMMin(-r1, d - r2);
+    float t2 = XMMax(r1, d + r2);
+    float t_5 = (t2 - t1) * 0.5f;
+
+    XMVECTOR NCenter = XMVectorAdd(Center1, XMVectorMultiply(N, XMVectorReplicate(t_5 + t1)));
+
+    XMStoreFloat3(&Out.Center, NCenter);
+    Out.Radius = t_5;
+}
+
+
+//-----------------------------------------------------------------------------
+// Create sphere enscribing bounding box
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingSphere::CreateFromBoundingBox(BoundingSphere& Out, const BoundingBox& box) noexcept
+{
+    Out.Center = box.Center;
+    XMVECTOR vExtents = XMLoadFloat3(&box.Extents);
+    Out.Radius = XMVectorGetX(XMVector3Length(vExtents));
+}
+
+_Use_decl_annotations_
+inline void BoundingSphere::CreateFromBoundingBox(BoundingSphere& Out, const BoundingOrientedBox& box) noexcept
+{
+    // Bounding box orientation is irrelevant because a sphere is rotationally invariant
+    Out.Center = box.Center;
+    XMVECTOR vExtents = XMLoadFloat3(&box.Extents);
+    Out.Radius = XMVectorGetX(XMVector3Length(vExtents));
+}
+
+
+//-----------------------------------------------------------------------------
+// Find the approximate smallest enclosing bounding sphere for a set of
+// points. Exact computation of the smallest enclosing bounding sphere is
+// possible but is slower and requires a more complex algorithm.
+// The algorithm is based on  Jack Ritter, "An Efficient Bounding Sphere",
+// Graphics Gems.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingSphere::CreateFromPoints(BoundingSphere& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride) noexcept
+{
+    assert(Count > 0);
+    assert(pPoints);
+
+    // Find the points with minimum and maximum x, y, and z
+    XMVECTOR MinX, MaxX, MinY, MaxY, MinZ, MaxZ;
+
+    MinX = MaxX = MinY = MaxY = MinZ = MaxZ = XMLoadFloat3(pPoints);
+
+    for (size_t i = 1; i < Count; ++i)
+    {
+        XMVECTOR Point = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(reinterpret_cast<const uint8_t*>(pPoints) + i * Stride));
+
+        float px = XMVectorGetX(Point);
+        float py = XMVectorGetY(Point);
+        float pz = XMVectorGetZ(Point);
+
+        if (px < XMVectorGetX(MinX))
+            MinX = Point;
+
+        if (px > XMVectorGetX(MaxX))
+            MaxX = Point;
+
+        if (py < XMVectorGetY(MinY))
+            MinY = Point;
+
+        if (py > XMVectorGetY(MaxY))
+            MaxY = Point;
+
+        if (pz < XMVectorGetZ(MinZ))
+            MinZ = Point;
+
+        if (pz > XMVectorGetZ(MaxZ))
+            MaxZ = Point;
+    }
+
+    // Use the min/max pair that are farthest apart to form the initial sphere.
+    XMVECTOR DeltaX = XMVectorSubtract(MaxX, MinX);
+    XMVECTOR DistX = XMVector3Length(DeltaX);
+
+    XMVECTOR DeltaY = XMVectorSubtract(MaxY, MinY);
+    XMVECTOR DistY = XMVector3Length(DeltaY);
+
+    XMVECTOR DeltaZ = XMVectorSubtract(MaxZ, MinZ);
+    XMVECTOR DistZ = XMVector3Length(DeltaZ);
+
+    XMVECTOR vCenter;
+    XMVECTOR vRadius;
+
+    if (XMVector3Greater(DistX, DistY))
+    {
+        if (XMVector3Greater(DistX, DistZ))
+        {
+            // Use min/max x.
+            vCenter = XMVectorLerp(MaxX, MinX, 0.5f);
+            vRadius = XMVectorScale(DistX, 0.5f);
+        }
+        else
+        {
+            // Use min/max z.
+            vCenter = XMVectorLerp(MaxZ, MinZ, 0.5f);
+            vRadius = XMVectorScale(DistZ, 0.5f);
+        }
+    }
+    else // Y >= X
+    {
+        if (XMVector3Greater(DistY, DistZ))
+        {
+            // Use min/max y.
+            vCenter = XMVectorLerp(MaxY, MinY, 0.5f);
+            vRadius = XMVectorScale(DistY, 0.5f);
+        }
+        else
+        {
+            // Use min/max z.
+            vCenter = XMVectorLerp(MaxZ, MinZ, 0.5f);
+            vRadius = XMVectorScale(DistZ, 0.5f);
+        }
+    }
+
+    // Add any points not inside the sphere.
+    for (size_t i = 0; i < Count; ++i)
+    {
+        XMVECTOR Point = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(reinterpret_cast<const uint8_t*>(pPoints) + i * Stride));
+
+        XMVECTOR Delta = XMVectorSubtract(Point, vCenter);
+
+        XMVECTOR Dist = XMVector3Length(Delta);
+
+        if (XMVector3Greater(Dist, vRadius))
+        {
+            // Adjust sphere to include the new point.
+            vRadius = XMVectorScale(XMVectorAdd(vRadius, Dist), 0.5f);
+            vCenter = XMVectorAdd(vCenter, XMVectorMultiply(XMVectorSubtract(XMVectorReplicate(1.0f), XMVectorDivide(vRadius, Dist)), Delta));
+        }
+    }
+
+    XMStoreFloat3(&Out.Center, vCenter);
+    XMStoreFloat(&Out.Radius, vRadius);
+}
+
+
+//-----------------------------------------------------------------------------
+// Create sphere containing frustum
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingSphere::CreateFromFrustum(BoundingSphere& Out, const BoundingFrustum& fr) noexcept
+{
+    XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT];
+    fr.GetCorners(Corners);
+    CreateFromPoints(Out, BoundingFrustum::CORNER_COUNT, Corners, sizeof(XMFLOAT3));
+}
+
+
+/****************************************************************************
+ *
+ * BoundingBox
+ *
+ ****************************************************************************/
+
+ //-----------------------------------------------------------------------------
+ // Transform an axis aligned box by an angle preserving transform.
+ //-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingBox::Transform(BoundingBox& Out, FXMMATRIX M) const noexcept
+{
+    // Load center and extents.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+
+    // Compute and transform the corners and find new min/max bounds.
+    XMVECTOR Corner = XMVectorMultiplyAdd(vExtents, g_BoxOffset[0], vCenter);
+    Corner = XMVector3Transform(Corner, M);
+
+    XMVECTOR Min, Max;
+    Min = Max = Corner;
+
+    for (size_t i = 1; i < CORNER_COUNT; ++i)
+    {
+        Corner = XMVectorMultiplyAdd(vExtents, g_BoxOffset[i], vCenter);
+        Corner = XMVector3Transform(Corner, M);
+
+        Min = XMVectorMin(Min, Corner);
+        Max = XMVectorMax(Max, Corner);
+    }
+
+    // Store center and extents.
+    XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f));
+    XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(Max, Min), 0.5f));
+}
+
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingBox::Transform(BoundingBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation) const noexcept
+{
+    assert(DirectX::Internal::XMQuaternionIsUnit(Rotation));
+
+    // Load center and extents.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+
+    XMVECTOR VectorScale = XMVectorReplicate(Scale);
+
+    // Compute and transform the corners and find new min/max bounds.
+    XMVECTOR Corner = XMVectorMultiplyAdd(vExtents, g_BoxOffset[0], vCenter);
+    Corner = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(Corner, VectorScale), Rotation), Translation);
+
+    XMVECTOR Min, Max;
+    Min = Max = Corner;
+
+    for (size_t i = 1; i < CORNER_COUNT; ++i)
+    {
+        Corner = XMVectorMultiplyAdd(vExtents, g_BoxOffset[i], vCenter);
+        Corner = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(Corner, VectorScale), Rotation), Translation);
+
+        Min = XMVectorMin(Min, Corner);
+        Max = XMVectorMax(Max, Corner);
+    }
+
+    // Store center and extents.
+    XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f));
+    XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(Max, Min), 0.5f));
+}
+
+
+//-----------------------------------------------------------------------------
+// Get the corner points of the box
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingBox::GetCorners(XMFLOAT3* Corners) const noexcept
+{
+    assert(Corners != nullptr);
+
+    // Load the box
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+
+    for (size_t i = 0; i < CORNER_COUNT; ++i)
+    {
+        XMVECTOR C = XMVectorMultiplyAdd(vExtents, g_BoxOffset[i], vCenter);
+        XMStoreFloat3(&Corners[i], C);
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+// Point in axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingBox::Contains(FXMVECTOR Point) const noexcept
+{
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+
+    return XMVector3InBounds(XMVectorSubtract(Point, vCenter), vExtents) ? CONTAINS : DISJOINT;
+}
+
+
+//-----------------------------------------------------------------------------
+// Triangle in axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingBox::Contains(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept
+{
+    if (!Intersects(V0, V1, V2))
+        return DISJOINT;
+
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+
+    XMVECTOR d = XMVectorAbs(XMVectorSubtract(V0, vCenter));
+    XMVECTOR Inside = XMVectorLessOrEqual(d, vExtents);
+
+    d = XMVectorAbs(XMVectorSubtract(V1, vCenter));
+    Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(d, vExtents));
+
+    d = XMVectorAbs(XMVectorSubtract(V2, vCenter));
+    Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(d, vExtents));
+
+    return (XMVector3EqualInt(Inside, XMVectorTrueInt())) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Sphere in axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingBox::Contains(const BoundingSphere& sh) const noexcept
+{
+    XMVECTOR SphereCenter = XMLoadFloat3(&sh.Center);
+    XMVECTOR SphereRadius = XMVectorReplicatePtr(&sh.Radius);
+
+    XMVECTOR BoxCenter = XMLoadFloat3(&Center);
+    XMVECTOR BoxExtents = XMLoadFloat3(&Extents);
+
+    XMVECTOR BoxMin = XMVectorSubtract(BoxCenter, BoxExtents);
+    XMVECTOR BoxMax = XMVectorAdd(BoxCenter, BoxExtents);
+
+    // Find the distance to the nearest point on the box.
+    // for each i in (x, y, z)
+    // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2
+    // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2
+
+    XMVECTOR d = XMVectorZero();
+
+    // Compute d for each dimension.
+    XMVECTOR LessThanMin = XMVectorLess(SphereCenter, BoxMin);
+    XMVECTOR GreaterThanMax = XMVectorGreater(SphereCenter, BoxMax);
+
+    XMVECTOR MinDelta = XMVectorSubtract(SphereCenter, BoxMin);
+    XMVECTOR MaxDelta = XMVectorSubtract(SphereCenter, BoxMax);
+
+    // Choose value for each dimension based on the comparison.
+    d = XMVectorSelect(d, MinDelta, LessThanMin);
+    d = XMVectorSelect(d, MaxDelta, GreaterThanMax);
+
+    // Use a dot-product to square them and sum them together.
+    XMVECTOR d2 = XMVector3Dot(d, d);
+
+    if (XMVector3Greater(d2, XMVectorMultiply(SphereRadius, SphereRadius)))
+        return DISJOINT;
+
+    XMVECTOR InsideAll = XMVectorLessOrEqual(XMVectorAdd(BoxMin, SphereRadius), SphereCenter);
+    InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(SphereCenter, XMVectorSubtract(BoxMax, SphereRadius)));
+    InsideAll = XMVectorAndInt(InsideAll, XMVectorGreater(XMVectorSubtract(BoxMax, BoxMin), SphereRadius));
+
+    return (XMVector3EqualInt(InsideAll, XMVectorTrueInt())) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Axis-aligned box in axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingBox::Contains(const BoundingBox& box) const noexcept
+{
+    XMVECTOR CenterA = XMLoadFloat3(&Center);
+    XMVECTOR ExtentsA = XMLoadFloat3(&Extents);
+
+    XMVECTOR CenterB = XMLoadFloat3(&box.Center);
+    XMVECTOR ExtentsB = XMLoadFloat3(&box.Extents);
+
+    XMVECTOR MinA = XMVectorSubtract(CenterA, ExtentsA);
+    XMVECTOR MaxA = XMVectorAdd(CenterA, ExtentsA);
+
+    XMVECTOR MinB = XMVectorSubtract(CenterB, ExtentsB);
+    XMVECTOR MaxB = XMVectorAdd(CenterB, ExtentsB);
+
+    // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false
+    XMVECTOR Disjoint = XMVectorOrInt(XMVectorGreater(MinA, MaxB), XMVectorGreater(MinB, MaxA));
+
+    if (DirectX::Internal::XMVector3AnyTrue(Disjoint))
+        return DISJOINT;
+
+    // for each i in (x, y, z) if a_min(i) <= b_min(i) and b_max(i) <= a_max(i) then A contains B
+    XMVECTOR Inside = XMVectorAndInt(XMVectorLessOrEqual(MinA, MinB), XMVectorLessOrEqual(MaxB, MaxA));
+
+    return DirectX::Internal::XMVector3AllTrue(Inside) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Oriented box in axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingBox::Contains(const BoundingOrientedBox& box) const noexcept
+{
+    if (!box.Intersects(*this))
+        return DISJOINT;
+
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+
+    // Subtract off the AABB center to remove a subtract below
+    XMVECTOR oCenter = XMVectorSubtract(XMLoadFloat3(&box.Center), vCenter);
+
+    XMVECTOR oExtents = XMLoadFloat3(&box.Extents);
+    XMVECTOR oOrientation = XMLoadFloat4(&box.Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(oOrientation));
+
+    XMVECTOR Inside = XMVectorTrueInt();
+
+    for (size_t i = 0; i < BoundingOrientedBox::CORNER_COUNT; ++i)
+    {
+        XMVECTOR C = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(oExtents, g_BoxOffset[i]), oOrientation), oCenter);
+        XMVECTOR d = XMVectorAbs(C);
+        Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(d, vExtents));
+    }
+
+    return (XMVector3EqualInt(Inside, XMVectorTrueInt())) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Frustum in axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingBox::Contains(const BoundingFrustum& fr) const noexcept
+{
+    if (!fr.Intersects(*this))
+        return DISJOINT;
+
+    XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT];
+    fr.GetCorners(Corners);
+
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+
+    XMVECTOR Inside = XMVectorTrueInt();
+
+    for (size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i)
+    {
+        XMVECTOR Point = XMLoadFloat3(&Corners[i]);
+        XMVECTOR d = XMVectorAbs(XMVectorSubtract(Point, vCenter));
+        Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(d, vExtents));
+    }
+
+    return (XMVector3EqualInt(Inside, XMVectorTrueInt())) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Sphere vs axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingBox::Intersects(const BoundingSphere& sh) const noexcept
+{
+    XMVECTOR SphereCenter = XMLoadFloat3(&sh.Center);
+    XMVECTOR SphereRadius = XMVectorReplicatePtr(&sh.Radius);
+
+    XMVECTOR BoxCenter = XMLoadFloat3(&Center);
+    XMVECTOR BoxExtents = XMLoadFloat3(&Extents);
+
+    XMVECTOR BoxMin = XMVectorSubtract(BoxCenter, BoxExtents);
+    XMVECTOR BoxMax = XMVectorAdd(BoxCenter, BoxExtents);
+
+    // Find the distance to the nearest point on the box.
+    // for each i in (x, y, z)
+    // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2
+    // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2
+
+    XMVECTOR d = XMVectorZero();
+
+    // Compute d for each dimension.
+    XMVECTOR LessThanMin = XMVectorLess(SphereCenter, BoxMin);
+    XMVECTOR GreaterThanMax = XMVectorGreater(SphereCenter, BoxMax);
+
+    XMVECTOR MinDelta = XMVectorSubtract(SphereCenter, BoxMin);
+    XMVECTOR MaxDelta = XMVectorSubtract(SphereCenter, BoxMax);
+
+    // Choose value for each dimension based on the comparison.
+    d = XMVectorSelect(d, MinDelta, LessThanMin);
+    d = XMVectorSelect(d, MaxDelta, GreaterThanMax);
+
+    // Use a dot-product to square them and sum them together.
+    XMVECTOR d2 = XMVector3Dot(d, d);
+
+    return XMVector3LessOrEqual(d2, XMVectorMultiply(SphereRadius, SphereRadius));
+}
+
+
+//-----------------------------------------------------------------------------
+// Axis-aligned box vs. axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingBox::Intersects(const BoundingBox& box) const noexcept
+{
+    XMVECTOR CenterA = XMLoadFloat3(&Center);
+    XMVECTOR ExtentsA = XMLoadFloat3(&Extents);
+
+    XMVECTOR CenterB = XMLoadFloat3(&box.Center);
+    XMVECTOR ExtentsB = XMLoadFloat3(&box.Extents);
+
+    XMVECTOR MinA = XMVectorSubtract(CenterA, ExtentsA);
+    XMVECTOR MaxA = XMVectorAdd(CenterA, ExtentsA);
+
+    XMVECTOR MinB = XMVectorSubtract(CenterB, ExtentsB);
+    XMVECTOR MaxB = XMVectorAdd(CenterB, ExtentsB);
+
+    // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false
+    XMVECTOR Disjoint = XMVectorOrInt(XMVectorGreater(MinA, MaxB), XMVectorGreater(MinB, MaxA));
+
+    return !DirectX::Internal::XMVector3AnyTrue(Disjoint);
+}
+
+
+//-----------------------------------------------------------------------------
+// Oriented box vs. axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingBox::Intersects(const BoundingOrientedBox& box) const noexcept
+{
+    return box.Intersects(*this);
+}
+
+
+//-----------------------------------------------------------------------------
+// Frustum vs. axis-aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingBox::Intersects(const BoundingFrustum& fr) const noexcept
+{
+    return fr.Intersects(*this);
+}
+
+
+//-----------------------------------------------------------------------------
+// Triangle vs. axis aligned box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV BoundingBox::Intersects(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept
+{
+    XMVECTOR Zero = XMVectorZero();
+
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+
+    XMVECTOR BoxMin = XMVectorSubtract(vCenter, vExtents);
+    XMVECTOR BoxMax = XMVectorAdd(vCenter, vExtents);
+
+    // Test the axes of the box (in effect test the AAB against the minimal AAB
+    // around the triangle).
+    XMVECTOR TriMin = XMVectorMin(XMVectorMin(V0, V1), V2);
+    XMVECTOR TriMax = XMVectorMax(XMVectorMax(V0, V1), V2);
+
+    // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then disjoint
+    XMVECTOR Disjoint = XMVectorOrInt(XMVectorGreater(TriMin, BoxMax), XMVectorGreater(BoxMin, TriMax));
+    if (DirectX::Internal::XMVector3AnyTrue(Disjoint))
+        return false;
+
+    // Test the plane of the triangle.
+    XMVECTOR Normal = XMVector3Cross(XMVectorSubtract(V1, V0), XMVectorSubtract(V2, V0));
+    XMVECTOR Dist = XMVector3Dot(Normal, V0);
+
+    // Assert that the triangle is not degenerate.
+    assert(!XMVector3Equal(Normal, Zero));
+
+    // for each i in (x, y, z) if n(i) >= 0 then v_min(i)=b_min(i), v_max(i)=b_max(i)
+    // else v_min(i)=b_max(i), v_max(i)=b_min(i)
+    XMVECTOR NormalSelect = XMVectorGreater(Normal, Zero);
+    XMVECTOR V_Min = XMVectorSelect(BoxMax, BoxMin, NormalSelect);
+    XMVECTOR V_Max = XMVectorSelect(BoxMin, BoxMax, NormalSelect);
+
+    // if n dot v_min + d > 0 || n dot v_max + d < 0 then disjoint
+    XMVECTOR MinDist = XMVector3Dot(V_Min, Normal);
+    XMVECTOR MaxDist = XMVector3Dot(V_Max, Normal);
+
+    XMVECTOR NoIntersection = XMVectorGreater(MinDist, Dist);
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(MaxDist, Dist));
+
+    // Move the box center to zero to simplify the following tests.
+    XMVECTOR TV0 = XMVectorSubtract(V0, vCenter);
+    XMVECTOR TV1 = XMVectorSubtract(V1, vCenter);
+    XMVECTOR TV2 = XMVectorSubtract(V2, vCenter);
+
+    // Test the edge/edge axes (3*3).
+    XMVECTOR e0 = XMVectorSubtract(TV1, TV0);
+    XMVECTOR e1 = XMVectorSubtract(TV2, TV1);
+    XMVECTOR e2 = XMVectorSubtract(TV0, TV2);
+
+    // Make w zero.
+    e0 = XMVectorInsert<0, 0, 0, 0, 1>(e0, Zero);
+    e1 = XMVectorInsert<0, 0, 0, 0, 1>(e1, Zero);
+    e2 = XMVectorInsert<0, 0, 0, 0, 1>(e2, Zero);
+
+    XMVECTOR Axis;
+    XMVECTOR p0, p1, p2;
+    XMVECTOR Min, Max;
+    XMVECTOR Radius;
+
+    // Axis == (1,0,0) x e0 = (0, -e0.z, e0.y)
+    Axis = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>(e0, XMVectorNegate(e0));
+    p0 = XMVector3Dot(TV0, Axis);
+    // p1 = XMVector3Dot( V1, Axis ); // p1 = p0;
+    p2 = XMVector3Dot(TV2, Axis);
+    Min = XMVectorMin(p0, p2);
+    Max = XMVectorMax(p0, p2);
+    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius)));
+
+    // Axis == (1,0,0) x e1 = (0, -e1.z, e1.y)
+    Axis = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>(e1, XMVectorNegate(e1));
+    p0 = XMVector3Dot(TV0, Axis);
+    p1 = XMVector3Dot(TV1, Axis);
+    // p2 = XMVector3Dot( V2, Axis ); // p2 = p1;
+    Min = XMVectorMin(p0, p1);
+    Max = XMVectorMax(p0, p1);
+    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius)));
+
+    // Axis == (1,0,0) x e2 = (0, -e2.z, e2.y)
+    Axis = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>(e2, XMVectorNegate(e2));
+    p0 = XMVector3Dot(TV0, Axis);
+    p1 = XMVector3Dot(TV1, Axis);
+    // p2 = XMVector3Dot( V2, Axis ); // p2 = p0;
+    Min = XMVectorMin(p0, p1);
+    Max = XMVectorMax(p0, p1);
+    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius)));
+
+    // Axis == (0,1,0) x e0 = (e0.z, 0, -e0.x)
+    Axis = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>(e0, XMVectorNegate(e0));
+    p0 = XMVector3Dot(TV0, Axis);
+    // p1 = XMVector3Dot( V1, Axis ); // p1 = p0;
+    p2 = XMVector3Dot(TV2, Axis);
+    Min = XMVectorMin(p0, p2);
+    Max = XMVectorMax(p0, p2);
+    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius)));
+
+    // Axis == (0,1,0) x e1 = (e1.z, 0, -e1.x)
+    Axis = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>(e1, XMVectorNegate(e1));
+    p0 = XMVector3Dot(TV0, Axis);
+    p1 = XMVector3Dot(TV1, Axis);
+    // p2 = XMVector3Dot( V2, Axis ); // p2 = p1;
+    Min = XMVectorMin(p0, p1);
+    Max = XMVectorMax(p0, p1);
+    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius)));
+
+    // Axis == (0,0,1) x e2 = (e2.z, 0, -e2.x)
+    Axis = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>(e2, XMVectorNegate(e2));
+    p0 = XMVector3Dot(TV0, Axis);
+    p1 = XMVector3Dot(TV1, Axis);
+    // p2 = XMVector3Dot( V2, Axis ); // p2 = p0;
+    Min = XMVectorMin(p0, p1);
+    Max = XMVectorMax(p0, p1);
+    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius)));
+
+    // Axis == (0,0,1) x e0 = (-e0.y, e0.x, 0)
+    Axis = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>(e0, XMVectorNegate(e0));
+    p0 = XMVector3Dot(TV0, Axis);
+    // p1 = XMVector3Dot( V1, Axis ); // p1 = p0;
+    p2 = XMVector3Dot(TV2, Axis);
+    Min = XMVectorMin(p0, p2);
+    Max = XMVectorMax(p0, p2);
+    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius)));
+
+    // Axis == (0,0,1) x e1 = (-e1.y, e1.x, 0)
+    Axis = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>(e1, XMVectorNegate(e1));
+    p0 = XMVector3Dot(TV0, Axis);
+    p1 = XMVector3Dot(TV1, Axis);
+    // p2 = XMVector3Dot( V2, Axis ); // p2 = p1;
+    Min = XMVectorMin(p0, p1);
+    Max = XMVectorMax(p0, p1);
+    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius)));
+
+    // Axis == (0,0,1) x e2 = (-e2.y, e2.x, 0)
+    Axis = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>(e2, XMVectorNegate(e2));
+    p0 = XMVector3Dot(TV0, Axis);
+    p1 = XMVector3Dot(TV1, Axis);
+    // p2 = XMVector3Dot( V2, Axis ); // p2 = p0;
+    Min = XMVectorMin(p0, p1);
+    Max = XMVectorMax(p0, p1);
+    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius)));
+
+    return XMVector4NotEqualInt(NoIntersection, XMVectorTrueInt());
+}
+
+
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PlaneIntersectionType XM_CALLCONV BoundingBox::Intersects(FXMVECTOR Plane) const noexcept
+{
+    assert(DirectX::Internal::XMPlaneIsUnit(Plane));
+
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+
+    // Set w of the center to one so we can dot4 with a plane.
+    vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne());
+
+    XMVECTOR Outside, Inside;
+    DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane, Outside, Inside);
+
+    // If the box is outside any plane it is outside.
+    if (XMVector4EqualInt(Outside, XMVectorTrueInt()))
+        return FRONT;
+
+    // If the box is inside all planes it is inside.
+    if (XMVector4EqualInt(Inside, XMVectorTrueInt()))
+        return BACK;
+
+    // The box is not inside all planes or outside a plane it intersects.
+    return INTERSECTING;
+}
+
+
+//-----------------------------------------------------------------------------
+// Compute the intersection of a ray (Origin, Direction) with an axis aligned
+// box using the slabs method.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV BoundingBox::Intersects(FXMVECTOR Origin, FXMVECTOR Direction, float& Dist) const noexcept
+{
+    assert(DirectX::Internal::XMVector3IsUnit(Direction));
+
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+
+    // Adjust ray origin to be relative to center of the box.
+    XMVECTOR TOrigin = XMVectorSubtract(vCenter, Origin);
+
+    // Compute the dot product againt each axis of the box.
+    // Since the axii are (1,0,0), (0,1,0), (0,0,1) no computation is necessary.
+    XMVECTOR AxisDotOrigin = TOrigin;
+    XMVECTOR AxisDotDirection = Direction;
+
+    // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab.
+    XMVECTOR IsParallel = XMVectorLessOrEqual(XMVectorAbs(AxisDotDirection), g_RayEpsilon);
+
+    // Test against all three axii simultaneously.
+    XMVECTOR InverseAxisDotDirection = XMVectorReciprocal(AxisDotDirection);
+    XMVECTOR t1 = XMVectorMultiply(XMVectorSubtract(AxisDotOrigin, vExtents), InverseAxisDotDirection);
+    XMVECTOR t2 = XMVectorMultiply(XMVectorAdd(AxisDotOrigin, vExtents), InverseAxisDotDirection);
+
+    // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't
+    // use the results from any directions parallel to the slab.
+    XMVECTOR t_min = XMVectorSelect(XMVectorMin(t1, t2), g_FltMin, IsParallel);
+    XMVECTOR t_max = XMVectorSelect(XMVectorMax(t1, t2), g_FltMax, IsParallel);
+
+    // t_min.x = maximum( t_min.x, t_min.y, t_min.z );
+    // t_max.x = minimum( t_max.x, t_max.y, t_max.z );
+    t_min = XMVectorMax(t_min, XMVectorSplatY(t_min));  // x = max(x,y)
+    t_min = XMVectorMax(t_min, XMVectorSplatZ(t_min));  // x = max(max(x,y),z)
+    t_max = XMVectorMin(t_max, XMVectorSplatY(t_max));  // x = min(x,y)
+    t_max = XMVectorMin(t_max, XMVectorSplatZ(t_max));  // x = min(min(x,y),z)
+
+    // if ( t_min > t_max ) return false;
+    XMVECTOR NoIntersection = XMVectorGreater(XMVectorSplatX(t_min), XMVectorSplatX(t_max));
+
+    // if ( t_max < 0.0f ) return false;
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(XMVectorSplatX(t_max), XMVectorZero()));
+
+    // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false;
+    XMVECTOR ParallelOverlap = XMVectorInBounds(AxisDotOrigin, vExtents);
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorAndCInt(IsParallel, ParallelOverlap));
+
+    if (!DirectX::Internal::XMVector3AnyTrue(NoIntersection))
+    {
+        // Store the x-component to *pDist
+        XMStoreFloat(&Dist, t_min);
+        return true;
+    }
+
+    Dist = 0.f;
+    return false;
+}
+
+
+//-----------------------------------------------------------------------------
+// Test an axis alinged box vs 6 planes (typically forming a frustum).
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingBox::ContainedBy(
+    FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2,
+    GXMVECTOR Plane3,
+    HXMVECTOR Plane4, HXMVECTOR Plane5) const noexcept
+{
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+
+    // Set w of the center to one so we can dot4 with a plane.
+    vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne());
+
+    XMVECTOR Outside, Inside;
+
+    // Test against each plane.
+    DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane0, Outside, Inside);
+
+    XMVECTOR AnyOutside = Outside;
+    XMVECTOR AllInside = Inside;
+
+    DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane1, Outside, Inside);
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane2, Outside, Inside);
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane3, Outside, Inside);
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane4, Outside, Inside);
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane5, Outside, Inside);
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    // If the box is outside any plane it is outside.
+    if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt()))
+        return DISJOINT;
+
+    // If the box is inside all planes it is inside.
+    if (XMVector4EqualInt(AllInside, XMVectorTrueInt()))
+        return CONTAINS;
+
+    // The box is not inside all planes or outside a plane, it may intersect.
+    return INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Create axis-aligned box that contains two other bounding boxes
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingBox::CreateMerged(BoundingBox& Out, const BoundingBox& b1, const BoundingBox& b2) noexcept
+{
+    XMVECTOR b1Center = XMLoadFloat3(&b1.Center);
+    XMVECTOR b1Extents = XMLoadFloat3(&b1.Extents);
+
+    XMVECTOR b2Center = XMLoadFloat3(&b2.Center);
+    XMVECTOR b2Extents = XMLoadFloat3(&b2.Extents);
+
+    XMVECTOR Min = XMVectorSubtract(b1Center, b1Extents);
+    Min = XMVectorMin(Min, XMVectorSubtract(b2Center, b2Extents));
+
+    XMVECTOR Max = XMVectorAdd(b1Center, b1Extents);
+    Max = XMVectorMax(Max, XMVectorAdd(b2Center, b2Extents));
+
+    assert(XMVector3LessOrEqual(Min, Max));
+
+    XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f));
+    XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(Max, Min), 0.5f));
+}
+
+
+//-----------------------------------------------------------------------------
+// Create axis-aligned box that contains a bounding sphere
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingBox::CreateFromSphere(BoundingBox& Out, const BoundingSphere& sh) noexcept
+{
+    XMVECTOR spCenter = XMLoadFloat3(&sh.Center);
+    XMVECTOR shRadius = XMVectorReplicatePtr(&sh.Radius);
+
+    XMVECTOR Min = XMVectorSubtract(spCenter, shRadius);
+    XMVECTOR Max = XMVectorAdd(spCenter, shRadius);
+
+    assert(XMVector3LessOrEqual(Min, Max));
+
+    XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f));
+    XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(Max, Min), 0.5f));
+}
+
+
+//-----------------------------------------------------------------------------
+// Create axis-aligned box from min/max points
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingBox::CreateFromPoints(BoundingBox& Out, FXMVECTOR pt1, FXMVECTOR pt2) noexcept
+{
+    XMVECTOR Min = XMVectorMin(pt1, pt2);
+    XMVECTOR Max = XMVectorMax(pt1, pt2);
+
+    // Store center and extents.
+    XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f));
+    XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(Max, Min), 0.5f));
+}
+
+
+//-----------------------------------------------------------------------------
+// Find the minimum axis aligned bounding box containing a set of points.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingBox::CreateFromPoints(BoundingBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride) noexcept
+{
+    assert(Count > 0);
+    assert(pPoints);
+
+    // Find the minimum and maximum x, y, and z
+    XMVECTOR vMin, vMax;
+
+    vMin = vMax = XMLoadFloat3(pPoints);
+
+    for (size_t i = 1; i < Count; ++i)
+    {
+        XMVECTOR Point = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(reinterpret_cast<const uint8_t*>(pPoints) + i * Stride));
+
+        vMin = XMVectorMin(vMin, Point);
+        vMax = XMVectorMax(vMax, Point);
+    }
+
+    // Store center and extents.
+    XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(vMin, vMax), 0.5f));
+    XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(vMax, vMin), 0.5f));
+}
+
+
+/****************************************************************************
+ *
+ * BoundingOrientedBox
+ *
+ ****************************************************************************/
+
+ //-----------------------------------------------------------------------------
+ // Transform an oriented box by an angle preserving transform.
+ //-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingOrientedBox::Transform(BoundingOrientedBox& Out, FXMMATRIX M) const noexcept
+{
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation));
+
+    // Composite the box rotation and the transform rotation.
+    XMMATRIX nM;
+    nM.r[0] = XMVector3Normalize(M.r[0]);
+    nM.r[1] = XMVector3Normalize(M.r[1]);
+    nM.r[2] = XMVector3Normalize(M.r[2]);
+    nM.r[3] = g_XMIdentityR3;
+    XMVECTOR Rotation = XMQuaternionRotationMatrix(nM);
+    vOrientation = XMQuaternionMultiply(vOrientation, Rotation);
+
+    // Transform the center.
+    vCenter = XMVector3Transform(vCenter, M);
+
+    // Scale the box extents.
+    XMVECTOR dX = XMVector3Length(M.r[0]);
+    XMVECTOR dY = XMVector3Length(M.r[1]);
+    XMVECTOR dZ = XMVector3Length(M.r[2]);
+
+    XMVECTOR VectorScale = XMVectorSelect(dY, dX, g_XMSelect1000);
+    VectorScale = XMVectorSelect(dZ, VectorScale, g_XMSelect1100);
+    vExtents = XMVectorMultiply(vExtents, VectorScale);
+
+    // Store the box.
+    XMStoreFloat3(&Out.Center, vCenter);
+    XMStoreFloat3(&Out.Extents, vExtents);
+    XMStoreFloat4(&Out.Orientation, vOrientation);
+}
+
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingOrientedBox::Transform(BoundingOrientedBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation) const noexcept
+{
+    assert(DirectX::Internal::XMQuaternionIsUnit(Rotation));
+
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation));
+
+    // Composite the box rotation and the transform rotation.
+    vOrientation = XMQuaternionMultiply(vOrientation, Rotation);
+
+    // Transform the center.
+    XMVECTOR VectorScale = XMVectorReplicate(Scale);
+    vCenter = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(vCenter, VectorScale), Rotation), Translation);
+
+    // Scale the box extents.
+    vExtents = XMVectorMultiply(vExtents, VectorScale);
+
+    // Store the box.
+    XMStoreFloat3(&Out.Center, vCenter);
+    XMStoreFloat3(&Out.Extents, vExtents);
+    XMStoreFloat4(&Out.Orientation, vOrientation);
+}
+
+
+//-----------------------------------------------------------------------------
+// Get the corner points of the box
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingOrientedBox::GetCorners(XMFLOAT3* Corners) const noexcept
+{
+    assert(Corners != nullptr);
+
+    // Load the box
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation));
+
+    for (size_t i = 0; i < CORNER_COUNT; ++i)
+    {
+        XMVECTOR C = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(vExtents, g_BoxOffset[i]), vOrientation), vCenter);
+        XMStoreFloat3(&Corners[i], C);
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+// Point in oriented box test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingOrientedBox::Contains(FXMVECTOR Point) const noexcept
+{
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    // Transform the point to be local to the box.
+    XMVECTOR TPoint = XMVector3InverseRotate(XMVectorSubtract(Point, vCenter), vOrientation);
+
+    return XMVector3InBounds(TPoint, vExtents) ? CONTAINS : DISJOINT;
+}
+
+
+//-----------------------------------------------------------------------------
+// Triangle in oriented bounding box
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingOrientedBox::Contains(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept
+{
+    // Load the box center & orientation.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    // Transform the triangle vertices into the space of the box.
+    XMVECTOR TV0 = XMVector3InverseRotate(XMVectorSubtract(V0, vCenter), vOrientation);
+    XMVECTOR TV1 = XMVector3InverseRotate(XMVectorSubtract(V1, vCenter), vOrientation);
+    XMVECTOR TV2 = XMVector3InverseRotate(XMVectorSubtract(V2, vCenter), vOrientation);
+
+    BoundingBox box;
+    box.Center = XMFLOAT3(0.0f, 0.0f, 0.0f);
+    box.Extents = Extents;
+
+    // Use the triangle vs axis aligned box intersection routine.
+    return box.Contains(TV0, TV1, TV2);
+}
+
+
+//-----------------------------------------------------------------------------
+// Sphere in oriented bounding box
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingOrientedBox::Contains(const BoundingSphere& sh) const noexcept
+{
+    XMVECTOR SphereCenter = XMLoadFloat3(&sh.Center);
+    XMVECTOR SphereRadius = XMVectorReplicatePtr(&sh.Radius);
+
+    XMVECTOR BoxCenter = XMLoadFloat3(&Center);
+    XMVECTOR BoxExtents = XMLoadFloat3(&Extents);
+    XMVECTOR BoxOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(BoxOrientation));
+
+    // Transform the center of the sphere to be local to the box.
+    // BoxMin = -BoxExtents
+    // BoxMax = +BoxExtents
+    SphereCenter = XMVector3InverseRotate(XMVectorSubtract(SphereCenter, BoxCenter), BoxOrientation);
+
+    // Find the distance to the nearest point on the box.
+    // for each i in (x, y, z)
+    // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2
+    // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2
+
+    XMVECTOR d = XMVectorZero();
+
+    // Compute d for each dimension.
+    XMVECTOR LessThanMin = XMVectorLess(SphereCenter, XMVectorNegate(BoxExtents));
+    XMVECTOR GreaterThanMax = XMVectorGreater(SphereCenter, BoxExtents);
+
+    XMVECTOR MinDelta = XMVectorAdd(SphereCenter, BoxExtents);
+    XMVECTOR MaxDelta = XMVectorSubtract(SphereCenter, BoxExtents);
+
+    // Choose value for each dimension based on the comparison.
+    d = XMVectorSelect(d, MinDelta, LessThanMin);
+    d = XMVectorSelect(d, MaxDelta, GreaterThanMax);
+
+    // Use a dot-product to square them and sum them together.
+    XMVECTOR d2 = XMVector3Dot(d, d);
+    XMVECTOR SphereRadiusSq = XMVectorMultiply(SphereRadius, SphereRadius);
+
+    if (XMVector4Greater(d2, SphereRadiusSq))
+        return DISJOINT;
+
+    // See if we are completely inside the box
+    XMVECTOR SMin = XMVectorSubtract(SphereCenter, SphereRadius);
+    XMVECTOR SMax = XMVectorAdd(SphereCenter, SphereRadius);
+
+    return (XMVector3InBounds(SMin, BoxExtents) && XMVector3InBounds(SMax, BoxExtents)) ? CONTAINS : INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Axis aligned box vs. oriented box. Constructs an oriented box and uses
+// the oriented box vs. oriented box test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingOrientedBox::Contains(const BoundingBox& box) const noexcept
+{
+    // Make the axis aligned box oriented and do an OBB vs OBB test.
+    BoundingOrientedBox obox(box.Center, box.Extents, XMFLOAT4(0.f, 0.f, 0.f, 1.f));
+    return Contains(obox);
+}
+
+
+//-----------------------------------------------------------------------------
+// Oriented bounding box in oriented bounding box
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingOrientedBox::Contains(const BoundingOrientedBox& box) const noexcept
+{
+    if (!Intersects(box))
+        return DISJOINT;
+
+    // Load the boxes
+    XMVECTOR aCenter = XMLoadFloat3(&Center);
+    XMVECTOR aExtents = XMLoadFloat3(&Extents);
+    XMVECTOR aOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(aOrientation));
+
+    XMVECTOR bCenter = XMLoadFloat3(&box.Center);
+    XMVECTOR bExtents = XMLoadFloat3(&box.Extents);
+    XMVECTOR bOrientation = XMLoadFloat4(&box.Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(bOrientation));
+
+    XMVECTOR offset = XMVectorSubtract(bCenter, aCenter);
+
+    for (size_t i = 0; i < CORNER_COUNT; ++i)
+    {
+        // Cb = rotate( bExtents * corneroffset[i], bOrientation ) + bcenter
+        // Ca = invrotate( Cb - aCenter, aOrientation )
+
+        XMVECTOR C = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(bExtents, g_BoxOffset[i]), bOrientation), offset);
+        C = XMVector3InverseRotate(C, aOrientation);
+
+        if (!XMVector3InBounds(C, aExtents))
+            return INTERSECTS;
+    }
+
+    return CONTAINS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Frustum in oriented bounding box
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingOrientedBox::Contains(const BoundingFrustum& fr) const noexcept
+{
+    if (!fr.Intersects(*this))
+        return DISJOINT;
+
+    XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT];
+    fr.GetCorners(Corners);
+
+    // Load the box
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation));
+
+    for (size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i)
+    {
+        XMVECTOR C = XMVector3InverseRotate(XMVectorSubtract(XMLoadFloat3(&Corners[i]), vCenter), vOrientation);
+
+        if (!XMVector3InBounds(C, vExtents))
+            return INTERSECTS;
+    }
+
+    return CONTAINS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Sphere vs. oriented box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingOrientedBox::Intersects(const BoundingSphere& sh) const noexcept
+{
+    XMVECTOR SphereCenter = XMLoadFloat3(&sh.Center);
+    XMVECTOR SphereRadius = XMVectorReplicatePtr(&sh.Radius);
+
+    XMVECTOR BoxCenter = XMLoadFloat3(&Center);
+    XMVECTOR BoxExtents = XMLoadFloat3(&Extents);
+    XMVECTOR BoxOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(BoxOrientation));
+
+    // Transform the center of the sphere to be local to the box.
+    // BoxMin = -BoxExtents
+    // BoxMax = +BoxExtents
+    SphereCenter = XMVector3InverseRotate(XMVectorSubtract(SphereCenter, BoxCenter), BoxOrientation);
+
+    // Find the distance to the nearest point on the box.
+    // for each i in (x, y, z)
+    // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2
+    // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2
+
+    XMVECTOR d = XMVectorZero();
+
+    // Compute d for each dimension.
+    XMVECTOR LessThanMin = XMVectorLess(SphereCenter, XMVectorNegate(BoxExtents));
+    XMVECTOR GreaterThanMax = XMVectorGreater(SphereCenter, BoxExtents);
+
+    XMVECTOR MinDelta = XMVectorAdd(SphereCenter, BoxExtents);
+    XMVECTOR MaxDelta = XMVectorSubtract(SphereCenter, BoxExtents);
+
+    // Choose value for each dimension based on the comparison.
+    d = XMVectorSelect(d, MinDelta, LessThanMin);
+    d = XMVectorSelect(d, MaxDelta, GreaterThanMax);
+
+    // Use a dot-product to square them and sum them together.
+    XMVECTOR d2 = XMVector3Dot(d, d);
+
+    return XMVector4LessOrEqual(d2, XMVectorMultiply(SphereRadius, SphereRadius)) ? true : false;
+}
+
+
+//-----------------------------------------------------------------------------
+// Axis aligned box vs. oriented box. Constructs an oriented box and uses
+// the oriented box vs. oriented box test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingOrientedBox::Intersects(const BoundingBox& box) const noexcept
+{
+    // Make the axis aligned box oriented and do an OBB vs OBB test.
+    BoundingOrientedBox obox(box.Center, box.Extents, XMFLOAT4(0.f, 0.f, 0.f, 1.f));
+    return Intersects(obox);
+}
+
+
+//-----------------------------------------------------------------------------
+// Fast oriented box / oriented box intersection test using the separating axis
+// theorem.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingOrientedBox::Intersects(const BoundingOrientedBox& box) const noexcept
+{
+    // Build the 3x3 rotation matrix that defines the orientation of B relative to A.
+    XMVECTOR A_quat = XMLoadFloat4(&Orientation);
+    XMVECTOR B_quat = XMLoadFloat4(&box.Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(A_quat));
+    assert(DirectX::Internal::XMQuaternionIsUnit(B_quat));
+
+    XMVECTOR Q = XMQuaternionMultiply(A_quat, XMQuaternionConjugate(B_quat));
+    XMMATRIX R = XMMatrixRotationQuaternion(Q);
+
+    // Compute the translation of B relative to A.
+    XMVECTOR A_cent = XMLoadFloat3(&Center);
+    XMVECTOR B_cent = XMLoadFloat3(&box.Center);
+    XMVECTOR t = XMVector3InverseRotate(XMVectorSubtract(B_cent, A_cent), A_quat);
+
+    //
+    // h(A) = extents of A.
+    // h(B) = extents of B.
+    //
+    // a(u) = axes of A = (1,0,0), (0,1,0), (0,0,1)
+    // b(u) = axes of B relative to A = (r00,r10,r20), (r01,r11,r21), (r02,r12,r22)
+    //
+    // For each possible separating axis l:
+    //   d(A) = sum (for i = u,v,w) h(A)(i) * abs( a(i) dot l )
+    //   d(B) = sum (for i = u,v,w) h(B)(i) * abs( b(i) dot l )
+    //   if abs( t dot l ) > d(A) + d(B) then disjoint
+    //
+
+    // Load extents of A and B.
+    XMVECTOR h_A = XMLoadFloat3(&Extents);
+    XMVECTOR h_B = XMLoadFloat3(&box.Extents);
+
+    // Rows. Note R[0,1,2]X.w = 0.
+    XMVECTOR R0X = R.r[0];
+    XMVECTOR R1X = R.r[1];
+    XMVECTOR R2X = R.r[2];
+
+    R = XMMatrixTranspose(R);
+
+    // Columns. Note RX[0,1,2].w = 0.
+    XMVECTOR RX0 = R.r[0];
+    XMVECTOR RX1 = R.r[1];
+    XMVECTOR RX2 = R.r[2];
+
+    // Absolute value of rows.
+    XMVECTOR AR0X = XMVectorAbs(R0X);
+    XMVECTOR AR1X = XMVectorAbs(R1X);
+    XMVECTOR AR2X = XMVectorAbs(R2X);
+
+    // Absolute value of columns.
+    XMVECTOR ARX0 = XMVectorAbs(RX0);
+    XMVECTOR ARX1 = XMVectorAbs(RX1);
+    XMVECTOR ARX2 = XMVectorAbs(RX2);
+
+    // Test each of the 15 possible seperating axii.
+    XMVECTOR d, d_A, d_B;
+
+    // l = a(u) = (1, 0, 0)
+    // t dot l = t.x
+    // d(A) = h(A).x
+    // d(B) = h(B) dot abs(r00, r01, r02)
+    d = XMVectorSplatX(t);
+    d_A = XMVectorSplatX(h_A);
+    d_B = XMVector3Dot(h_B, AR0X);
+    XMVECTOR NoIntersection = XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B));
+
+    // l = a(v) = (0, 1, 0)
+    // t dot l = t.y
+    // d(A) = h(A).y
+    // d(B) = h(B) dot abs(r10, r11, r12)
+    d = XMVectorSplatY(t);
+    d_A = XMVectorSplatY(h_A);
+    d_B = XMVector3Dot(h_B, AR1X);
+    NoIntersection = XMVectorOrInt(NoIntersection,
+        XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
+
+    // l = a(w) = (0, 0, 1)
+    // t dot l = t.z
+    // d(A) = h(A).z
+    // d(B) = h(B) dot abs(r20, r21, r22)
+    d = XMVectorSplatZ(t);
+    d_A = XMVectorSplatZ(h_A);
+    d_B = XMVector3Dot(h_B, AR2X);
+    NoIntersection = XMVectorOrInt(NoIntersection,
+        XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
+
+    // l = b(u) = (r00, r10, r20)
+    // d(A) = h(A) dot abs(r00, r10, r20)
+    // d(B) = h(B).x
+    d = XMVector3Dot(t, RX0);
+    d_A = XMVector3Dot(h_A, ARX0);
+    d_B = XMVectorSplatX(h_B);
+    NoIntersection = XMVectorOrInt(NoIntersection,
+        XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
+
+    // l = b(v) = (r01, r11, r21)
+    // d(A) = h(A) dot abs(r01, r11, r21)
+    // d(B) = h(B).y
+    d = XMVector3Dot(t, RX1);
+    d_A = XMVector3Dot(h_A, ARX1);
+    d_B = XMVectorSplatY(h_B);
+    NoIntersection = XMVectorOrInt(NoIntersection,
+        XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
+
+    // l = b(w) = (r02, r12, r22)
+    // d(A) = h(A) dot abs(r02, r12, r22)
+    // d(B) = h(B).z
+    d = XMVector3Dot(t, RX2);
+    d_A = XMVector3Dot(h_A, ARX2);
+    d_B = XMVectorSplatZ(h_B);
+    NoIntersection = XMVectorOrInt(NoIntersection,
+        XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
+
+    // l = a(u) x b(u) = (0, -r20, r10)
+    // d(A) = h(A) dot abs(0, r20, r10)
+    // d(B) = h(B) dot abs(0, r02, r01)
+    d = XMVector3Dot(t, XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>(RX0, XMVectorNegate(RX0)));
+    d_A = XMVector3Dot(h_A, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>(ARX0));
+    d_B = XMVector3Dot(h_B, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>(AR0X));
+    NoIntersection = XMVectorOrInt(NoIntersection,
+        XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
+
+    // l = a(u) x b(v) = (0, -r21, r11)
+    // d(A) = h(A) dot abs(0, r21, r11)
+    // d(B) = h(B) dot abs(r02, 0, r00)
+    d = XMVector3Dot(t, XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>(RX1, XMVectorNegate(RX1)));
+    d_A = XMVector3Dot(h_A, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>(ARX1));
+    d_B = XMVector3Dot(h_B, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>(AR0X));
+    NoIntersection = XMVectorOrInt(NoIntersection,
+        XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
+
+    // l = a(u) x b(w) = (0, -r22, r12)
+    // d(A) = h(A) dot abs(0, r22, r12)
+    // d(B) = h(B) dot abs(r01, r00, 0)
+    d = XMVector3Dot(t, XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>(RX2, XMVectorNegate(RX2)));
+    d_A = XMVector3Dot(h_A, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>(ARX2));
+    d_B = XMVector3Dot(h_B, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>(AR0X));
+    NoIntersection = XMVectorOrInt(NoIntersection,
+        XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
+
+    // l = a(v) x b(u) = (r20, 0, -r00)
+    // d(A) = h(A) dot abs(r20, 0, r00)
+    // d(B) = h(B) dot abs(0, r12, r11)
+    d = XMVector3Dot(t, XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>(RX0, XMVectorNegate(RX0)));
+    d_A = XMVector3Dot(h_A, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>(ARX0));
+    d_B = XMVector3Dot(h_B, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>(AR1X));
+    NoIntersection = XMVectorOrInt(NoIntersection,
+        XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
+
+    // l = a(v) x b(v) = (r21, 0, -r01)
+    // d(A) = h(A) dot abs(r21, 0, r01)
+    // d(B) = h(B) dot abs(r12, 0, r10)
+    d = XMVector3Dot(t, XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>(RX1, XMVectorNegate(RX1)));
+    d_A = XMVector3Dot(h_A, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>(ARX1));
+    d_B = XMVector3Dot(h_B, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>(AR1X));
+    NoIntersection = XMVectorOrInt(NoIntersection,
+        XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
+
+    // l = a(v) x b(w) = (r22, 0, -r02)
+    // d(A) = h(A) dot abs(r22, 0, r02)
+    // d(B) = h(B) dot abs(r11, r10, 0)
+    d = XMVector3Dot(t, XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>(RX2, XMVectorNegate(RX2)));
+    d_A = XMVector3Dot(h_A, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>(ARX2));
+    d_B = XMVector3Dot(h_B, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>(AR1X));
+    NoIntersection = XMVectorOrInt(NoIntersection,
+        XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
+
+    // l = a(w) x b(u) = (-r10, r00, 0)
+    // d(A) = h(A) dot abs(r10, r00, 0)
+    // d(B) = h(B) dot abs(0, r22, r21)
+    d = XMVector3Dot(t, XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>(RX0, XMVectorNegate(RX0)));
+    d_A = XMVector3Dot(h_A, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>(ARX0));
+    d_B = XMVector3Dot(h_B, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>(AR2X));
+    NoIntersection = XMVectorOrInt(NoIntersection,
+        XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
+
+    // l = a(w) x b(v) = (-r11, r01, 0)
+    // d(A) = h(A) dot abs(r11, r01, 0)
+    // d(B) = h(B) dot abs(r22, 0, r20)
+    d = XMVector3Dot(t, XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>(RX1, XMVectorNegate(RX1)));
+    d_A = XMVector3Dot(h_A, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>(ARX1));
+    d_B = XMVector3Dot(h_B, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>(AR2X));
+    NoIntersection = XMVectorOrInt(NoIntersection,
+        XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
+
+    // l = a(w) x b(w) = (-r12, r02, 0)
+    // d(A) = h(A) dot abs(r12, r02, 0)
+    // d(B) = h(B) dot abs(r21, r20, 0)
+    d = XMVector3Dot(t, XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>(RX2, XMVectorNegate(RX2)));
+    d_A = XMVector3Dot(h_A, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>(ARX2));
+    d_B = XMVector3Dot(h_B, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>(AR2X));
+    NoIntersection = XMVectorOrInt(NoIntersection,
+        XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
+
+    // No seperating axis found, boxes must intersect.
+    return XMVector4NotEqualInt(NoIntersection, XMVectorTrueInt()) ? true : false;
+}
+
+
+//-----------------------------------------------------------------------------
+// Frustum vs. oriented box test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingOrientedBox::Intersects(const BoundingFrustum& fr) const noexcept
+{
+    return fr.Intersects(*this);
+}
+
+
+//-----------------------------------------------------------------------------
+// Triangle vs. oriented box test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV BoundingOrientedBox::Intersects(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept
+{
+    // Load the box center & orientation.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    // Transform the triangle vertices into the space of the box.
+    XMVECTOR TV0 = XMVector3InverseRotate(XMVectorSubtract(V0, vCenter), vOrientation);
+    XMVECTOR TV1 = XMVector3InverseRotate(XMVectorSubtract(V1, vCenter), vOrientation);
+    XMVECTOR TV2 = XMVector3InverseRotate(XMVectorSubtract(V2, vCenter), vOrientation);
+
+    BoundingBox box;
+    box.Center = XMFLOAT3(0.0f, 0.0f, 0.0f);
+    box.Extents = Extents;
+
+    // Use the triangle vs axis aligned box intersection routine.
+    return box.Intersects(TV0, TV1, TV2);
+}
+
+
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PlaneIntersectionType XM_CALLCONV BoundingOrientedBox::Intersects(FXMVECTOR Plane) const noexcept
+{
+    assert(DirectX::Internal::XMPlaneIsUnit(Plane));
+
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+    XMVECTOR BoxOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(BoxOrientation));
+
+    // Set w of the center to one so we can dot4 with a plane.
+    vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne());
+
+    // Build the 3x3 rotation matrix that defines the box axes.
+    XMMATRIX R = XMMatrixRotationQuaternion(BoxOrientation);
+
+    XMVECTOR Outside, Inside;
+    DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane, Outside, Inside);
+
+    // If the box is outside any plane it is outside.
+    if (XMVector4EqualInt(Outside, XMVectorTrueInt()))
+        return FRONT;
+
+    // If the box is inside all planes it is inside.
+    if (XMVector4EqualInt(Inside, XMVectorTrueInt()))
+        return BACK;
+
+    // The box is not inside all planes or outside a plane it intersects.
+    return INTERSECTING;
+}
+
+
+//-----------------------------------------------------------------------------
+// Compute the intersection of a ray (Origin, Direction) with an oriented box
+// using the slabs method.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV BoundingOrientedBox::Intersects(FXMVECTOR Origin, FXMVECTOR Direction, float& Dist) const noexcept
+{
+    assert(DirectX::Internal::XMVector3IsUnit(Direction));
+
+    static const XMVECTORU32 SelectY = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 } } };
+    static const XMVECTORU32 SelectZ = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } };
+
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation));
+
+    // Get the boxes normalized side directions.
+    XMMATRIX R = XMMatrixRotationQuaternion(vOrientation);
+
+    // Adjust ray origin to be relative to center of the box.
+    XMVECTOR TOrigin = XMVectorSubtract(vCenter, Origin);
+
+    // Compute the dot product againt each axis of the box.
+    XMVECTOR AxisDotOrigin = XMVector3Dot(R.r[0], TOrigin);
+    AxisDotOrigin = XMVectorSelect(AxisDotOrigin, XMVector3Dot(R.r[1], TOrigin), SelectY);
+    AxisDotOrigin = XMVectorSelect(AxisDotOrigin, XMVector3Dot(R.r[2], TOrigin), SelectZ);
+
+    XMVECTOR AxisDotDirection = XMVector3Dot(R.r[0], Direction);
+    AxisDotDirection = XMVectorSelect(AxisDotDirection, XMVector3Dot(R.r[1], Direction), SelectY);
+    AxisDotDirection = XMVectorSelect(AxisDotDirection, XMVector3Dot(R.r[2], Direction), SelectZ);
+
+    // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab.
+    XMVECTOR IsParallel = XMVectorLessOrEqual(XMVectorAbs(AxisDotDirection), g_RayEpsilon);
+
+    // Test against all three axes simultaneously.
+    XMVECTOR InverseAxisDotDirection = XMVectorReciprocal(AxisDotDirection);
+    XMVECTOR t1 = XMVectorMultiply(XMVectorSubtract(AxisDotOrigin, vExtents), InverseAxisDotDirection);
+    XMVECTOR t2 = XMVectorMultiply(XMVectorAdd(AxisDotOrigin, vExtents), InverseAxisDotDirection);
+
+    // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't
+    // use the results from any directions parallel to the slab.
+    XMVECTOR t_min = XMVectorSelect(XMVectorMin(t1, t2), g_FltMin, IsParallel);
+    XMVECTOR t_max = XMVectorSelect(XMVectorMax(t1, t2), g_FltMax, IsParallel);
+
+    // t_min.x = maximum( t_min.x, t_min.y, t_min.z );
+    // t_max.x = minimum( t_max.x, t_max.y, t_max.z );
+    t_min = XMVectorMax(t_min, XMVectorSplatY(t_min));  // x = max(x,y)
+    t_min = XMVectorMax(t_min, XMVectorSplatZ(t_min));  // x = max(max(x,y),z)
+    t_max = XMVectorMin(t_max, XMVectorSplatY(t_max));  // x = min(x,y)
+    t_max = XMVectorMin(t_max, XMVectorSplatZ(t_max));  // x = min(min(x,y),z)
+
+    // if ( t_min > t_max ) return false;
+    XMVECTOR NoIntersection = XMVectorGreater(XMVectorSplatX(t_min), XMVectorSplatX(t_max));
+
+    // if ( t_max < 0.0f ) return false;
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(XMVectorSplatX(t_max), XMVectorZero()));
+
+    // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false;
+    XMVECTOR ParallelOverlap = XMVectorInBounds(AxisDotOrigin, vExtents);
+    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorAndCInt(IsParallel, ParallelOverlap));
+
+    if (!DirectX::Internal::XMVector3AnyTrue(NoIntersection))
+    {
+        // Store the x-component to *pDist
+        XMStoreFloat(&Dist, t_min);
+        return true;
+    }
+
+    Dist = 0.f;
+    return false;
+}
+
+
+//-----------------------------------------------------------------------------
+// Test an oriented box vs 6 planes (typically forming a frustum).
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingOrientedBox::ContainedBy(
+    FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2,
+    GXMVECTOR Plane3,
+    HXMVECTOR Plane4, HXMVECTOR Plane5) const noexcept
+{
+    // Load the box.
+    XMVECTOR vCenter = XMLoadFloat3(&Center);
+    XMVECTOR vExtents = XMLoadFloat3(&Extents);
+    XMVECTOR BoxOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(BoxOrientation));
+
+    // Set w of the center to one so we can dot4 with a plane.
+    vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne());
+
+    // Build the 3x3 rotation matrix that defines the box axes.
+    XMMATRIX R = XMMatrixRotationQuaternion(BoxOrientation);
+
+    XMVECTOR Outside, Inside;
+
+    // Test against each plane.
+    DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane0, Outside, Inside);
+
+    XMVECTOR AnyOutside = Outside;
+    XMVECTOR AllInside = Inside;
+
+    DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane1, Outside, Inside);
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane2, Outside, Inside);
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane3, Outside, Inside);
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane4, Outside, Inside);
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane5, Outside, Inside);
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    // If the box is outside any plane it is outside.
+    if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt()))
+        return DISJOINT;
+
+    // If the box is inside all planes it is inside.
+    if (XMVector4EqualInt(AllInside, XMVectorTrueInt()))
+        return CONTAINS;
+
+    // The box is not inside all planes or outside a plane, it may intersect.
+    return INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Create oriented bounding box from axis-aligned bounding box
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingOrientedBox::CreateFromBoundingBox(BoundingOrientedBox& Out, const BoundingBox& box) noexcept
+{
+    Out.Center = box.Center;
+    Out.Extents = box.Extents;
+    Out.Orientation = XMFLOAT4(0.f, 0.f, 0.f, 1.f);
+}
+
+
+//-----------------------------------------------------------------------------
+// Find the approximate minimum oriented bounding box containing a set of
+// points.  Exact computation of minimum oriented bounding box is possible but
+// is slower and requires a more complex algorithm.
+// The algorithm works by computing the inertia tensor of the points and then
+// using the eigenvectors of the intertia tensor as the axes of the box.
+// Computing the intertia tensor of the convex hull of the points will usually
+// result in better bounding box but the computation is more complex.
+// Exact computation of the minimum oriented bounding box is possible but the
+// best know algorithm is O(N^3) and is significanly more complex to implement.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingOrientedBox::CreateFromPoints(BoundingOrientedBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride) noexcept
+{
+    assert(Count > 0);
+    assert(pPoints != nullptr);
+
+    XMVECTOR CenterOfMass = XMVectorZero();
+
+    // Compute the center of mass and inertia tensor of the points.
+    for (size_t i = 0; i < Count; ++i)
+    {
+        XMVECTOR Point = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(reinterpret_cast<const uint8_t*>(pPoints) + i * Stride));
+
+        CenterOfMass = XMVectorAdd(CenterOfMass, Point);
+    }
+
+    CenterOfMass = XMVectorMultiply(CenterOfMass, XMVectorReciprocal(XMVectorReplicate(float(Count))));
+
+    // Compute the inertia tensor of the points around the center of mass.
+    // Using the center of mass is not strictly necessary, but will hopefully
+    // improve the stability of finding the eigenvectors.
+    XMVECTOR XX_YY_ZZ = XMVectorZero();
+    XMVECTOR XY_XZ_YZ = XMVectorZero();
+
+    for (size_t i = 0; i < Count; ++i)
+    {
+        XMVECTOR Point = XMVectorSubtract(XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(reinterpret_cast<const uint8_t*>(pPoints) + i * Stride)), CenterOfMass);
+
+        XX_YY_ZZ = XMVectorAdd(XX_YY_ZZ, XMVectorMultiply(Point, Point));
+
+        XMVECTOR XXY = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_W>(Point);
+        XMVECTOR YZZ = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_W>(Point);
+
+        XY_XZ_YZ = XMVectorAdd(XY_XZ_YZ, XMVectorMultiply(XXY, YZZ));
+    }
+
+    XMVECTOR v1, v2, v3;
+
+    // Compute the eigenvectors of the inertia tensor.
+    DirectX::Internal::CalculateEigenVectorsFromCovarianceMatrix(XMVectorGetX(XX_YY_ZZ), XMVectorGetY(XX_YY_ZZ),
+        XMVectorGetZ(XX_YY_ZZ),
+        XMVectorGetX(XY_XZ_YZ), XMVectorGetY(XY_XZ_YZ),
+        XMVectorGetZ(XY_XZ_YZ),
+        &v1, &v2, &v3);
+
+    // Put them in a matrix.
+    XMMATRIX R;
+
+    R.r[0] = XMVectorSetW(v1, 0.f);
+    R.r[1] = XMVectorSetW(v2, 0.f);
+    R.r[2] = XMVectorSetW(v3, 0.f);
+    R.r[3] = g_XMIdentityR3.v;
+
+    // Multiply by -1 to convert the matrix into a right handed coordinate
+    // system (Det ~= 1) in case the eigenvectors form a left handed
+    // coordinate system (Det ~= -1) because XMQuaternionRotationMatrix only
+    // works on right handed matrices.
+    XMVECTOR Det = XMMatrixDeterminant(R);
+
+    if (XMVector4Less(Det, XMVectorZero()))
+    {
+        R.r[0] = XMVectorMultiply(R.r[0], g_XMNegativeOne.v);
+        R.r[1] = XMVectorMultiply(R.r[1], g_XMNegativeOne.v);
+        R.r[2] = XMVectorMultiply(R.r[2], g_XMNegativeOne.v);
+    }
+
+    // Get the rotation quaternion from the matrix.
+    XMVECTOR vOrientation = XMQuaternionRotationMatrix(R);
+
+    // Make sure it is normal (in case the vectors are slightly non-orthogonal).
+    vOrientation = XMQuaternionNormalize(vOrientation);
+
+    // Rebuild the rotation matrix from the quaternion.
+    R = XMMatrixRotationQuaternion(vOrientation);
+
+    // Build the rotation into the rotated space.
+    XMMATRIX InverseR = XMMatrixTranspose(R);
+
+    // Find the minimum OBB using the eigenvectors as the axes.
+    XMVECTOR vMin, vMax;
+
+    vMin = vMax = XMVector3TransformNormal(XMLoadFloat3(pPoints), InverseR);
+
+    for (size_t i = 1; i < Count; ++i)
+    {
+        XMVECTOR Point = XMVector3TransformNormal(XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(reinterpret_cast<const uint8_t*>(pPoints) + i * Stride)),
+            InverseR);
+
+        vMin = XMVectorMin(vMin, Point);
+        vMax = XMVectorMax(vMax, Point);
+    }
+
+    // Rotate the center into world space.
+    XMVECTOR vCenter = XMVectorScale(XMVectorAdd(vMin, vMax), 0.5f);
+    vCenter = XMVector3TransformNormal(vCenter, R);
+
+    // Store center, extents, and orientation.
+    XMStoreFloat3(&Out.Center, vCenter);
+    XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(vMax, vMin), 0.5f));
+    XMStoreFloat4(&Out.Orientation, vOrientation);
+}
+
+
+/****************************************************************************
+ *
+ * BoundingFrustum
+ *
+ ****************************************************************************/
+
+_Use_decl_annotations_
+inline BoundingFrustum::BoundingFrustum(CXMMATRIX Projection, bool rhcoords) noexcept
+{
+    CreateFromMatrix(*this, Projection, rhcoords);
+}
+
+
+//-----------------------------------------------------------------------------
+// Transform a frustum by an angle preserving transform.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingFrustum::Transform(BoundingFrustum& Out, FXMMATRIX M) const noexcept
+{
+    // Load the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation));
+
+    // Composite the frustum rotation and the transform rotation
+    XMMATRIX nM;
+    nM.r[0] = XMVector3Normalize(M.r[0]);
+    nM.r[1] = XMVector3Normalize(M.r[1]);
+    nM.r[2] = XMVector3Normalize(M.r[2]);
+    nM.r[3] = g_XMIdentityR3;
+    XMVECTOR Rotation = XMQuaternionRotationMatrix(nM);
+    vOrientation = XMQuaternionMultiply(vOrientation, Rotation);
+
+    // Transform the center.
+    vOrigin = XMVector3Transform(vOrigin, M);
+
+    // Store the frustum.
+    XMStoreFloat3(&Out.Origin, vOrigin);
+    XMStoreFloat4(&Out.Orientation, vOrientation);
+
+    // Scale the near and far distances (the slopes remain the same).
+    XMVECTOR dX = XMVector3Dot(M.r[0], M.r[0]);
+    XMVECTOR dY = XMVector3Dot(M.r[1], M.r[1]);
+    XMVECTOR dZ = XMVector3Dot(M.r[2], M.r[2]);
+
+    XMVECTOR d = XMVectorMax(dX, XMVectorMax(dY, dZ));
+    float Scale = sqrtf(XMVectorGetX(d));
+
+    Out.Near = Near * Scale;
+    Out.Far = Far * Scale;
+
+    // Copy the slopes.
+    Out.RightSlope = RightSlope;
+    Out.LeftSlope = LeftSlope;
+    Out.TopSlope = TopSlope;
+    Out.BottomSlope = BottomSlope;
+}
+
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingFrustum::Transform(BoundingFrustum& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation) const noexcept
+{
+    assert(DirectX::Internal::XMQuaternionIsUnit(Rotation));
+
+    // Load the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation));
+
+    // Composite the frustum rotation and the transform rotation.
+    vOrientation = XMQuaternionMultiply(vOrientation, Rotation);
+
+    // Transform the origin.
+    vOrigin = XMVectorAdd(XMVector3Rotate(XMVectorScale(vOrigin, Scale), Rotation), Translation);
+
+    // Store the frustum.
+    XMStoreFloat3(&Out.Origin, vOrigin);
+    XMStoreFloat4(&Out.Orientation, vOrientation);
+
+    // Scale the near and far distances (the slopes remain the same).
+    Out.Near = Near * Scale;
+    Out.Far = Far * Scale;
+
+    // Copy the slopes.
+    Out.RightSlope = RightSlope;
+    Out.LeftSlope = LeftSlope;
+    Out.TopSlope = TopSlope;
+    Out.BottomSlope = BottomSlope;
+}
+
+
+//-----------------------------------------------------------------------------
+// Get the corner points of the frustum
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingFrustum::GetCorners(XMFLOAT3* Corners) const noexcept
+{
+    assert(Corners != nullptr);
+
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation));
+
+    // Build the corners of the frustum.
+    XMVECTOR vRightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f);
+    XMVECTOR vRightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f);
+    XMVECTOR vLeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f);
+    XMVECTOR vLeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f);
+    XMVECTOR vNear = XMVectorReplicatePtr(&Near);
+    XMVECTOR vFar = XMVectorReplicatePtr(&Far);
+
+    // Returns 8 corners position of bounding frustum.
+    //     Near    Far
+    //    0----1  4----5
+    //    |    |  |    |
+    //    |    |  |    |
+    //    3----2  7----6
+
+    XMVECTOR vCorners[CORNER_COUNT];
+    vCorners[0] = XMVectorMultiply(vLeftTop, vNear);
+    vCorners[1] = XMVectorMultiply(vRightTop, vNear);
+    vCorners[2] = XMVectorMultiply(vRightBottom, vNear);
+    vCorners[3] = XMVectorMultiply(vLeftBottom, vNear);
+    vCorners[4] = XMVectorMultiply(vLeftTop, vFar);
+    vCorners[5] = XMVectorMultiply(vRightTop, vFar);
+    vCorners[6] = XMVectorMultiply(vRightBottom, vFar);
+    vCorners[7] = XMVectorMultiply(vLeftBottom, vFar);
+
+    for (size_t i = 0; i < CORNER_COUNT; ++i)
+    {
+        XMVECTOR C = XMVectorAdd(XMVector3Rotate(vCorners[i], vOrientation), vOrigin);
+        XMStoreFloat3(&Corners[i], C);
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+// Point in frustum test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingFrustum::Contains(FXMVECTOR Point) const noexcept
+{
+    // Build frustum planes.
+    XMVECTOR Planes[6];
+    Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
+    Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
+    Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
+    Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
+    Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
+    Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
+
+    // Load origin and orientation.
+    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation));
+
+    // Transform point into local space of frustum.
+    XMVECTOR TPoint = XMVector3InverseRotate(XMVectorSubtract(Point, vOrigin), vOrientation);
+
+    // Set w to one.
+    TPoint = XMVectorInsert<0, 0, 0, 0, 1>(TPoint, XMVectorSplatOne());
+
+    XMVECTOR Zero = XMVectorZero();
+    XMVECTOR Outside = Zero;
+
+    // Test point against each plane of the frustum.
+    for (size_t i = 0; i < 6; ++i)
+    {
+        XMVECTOR Dot = XMVector4Dot(TPoint, Planes[i]);
+        Outside = XMVectorOrInt(Outside, XMVectorGreater(Dot, Zero));
+    }
+
+    return XMVector4NotEqualInt(Outside, XMVectorTrueInt()) ? CONTAINS : DISJOINT;
+}
+
+
+//-----------------------------------------------------------------------------
+// Triangle vs frustum test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingFrustum::Contains(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept
+{
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    // Create 6 planes (do it inline to encourage use of registers)
+    XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
+    NearPlane = DirectX::Internal::XMPlaneTransform(NearPlane, vOrientation, vOrigin);
+    NearPlane = XMPlaneNormalize(NearPlane);
+
+    XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
+    FarPlane = DirectX::Internal::XMPlaneTransform(FarPlane, vOrientation, vOrigin);
+    FarPlane = XMPlaneNormalize(FarPlane);
+
+    XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
+    RightPlane = DirectX::Internal::XMPlaneTransform(RightPlane, vOrientation, vOrigin);
+    RightPlane = XMPlaneNormalize(RightPlane);
+
+    XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
+    LeftPlane = DirectX::Internal::XMPlaneTransform(LeftPlane, vOrientation, vOrigin);
+    LeftPlane = XMPlaneNormalize(LeftPlane);
+
+    XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
+    TopPlane = DirectX::Internal::XMPlaneTransform(TopPlane, vOrientation, vOrigin);
+    TopPlane = XMPlaneNormalize(TopPlane);
+
+    XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
+    BottomPlane = DirectX::Internal::XMPlaneTransform(BottomPlane, vOrientation, vOrigin);
+    BottomPlane = XMPlaneNormalize(BottomPlane);
+
+    return TriangleTests::ContainedBy(V0, V1, V2, NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane);
+}
+
+
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingFrustum::Contains(const BoundingSphere& sh) const noexcept
+{
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    // Create 6 planes (do it inline to encourage use of registers)
+    XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
+    NearPlane = DirectX::Internal::XMPlaneTransform(NearPlane, vOrientation, vOrigin);
+    NearPlane = XMPlaneNormalize(NearPlane);
+
+    XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
+    FarPlane = DirectX::Internal::XMPlaneTransform(FarPlane, vOrientation, vOrigin);
+    FarPlane = XMPlaneNormalize(FarPlane);
+
+    XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
+    RightPlane = DirectX::Internal::XMPlaneTransform(RightPlane, vOrientation, vOrigin);
+    RightPlane = XMPlaneNormalize(RightPlane);
+
+    XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
+    LeftPlane = DirectX::Internal::XMPlaneTransform(LeftPlane, vOrientation, vOrigin);
+    LeftPlane = XMPlaneNormalize(LeftPlane);
+
+    XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
+    TopPlane = DirectX::Internal::XMPlaneTransform(TopPlane, vOrientation, vOrigin);
+    TopPlane = XMPlaneNormalize(TopPlane);
+
+    XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
+    BottomPlane = DirectX::Internal::XMPlaneTransform(BottomPlane, vOrientation, vOrigin);
+    BottomPlane = XMPlaneNormalize(BottomPlane);
+
+    return sh.ContainedBy(NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane);
+}
+
+
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingFrustum::Contains(const BoundingBox& box) const noexcept
+{
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    // Create 6 planes (do it inline to encourage use of registers)
+    XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
+    NearPlane = DirectX::Internal::XMPlaneTransform(NearPlane, vOrientation, vOrigin);
+    NearPlane = XMPlaneNormalize(NearPlane);
+
+    XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
+    FarPlane = DirectX::Internal::XMPlaneTransform(FarPlane, vOrientation, vOrigin);
+    FarPlane = XMPlaneNormalize(FarPlane);
+
+    XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
+    RightPlane = DirectX::Internal::XMPlaneTransform(RightPlane, vOrientation, vOrigin);
+    RightPlane = XMPlaneNormalize(RightPlane);
+
+    XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
+    LeftPlane = DirectX::Internal::XMPlaneTransform(LeftPlane, vOrientation, vOrigin);
+    LeftPlane = XMPlaneNormalize(LeftPlane);
+
+    XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
+    TopPlane = DirectX::Internal::XMPlaneTransform(TopPlane, vOrientation, vOrigin);
+    TopPlane = XMPlaneNormalize(TopPlane);
+
+    XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
+    BottomPlane = DirectX::Internal::XMPlaneTransform(BottomPlane, vOrientation, vOrigin);
+    BottomPlane = XMPlaneNormalize(BottomPlane);
+
+    return box.ContainedBy(NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane);
+}
+
+
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingFrustum::Contains(const BoundingOrientedBox& box) const noexcept
+{
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    // Create 6 planes (do it inline to encourage use of registers)
+    XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
+    NearPlane = DirectX::Internal::XMPlaneTransform(NearPlane, vOrientation, vOrigin);
+    NearPlane = XMPlaneNormalize(NearPlane);
+
+    XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
+    FarPlane = DirectX::Internal::XMPlaneTransform(FarPlane, vOrientation, vOrigin);
+    FarPlane = XMPlaneNormalize(FarPlane);
+
+    XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
+    RightPlane = DirectX::Internal::XMPlaneTransform(RightPlane, vOrientation, vOrigin);
+    RightPlane = XMPlaneNormalize(RightPlane);
+
+    XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
+    LeftPlane = DirectX::Internal::XMPlaneTransform(LeftPlane, vOrientation, vOrigin);
+    LeftPlane = XMPlaneNormalize(LeftPlane);
+
+    XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
+    TopPlane = DirectX::Internal::XMPlaneTransform(TopPlane, vOrientation, vOrigin);
+    TopPlane = XMPlaneNormalize(TopPlane);
+
+    XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
+    BottomPlane = DirectX::Internal::XMPlaneTransform(BottomPlane, vOrientation, vOrigin);
+    BottomPlane = XMPlaneNormalize(BottomPlane);
+
+    return box.ContainedBy(NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane);
+}
+
+
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType BoundingFrustum::Contains(const BoundingFrustum& fr) const noexcept
+{
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    // Create 6 planes (do it inline to encourage use of registers)
+    XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
+    NearPlane = DirectX::Internal::XMPlaneTransform(NearPlane, vOrientation, vOrigin);
+    NearPlane = XMPlaneNormalize(NearPlane);
+
+    XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
+    FarPlane = DirectX::Internal::XMPlaneTransform(FarPlane, vOrientation, vOrigin);
+    FarPlane = XMPlaneNormalize(FarPlane);
+
+    XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
+    RightPlane = DirectX::Internal::XMPlaneTransform(RightPlane, vOrientation, vOrigin);
+    RightPlane = XMPlaneNormalize(RightPlane);
+
+    XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
+    LeftPlane = DirectX::Internal::XMPlaneTransform(LeftPlane, vOrientation, vOrigin);
+    LeftPlane = XMPlaneNormalize(LeftPlane);
+
+    XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
+    TopPlane = DirectX::Internal::XMPlaneTransform(TopPlane, vOrientation, vOrigin);
+    TopPlane = XMPlaneNormalize(TopPlane);
+
+    XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
+    BottomPlane = DirectX::Internal::XMPlaneTransform(BottomPlane, vOrientation, vOrigin);
+    BottomPlane = XMPlaneNormalize(BottomPlane);
+
+    return fr.ContainedBy(NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane);
+}
+
+
+//-----------------------------------------------------------------------------
+// Exact sphere vs frustum test.  The algorithm first checks the sphere against
+// the planes of the frustum, then if the plane checks were indeterminate finds
+// the nearest feature (plane, line, point) on the frustum to the center of the
+// sphere and compares the distance to the nearest feature to the radius of the
+// sphere
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingFrustum::Intersects(const BoundingSphere& sh) const noexcept
+{
+    XMVECTOR Zero = XMVectorZero();
+
+    // Build the frustum planes.
+    XMVECTOR Planes[6];
+    Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
+    Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
+    Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
+    Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
+    Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
+    Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
+
+    // Normalize the planes so we can compare to the sphere radius.
+    Planes[2] = XMVector3Normalize(Planes[2]);
+    Planes[3] = XMVector3Normalize(Planes[3]);
+    Planes[4] = XMVector3Normalize(Planes[4]);
+    Planes[5] = XMVector3Normalize(Planes[5]);
+
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation));
+
+    // Load the sphere.
+    XMVECTOR vCenter = XMLoadFloat3(&sh.Center);
+    XMVECTOR vRadius = XMVectorReplicatePtr(&sh.Radius);
+
+    // Transform the center of the sphere into the local space of frustum.
+    vCenter = XMVector3InverseRotate(XMVectorSubtract(vCenter, vOrigin), vOrientation);
+
+    // Set w of the center to one so we can dot4 with the plane.
+    vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne());
+
+    // Check against each plane of the frustum.
+    XMVECTOR Outside = XMVectorFalseInt();
+    XMVECTOR InsideAll = XMVectorTrueInt();
+    XMVECTOR CenterInsideAll = XMVectorTrueInt();
+
+    XMVECTOR Dist[6];
+
+    for (size_t i = 0; i < 6; ++i)
+    {
+        Dist[i] = XMVector4Dot(vCenter, Planes[i]);
+
+        // Outside the plane?
+        Outside = XMVectorOrInt(Outside, XMVectorGreater(Dist[i], vRadius));
+
+        // Fully inside the plane?
+        InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(Dist[i], XMVectorNegate(vRadius)));
+
+        // Check if the center is inside the plane.
+        CenterInsideAll = XMVectorAndInt(CenterInsideAll, XMVectorLessOrEqual(Dist[i], Zero));
+    }
+
+    // If the sphere is outside any of the planes it is outside.
+    if (XMVector4EqualInt(Outside, XMVectorTrueInt()))
+        return false;
+
+    // If the sphere is inside all planes it is fully inside.
+    if (XMVector4EqualInt(InsideAll, XMVectorTrueInt()))
+        return true;
+
+    // If the center of the sphere is inside all planes and the sphere intersects
+    // one or more planes then it must intersect.
+    if (XMVector4EqualInt(CenterInsideAll, XMVectorTrueInt()))
+        return true;
+
+    // The sphere may be outside the frustum or intersecting the frustum.
+    // Find the nearest feature (face, edge, or corner) on the frustum
+    // to the sphere.
+
+    // The faces adjacent to each face are:
+    static const size_t adjacent_faces[6][4] =
+    {
+        { 2, 3, 4, 5 },    // 0
+        { 2, 3, 4, 5 },    // 1
+        { 0, 1, 4, 5 },    // 2
+        { 0, 1, 4, 5 },    // 3
+        { 0, 1, 2, 3 },    // 4
+        { 0, 1, 2, 3 }
+    };  // 5
+
+    XMVECTOR Intersects = XMVectorFalseInt();
+
+    // Check to see if the nearest feature is one of the planes.
+    for (size_t i = 0; i < 6; ++i)
+    {
+        // Find the nearest point on the plane to the center of the sphere.
+        XMVECTOR Point = XMVectorNegativeMultiplySubtract(Planes[i], Dist[i], vCenter);
+
+        // Set w of the point to one.
+        Point = XMVectorInsert<0, 0, 0, 0, 1>(Point, XMVectorSplatOne());
+
+        // If the point is inside the face (inside the adjacent planes) then
+        // this plane is the nearest feature.
+        XMVECTOR InsideFace = XMVectorTrueInt();
+
+        for (size_t j = 0; j < 4; j++)
+        {
+            size_t plane_index = adjacent_faces[i][j];
+
+            InsideFace = XMVectorAndInt(InsideFace,
+                XMVectorLessOrEqual(XMVector4Dot(Point, Planes[plane_index]), Zero));
+        }
+
+        // Since we have already checked distance from the plane we know that the
+        // sphere must intersect if this plane is the nearest feature.
+        Intersects = XMVectorOrInt(Intersects,
+            XMVectorAndInt(XMVectorGreater(Dist[i], Zero), InsideFace));
+    }
+
+    if (XMVector4EqualInt(Intersects, XMVectorTrueInt()))
+        return true;
+
+    // Build the corners of the frustum.
+    XMVECTOR vRightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f);
+    XMVECTOR vRightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f);
+    XMVECTOR vLeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f);
+    XMVECTOR vLeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f);
+    XMVECTOR vNear = XMVectorReplicatePtr(&Near);
+    XMVECTOR vFar = XMVectorReplicatePtr(&Far);
+
+    XMVECTOR Corners[CORNER_COUNT];
+    Corners[0] = XMVectorMultiply(vRightTop, vNear);
+    Corners[1] = XMVectorMultiply(vRightBottom, vNear);
+    Corners[2] = XMVectorMultiply(vLeftTop, vNear);
+    Corners[3] = XMVectorMultiply(vLeftBottom, vNear);
+    Corners[4] = XMVectorMultiply(vRightTop, vFar);
+    Corners[5] = XMVectorMultiply(vRightBottom, vFar);
+    Corners[6] = XMVectorMultiply(vLeftTop, vFar);
+    Corners[7] = XMVectorMultiply(vLeftBottom, vFar);
+
+    // The Edges are:
+    static const size_t edges[12][2] =
+    {
+        { 0, 1 }, { 2, 3 }, { 0, 2 }, { 1, 3 },    // Near plane
+        { 4, 5 }, { 6, 7 }, { 4, 6 }, { 5, 7 },    // Far plane
+        { 0, 4 }, { 1, 5 }, { 2, 6 }, { 3, 7 },
+    }; // Near to far
+
+    XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius);
+
+    // Check to see if the nearest feature is one of the edges (or corners).
+    for (size_t i = 0; i < 12; ++i)
+    {
+        size_t ei0 = edges[i][0];
+        size_t ei1 = edges[i][1];
+
+        // Find the nearest point on the edge to the center of the sphere.
+        // The corners of the frustum are included as the endpoints of the edges.
+        XMVECTOR Point = DirectX::Internal::PointOnLineSegmentNearestPoint(Corners[ei0], Corners[ei1], vCenter);
+
+        XMVECTOR Delta = XMVectorSubtract(vCenter, Point);
+
+        XMVECTOR DistSq = XMVector3Dot(Delta, Delta);
+
+        // If the distance to the center of the sphere to the point is less than
+        // the radius of the sphere then it must intersect.
+        Intersects = XMVectorOrInt(Intersects, XMVectorLessOrEqual(DistSq, RadiusSq));
+    }
+
+    if (XMVector4EqualInt(Intersects, XMVectorTrueInt()))
+        return true;
+
+    // The sphere must be outside the frustum.
+    return false;
+}
+
+
+//-----------------------------------------------------------------------------
+// Exact axis aligned box vs frustum test.  Constructs an oriented box and uses
+// the oriented box vs frustum test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingFrustum::Intersects(const BoundingBox& box) const noexcept
+{
+    // Make the axis aligned box oriented and do an OBB vs frustum test.
+    BoundingOrientedBox obox(box.Center, box.Extents, XMFLOAT4(0.f, 0.f, 0.f, 1.f));
+    return Intersects(obox);
+}
+
+
+//-----------------------------------------------------------------------------
+// Exact oriented box vs frustum test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingFrustum::Intersects(const BoundingOrientedBox& box) const noexcept
+{
+    static const XMVECTORU32 SelectY = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 } } };
+    static const XMVECTORU32 SelectZ = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } };
+
+    XMVECTOR Zero = XMVectorZero();
+
+    // Build the frustum planes.
+    XMVECTOR Planes[6];
+    Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
+    Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
+    Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
+    Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
+    Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
+    Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
+
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
+    XMVECTOR FrustumOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(FrustumOrientation));
+
+    // Load the box.
+    XMVECTOR Center = XMLoadFloat3(&box.Center);
+    XMVECTOR Extents = XMLoadFloat3(&box.Extents);
+    XMVECTOR BoxOrientation = XMLoadFloat4(&box.Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(BoxOrientation));
+
+    // Transform the oriented box into the space of the frustum in order to
+    // minimize the number of transforms we have to do.
+    Center = XMVector3InverseRotate(XMVectorSubtract(Center, vOrigin), FrustumOrientation);
+    BoxOrientation = XMQuaternionMultiply(BoxOrientation, XMQuaternionConjugate(FrustumOrientation));
+
+    // Set w of the center to one so we can dot4 with the plane.
+    Center = XMVectorInsert<0, 0, 0, 0, 1>(Center, XMVectorSplatOne());
+
+    // Build the 3x3 rotation matrix that defines the box axes.
+    XMMATRIX R = XMMatrixRotationQuaternion(BoxOrientation);
+
+    // Check against each plane of the frustum.
+    XMVECTOR Outside = XMVectorFalseInt();
+    XMVECTOR InsideAll = XMVectorTrueInt();
+    XMVECTOR CenterInsideAll = XMVectorTrueInt();
+
+    for (size_t i = 0; i < 6; ++i)
+    {
+        // Compute the distance to the center of the box.
+        XMVECTOR Dist = XMVector4Dot(Center, Planes[i]);
+
+        // Project the axes of the box onto the normal of the plane.  Half the
+        // length of the projection (sometime called the "radius") is equal to
+        // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w))
+        // where h(i) are extents of the box, n is the plane normal, and b(i) are the
+        // axes of the box.
+        XMVECTOR Radius = XMVector3Dot(Planes[i], R.r[0]);
+        Radius = XMVectorSelect(Radius, XMVector3Dot(Planes[i], R.r[1]), SelectY);
+        Radius = XMVectorSelect(Radius, XMVector3Dot(Planes[i], R.r[2]), SelectZ);
+        Radius = XMVector3Dot(Extents, XMVectorAbs(Radius));
+
+        // Outside the plane?
+        Outside = XMVectorOrInt(Outside, XMVectorGreater(Dist, Radius));
+
+        // Fully inside the plane?
+        InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(Dist, XMVectorNegate(Radius)));
+
+        // Check if the center is inside the plane.
+        CenterInsideAll = XMVectorAndInt(CenterInsideAll, XMVectorLessOrEqual(Dist, Zero));
+    }
+
+    // If the box is outside any of the planes it is outside.
+    if (XMVector4EqualInt(Outside, XMVectorTrueInt()))
+        return false;
+
+    // If the box is inside all planes it is fully inside.
+    if (XMVector4EqualInt(InsideAll, XMVectorTrueInt()))
+        return true;
+
+    // If the center of the box is inside all planes and the box intersects
+    // one or more planes then it must intersect.
+    if (XMVector4EqualInt(CenterInsideAll, XMVectorTrueInt()))
+        return true;
+
+    // Build the corners of the frustum.
+    XMVECTOR vRightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f);
+    XMVECTOR vRightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f);
+    XMVECTOR vLeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f);
+    XMVECTOR vLeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f);
+    XMVECTOR vNear = XMVectorReplicatePtr(&Near);
+    XMVECTOR vFar = XMVectorReplicatePtr(&Far);
+
+    XMVECTOR Corners[CORNER_COUNT];
+    Corners[0] = XMVectorMultiply(vRightTop, vNear);
+    Corners[1] = XMVectorMultiply(vRightBottom, vNear);
+    Corners[2] = XMVectorMultiply(vLeftTop, vNear);
+    Corners[3] = XMVectorMultiply(vLeftBottom, vNear);
+    Corners[4] = XMVectorMultiply(vRightTop, vFar);
+    Corners[5] = XMVectorMultiply(vRightBottom, vFar);
+    Corners[6] = XMVectorMultiply(vLeftTop, vFar);
+    Corners[7] = XMVectorMultiply(vLeftBottom, vFar);
+
+    // Test against box axes (3)
+    {
+        // Find the min/max values of the projection of the frustum onto each axis.
+        XMVECTOR FrustumMin, FrustumMax;
+
+        FrustumMin = XMVector3Dot(Corners[0], R.r[0]);
+        FrustumMin = XMVectorSelect(FrustumMin, XMVector3Dot(Corners[0], R.r[1]), SelectY);
+        FrustumMin = XMVectorSelect(FrustumMin, XMVector3Dot(Corners[0], R.r[2]), SelectZ);
+        FrustumMax = FrustumMin;
+
+        for (size_t i = 1; i < BoundingOrientedBox::CORNER_COUNT; ++i)
+        {
+            XMVECTOR Temp = XMVector3Dot(Corners[i], R.r[0]);
+            Temp = XMVectorSelect(Temp, XMVector3Dot(Corners[i], R.r[1]), SelectY);
+            Temp = XMVectorSelect(Temp, XMVector3Dot(Corners[i], R.r[2]), SelectZ);
+
+            FrustumMin = XMVectorMin(FrustumMin, Temp);
+            FrustumMax = XMVectorMax(FrustumMax, Temp);
+        }
+
+        // Project the center of the box onto the axes.
+        XMVECTOR BoxDist = XMVector3Dot(Center, R.r[0]);
+        BoxDist = XMVectorSelect(BoxDist, XMVector3Dot(Center, R.r[1]), SelectY);
+        BoxDist = XMVectorSelect(BoxDist, XMVector3Dot(Center, R.r[2]), SelectZ);
+
+        // The projection of the box onto the axis is just its Center and Extents.
+        // if (min > box_max || max < box_min) reject;
+        XMVECTOR Result = XMVectorOrInt(XMVectorGreater(FrustumMin, XMVectorAdd(BoxDist, Extents)),
+            XMVectorLess(FrustumMax, XMVectorSubtract(BoxDist, Extents)));
+
+        if (DirectX::Internal::XMVector3AnyTrue(Result))
+            return false;
+    }
+
+    // Test against edge/edge axes (3*6).
+    XMVECTOR FrustumEdgeAxis[6];
+
+    FrustumEdgeAxis[0] = vRightTop;
+    FrustumEdgeAxis[1] = vRightBottom;
+    FrustumEdgeAxis[2] = vLeftTop;
+    FrustumEdgeAxis[3] = vLeftBottom;
+    FrustumEdgeAxis[4] = XMVectorSubtract(vRightTop, vLeftTop);
+    FrustumEdgeAxis[5] = XMVectorSubtract(vLeftBottom, vLeftTop);
+
+    for (size_t i = 0; i < 3; ++i)
+    {
+        for (size_t j = 0; j < 6; j++)
+        {
+            // Compute the axis we are going to test.
+            XMVECTOR Axis = XMVector3Cross(R.r[i], FrustumEdgeAxis[j]);
+
+            // Find the min/max values of the projection of the frustum onto the axis.
+            XMVECTOR FrustumMin, FrustumMax;
+
+            FrustumMin = FrustumMax = XMVector3Dot(Axis, Corners[0]);
+
+            for (size_t k = 1; k < CORNER_COUNT; k++)
+            {
+                XMVECTOR Temp = XMVector3Dot(Axis, Corners[k]);
+                FrustumMin = XMVectorMin(FrustumMin, Temp);
+                FrustumMax = XMVectorMax(FrustumMax, Temp);
+            }
+
+            // Project the center of the box onto the axis.
+            XMVECTOR Dist = XMVector3Dot(Center, Axis);
+
+            // Project the axes of the box onto the axis to find the "radius" of the box.
+            XMVECTOR Radius = XMVector3Dot(Axis, R.r[0]);
+            Radius = XMVectorSelect(Radius, XMVector3Dot(Axis, R.r[1]), SelectY);
+            Radius = XMVectorSelect(Radius, XMVector3Dot(Axis, R.r[2]), SelectZ);
+            Radius = XMVector3Dot(Extents, XMVectorAbs(Radius));
+
+            // if (center > max + radius || center < min - radius) reject;
+            Outside = XMVectorOrInt(Outside, XMVectorGreater(Dist, XMVectorAdd(FrustumMax, Radius)));
+            Outside = XMVectorOrInt(Outside, XMVectorLess(Dist, XMVectorSubtract(FrustumMin, Radius)));
+        }
+    }
+
+    if (XMVector4EqualInt(Outside, XMVectorTrueInt()))
+        return false;
+
+    // If we did not find a separating plane then the box must intersect the frustum.
+    return true;
+}
+
+
+//-----------------------------------------------------------------------------
+// Exact frustum vs frustum test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool BoundingFrustum::Intersects(const BoundingFrustum& fr) const noexcept
+{
+    // Load origin and orientation of frustum B.
+    XMVECTOR OriginB = XMLoadFloat3(&Origin);
+    XMVECTOR OrientationB = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(OrientationB));
+
+    // Build the planes of frustum B.
+    XMVECTOR AxisB[6];
+    AxisB[0] = XMVectorSet(0.0f, 0.0f, -1.0f, 0.0f);
+    AxisB[1] = XMVectorSet(0.0f, 0.0f, 1.0f, 0.0f);
+    AxisB[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
+    AxisB[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
+    AxisB[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
+    AxisB[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
+
+    XMVECTOR PlaneDistB[6];
+    PlaneDistB[0] = XMVectorNegate(XMVectorReplicatePtr(&Near));
+    PlaneDistB[1] = XMVectorReplicatePtr(&Far);
+    PlaneDistB[2] = XMVectorZero();
+    PlaneDistB[3] = XMVectorZero();
+    PlaneDistB[4] = XMVectorZero();
+    PlaneDistB[5] = XMVectorZero();
+
+    // Load origin and orientation of frustum A.
+    XMVECTOR OriginA = XMLoadFloat3(&fr.Origin);
+    XMVECTOR OrientationA = XMLoadFloat4(&fr.Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(OrientationA));
+
+    // Transform frustum A into the space of the frustum B in order to
+    // minimize the number of transforms we have to do.
+    OriginA = XMVector3InverseRotate(XMVectorSubtract(OriginA, OriginB), OrientationB);
+    OrientationA = XMQuaternionMultiply(OrientationA, XMQuaternionConjugate(OrientationB));
+
+    // Build the corners of frustum A (in the local space of B).
+    XMVECTOR RightTopA = XMVectorSet(fr.RightSlope, fr.TopSlope, 1.0f, 0.0f);
+    XMVECTOR RightBottomA = XMVectorSet(fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f);
+    XMVECTOR LeftTopA = XMVectorSet(fr.LeftSlope, fr.TopSlope, 1.0f, 0.0f);
+    XMVECTOR LeftBottomA = XMVectorSet(fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f);
+    XMVECTOR NearA = XMVectorReplicatePtr(&fr.Near);
+    XMVECTOR FarA = XMVectorReplicatePtr(&fr.Far);
+
+    RightTopA = XMVector3Rotate(RightTopA, OrientationA);
+    RightBottomA = XMVector3Rotate(RightBottomA, OrientationA);
+    LeftTopA = XMVector3Rotate(LeftTopA, OrientationA);
+    LeftBottomA = XMVector3Rotate(LeftBottomA, OrientationA);
+
+    XMVECTOR CornersA[CORNER_COUNT];
+    CornersA[0] = XMVectorMultiplyAdd(RightTopA, NearA, OriginA);
+    CornersA[1] = XMVectorMultiplyAdd(RightBottomA, NearA, OriginA);
+    CornersA[2] = XMVectorMultiplyAdd(LeftTopA, NearA, OriginA);
+    CornersA[3] = XMVectorMultiplyAdd(LeftBottomA, NearA, OriginA);
+    CornersA[4] = XMVectorMultiplyAdd(RightTopA, FarA, OriginA);
+    CornersA[5] = XMVectorMultiplyAdd(RightBottomA, FarA, OriginA);
+    CornersA[6] = XMVectorMultiplyAdd(LeftTopA, FarA, OriginA);
+    CornersA[7] = XMVectorMultiplyAdd(LeftBottomA, FarA, OriginA);
+
+    // Check frustum A against each plane of frustum B.
+    XMVECTOR Outside = XMVectorFalseInt();
+    XMVECTOR InsideAll = XMVectorTrueInt();
+
+    for (size_t i = 0; i < 6; ++i)
+    {
+        // Find the min/max projection of the frustum onto the plane normal.
+        XMVECTOR Min, Max;
+
+        Min = Max = XMVector3Dot(AxisB[i], CornersA[0]);
+
+        for (size_t j = 1; j < CORNER_COUNT; j++)
+        {
+            XMVECTOR Temp = XMVector3Dot(AxisB[i], CornersA[j]);
+            Min = XMVectorMin(Min, Temp);
+            Max = XMVectorMax(Max, Temp);
+        }
+
+        // Outside the plane?
+        Outside = XMVectorOrInt(Outside, XMVectorGreater(Min, PlaneDistB[i]));
+
+        // Fully inside the plane?
+        InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(Max, PlaneDistB[i]));
+    }
+
+    // If the frustum A is outside any of the planes of frustum B it is outside.
+    if (XMVector4EqualInt(Outside, XMVectorTrueInt()))
+        return false;
+
+    // If frustum A is inside all planes of frustum B it is fully inside.
+    if (XMVector4EqualInt(InsideAll, XMVectorTrueInt()))
+        return true;
+
+    // Build the corners of frustum B.
+    XMVECTOR RightTopB = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f);
+    XMVECTOR RightBottomB = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f);
+    XMVECTOR LeftTopB = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f);
+    XMVECTOR LeftBottomB = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f);
+    XMVECTOR NearB = XMVectorReplicatePtr(&Near);
+    XMVECTOR FarB = XMVectorReplicatePtr(&Far);
+
+    XMVECTOR CornersB[BoundingFrustum::CORNER_COUNT];
+    CornersB[0] = XMVectorMultiply(RightTopB, NearB);
+    CornersB[1] = XMVectorMultiply(RightBottomB, NearB);
+    CornersB[2] = XMVectorMultiply(LeftTopB, NearB);
+    CornersB[3] = XMVectorMultiply(LeftBottomB, NearB);
+    CornersB[4] = XMVectorMultiply(RightTopB, FarB);
+    CornersB[5] = XMVectorMultiply(RightBottomB, FarB);
+    CornersB[6] = XMVectorMultiply(LeftTopB, FarB);
+    CornersB[7] = XMVectorMultiply(LeftBottomB, FarB);
+
+    // Build the planes of frustum A (in the local space of B).
+    XMVECTOR AxisA[6];
+    XMVECTOR PlaneDistA[6];
+
+    AxisA[0] = XMVectorSet(0.0f, 0.0f, -1.0f, 0.0f);
+    AxisA[1] = XMVectorSet(0.0f, 0.0f, 1.0f, 0.0f);
+    AxisA[2] = XMVectorSet(1.0f, 0.0f, -fr.RightSlope, 0.0f);
+    AxisA[3] = XMVectorSet(-1.0f, 0.0f, fr.LeftSlope, 0.0f);
+    AxisA[4] = XMVectorSet(0.0f, 1.0f, -fr.TopSlope, 0.0f);
+    AxisA[5] = XMVectorSet(0.0f, -1.0f, fr.BottomSlope, 0.0f);
+
+    AxisA[0] = XMVector3Rotate(AxisA[0], OrientationA);
+    AxisA[1] = XMVectorNegate(AxisA[0]);
+    AxisA[2] = XMVector3Rotate(AxisA[2], OrientationA);
+    AxisA[3] = XMVector3Rotate(AxisA[3], OrientationA);
+    AxisA[4] = XMVector3Rotate(AxisA[4], OrientationA);
+    AxisA[5] = XMVector3Rotate(AxisA[5], OrientationA);
+
+    PlaneDistA[0] = XMVector3Dot(AxisA[0], CornersA[0]);  // Re-use corner on near plane.
+    PlaneDistA[1] = XMVector3Dot(AxisA[1], CornersA[4]);  // Re-use corner on far plane.
+    PlaneDistA[2] = XMVector3Dot(AxisA[2], OriginA);
+    PlaneDistA[3] = XMVector3Dot(AxisA[3], OriginA);
+    PlaneDistA[4] = XMVector3Dot(AxisA[4], OriginA);
+    PlaneDistA[5] = XMVector3Dot(AxisA[5], OriginA);
+
+    // Check each axis of frustum A for a seperating plane (5).
+    for (size_t i = 0; i < 6; ++i)
+    {
+        // Find the minimum projection of the frustum onto the plane normal.
+        XMVECTOR Min;
+
+        Min = XMVector3Dot(AxisA[i], CornersB[0]);
+
+        for (size_t j = 1; j < CORNER_COUNT; j++)
+        {
+            XMVECTOR Temp = XMVector3Dot(AxisA[i], CornersB[j]);
+            Min = XMVectorMin(Min, Temp);
+        }
+
+        // Outside the plane?
+        Outside = XMVectorOrInt(Outside, XMVectorGreater(Min, PlaneDistA[i]));
+    }
+
+    // If the frustum B is outside any of the planes of frustum A it is outside.
+    if (XMVector4EqualInt(Outside, XMVectorTrueInt()))
+        return false;
+
+    // Check edge/edge axes (6 * 6).
+    XMVECTOR FrustumEdgeAxisA[6];
+    FrustumEdgeAxisA[0] = RightTopA;
+    FrustumEdgeAxisA[1] = RightBottomA;
+    FrustumEdgeAxisA[2] = LeftTopA;
+    FrustumEdgeAxisA[3] = LeftBottomA;
+    FrustumEdgeAxisA[4] = XMVectorSubtract(RightTopA, LeftTopA);
+    FrustumEdgeAxisA[5] = XMVectorSubtract(LeftBottomA, LeftTopA);
+
+    XMVECTOR FrustumEdgeAxisB[6];
+    FrustumEdgeAxisB[0] = RightTopB;
+    FrustumEdgeAxisB[1] = RightBottomB;
+    FrustumEdgeAxisB[2] = LeftTopB;
+    FrustumEdgeAxisB[3] = LeftBottomB;
+    FrustumEdgeAxisB[4] = XMVectorSubtract(RightTopB, LeftTopB);
+    FrustumEdgeAxisB[5] = XMVectorSubtract(LeftBottomB, LeftTopB);
+
+    for (size_t i = 0; i < 6; ++i)
+    {
+        for (size_t j = 0; j < 6; j++)
+        {
+            // Compute the axis we are going to test.
+            XMVECTOR Axis = XMVector3Cross(FrustumEdgeAxisA[i], FrustumEdgeAxisB[j]);
+
+            // Find the min/max values of the projection of both frustums onto the axis.
+            XMVECTOR MinA, MaxA;
+            XMVECTOR MinB, MaxB;
+
+            MinA = MaxA = XMVector3Dot(Axis, CornersA[0]);
+            MinB = MaxB = XMVector3Dot(Axis, CornersB[0]);
+
+            for (size_t k = 1; k < CORNER_COUNT; k++)
+            {
+                XMVECTOR TempA = XMVector3Dot(Axis, CornersA[k]);
+                MinA = XMVectorMin(MinA, TempA);
+                MaxA = XMVectorMax(MaxA, TempA);
+
+                XMVECTOR TempB = XMVector3Dot(Axis, CornersB[k]);
+                MinB = XMVectorMin(MinB, TempB);
+                MaxB = XMVectorMax(MaxB, TempB);
+            }
+
+            // if (MinA > MaxB || MinB > MaxA) reject
+            Outside = XMVectorOrInt(Outside, XMVectorGreater(MinA, MaxB));
+            Outside = XMVectorOrInt(Outside, XMVectorGreater(MinB, MaxA));
+        }
+    }
+
+    // If there is a seperating plane, then the frustums do not intersect.
+    if (XMVector4EqualInt(Outside, XMVectorTrueInt()))
+        return false;
+
+    // If we did not find a separating plane then the frustums intersect.
+    return true;
+}
+
+
+//-----------------------------------------------------------------------------
+// Triangle vs frustum test.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV BoundingFrustum::Intersects(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept
+{
+    // Build the frustum planes (NOTE: D is negated from the usual).
+    XMVECTOR Planes[6];
+    Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, -Near);
+    Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, Far);
+    Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
+    Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
+    Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
+    Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
+
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation));
+
+    // Transform triangle into the local space of frustum.
+    XMVECTOR TV0 = XMVector3InverseRotate(XMVectorSubtract(V0, vOrigin), vOrientation);
+    XMVECTOR TV1 = XMVector3InverseRotate(XMVectorSubtract(V1, vOrigin), vOrientation);
+    XMVECTOR TV2 = XMVector3InverseRotate(XMVectorSubtract(V2, vOrigin), vOrientation);
+
+    // Test each vertex of the triangle against the frustum planes.
+    XMVECTOR Outside = XMVectorFalseInt();
+    XMVECTOR InsideAll = XMVectorTrueInt();
+
+    for (size_t i = 0; i < 6; ++i)
+    {
+        XMVECTOR Dist0 = XMVector3Dot(TV0, Planes[i]);
+        XMVECTOR Dist1 = XMVector3Dot(TV1, Planes[i]);
+        XMVECTOR Dist2 = XMVector3Dot(TV2, Planes[i]);
+
+        XMVECTOR MinDist = XMVectorMin(Dist0, Dist1);
+        MinDist = XMVectorMin(MinDist, Dist2);
+        XMVECTOR MaxDist = XMVectorMax(Dist0, Dist1);
+        MaxDist = XMVectorMax(MaxDist, Dist2);
+
+        XMVECTOR PlaneDist = XMVectorSplatW(Planes[i]);
+
+        // Outside the plane?
+        Outside = XMVectorOrInt(Outside, XMVectorGreater(MinDist, PlaneDist));
+
+        // Fully inside the plane?
+        InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(MaxDist, PlaneDist));
+    }
+
+    // If the triangle is outside any of the planes it is outside.
+    if (XMVector4EqualInt(Outside, XMVectorTrueInt()))
+        return false;
+
+    // If the triangle is inside all planes it is fully inside.
+    if (XMVector4EqualInt(InsideAll, XMVectorTrueInt()))
+        return true;
+
+    // Build the corners of the frustum.
+    XMVECTOR vRightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f);
+    XMVECTOR vRightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f);
+    XMVECTOR vLeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f);
+    XMVECTOR vLeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f);
+    XMVECTOR vNear = XMVectorReplicatePtr(&Near);
+    XMVECTOR vFar = XMVectorReplicatePtr(&Far);
+
+    XMVECTOR Corners[CORNER_COUNT];
+    Corners[0] = XMVectorMultiply(vRightTop, vNear);
+    Corners[1] = XMVectorMultiply(vRightBottom, vNear);
+    Corners[2] = XMVectorMultiply(vLeftTop, vNear);
+    Corners[3] = XMVectorMultiply(vLeftBottom, vNear);
+    Corners[4] = XMVectorMultiply(vRightTop, vFar);
+    Corners[5] = XMVectorMultiply(vRightBottom, vFar);
+    Corners[6] = XMVectorMultiply(vLeftTop, vFar);
+    Corners[7] = XMVectorMultiply(vLeftBottom, vFar);
+
+    // Test the plane of the triangle.
+    XMVECTOR Normal = XMVector3Cross(XMVectorSubtract(V1, V0), XMVectorSubtract(V2, V0));
+    XMVECTOR Dist = XMVector3Dot(Normal, V0);
+
+    XMVECTOR MinDist, MaxDist;
+    MinDist = MaxDist = XMVector3Dot(Corners[0], Normal);
+    for (size_t i = 1; i < CORNER_COUNT; ++i)
+    {
+        XMVECTOR Temp = XMVector3Dot(Corners[i], Normal);
+        MinDist = XMVectorMin(MinDist, Temp);
+        MaxDist = XMVectorMax(MaxDist, Temp);
+    }
+
+    Outside = XMVectorOrInt(XMVectorGreater(MinDist, Dist), XMVectorLess(MaxDist, Dist));
+    if (XMVector4EqualInt(Outside, XMVectorTrueInt()))
+        return false;
+
+    // Check the edge/edge axes (3*6).
+    XMVECTOR TriangleEdgeAxis[3];
+    TriangleEdgeAxis[0] = XMVectorSubtract(V1, V0);
+    TriangleEdgeAxis[1] = XMVectorSubtract(V2, V1);
+    TriangleEdgeAxis[2] = XMVectorSubtract(V0, V2);
+
+    XMVECTOR FrustumEdgeAxis[6];
+    FrustumEdgeAxis[0] = vRightTop;
+    FrustumEdgeAxis[1] = vRightBottom;
+    FrustumEdgeAxis[2] = vLeftTop;
+    FrustumEdgeAxis[3] = vLeftBottom;
+    FrustumEdgeAxis[4] = XMVectorSubtract(vRightTop, vLeftTop);
+    FrustumEdgeAxis[5] = XMVectorSubtract(vLeftBottom, vLeftTop);
+
+    for (size_t i = 0; i < 3; ++i)
+    {
+        for (size_t j = 0; j < 6; j++)
+        {
+            // Compute the axis we are going to test.
+            XMVECTOR Axis = XMVector3Cross(TriangleEdgeAxis[i], FrustumEdgeAxis[j]);
+
+            // Find the min/max of the projection of the triangle onto the axis.
+            XMVECTOR MinA, MaxA;
+
+            XMVECTOR Dist0 = XMVector3Dot(V0, Axis);
+            XMVECTOR Dist1 = XMVector3Dot(V1, Axis);
+            XMVECTOR Dist2 = XMVector3Dot(V2, Axis);
+
+            MinA = XMVectorMin(Dist0, Dist1);
+            MinA = XMVectorMin(MinA, Dist2);
+            MaxA = XMVectorMax(Dist0, Dist1);
+            MaxA = XMVectorMax(MaxA, Dist2);
+
+            // Find the min/max of the projection of the frustum onto the axis.
+            XMVECTOR MinB, MaxB;
+
+            MinB = MaxB = XMVector3Dot(Axis, Corners[0]);
+
+            for (size_t k = 1; k < CORNER_COUNT; k++)
+            {
+                XMVECTOR Temp = XMVector3Dot(Axis, Corners[k]);
+                MinB = XMVectorMin(MinB, Temp);
+                MaxB = XMVectorMax(MaxB, Temp);
+            }
+
+            // if (MinA > MaxB || MinB > MaxA) reject;
+            Outside = XMVectorOrInt(Outside, XMVectorGreater(MinA, MaxB));
+            Outside = XMVectorOrInt(Outside, XMVectorGreater(MinB, MaxA));
+        }
+    }
+
+    if (XMVector4EqualInt(Outside, XMVectorTrueInt()))
+        return false;
+
+    // If we did not find a separating plane then the triangle must intersect the frustum.
+    return true;
+}
+
+
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline PlaneIntersectionType XM_CALLCONV BoundingFrustum::Intersects(FXMVECTOR Plane) const noexcept
+{
+    assert(DirectX::Internal::XMPlaneIsUnit(Plane));
+
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation));
+
+    // Set w of the origin to one so we can dot4 with a plane.
+    vOrigin = XMVectorInsert<0, 0, 0, 0, 1>(vOrigin, XMVectorSplatOne());
+
+    // Build the corners of the frustum (in world space).
+    XMVECTOR RightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f);
+    XMVECTOR RightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f);
+    XMVECTOR LeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f);
+    XMVECTOR LeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f);
+    XMVECTOR vNear = XMVectorReplicatePtr(&Near);
+    XMVECTOR vFar = XMVectorReplicatePtr(&Far);
+
+    RightTop = XMVector3Rotate(RightTop, vOrientation);
+    RightBottom = XMVector3Rotate(RightBottom, vOrientation);
+    LeftTop = XMVector3Rotate(LeftTop, vOrientation);
+    LeftBottom = XMVector3Rotate(LeftBottom, vOrientation);
+
+    XMVECTOR Corners0 = XMVectorMultiplyAdd(RightTop, vNear, vOrigin);
+    XMVECTOR Corners1 = XMVectorMultiplyAdd(RightBottom, vNear, vOrigin);
+    XMVECTOR Corners2 = XMVectorMultiplyAdd(LeftTop, vNear, vOrigin);
+    XMVECTOR Corners3 = XMVectorMultiplyAdd(LeftBottom, vNear, vOrigin);
+    XMVECTOR Corners4 = XMVectorMultiplyAdd(RightTop, vFar, vOrigin);
+    XMVECTOR Corners5 = XMVectorMultiplyAdd(RightBottom, vFar, vOrigin);
+    XMVECTOR Corners6 = XMVectorMultiplyAdd(LeftTop, vFar, vOrigin);
+    XMVECTOR Corners7 = XMVectorMultiplyAdd(LeftBottom, vFar, vOrigin);
+
+    XMVECTOR Outside, Inside;
+    DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3,
+        Corners4, Corners5, Corners6, Corners7,
+        Plane, Outside, Inside);
+
+    // If the frustum is outside any plane it is outside.
+    if (XMVector4EqualInt(Outside, XMVectorTrueInt()))
+        return FRONT;
+
+    // If the frustum is inside all planes it is inside.
+    if (XMVector4EqualInt(Inside, XMVectorTrueInt()))
+        return BACK;
+
+    // The frustum is not inside all planes or outside a plane it intersects.
+    return INTERSECTING;
+}
+
+
+//-----------------------------------------------------------------------------
+// Ray vs. frustum test
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline bool XM_CALLCONV BoundingFrustum::Intersects(FXMVECTOR rayOrigin, FXMVECTOR Direction, float& Dist) const noexcept
+{
+    // If ray starts inside the frustum, return a distance of 0 for the hit
+    if (Contains(rayOrigin) == CONTAINS)
+    {
+        Dist = 0.0f;
+        return true;
+    }
+
+    // Build the frustum planes.
+    XMVECTOR Planes[6];
+    Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
+    Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
+    Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
+    Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
+    Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
+    Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
+
+    // Load origin and orientation of the frustum.
+    XMVECTOR frOrigin = XMLoadFloat3(&Origin);
+    XMVECTOR frOrientation = XMLoadFloat4(&Orientation);
+
+    // This algorithm based on "Fast Ray-Convex Polyhedron Intersectin," in James Arvo, ed., Graphics Gems II pp. 247-250
+    float tnear = -FLT_MAX;
+    float tfar = FLT_MAX;
+
+    for (size_t i = 0; i < 6; ++i)
+    {
+        XMVECTOR Plane = DirectX::Internal::XMPlaneTransform(Planes[i], frOrientation, frOrigin);
+        Plane = XMPlaneNormalize(Plane);
+
+        XMVECTOR AxisDotOrigin = XMPlaneDotCoord(Plane, rayOrigin);
+        XMVECTOR AxisDotDirection = XMVector3Dot(Plane, Direction);
+
+        if (XMVector3LessOrEqual(XMVectorAbs(AxisDotDirection), g_RayEpsilon))
+        {
+            // Ray is parallel to plane - check if ray origin is inside plane's
+            if (XMVector3Greater(AxisDotOrigin, g_XMZero))
+            {
+                // Ray origin is outside half-space.
+                Dist = 0.f;
+                return false;
+            }
+        }
+        else
+        {
+            // Ray not parallel - get distance to plane.
+            float vd = XMVectorGetX(AxisDotDirection);
+            float vn = XMVectorGetX(AxisDotOrigin);
+            float t = -vn / vd;
+            if (vd < 0.0f)
+            {
+                // Front face - T is a near point.
+                if (t > tfar)
+                {
+                    Dist = 0.f;
+                    return false;
+                }
+                if (t > tnear)
+                {
+                    // Hit near face.
+                    tnear = t;
+                }
+            }
+            else
+            {
+                // back face - T is far point.
+                if (t < tnear)
+                {
+                    Dist = 0.f;
+                    return false;
+                }
+                if (t < tfar)
+                {
+                    // Hit far face.
+                    tfar = t;
+                }
+            }
+        }
+    }
+
+    // Survived all tests.
+    // Note: if ray originates on polyhedron, may want to change 0.0f to some
+    // epsilon to avoid intersecting the originating face.
+    float distance = (tnear >= 0.0f) ? tnear : tfar;
+    if (distance >= 0.0f)
+    {
+        Dist = distance;
+        return true;
+    }
+
+    Dist = 0.f;
+    return false;
+}
+
+
+//-----------------------------------------------------------------------------
+// Test a frustum vs 6 planes (typically forming another frustum).
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline ContainmentType XM_CALLCONV BoundingFrustum::ContainedBy(
+    FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2,
+    GXMVECTOR Plane3,
+    HXMVECTOR Plane4, HXMVECTOR Plane5) const noexcept
+{
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation));
+
+    // Set w of the origin to one so we can dot4 with a plane.
+    vOrigin = XMVectorInsert<0, 0, 0, 0, 1>(vOrigin, XMVectorSplatOne());
+
+    // Build the corners of the frustum (in world space).
+    XMVECTOR RightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f);
+    XMVECTOR RightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f);
+    XMVECTOR LeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f);
+    XMVECTOR LeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f);
+    XMVECTOR vNear = XMVectorReplicatePtr(&Near);
+    XMVECTOR vFar = XMVectorReplicatePtr(&Far);
+
+    RightTop = XMVector3Rotate(RightTop, vOrientation);
+    RightBottom = XMVector3Rotate(RightBottom, vOrientation);
+    LeftTop = XMVector3Rotate(LeftTop, vOrientation);
+    LeftBottom = XMVector3Rotate(LeftBottom, vOrientation);
+
+    XMVECTOR Corners0 = XMVectorMultiplyAdd(RightTop, vNear, vOrigin);
+    XMVECTOR Corners1 = XMVectorMultiplyAdd(RightBottom, vNear, vOrigin);
+    XMVECTOR Corners2 = XMVectorMultiplyAdd(LeftTop, vNear, vOrigin);
+    XMVECTOR Corners3 = XMVectorMultiplyAdd(LeftBottom, vNear, vOrigin);
+    XMVECTOR Corners4 = XMVectorMultiplyAdd(RightTop, vFar, vOrigin);
+    XMVECTOR Corners5 = XMVectorMultiplyAdd(RightBottom, vFar, vOrigin);
+    XMVECTOR Corners6 = XMVectorMultiplyAdd(LeftTop, vFar, vOrigin);
+    XMVECTOR Corners7 = XMVectorMultiplyAdd(LeftBottom, vFar, vOrigin);
+
+    XMVECTOR Outside, Inside;
+
+    // Test against each plane.
+    DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3,
+        Corners4, Corners5, Corners6, Corners7,
+        Plane0, Outside, Inside);
+
+    XMVECTOR AnyOutside = Outside;
+    XMVECTOR AllInside = Inside;
+
+    DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3,
+        Corners4, Corners5, Corners6, Corners7,
+        Plane1, Outside, Inside);
+
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3,
+        Corners4, Corners5, Corners6, Corners7,
+        Plane2, Outside, Inside);
+
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3,
+        Corners4, Corners5, Corners6, Corners7,
+        Plane3, Outside, Inside);
+
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3,
+        Corners4, Corners5, Corners6, Corners7,
+        Plane4, Outside, Inside);
+
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3,
+        Corners4, Corners5, Corners6, Corners7,
+        Plane5, Outside, Inside);
+
+    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+    AllInside = XMVectorAndInt(AllInside, Inside);
+
+    // If the frustum is outside any plane it is outside.
+    if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt()))
+        return DISJOINT;
+
+    // If the frustum is inside all planes it is inside.
+    if (XMVector4EqualInt(AllInside, XMVectorTrueInt()))
+        return CONTAINS;
+
+    // The frustum is not inside all planes or outside a plane, it may intersect.
+    return INTERSECTS;
+}
+
+
+//-----------------------------------------------------------------------------
+// Build the 6 frustum planes from a frustum.
+//
+// The intended use for these routines is for fast culling to a view frustum.
+// When the volume being tested against a view frustum is small relative to the
+// view frustum it is usually either inside all six planes of the frustum
+// (CONTAINS) or outside one of the planes of the frustum (DISJOINT). If neither
+// of these cases is true then it may or may not be intersecting the frustum
+// (INTERSECTS)
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void BoundingFrustum::GetPlanes(XMVECTOR* NearPlane, XMVECTOR* FarPlane, XMVECTOR* RightPlane,
+    XMVECTOR* LeftPlane, XMVECTOR* TopPlane, XMVECTOR* BottomPlane) const noexcept
+{
+    // Load origin and orientation of the frustum.
+    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
+    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
+
+    if (NearPlane)
+    {
+        XMVECTOR vNearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
+        vNearPlane = DirectX::Internal::XMPlaneTransform(vNearPlane, vOrientation, vOrigin);
+        *NearPlane = XMPlaneNormalize(vNearPlane);
+    }
+
+    if (FarPlane)
+    {
+        XMVECTOR vFarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
+        vFarPlane = DirectX::Internal::XMPlaneTransform(vFarPlane, vOrientation, vOrigin);
+        *FarPlane = XMPlaneNormalize(vFarPlane);
+    }
+
+    if (RightPlane)
+    {
+        XMVECTOR vRightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
+        vRightPlane = DirectX::Internal::XMPlaneTransform(vRightPlane, vOrientation, vOrigin);
+        *RightPlane = XMPlaneNormalize(vRightPlane);
+    }
+
+    if (LeftPlane)
+    {
+        XMVECTOR vLeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
+        vLeftPlane = DirectX::Internal::XMPlaneTransform(vLeftPlane, vOrientation, vOrigin);
+        *LeftPlane = XMPlaneNormalize(vLeftPlane);
+    }
+
+    if (TopPlane)
+    {
+        XMVECTOR vTopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
+        vTopPlane = DirectX::Internal::XMPlaneTransform(vTopPlane, vOrientation, vOrigin);
+        *TopPlane = XMPlaneNormalize(vTopPlane);
+    }
+
+    if (BottomPlane)
+    {
+        XMVECTOR vBottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
+        vBottomPlane = DirectX::Internal::XMPlaneTransform(vBottomPlane, vOrientation, vOrigin);
+        *BottomPlane = XMPlaneNormalize(vBottomPlane);
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+// Build a frustum from a persepective projection matrix.  The matrix may only
+// contain a projection; any rotation, translation or scale will cause the
+// constructed frustum to be incorrect.
+//-----------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV BoundingFrustum::CreateFromMatrix(BoundingFrustum& Out, FXMMATRIX Projection, bool rhcoords) noexcept
+{
+    // Corners of the projection frustum in homogenous space.
+    static XMVECTORF32 HomogenousPoints[6] =
+    {
+        { { {  1.0f,  0.0f, 1.0f, 1.0f } } },   // right (at far plane)
+        { { { -1.0f,  0.0f, 1.0f, 1.0f } } },   // left
+        { { {  0.0f,  1.0f, 1.0f, 1.0f } } },   // top
+        { { {  0.0f, -1.0f, 1.0f, 1.0f } } },   // bottom
+
+        { { { 0.0f, 0.0f, 0.0f, 1.0f } } },     // near
+        { { { 0.0f, 0.0f, 1.0f, 1.0f } } }      // far
+    };
+
+    XMVECTOR Determinant;
+    XMMATRIX matInverse = XMMatrixInverse(&Determinant, Projection);
+
+    // Compute the frustum corners in world space.
+    XMVECTOR Points[6];
+
+    for (size_t i = 0; i < 6; ++i)
+    {
+        // Transform point.
+        Points[i] = XMVector4Transform(HomogenousPoints[i], matInverse);
+    }
+
+    Out.Origin = XMFLOAT3(0.0f, 0.0f, 0.0f);
+    Out.Orientation = XMFLOAT4(0.0f, 0.0f, 0.0f, 1.0f);
+
+    // Compute the slopes.
+    Points[0] = XMVectorMultiply(Points[0], XMVectorReciprocal(XMVectorSplatZ(Points[0])));
+    Points[1] = XMVectorMultiply(Points[1], XMVectorReciprocal(XMVectorSplatZ(Points[1])));
+    Points[2] = XMVectorMultiply(Points[2], XMVectorReciprocal(XMVectorSplatZ(Points[2])));
+    Points[3] = XMVectorMultiply(Points[3], XMVectorReciprocal(XMVectorSplatZ(Points[3])));
+
+    Out.RightSlope = XMVectorGetX(Points[0]);
+    Out.LeftSlope = XMVectorGetX(Points[1]);
+    Out.TopSlope = XMVectorGetY(Points[2]);
+    Out.BottomSlope = XMVectorGetY(Points[3]);
+
+    // Compute near and far.
+    Points[4] = XMVectorMultiply(Points[4], XMVectorReciprocal(XMVectorSplatW(Points[4])));
+    Points[5] = XMVectorMultiply(Points[5], XMVectorReciprocal(XMVectorSplatW(Points[5])));
+
+    if (rhcoords)
+    {
+        Out.Near = XMVectorGetZ(Points[5]);
+        Out.Far = XMVectorGetZ(Points[4]);
+    }
+    else
+    {
+        Out.Near = XMVectorGetZ(Points[4]);
+        Out.Far = XMVectorGetZ(Points[5]);
+    }
+}
+
+
+/****************************************************************************
+ *
+ * TriangleTests
+ *
+ ****************************************************************************/
+
+namespace TriangleTests
+{
+
+    //-----------------------------------------------------------------------------
+    // Compute the intersection of a ray (Origin, Direction) with a triangle
+    // (V0, V1, V2).  Return true if there is an intersection and also set *pDist
+    // to the distance along the ray to the intersection.
+    //
+    // The algorithm is based on Moller, Tomas and Trumbore, "Fast, Minimum Storage
+    // Ray-Triangle Intersection", Journal of Graphics Tools, vol. 2, no. 1,
+    // pp 21-28, 1997.
+    //-----------------------------------------------------------------------------
+    _Use_decl_annotations_
+        inline bool XM_CALLCONV Intersects(
+            FXMVECTOR Origin, FXMVECTOR Direction, FXMVECTOR V0,
+            GXMVECTOR V1,
+            HXMVECTOR V2, float& Dist) noexcept
+    {
+        assert(DirectX::Internal::XMVector3IsUnit(Direction));
+
+        XMVECTOR Zero = XMVectorZero();
+
+        XMVECTOR e1 = XMVectorSubtract(V1, V0);
+        XMVECTOR e2 = XMVectorSubtract(V2, V0);
+
+        // p = Direction ^ e2;
+        XMVECTOR p = XMVector3Cross(Direction, e2);
+
+        // det = e1 * p;
+        XMVECTOR det = XMVector3Dot(e1, p);
+
+        XMVECTOR u, v, t;
+
+        if (XMVector3GreaterOrEqual(det, g_RayEpsilon))
+        {
+            // Determinate is positive (front side of the triangle).
+            XMVECTOR s = XMVectorSubtract(Origin, V0);
+
+            // u = s * p;
+            u = XMVector3Dot(s, p);
+
+            XMVECTOR NoIntersection = XMVectorLess(u, Zero);
+            NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(u, det));
+
+            // q = s ^ e1;
+            XMVECTOR q = XMVector3Cross(s, e1);
+
+            // v = Direction * q;
+            v = XMVector3Dot(Direction, q);
+
+            NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(v, Zero));
+            NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(XMVectorAdd(u, v), det));
+
+            // t = e2 * q;
+            t = XMVector3Dot(e2, q);
+
+            NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(t, Zero));
+
+            if (XMVector4EqualInt(NoIntersection, XMVectorTrueInt()))
+            {
+                Dist = 0.f;
+                return false;
+            }
+        }
+        else if (XMVector3LessOrEqual(det, g_RayNegEpsilon))
+        {
+            // Determinate is negative (back side of the triangle).
+            XMVECTOR s = XMVectorSubtract(Origin, V0);
+
+            // u = s * p;
+            u = XMVector3Dot(s, p);
+
+            XMVECTOR NoIntersection = XMVectorGreater(u, Zero);
+            NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(u, det));
+
+            // q = s ^ e1;
+            XMVECTOR q = XMVector3Cross(s, e1);
+
+            // v = Direction * q;
+            v = XMVector3Dot(Direction, q);
+
+            NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(v, Zero));
+            NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(XMVectorAdd(u, v), det));
+
+            // t = e2 * q;
+            t = XMVector3Dot(e2, q);
+
+            NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(t, Zero));
+
+            if (XMVector4EqualInt(NoIntersection, XMVectorTrueInt()))
+            {
+                Dist = 0.f;
+                return false;
+            }
+        }
+        else
+        {
+            // Parallel ray.
+            Dist = 0.f;
+            return false;
+        }
+
+        t = XMVectorDivide(t, det);
+
+        // (u / det) and (v / dev) are the barycentric cooridinates of the intersection.
+
+        // Store the x-component to *pDist
+        XMStoreFloat(&Dist, t);
+
+        return true;
+    }
+
+
+    //-----------------------------------------------------------------------------
+    // Test if two triangles intersect.
+    //
+    // The final test of algorithm is based on Shen, Heng, and Tang, "A Fast
+    // Triangle-Triangle Overlap Test Using Signed Distances", Journal of Graphics
+    // Tools, vol. 8, no. 1, pp 17-23, 2003 and Guigue and Devillers, "Fast and
+    // Robust Triangle-Triangle Overlap Test Using Orientation Predicates", Journal
+    // of Graphics Tools, vol. 8, no. 1, pp 25-32, 2003.
+    //
+    // The final test could be considered an edge-edge separating plane test with
+    // the 9 possible cases narrowed down to the only two pairs of edges that can
+    // actaully result in a seperation.
+    //-----------------------------------------------------------------------------
+    _Use_decl_annotations_
+        inline bool XM_CALLCONV Intersects(FXMVECTOR A0, FXMVECTOR A1, FXMVECTOR A2, GXMVECTOR B0, HXMVECTOR B1, HXMVECTOR B2) noexcept
+    {
+        static const XMVECTORU32 SelectY = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 } } };
+        static const XMVECTORU32 SelectZ = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } };
+        static const XMVECTORU32 Select0111 = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_1 } } };
+        static const XMVECTORU32 Select1011 = { { { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 } } };
+        static const XMVECTORU32 Select1101 = { { { XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1 } } };
+
+        XMVECTOR Zero = XMVectorZero();
+
+        // Compute the normal of triangle A.
+        XMVECTOR N1 = XMVector3Cross(XMVectorSubtract(A1, A0), XMVectorSubtract(A2, A0));
+
+        // Assert that the triangle is not degenerate.
+        assert(!XMVector3Equal(N1, Zero));
+
+        // Test points of B against the plane of A.
+        XMVECTOR BDist = XMVector3Dot(N1, XMVectorSubtract(B0, A0));
+        BDist = XMVectorSelect(BDist, XMVector3Dot(N1, XMVectorSubtract(B1, A0)), SelectY);
+        BDist = XMVectorSelect(BDist, XMVector3Dot(N1, XMVectorSubtract(B2, A0)), SelectZ);
+
+        // Ensure robustness with co-planar triangles by zeroing small distances.
+        uint32_t BDistIsZeroCR;
+        XMVECTOR BDistIsZero = XMVectorGreaterR(&BDistIsZeroCR, g_RayEpsilon, XMVectorAbs(BDist));
+        BDist = XMVectorSelect(BDist, Zero, BDistIsZero);
+
+        uint32_t BDistIsLessCR;
+        XMVECTOR BDistIsLess = XMVectorGreaterR(&BDistIsLessCR, Zero, BDist);
+
+        uint32_t BDistIsGreaterCR;
+        XMVECTOR BDistIsGreater = XMVectorGreaterR(&BDistIsGreaterCR, BDist, Zero);
+
+        // If all the points are on the same side we don't intersect.
+        if (XMComparisonAllTrue(BDistIsLessCR) || XMComparisonAllTrue(BDistIsGreaterCR))
+            return false;
+
+        // Compute the normal of triangle B.
+        XMVECTOR N2 = XMVector3Cross(XMVectorSubtract(B1, B0), XMVectorSubtract(B2, B0));
+
+        // Assert that the triangle is not degenerate.
+        assert(!XMVector3Equal(N2, Zero));
+
+        // Test points of A against the plane of B.
+        XMVECTOR ADist = XMVector3Dot(N2, XMVectorSubtract(A0, B0));
+        ADist = XMVectorSelect(ADist, XMVector3Dot(N2, XMVectorSubtract(A1, B0)), SelectY);
+        ADist = XMVectorSelect(ADist, XMVector3Dot(N2, XMVectorSubtract(A2, B0)), SelectZ);
+
+        // Ensure robustness with co-planar triangles by zeroing small distances.
+        uint32_t ADistIsZeroCR;
+        XMVECTOR ADistIsZero = XMVectorGreaterR(&ADistIsZeroCR, g_RayEpsilon, XMVectorAbs(BDist));
+        ADist = XMVectorSelect(ADist, Zero, ADistIsZero);
+
+        uint32_t ADistIsLessCR;
+        XMVECTOR ADistIsLess = XMVectorGreaterR(&ADistIsLessCR, Zero, ADist);
+
+        uint32_t ADistIsGreaterCR;
+        XMVECTOR ADistIsGreater = XMVectorGreaterR(&ADistIsGreaterCR, ADist, Zero);
+
+        // If all the points are on the same side we don't intersect.
+        if (XMComparisonAllTrue(ADistIsLessCR) || XMComparisonAllTrue(ADistIsGreaterCR))
+            return false;
+
+        // Special case for co-planar triangles.
+        if (XMComparisonAllTrue(ADistIsZeroCR) || XMComparisonAllTrue(BDistIsZeroCR))
+        {
+            XMVECTOR Axis, Dist, MinDist;
+
+            // Compute an axis perpindicular to the edge (points out).
+            Axis = XMVector3Cross(N1, XMVectorSubtract(A1, A0));
+            Dist = XMVector3Dot(Axis, A0);
+
+            // Test points of B against the axis.
+            MinDist = XMVector3Dot(B0, Axis);
+            MinDist = XMVectorMin(MinDist, XMVector3Dot(B1, Axis));
+            MinDist = XMVectorMin(MinDist, XMVector3Dot(B2, Axis));
+            if (XMVector4GreaterOrEqual(MinDist, Dist))
+                return false;
+
+            // Edge (A1, A2)
+            Axis = XMVector3Cross(N1, XMVectorSubtract(A2, A1));
+            Dist = XMVector3Dot(Axis, A1);
+
+            MinDist = XMVector3Dot(B0, Axis);
+            MinDist = XMVectorMin(MinDist, XMVector3Dot(B1, Axis));
+            MinDist = XMVectorMin(MinDist, XMVector3Dot(B2, Axis));
+            if (XMVector4GreaterOrEqual(MinDist, Dist))
+                return false;
+
+            // Edge (A2, A0)
+            Axis = XMVector3Cross(N1, XMVectorSubtract(A0, A2));
+            Dist = XMVector3Dot(Axis, A2);
+
+            MinDist = XMVector3Dot(B0, Axis);
+            MinDist = XMVectorMin(MinDist, XMVector3Dot(B1, Axis));
+            MinDist = XMVectorMin(MinDist, XMVector3Dot(B2, Axis));
+            if (XMVector4GreaterOrEqual(MinDist, Dist))
+                return false;
+
+            // Edge (B0, B1)
+            Axis = XMVector3Cross(N2, XMVectorSubtract(B1, B0));
+            Dist = XMVector3Dot(Axis, B0);
+
+            MinDist = XMVector3Dot(A0, Axis);
+            MinDist = XMVectorMin(MinDist, XMVector3Dot(A1, Axis));
+            MinDist = XMVectorMin(MinDist, XMVector3Dot(A2, Axis));
+            if (XMVector4GreaterOrEqual(MinDist, Dist))
+                return false;
+
+            // Edge (B1, B2)
+            Axis = XMVector3Cross(N2, XMVectorSubtract(B2, B1));
+            Dist = XMVector3Dot(Axis, B1);
+
+            MinDist = XMVector3Dot(A0, Axis);
+            MinDist = XMVectorMin(MinDist, XMVector3Dot(A1, Axis));
+            MinDist = XMVectorMin(MinDist, XMVector3Dot(A2, Axis));
+            if (XMVector4GreaterOrEqual(MinDist, Dist))
+                return false;
+
+            // Edge (B2,B0)
+            Axis = XMVector3Cross(N2, XMVectorSubtract(B0, B2));
+            Dist = XMVector3Dot(Axis, B2);
+
+            MinDist = XMVector3Dot(A0, Axis);
+            MinDist = XMVectorMin(MinDist, XMVector3Dot(A1, Axis));
+            MinDist = XMVectorMin(MinDist, XMVector3Dot(A2, Axis));
+            if (XMVector4GreaterOrEqual(MinDist, Dist))
+                return false;
+
+            return true;
+        }
+
+        //
+        // Find the single vertex of A and B (ie the vertex on the opposite side
+        // of the plane from the other two) and reorder the edges so we can compute
+        // the signed edge/edge distances.
+        //
+        // if ( (V0 >= 0 && V1 <  0 && V2 <  0) ||
+        //      (V0 >  0 && V1 <= 0 && V2 <= 0) ||
+        //      (V0 <= 0 && V1 >  0 && V2 >  0) ||
+        //      (V0 <  0 && V1 >= 0 && V2 >= 0) ) then V0 is singular;
+        //
+        // If our singular vertex is not on the positive side of the plane we reverse
+        // the triangle winding so that the overlap comparisons will compare the
+        // correct edges with the correct signs.
+        //
+        XMVECTOR ADistIsLessEqual = XMVectorOrInt(ADistIsLess, ADistIsZero);
+        XMVECTOR ADistIsGreaterEqual = XMVectorOrInt(ADistIsGreater, ADistIsZero);
+
+        XMVECTOR AA0, AA1, AA2;
+        bool bPositiveA;
+
+        if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreaterEqual, ADistIsLess, Select0111)) ||
+            DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreater, ADistIsLessEqual, Select0111)))
+        {
+            // A0 is singular, crossing from positive to negative.
+            AA0 = A0; AA1 = A1; AA2 = A2;
+            bPositiveA = true;
+        }
+        else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLessEqual, ADistIsGreater, Select0111)) ||
+            DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLess, ADistIsGreaterEqual, Select0111)))
+        {
+            // A0 is singular, crossing from negative to positive.
+            AA0 = A0; AA1 = A2; AA2 = A1;
+            bPositiveA = false;
+        }
+        else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreaterEqual, ADistIsLess, Select1011)) ||
+            DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreater, ADistIsLessEqual, Select1011)))
+        {
+            // A1 is singular, crossing from positive to negative.
+            AA0 = A1; AA1 = A2; AA2 = A0;
+            bPositiveA = true;
+        }
+        else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLessEqual, ADistIsGreater, Select1011)) ||
+            DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLess, ADistIsGreaterEqual, Select1011)))
+        {
+            // A1 is singular, crossing from negative to positive.
+            AA0 = A1; AA1 = A0; AA2 = A2;
+            bPositiveA = false;
+        }
+        else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreaterEqual, ADistIsLess, Select1101)) ||
+            DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreater, ADistIsLessEqual, Select1101)))
+        {
+            // A2 is singular, crossing from positive to negative.
+            AA0 = A2; AA1 = A0; AA2 = A1;
+            bPositiveA = true;
+        }
+        else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLessEqual, ADistIsGreater, Select1101)) ||
+            DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLess, ADistIsGreaterEqual, Select1101)))
+        {
+            // A2 is singular, crossing from negative to positive.
+            AA0 = A2; AA1 = A1; AA2 = A0;
+            bPositiveA = false;
+        }
+        else
+        {
+            assert(false);
+            return false;
+        }
+
+        XMVECTOR BDistIsLessEqual = XMVectorOrInt(BDistIsLess, BDistIsZero);
+        XMVECTOR BDistIsGreaterEqual = XMVectorOrInt(BDistIsGreater, BDistIsZero);
+
+        XMVECTOR BB0, BB1, BB2;
+        bool bPositiveB;
+
+        if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreaterEqual, BDistIsLess, Select0111)) ||
+            DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreater, BDistIsLessEqual, Select0111)))
+        {
+            // B0 is singular, crossing from positive to negative.
+            BB0 = B0; BB1 = B1; BB2 = B2;
+            bPositiveB = true;
+        }
+        else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLessEqual, BDistIsGreater, Select0111)) ||
+            DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLess, BDistIsGreaterEqual, Select0111)))
+        {
+            // B0 is singular, crossing from negative to positive.
+            BB0 = B0; BB1 = B2; BB2 = B1;
+            bPositiveB = false;
+        }
+        else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreaterEqual, BDistIsLess, Select1011)) ||
+            DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreater, BDistIsLessEqual, Select1011)))
+        {
+            // B1 is singular, crossing from positive to negative.
+            BB0 = B1; BB1 = B2; BB2 = B0;
+            bPositiveB = true;
+        }
+        else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLessEqual, BDistIsGreater, Select1011)) ||
+            DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLess, BDistIsGreaterEqual, Select1011)))
+        {
+            // B1 is singular, crossing from negative to positive.
+            BB0 = B1; BB1 = B0; BB2 = B2;
+            bPositiveB = false;
+        }
+        else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreaterEqual, BDistIsLess, Select1101)) ||
+            DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreater, BDistIsLessEqual, Select1101)))
+        {
+            // B2 is singular, crossing from positive to negative.
+            BB0 = B2; BB1 = B0; BB2 = B1;
+            bPositiveB = true;
+        }
+        else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLessEqual, BDistIsGreater, Select1101)) ||
+            DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLess, BDistIsGreaterEqual, Select1101)))
+        {
+            // B2 is singular, crossing from negative to positive.
+            BB0 = B2; BB1 = B1; BB2 = B0;
+            bPositiveB = false;
+        }
+        else
+        {
+            assert(false);
+            return false;
+        }
+
+        XMVECTOR Delta0, Delta1;
+
+        // Reverse the direction of the test depending on whether the singular vertices are
+        // the same sign or different signs.
+        if (bPositiveA ^ bPositiveB)
+        {
+            Delta0 = XMVectorSubtract(BB0, AA0);
+            Delta1 = XMVectorSubtract(AA0, BB0);
+        }
+        else
+        {
+            Delta0 = XMVectorSubtract(AA0, BB0);
+            Delta1 = XMVectorSubtract(BB0, AA0);
+        }
+
+        // Check if the triangles overlap on the line of intersection between the
+        // planes of the two triangles by finding the signed line distances.
+        XMVECTOR Dist0 = XMVector3Dot(Delta0, XMVector3Cross(XMVectorSubtract(BB2, BB0), XMVectorSubtract(AA2, AA0)));
+        if (XMVector4Greater(Dist0, Zero))
+            return false;
+
+        XMVECTOR Dist1 = XMVector3Dot(Delta1, XMVector3Cross(XMVectorSubtract(BB1, BB0), XMVectorSubtract(AA1, AA0)));
+        if (XMVector4Greater(Dist1, Zero))
+            return false;
+
+        return true;
+    }
+
+
+    //-----------------------------------------------------------------------------
+    // Ray-triangle test
+    //-----------------------------------------------------------------------------
+    _Use_decl_annotations_
+        inline PlaneIntersectionType XM_CALLCONV Intersects(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane) noexcept
+    {
+        XMVECTOR One = XMVectorSplatOne();
+
+        assert(DirectX::Internal::XMPlaneIsUnit(Plane));
+
+        // Set w of the points to one so we can dot4 with a plane.
+        XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One);
+        XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One);
+        XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One);
+
+        XMVECTOR Outside, Inside;
+        DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane, Outside, Inside);
+
+        // If the triangle is outside any plane it is outside.
+        if (XMVector4EqualInt(Outside, XMVectorTrueInt()))
+            return FRONT;
+
+        // If the triangle is inside all planes it is inside.
+        if (XMVector4EqualInt(Inside, XMVectorTrueInt()))
+            return BACK;
+
+        // The triangle is not inside all planes or outside a plane it intersects.
+        return INTERSECTING;
+    }
+
+
+    //-----------------------------------------------------------------------------
+    // Test a triangle vs 6 planes (typically forming a frustum).
+    //-----------------------------------------------------------------------------
+    _Use_decl_annotations_
+        inline ContainmentType XM_CALLCONV ContainedBy(
+            FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2,
+            GXMVECTOR Plane0,
+            HXMVECTOR Plane1, HXMVECTOR Plane2,
+            CXMVECTOR Plane3, CXMVECTOR Plane4, CXMVECTOR Plane5) noexcept
+    {
+        XMVECTOR One = XMVectorSplatOne();
+
+        // Set w of the points to one so we can dot4 with a plane.
+        XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One);
+        XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One);
+        XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One);
+
+        XMVECTOR Outside, Inside;
+
+        // Test against each plane.
+        DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane0, Outside, Inside);
+
+        XMVECTOR AnyOutside = Outside;
+        XMVECTOR AllInside = Inside;
+
+        DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane1, Outside, Inside);
+        AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+        AllInside = XMVectorAndInt(AllInside, Inside);
+
+        DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane2, Outside, Inside);
+        AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+        AllInside = XMVectorAndInt(AllInside, Inside);
+
+        DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane3, Outside, Inside);
+        AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+        AllInside = XMVectorAndInt(AllInside, Inside);
+
+        DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane4, Outside, Inside);
+        AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+        AllInside = XMVectorAndInt(AllInside, Inside);
+
+        DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane5, Outside, Inside);
+        AnyOutside = XMVectorOrInt(AnyOutside, Outside);
+        AllInside = XMVectorAndInt(AllInside, Inside);
+
+        // If the triangle is outside any plane it is outside.
+        if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt()))
+            return DISJOINT;
+
+        // If the triangle is inside all planes it is inside.
+        if (XMVector4EqualInt(AllInside, XMVectorTrueInt()))
+            return CONTAINS;
+
+        // The triangle is not inside all planes or outside a plane, it may intersect.
+        return INTERSECTS;
+    }
+
+} // namespace TriangleTests
+
diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXColors.h b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXColors.h
new file mode 100644
index 000000000..83fa21093
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXColors.h
@@ -0,0 +1,312 @@
+//-------------------------------------------------------------------------------------
+// DirectXColors.h -- C++ Color Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#include "DirectXMath.h"
+
+namespace DirectX
+{
+
+    namespace Colors
+    {
+        // Standard colors (Red/Green/Blue/Alpha) in sRGB colorspace
+        XMGLOBALCONST XMVECTORF32 AliceBlue = { { { 0.941176534f, 0.972549081f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 AntiqueWhite = { { { 0.980392218f, 0.921568692f, 0.843137324f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Aqua = { { { 0.f, 1.f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Aquamarine = { { { 0.498039246f, 1.f, 0.831372619f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Azure = { { { 0.941176534f, 1.f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Beige = { { { 0.960784376f, 0.960784376f, 0.862745166f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Bisque = { { { 1.f, 0.894117713f, 0.768627524f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Black = { { { 0.f, 0.f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 BlanchedAlmond = { { { 1.f, 0.921568692f, 0.803921640f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Blue = { { { 0.f, 0.f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 BlueViolet = { { { 0.541176498f, 0.168627456f, 0.886274576f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Brown = { { { 0.647058845f, 0.164705887f, 0.164705887f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 BurlyWood = { { { 0.870588303f, 0.721568644f, 0.529411793f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 CadetBlue = { { { 0.372549027f, 0.619607866f, 0.627451003f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Chartreuse = { { { 0.498039246f, 1.f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Chocolate = { { { 0.823529482f, 0.411764741f, 0.117647067f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Coral = { { { 1.f, 0.498039246f, 0.313725501f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 CornflowerBlue = { { { 0.392156899f, 0.584313750f, 0.929411829f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Cornsilk = { { { 1.f, 0.972549081f, 0.862745166f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Crimson = { { { 0.862745166f, 0.078431375f, 0.235294133f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Cyan = { { { 0.f, 1.f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkBlue = { { { 0.f, 0.f, 0.545098066f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkCyan = { { { 0.f, 0.545098066f, 0.545098066f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkGoldenrod = { { { 0.721568644f, 0.525490224f, 0.043137256f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkGray = { { { 0.662745118f, 0.662745118f, 0.662745118f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkGreen = { { { 0.f, 0.392156899f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkKhaki = { { { 0.741176486f, 0.717647076f, 0.419607878f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkMagenta = { { { 0.545098066f, 0.f, 0.545098066f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkOliveGreen = { { { 0.333333343f, 0.419607878f, 0.184313729f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkOrange = { { { 1.f, 0.549019635f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkOrchid = { { { 0.600000024f, 0.196078449f, 0.800000072f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkRed = { { { 0.545098066f, 0.f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkSalmon = { { { 0.913725555f, 0.588235319f, 0.478431404f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkSeaGreen = { { { 0.560784340f, 0.737254918f, 0.545098066f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkSlateBlue = { { { 0.282352954f, 0.239215702f, 0.545098066f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkSlateGray = { { { 0.184313729f, 0.309803933f, 0.309803933f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkTurquoise = { { { 0.f, 0.807843208f, 0.819607913f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkViolet = { { { 0.580392182f, 0.f, 0.827451050f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DeepPink = { { { 1.f, 0.078431375f, 0.576470613f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DeepSkyBlue = { { { 0.f, 0.749019623f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DimGray = { { { 0.411764741f, 0.411764741f, 0.411764741f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DodgerBlue = { { { 0.117647067f, 0.564705908f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Firebrick = { { { 0.698039234f, 0.133333340f, 0.133333340f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 FloralWhite = { { { 1.f, 0.980392218f, 0.941176534f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 ForestGreen = { { { 0.133333340f, 0.545098066f, 0.133333340f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Fuchsia = { { { 1.f, 0.f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Gainsboro = { { { 0.862745166f, 0.862745166f, 0.862745166f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 GhostWhite = { { { 0.972549081f, 0.972549081f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Gold = { { { 1.f, 0.843137324f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Goldenrod = { { { 0.854902029f, 0.647058845f, 0.125490203f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Gray = { { { 0.501960814f, 0.501960814f, 0.501960814f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Green = { { { 0.f, 0.501960814f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 GreenYellow = { { { 0.678431392f, 1.f, 0.184313729f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Honeydew = { { { 0.941176534f, 1.f, 0.941176534f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 HotPink = { { { 1.f, 0.411764741f, 0.705882370f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 IndianRed = { { { 0.803921640f, 0.360784322f, 0.360784322f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Indigo = { { { 0.294117659f, 0.f, 0.509803951f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Ivory = { { { 1.f, 1.f, 0.941176534f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Khaki = { { { 0.941176534f, 0.901960850f, 0.549019635f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Lavender = { { { 0.901960850f, 0.901960850f, 0.980392218f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LavenderBlush = { { { 1.f, 0.941176534f, 0.960784376f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LawnGreen = { { { 0.486274540f, 0.988235354f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LemonChiffon = { { { 1.f, 0.980392218f, 0.803921640f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightBlue = { { { 0.678431392f, 0.847058892f, 0.901960850f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightCoral = { { { 0.941176534f, 0.501960814f, 0.501960814f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightCyan = { { { 0.878431439f, 1.f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightGoldenrodYellow = { { { 0.980392218f, 0.980392218f, 0.823529482f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightGray = { { { 0.827451050f, 0.827451050f, 0.827451050f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightGreen = { { { 0.564705908f, 0.933333397f, 0.564705908f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightPink = { { { 1.f, 0.713725507f, 0.756862819f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightSalmon = { { { 1.f, 0.627451003f, 0.478431404f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightSeaGreen = { { { 0.125490203f, 0.698039234f, 0.666666687f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightSkyBlue = { { { 0.529411793f, 0.807843208f, 0.980392218f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightSlateGray = { { { 0.466666698f, 0.533333361f, 0.600000024f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightSteelBlue = { { { 0.690196097f, 0.768627524f, 0.870588303f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightYellow = { { { 1.f, 1.f, 0.878431439f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Lime = { { { 0.f, 1.f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LimeGreen = { { { 0.196078449f, 0.803921640f, 0.196078449f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Linen = { { { 0.980392218f, 0.941176534f, 0.901960850f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Magenta = { { { 1.f, 0.f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Maroon = { { { 0.501960814f, 0.f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumAquamarine = { { { 0.400000036f, 0.803921640f, 0.666666687f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumBlue = { { { 0.f, 0.f, 0.803921640f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumOrchid = { { { 0.729411781f, 0.333333343f, 0.827451050f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumPurple = { { { 0.576470613f, 0.439215720f, 0.858823597f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumSeaGreen = { { { 0.235294133f, 0.701960802f, 0.443137288f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumSlateBlue = { { { 0.482352972f, 0.407843173f, 0.933333397f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumSpringGreen = { { { 0.f, 0.980392218f, 0.603921592f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumTurquoise = { { { 0.282352954f, 0.819607913f, 0.800000072f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumVioletRed = { { { 0.780392230f, 0.082352944f, 0.521568656f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MidnightBlue = { { { 0.098039225f, 0.098039225f, 0.439215720f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MintCream = { { { 0.960784376f, 1.f, 0.980392218f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MistyRose = { { { 1.f, 0.894117713f, 0.882353008f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Moccasin = { { { 1.f, 0.894117713f, 0.709803939f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 NavajoWhite = { { { 1.f, 0.870588303f, 0.678431392f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Navy = { { { 0.f, 0.f, 0.501960814f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 OldLace = { { { 0.992156923f, 0.960784376f, 0.901960850f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Olive = { { { 0.501960814f, 0.501960814f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 OliveDrab = { { { 0.419607878f, 0.556862772f, 0.137254909f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Orange = { { { 1.f, 0.647058845f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 OrangeRed = { { { 1.f, 0.270588249f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Orchid = { { { 0.854902029f, 0.439215720f, 0.839215755f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 PaleGoldenrod = { { { 0.933333397f, 0.909803987f, 0.666666687f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 PaleGreen = { { { 0.596078455f, 0.984313786f, 0.596078455f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 PaleTurquoise = { { { 0.686274529f, 0.933333397f, 0.933333397f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 PaleVioletRed = { { { 0.858823597f, 0.439215720f, 0.576470613f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 PapayaWhip = { { { 1.f, 0.937254965f, 0.835294187f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 PeachPuff = { { { 1.f, 0.854902029f, 0.725490212f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Peru = { { { 0.803921640f, 0.521568656f, 0.247058839f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Pink = { { { 1.f, 0.752941251f, 0.796078503f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Plum = { { { 0.866666734f, 0.627451003f, 0.866666734f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 PowderBlue = { { { 0.690196097f, 0.878431439f, 0.901960850f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Purple = { { { 0.501960814f, 0.f, 0.501960814f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Red = { { { 1.f, 0.f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 RosyBrown = { { { 0.737254918f, 0.560784340f, 0.560784340f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 RoyalBlue = { { { 0.254901975f, 0.411764741f, 0.882353008f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SaddleBrown = { { { 0.545098066f, 0.270588249f, 0.074509807f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Salmon = { { { 0.980392218f, 0.501960814f, 0.447058856f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SandyBrown = { { { 0.956862807f, 0.643137276f, 0.376470625f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SeaGreen = { { { 0.180392161f, 0.545098066f, 0.341176480f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SeaShell = { { { 1.f, 0.960784376f, 0.933333397f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Sienna = { { { 0.627451003f, 0.321568638f, 0.176470593f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Silver = { { { 0.752941251f, 0.752941251f, 0.752941251f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SkyBlue = { { { 0.529411793f, 0.807843208f, 0.921568692f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SlateBlue = { { { 0.415686309f, 0.352941185f, 0.803921640f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SlateGray = { { { 0.439215720f, 0.501960814f, 0.564705908f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Snow = { { { 1.f, 0.980392218f, 0.980392218f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SpringGreen = { { { 0.f, 1.f, 0.498039246f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SteelBlue = { { { 0.274509817f, 0.509803951f, 0.705882370f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Tan = { { { 0.823529482f, 0.705882370f, 0.549019635f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Teal = { { { 0.f, 0.501960814f, 0.501960814f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Thistle = { { { 0.847058892f, 0.749019623f, 0.847058892f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Tomato = { { { 1.f, 0.388235331f, 0.278431386f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Transparent = { { { 0.f, 0.f, 0.f, 0.f } } };
+        XMGLOBALCONST XMVECTORF32 Turquoise = { { { 0.250980407f, 0.878431439f, 0.815686345f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Violet = { { { 0.933333397f, 0.509803951f, 0.933333397f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Wheat = { { { 0.960784376f, 0.870588303f, 0.701960802f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 White = { { { 1.f, 1.f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 WhiteSmoke = { { { 0.960784376f, 0.960784376f, 0.960784376f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Yellow = { { { 1.f, 1.f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 YellowGreen = { { { 0.603921592f, 0.803921640f, 0.196078449f, 1.f } } };
+
+    } // namespace Colors
+
+    namespace ColorsLinear
+    {
+        // Standard colors (Red/Green/Blue/Alpha) in linear colorspace
+        XMGLOBALCONST XMVECTORF32 AliceBlue = { { { 0.871367335f, 0.938685894f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 AntiqueWhite = { { { 0.955973506f, 0.830770075f, 0.679542601f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Aqua = { { { 0.f, 1.f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Aquamarine = { { { 0.212230787f, 1.f, 0.658374965f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Azure = { { { 0.871367335f, 1.f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Beige = { { { 0.913098991f, 0.913098991f, 0.715693772f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Bisque = { { { 1.f, 0.775822461f, 0.552011609f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Black = { { { 0.f, 0.f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 BlanchedAlmond = { { { 1.f, 0.830770075f, 0.610495746f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Blue = { { { 0.f, 0.f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 BlueViolet = { { { 0.254152179f, 0.024157630f, 0.760524750f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Brown = { { { 0.376262218f, 0.023153365f, 0.023153365f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 BurlyWood = { { { 0.730461001f, 0.479320228f, 0.242281199f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 CadetBlue = { { { 0.114435382f, 0.341914445f, 0.351532698f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Chartreuse = { { { 0.212230787f, 1.f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Chocolate = { { { 0.644479871f, 0.141263321f, 0.012983031f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Coral = { { { 1.f, 0.212230787f, 0.080219828f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 CornflowerBlue = { { { 0.127437726f, 0.300543845f, 0.846873462f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Cornsilk = { { { 1.f, 0.938685894f, 0.715693772f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Crimson = { { { 0.715693772f, 0.006995410f, 0.045186214f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Cyan = { { { 0.f, 1.f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkBlue = { { { 0.f, 0.f, 0.258182913f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkCyan = { { { 0.f, 0.258182913f, 0.258182913f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkGoldenrod = { { { 0.479320228f, 0.238397658f, 0.003346536f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkGray = { { { 0.396755308f, 0.396755308f, 0.396755308f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkGreen = { { { 0.f, 0.127437726f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkKhaki = { { { 0.508881450f, 0.473531544f, 0.147027299f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkMagenta = { { { 0.258182913f, 0.f, 0.258182913f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkOliveGreen = { { { 0.090841733f, 0.147027299f, 0.028426038f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkOrange = { { { 1.f, 0.262250721f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkOrchid = { { { 0.318546832f, 0.031896040f, 0.603827536f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkRed = { { { 0.258182913f, 0.f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkSalmon = { { { 0.814846814f, 0.304987371f, 0.194617867f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkSeaGreen = { { { 0.274677366f, 0.502886593f, 0.258182913f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkSlateBlue = { { { 0.064803280f, 0.046665095f, 0.258182913f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkSlateGray = { { { 0.028426038f, 0.078187428f, 0.078187428f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkTurquoise = { { { 0.f, 0.617206752f, 0.637597024f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DarkViolet = { { { 0.296138316f, 0.f, 0.651405811f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DeepPink = { { { 1.f, 0.006995410f, 0.291770697f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DeepSkyBlue = { { { 0.f, 0.520995677f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DimGray = { { { 0.141263321f, 0.141263321f, 0.141263321f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 DodgerBlue = { { { 0.012983031f, 0.278894335f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Firebrick = { { { 0.445201248f, 0.015996292f, 0.015996292f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 FloralWhite = { { { 1.f, 0.955973506f, 0.871367335f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 ForestGreen = { { { 0.015996292f, 0.258182913f, 0.015996292f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Fuchsia = { { { 1.f, 0.f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Gainsboro = { { { 0.715693772f, 0.715693772f, 0.715693772f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 GhostWhite = { { { 0.938685894f, 0.938685894f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Gold = { { { 1.f, 0.679542601f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Goldenrod = { { { 0.701102138f, 0.376262218f, 0.014443844f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Gray = { { { 0.215860531f, 0.215860531f, 0.215860531f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Green = { { { 0.f, 0.215860531f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 GreenYellow = { { { 0.417885154f, 1.f, 0.028426038f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Honeydew = { { { 0.871367335f, 1.f, 0.871367335f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 HotPink = { { { 1.f, 0.141263321f, 0.456411064f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 IndianRed = { { { 0.610495746f, 0.107023112f, 0.107023112f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Indigo = { { { 0.070360109f, 0.f, 0.223227978f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Ivory = { { { 1.f, 1.f, 0.871367335f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Khaki = { { { 0.871367335f, 0.791298151f, 0.262250721f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Lavender = { { { 0.791298151f, 0.791298151f, 0.955973506f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LavenderBlush = { { { 1.f, 0.871367335f, 0.913098991f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LawnGreen = { { { 0.201556295f, 0.973445475f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LemonChiffon = { { { 1.f, 0.955973506f, 0.610495746f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightBlue = { { { 0.417885154f, 0.686685443f, 0.791298151f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightCoral = { { { 0.871367335f, 0.215860531f, 0.215860531f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightCyan = { { { 0.745404482f, 1.f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightGoldenrodYellow = { { { 0.955973506f, 0.955973506f, 0.644479871f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightGray = { { { 0.651405811f, 0.651405811f, 0.651405811f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightGreen = { { { 0.278894335f, 0.854992807f, 0.278894335f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightPink = { { { 1.f, 0.467783839f, 0.533276618f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightSalmon = { { { 1.f, 0.351532698f, 0.194617867f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightSeaGreen = { { { 0.014443844f, 0.445201248f, 0.401977867f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightSkyBlue = { { { 0.242281199f, 0.617206752f, 0.955973506f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightSlateGray = { { { 0.184475034f, 0.246201396f, 0.318546832f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightSteelBlue = { { { 0.434153706f, 0.552011609f, 0.730461001f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LightYellow = { { { 1.f, 1.f, 0.745404482f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Lime = { { { 0.f, 1.f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 LimeGreen = { { { 0.031896040f, 0.610495746f, 0.031896040f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Linen = { { { 0.955973506f, 0.871367335f, 0.791298151f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Magenta = { { { 1.f, 0.f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Maroon = { { { 0.215860531f, 0.f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumAquamarine = { { { 0.132868364f, 0.610495746f, 0.401977867f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumBlue = { { { 0.f, 0.f, 0.610495746f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumOrchid = { { { 0.491020888f, 0.090841733f, 0.651405811f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumPurple = { { { 0.291770697f, 0.162029430f, 0.708376050f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumSeaGreen = { { { 0.045186214f, 0.450785846f, 0.165132239f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumSlateBlue = { { { 0.198069349f, 0.138431653f, 0.854992807f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumSpringGreen = { { { 0.f, 0.955973506f, 0.323143244f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumTurquoise = { { { 0.064803280f, 0.637597024f, 0.603827536f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MediumVioletRed = { { { 0.571125031f, 0.007499032f, 0.234550655f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MidnightBlue = { { { 0.009721218f, 0.009721218f, 0.162029430f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MintCream = { { { 0.913098991f, 1.f, 0.955973506f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 MistyRose = { { { 1.f, 0.775822461f, 0.752942443f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Moccasin = { { { 1.f, 0.775822461f, 0.462077051f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 NavajoWhite = { { { 1.f, 0.730461001f, 0.417885154f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Navy = { { { 0.f, 0.f, 0.215860531f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 OldLace = { { { 0.982250869f, 0.913098991f, 0.791298151f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Olive = { { { 0.215860531f, 0.215860531f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 OliveDrab = { { { 0.147027299f, 0.270497859f, 0.016807375f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Orange = { { { 1.f, 0.376262218f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 OrangeRed = { { { 1.f, 0.059511241f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Orchid = { { { 0.701102138f, 0.162029430f, 0.672443330f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 PaleGoldenrod = { { { 0.854992807f, 0.806952477f, 0.401977867f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 PaleGreen = { { { 0.313988745f, 0.964686573f, 0.313988745f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 PaleTurquoise = { { { 0.428690553f, 0.854992807f, 0.854992807f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 PaleVioletRed = { { { 0.708376050f, 0.162029430f, 0.291770697f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 PapayaWhip = { { { 1.f, 0.863157392f, 0.665387452f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 PeachPuff = { { { 1.f, 0.701102138f, 0.485149980f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Peru = { { { 0.610495746f, 0.234550655f, 0.049706575f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Pink = { { { 1.f, 0.527115345f, 0.597202003f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Plum = { { { 0.723055363f, 0.351532698f, 0.723055363f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 PowderBlue = { { { 0.434153706f, 0.745404482f, 0.791298151f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Purple = { { { 0.215860531f, 0.f, 0.215860531f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Red = { { { 1.f, 0.f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 RosyBrown = { { { 0.502886593f, 0.274677366f, 0.274677366f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 RoyalBlue = { { { 0.052860655f, 0.141263321f, 0.752942443f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SaddleBrown = { { { 0.258182913f, 0.059511241f, 0.006512091f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Salmon = { { { 0.955973506f, 0.215860531f, 0.168269455f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SandyBrown = { { { 0.904661357f, 0.371237785f, 0.116970696f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SeaGreen = { { { 0.027320892f, 0.258182913f, 0.095307484f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SeaShell = { { { 1.f, 0.913098991f, 0.854992807f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Sienna = { { { 0.351532698f, 0.084376216f, 0.026241222f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Silver = { { { 0.527115345f, 0.527115345f, 0.527115345f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SkyBlue = { { { 0.242281199f, 0.617206752f, 0.830770075f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SlateBlue = { { { 0.144128501f, 0.102241747f, 0.610495746f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SlateGray = { { { 0.162029430f, 0.215860531f, 0.278894335f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Snow = { { { 1.f, 0.955973506f, 0.955973506f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SpringGreen = { { { 0.f, 1.f, 0.212230787f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 SteelBlue = { { { 0.061246071f, 0.223227978f, 0.456411064f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Tan = { { { 0.644479871f, 0.456411064f, 0.262250721f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Teal = { { { 0.f, 0.215860531f, 0.215860531f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Thistle = { { { 0.686685443f, 0.520995677f, 0.686685443f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Tomato = { { { 1.f, 0.124771863f, 0.063010029f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Transparent = { { { 0.f, 0.f, 0.f, 0.f } } };
+        XMGLOBALCONST XMVECTORF32 Turquoise = { { { 0.051269468f, 0.745404482f, 0.630757332f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Violet = { { { 0.854992807f, 0.223227978f, 0.854992807f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Wheat = { { { 0.913098991f, 0.730461001f, 0.450785846f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 White = { { { 1.f, 1.f, 1.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 WhiteSmoke = { { { 0.913098991f, 0.913098991f, 0.913098991f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 Yellow = { { { 1.f, 1.f, 0.f, 1.f } } };
+        XMGLOBALCONST XMVECTORF32 YellowGreen = { { { 0.323143244f, 0.610495746f, 0.031896040f, 1.f } } };
+
+    } // namespace ColorsLinear
+
+} // namespace DirectX
+
diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h
new file mode 100644
index 000000000..593aead5b
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h
@@ -0,0 +1,2280 @@
+//-------------------------------------------------------------------------------------
+// DirectXMath.h -- SIMD C++ Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#ifndef __cplusplus
+#error DirectX Math requires C++
+#endif
+
+#define DIRECTX_MATH_VERSION 318
+
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+#error DirectX Math requires Visual C++ 2017 or later.
+#endif
+
+#if defined(_MSC_VER) && !defined(_M_ARM) && !defined(_M_ARM64) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC) && (!_MANAGED) && (!_M_CEE) && (!defined(_M_IX86_FP) || (_M_IX86_FP > 1)) && !defined(_XM_NO_INTRINSICS_) && !defined(_XM_VECTORCALL_)
+#define _XM_VECTORCALL_ 1
+#endif
+
+#if _XM_VECTORCALL_
+#define XM_CALLCONV __vectorcall
+#elif defined(__GNUC__)
+#define XM_CALLCONV
+#else
+#define XM_CALLCONV __fastcall
+#endif
+
+#ifndef XM_DEPRECATED
+#ifdef __GNUC__
+#define XM_DEPRECATED __attribute__ ((deprecated))
+#else
+#define XM_DEPRECATED __declspec(deprecated("This is deprecated and will be removed in a future version."))
+#endif
+#endif
+
+#if !defined(_XM_AVX2_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
+#define _XM_AVX2_INTRINSICS_
+#endif
+
+#if !defined(_XM_FMA3_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+#define _XM_FMA3_INTRINSICS_
+#endif
+
+#if !defined(_XM_F16C_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+#define _XM_F16C_INTRINSICS_
+#endif
+
+#if !defined(_XM_F16C_INTRINSICS_) && defined(__F16C__) && !defined(_XM_NO_INTRINSICS_)
+#define _XM_F16C_INTRINSICS_
+#endif
+
+#if defined(_XM_FMA3_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_)
+#define _XM_AVX_INTRINSICS_
+#endif
+
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_)
+#define _XM_AVX_INTRINSICS_
+#endif
+
+#if !defined(_XM_AVX_INTRINSICS_) && defined(__AVX__) && !defined(_XM_NO_INTRINSICS_)
+#define _XM_AVX_INTRINSICS_
+#endif
+
+#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_SSE4_INTRINSICS_)
+#define _XM_SSE4_INTRINSICS_
+#endif
+
+#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_SSE3_INTRINSICS_)
+#define _XM_SSE3_INTRINSICS_
+#endif
+
+#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_)
+#define _XM_SSE_INTRINSICS_
+#endif
+
+#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC)
+#define _XM_SSE_INTRINSICS_
+#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __arm__ || __aarch64__
+#define _XM_ARM_NEON_INTRINSICS_
+#elif !defined(_XM_NO_INTRINSICS_)
+#error DirectX Math does not support this target
+#endif
+#endif // !_XM_ARM_NEON_INTRINSICS_ && !_XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
+
+#if defined(_XM_SSE_INTRINSICS_) && defined(_MSC_VER) && (_MSC_VER >= 1920) && !defined(__clang__) && !defined(_XM_SVML_INTRINSICS_) && !defined(_XM_DISABLE_INTEL_SVML_)
+#define _XM_SVML_INTRINSICS_
+#endif
+
+#if !defined(_XM_NO_XMVECTOR_OVERLOADS_) && (defined(__clang__) || defined(__GNUC__)) && !defined(_XM_NO_INTRINSICS_)
+#define _XM_NO_XMVECTOR_OVERLOADS_
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4514 4820)
+// C4514/4820: Off by default noise
+#endif
+#include <math.h>
+#include <float.h>
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#ifndef _XM_NO_INTRINSICS_
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4987)
+// C4987: Off by default noise
+#include <intrin.h>
+#pragma warning(pop)
+#endif
+
+#if (defined(__clang__) || defined(__GNUC__)) && (__x86_64__ || __i386__)
+#include <cpuid.h>
+#endif
+
+#ifdef _XM_SSE_INTRINSICS_
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#ifdef _XM_SSE3_INTRINSICS_
+#include <pmmintrin.h>
+#endif
+
+#ifdef _XM_SSE4_INTRINSICS_
+#include <smmintrin.h>
+#endif
+
+#ifdef _XM_AVX_INTRINSICS_
+#include <immintrin.h>
+#endif
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_MSC_VER) && !defined(__clang__) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC))
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+#endif
+#endif // !_XM_NO_INTRINSICS_
+
+#include "sal.h"
+#include <assert.h>
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4005 4668)
+// C4005/4668: Old header issue
+#endif
+#include <stdint.h>
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#if __cplusplus >= 201703L
+#define XM_ALIGNED_DATA(x) alignas(x)
+#define XM_ALIGNED_STRUCT(x) struct alignas(x)
+#elif defined(__GNUC__)
+#define XM_ALIGNED_DATA(x) __attribute__ ((aligned(x)))
+#define XM_ALIGNED_STRUCT(x) struct __attribute__ ((aligned(x)))
+#else
+#define XM_ALIGNED_DATA(x) __declspec(align(x))
+#define XM_ALIGNED_STRUCT(x) __declspec(align(x)) struct
+#endif
+
+#if (__cplusplus >= 202002L)
+#include <compare>
+#endif
+
+/****************************************************************************
+ *
+ * Conditional intrinsics
+ *
+ ****************************************************************************/
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+
+#if defined(_XM_NO_MOVNT_)
+#define XM_STREAM_PS( p, a ) _mm_store_ps((p), (a))
+#define XM256_STREAM_PS( p, a ) _mm256_store_ps((p), (a))
+#define XM_SFENCE()
+#else
+#define XM_STREAM_PS( p, a ) _mm_stream_ps((p), (a))
+#define XM256_STREAM_PS( p, a ) _mm256_stream_ps((p), (a))
+#define XM_SFENCE() _mm_sfence()
+#endif
+
+#if defined(_XM_FMA3_INTRINSICS_)
+#define XM_FMADD_PS( a, b, c ) _mm_fmadd_ps((a), (b), (c))
+#define XM_FNMADD_PS( a, b, c ) _mm_fnmadd_ps((a), (b), (c))
+#else
+#define XM_FMADD_PS( a, b, c ) _mm_add_ps(_mm_mul_ps((a), (b)), (c))
+#define XM_FNMADD_PS( a, b, c ) _mm_sub_ps((c), _mm_mul_ps((a), (b)))
+#endif
+
+#if defined(_XM_AVX_INTRINSICS_) && defined(_XM_FAVOR_INTEL_)
+#define XM_PERMUTE_PS( v, c ) _mm_permute_ps((v), c )
+#else
+#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps((v), (v), c )
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 11)
+#define XM_LOADU_SI16( p ) _mm_cvtsi32_si128(*reinterpret_cast<unsigned short const*>(p))
+#else
+#define XM_LOADU_SI16( p ) _mm_loadu_si16(p)
+#endif
+
+#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
+
+#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+
+#if defined(__clang__) || defined(__GNUC__)
+#define XM_PREFETCH( a ) __builtin_prefetch(a)
+#elif defined(_MSC_VER)
+#define XM_PREFETCH( a ) __prefetch(a)
+#else
+#define XM_PREFETCH( a )
+#endif
+
+#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
+
+namespace DirectX
+{
+
+    /****************************************************************************
+     *
+     * Constant definitions
+     *
+     ****************************************************************************/
+
+#if defined(__XNAMATH_H__) && defined(XM_PI)
+#undef XM_PI
+#undef XM_2PI
+#undef XM_1DIVPI
+#undef XM_1DIV2PI
+#undef XM_PIDIV2
+#undef XM_PIDIV4
+#undef XM_SELECT_0
+#undef XM_SELECT_1
+#undef XM_PERMUTE_0X
+#undef XM_PERMUTE_0Y
+#undef XM_PERMUTE_0Z
+#undef XM_PERMUTE_0W
+#undef XM_PERMUTE_1X
+#undef XM_PERMUTE_1Y
+#undef XM_PERMUTE_1Z
+#undef XM_PERMUTE_1W
+#undef XM_CRMASK_CR6
+#undef XM_CRMASK_CR6TRUE
+#undef XM_CRMASK_CR6FALSE
+#undef XM_CRMASK_CR6BOUNDS
+#undef XM_CACHE_LINE_SIZE
+#endif
+
+    constexpr float XM_PI = 3.141592654f;
+    constexpr float XM_2PI = 6.283185307f;
+    constexpr float XM_1DIVPI = 0.318309886f;
+    constexpr float XM_1DIV2PI = 0.159154943f;
+    constexpr float XM_PIDIV2 = 1.570796327f;
+    constexpr float XM_PIDIV4 = 0.785398163f;
+
+    constexpr uint32_t XM_SELECT_0 = 0x00000000;
+    constexpr uint32_t XM_SELECT_1 = 0xFFFFFFFF;
+
+    constexpr uint32_t XM_PERMUTE_0X = 0;
+    constexpr uint32_t XM_PERMUTE_0Y = 1;
+    constexpr uint32_t XM_PERMUTE_0Z = 2;
+    constexpr uint32_t XM_PERMUTE_0W = 3;
+    constexpr uint32_t XM_PERMUTE_1X = 4;
+    constexpr uint32_t XM_PERMUTE_1Y = 5;
+    constexpr uint32_t XM_PERMUTE_1Z = 6;
+    constexpr uint32_t XM_PERMUTE_1W = 7;
+
+    constexpr uint32_t XM_SWIZZLE_X = 0;
+    constexpr uint32_t XM_SWIZZLE_Y = 1;
+    constexpr uint32_t XM_SWIZZLE_Z = 2;
+    constexpr uint32_t XM_SWIZZLE_W = 3;
+
+    constexpr uint32_t XM_CRMASK_CR6 = 0x000000F0;
+    constexpr uint32_t XM_CRMASK_CR6TRUE = 0x00000080;
+    constexpr uint32_t XM_CRMASK_CR6FALSE = 0x00000020;
+    constexpr uint32_t XM_CRMASK_CR6BOUNDS = XM_CRMASK_CR6FALSE;
+
+#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __arm__ || __aarch64__
+    constexpr size_t XM_CACHE_LINE_SIZE = 128;
+#else
+    constexpr size_t XM_CACHE_LINE_SIZE = 64;
+#endif
+
+
+    /****************************************************************************
+     *
+     * Macros
+     *
+     ****************************************************************************/
+
+#if defined(__XNAMATH_H__) && defined(XMComparisonAllTrue)
+#undef XMComparisonAllTrue
+#undef XMComparisonAnyTrue
+#undef XMComparisonAllFalse
+#undef XMComparisonAnyFalse
+#undef XMComparisonMixed
+#undef XMComparisonAllInBounds
+#undef XMComparisonAnyOutOfBounds
+#endif
+
+    // Unit conversion
+
+    constexpr float XMConvertToRadians(float fDegrees) noexcept { return fDegrees * (XM_PI / 180.0f); }
+    constexpr float XMConvertToDegrees(float fRadians) noexcept { return fRadians * (180.0f / XM_PI); }
+
+    // Condition register evaluation proceeding a recording (R) comparison
+
+    constexpr bool XMComparisonAllTrue(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6TRUE) == XM_CRMASK_CR6TRUE; }
+    constexpr bool XMComparisonAnyTrue(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6FALSE) != XM_CRMASK_CR6FALSE; }
+    constexpr bool XMComparisonAllFalse(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6FALSE) == XM_CRMASK_CR6FALSE; }
+    constexpr bool XMComparisonAnyFalse(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6TRUE) != XM_CRMASK_CR6TRUE; }
+    constexpr bool XMComparisonMixed(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6) == 0; }
+    constexpr bool XMComparisonAllInBounds(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6BOUNDS) == XM_CRMASK_CR6BOUNDS; }
+    constexpr bool XMComparisonAnyOutOfBounds(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6BOUNDS) != XM_CRMASK_CR6BOUNDS; }
+
+
+    /****************************************************************************
+     *
+     * Data types
+     *
+     ****************************************************************************/
+
+#ifdef _MSC_VER    
+#pragma warning(push)
+#pragma warning(disable:4068 4201 4365 4324 4820)
+     // C4068: ignore unknown pragmas
+     // C4201: nonstandard extension used : nameless struct/union
+     // C4365: Off by default noise
+     // C4324/4820: padding warnings
+#endif
+
+#ifdef _PREFAST_
+#pragma prefast(push)
+#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
+#endif
+
+//------------------------------------------------------------------------------
+#if defined(_XM_NO_INTRINSICS_)
+    struct __vector4
+    {
+        union
+        {
+            float       vector4_f32[4];
+            uint32_t    vector4_u32[4];
+        };
+    };
+#endif // _XM_NO_INTRINSICS_
+
+    //------------------------------------------------------------------------------
+    // Vector intrinsic: Four 32 bit floating point components aligned on a 16 byte
+    // boundary and mapped to hardware vector registers
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    using XMVECTOR = __m128;
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    using XMVECTOR = float32x4_t;
+#else
+    using XMVECTOR = __vector4;
+#endif
+
+    // Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86, ARM, ARM64, and vector call; by reference otherwise
+#if ( defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) || _XM_VECTORCALL_ || __i386__ || __arm__ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_)
+    typedef const XMVECTOR FXMVECTOR;
+#else
+    typedef const XMVECTOR& FXMVECTOR;
+#endif
+
+    // Fix-up for (4th) XMVECTOR parameter to pass in-register for ARM, ARM64, and vector call; by reference otherwise
+#if ( defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || _XM_VECTORCALL_ || __arm__ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_)
+    typedef const XMVECTOR GXMVECTOR;
+#else
+    typedef const XMVECTOR& GXMVECTOR;
+#endif
+
+    // Fix-up for (5th & 6th) XMVECTOR parameter to pass in-register for ARM64 and vector call; by reference otherwise
+#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || _XM_VECTORCALL_ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_)
+    typedef const XMVECTOR HXMVECTOR;
+#else
+    typedef const XMVECTOR& HXMVECTOR;
+#endif
+
+    // Fix-up for (7th+) XMVECTOR parameters to pass by reference
+    typedef const XMVECTOR& CXMVECTOR;
+
+    //------------------------------------------------------------------------------
+    // Conversion types for constants
+    XM_ALIGNED_STRUCT(16) XMVECTORF32
+    {
+        union
+        {
+            float f[4];
+            XMVECTOR v;
+        };
+
+        inline operator XMVECTOR() const noexcept { return v; }
+        inline operator const float* () const noexcept { return f; }
+#ifdef _XM_NO_INTRINSICS_
+#elif defined(_XM_SSE_INTRINSICS_)
+        inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
+        inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES))
+        inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
+        inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
+#endif
+    };
+
+    XM_ALIGNED_STRUCT(16) XMVECTORI32
+    {
+        union
+        {
+            int32_t i[4];
+            XMVECTOR v;
+        };
+
+        inline operator XMVECTOR() const noexcept { return v; }
+#ifdef _XM_NO_INTRINSICS_
+#elif defined(_XM_SSE_INTRINSICS_)
+        inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
+        inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES))
+        inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
+        inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
+#endif
+    };
+
+    XM_ALIGNED_STRUCT(16) XMVECTORU8
+    {
+        union
+        {
+            uint8_t u[16];
+            XMVECTOR v;
+        };
+
+        inline operator XMVECTOR() const noexcept { return v; }
+#ifdef _XM_NO_INTRINSICS_
+#elif defined(_XM_SSE_INTRINSICS_)
+        inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
+        inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES))
+        inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
+        inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
+#endif
+    };
+
+    XM_ALIGNED_STRUCT(16) XMVECTORU32
+    {
+        union
+        {
+            uint32_t u[4];
+            XMVECTOR v;
+        };
+
+        inline operator XMVECTOR() const noexcept { return v; }
+#ifdef _XM_NO_INTRINSICS_
+#elif defined(_XM_SSE_INTRINSICS_)
+        inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
+        inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES))
+        inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
+        inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
+#endif
+    };
+
+    //------------------------------------------------------------------------------
+    // Vector operators
+
+#ifndef _XM_NO_XMVECTOR_OVERLOADS_
+    XMVECTOR    XM_CALLCONV     operator+ (FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     operator- (FXMVECTOR V) noexcept;
+
+    XMVECTOR&   XM_CALLCONV     operator+= (XMVECTOR& V1, FXMVECTOR V2) noexcept;
+    XMVECTOR&   XM_CALLCONV     operator-= (XMVECTOR& V1, FXMVECTOR V2) noexcept;
+    XMVECTOR&   XM_CALLCONV     operator*= (XMVECTOR& V1, FXMVECTOR V2) noexcept;
+    XMVECTOR&   XM_CALLCONV     operator/= (XMVECTOR& V1, FXMVECTOR V2) noexcept;
+
+    XMVECTOR& operator*= (XMVECTOR& V, float S) noexcept;
+    XMVECTOR& operator/= (XMVECTOR& V, float S) noexcept;
+
+    XMVECTOR    XM_CALLCONV     operator+ (FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     operator- (FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     operator* (FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     operator/ (FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     operator* (FXMVECTOR V, float S) noexcept;
+    XMVECTOR    XM_CALLCONV     operator* (float S, FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     operator/ (FXMVECTOR V, float S) noexcept;
+#endif /* !_XM_NO_XMVECTOR_OVERLOADS_ */
+
+    //------------------------------------------------------------------------------
+    // Matrix type: Sixteen 32 bit floating point components aligned on a
+    // 16 byte boundary and mapped to four hardware vector registers
+
+    struct XMMATRIX;
+
+    // Fix-up for (1st) XMMATRIX parameter to pass in-register for ARM64 and vector call; by reference otherwise
+#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || _XM_VECTORCALL_ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_)
+    typedef const XMMATRIX FXMMATRIX;
+#else
+    typedef const XMMATRIX& FXMMATRIX;
+#endif
+
+    // Fix-up for (2nd+) XMMATRIX parameters to pass by reference
+    typedef const XMMATRIX& CXMMATRIX;
+
+#ifdef _XM_NO_INTRINSICS_
+    struct XMMATRIX
+#else
+    XM_ALIGNED_STRUCT(16) XMMATRIX
+#endif
+    {
+#ifdef _XM_NO_INTRINSICS_
+        union
+        {
+            XMVECTOR r[4];
+            struct
+            {
+                float _11, _12, _13, _14;
+                float _21, _22, _23, _24;
+                float _31, _32, _33, _34;
+                float _41, _42, _43, _44;
+            };
+            float m[4][4];
+        };
+#else
+        XMVECTOR r[4];
+#endif
+
+        XMMATRIX() = default;
+
+        XMMATRIX(const XMMATRIX&) = default;
+
+#if defined(_MSC_VER) && (_MSC_FULL_VER < 191426431)
+        XMMATRIX& operator= (const XMMATRIX& M) noexcept { r[0] = M.r[0]; r[1] = M.r[1]; r[2] = M.r[2]; r[3] = M.r[3]; return *this; }
+#else
+        XMMATRIX& operator=(const XMMATRIX&) = default;
+
+        XMMATRIX(XMMATRIX&&) = default;
+        XMMATRIX& operator=(XMMATRIX&&) = default;
+#endif
+
+        constexpr XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, CXMVECTOR R3) noexcept : r{ R0,R1,R2,R3 } {}
+        XMMATRIX(float m00, float m01, float m02, float m03,
+            float m10, float m11, float m12, float m13,
+            float m20, float m21, float m22, float m23,
+            float m30, float m31, float m32, float m33) noexcept;
+        explicit XMMATRIX(_In_reads_(16) const float* pArray) noexcept;
+
+#ifdef _XM_NO_INTRINSICS_
+        float       operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; }
+        float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; }
+#endif
+
+        XMMATRIX    operator+ () const noexcept { return *this; }
+        XMMATRIX    operator- () const noexcept;
+
+        XMMATRIX&   XM_CALLCONV     operator+= (FXMMATRIX M) noexcept;
+        XMMATRIX&   XM_CALLCONV     operator-= (FXMMATRIX M) noexcept;
+        XMMATRIX&   XM_CALLCONV     operator*= (FXMMATRIX M) noexcept;
+        XMMATRIX&   operator*= (float S) noexcept;
+        XMMATRIX&   operator/= (float S) noexcept;
+
+        XMMATRIX    XM_CALLCONV     operator+ (FXMMATRIX M) const noexcept;
+        XMMATRIX    XM_CALLCONV     operator- (FXMMATRIX M) const noexcept;
+        XMMATRIX    XM_CALLCONV     operator* (FXMMATRIX M) const noexcept;
+        XMMATRIX    operator* (float S) const noexcept;
+        XMMATRIX    operator/ (float S) const noexcept;
+
+        friend XMMATRIX     XM_CALLCONV     operator* (float S, FXMMATRIX M) noexcept;
+    };
+
+    //------------------------------------------------------------------------------
+    // 2D Vector; 32 bit floating point components
+    struct XMFLOAT2
+    {
+        float x;
+        float y;
+
+        XMFLOAT2() = default;
+
+        XMFLOAT2(const XMFLOAT2&) = default;
+        XMFLOAT2& operator=(const XMFLOAT2&) = default;
+
+        XMFLOAT2(XMFLOAT2&&) = default;
+        XMFLOAT2& operator=(XMFLOAT2&&) = default;
+
+        constexpr XMFLOAT2(float _x, float _y) noexcept : x(_x), y(_y) {}
+        explicit XMFLOAT2(_In_reads_(2) const float* pArray)  noexcept : x(pArray[0]), y(pArray[1]) {}
+
+#if (__cplusplus >= 202002L)
+        bool operator == (const XMFLOAT2&) const = default;
+        auto operator <=> (const XMFLOAT2&) const = default;
+#endif
+    };
+
+    // 2D Vector; 32 bit floating point components aligned on a 16 byte boundary
+    XM_ALIGNED_STRUCT(16) XMFLOAT2A : public XMFLOAT2
+    {
+        using XMFLOAT2::XMFLOAT2;
+    };
+
+    //------------------------------------------------------------------------------
+    // 2D Vector; 32 bit signed integer components
+    struct XMINT2
+    {
+        int32_t x;
+        int32_t y;
+
+        XMINT2() = default;
+
+        XMINT2(const XMINT2&) = default;
+        XMINT2& operator=(const XMINT2&) = default;
+
+        XMINT2(XMINT2&&) = default;
+        XMINT2& operator=(XMINT2&&) = default;
+
+        constexpr XMINT2(int32_t _x, int32_t _y) noexcept : x(_x), y(_y) {}
+        explicit XMINT2(_In_reads_(2) const int32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {}
+
+#if (__cplusplus >= 202002L)
+        bool operator == (const XMINT2&) const = default;
+        auto operator <=> (const XMINT2&) const = default;
+#endif
+    };
+
+    // 2D Vector; 32 bit unsigned integer components
+    struct XMUINT2
+    {
+        uint32_t x;
+        uint32_t y;
+
+        XMUINT2() = default;
+
+        XMUINT2(const XMUINT2&) = default;
+        XMUINT2& operator=(const XMUINT2&) = default;
+
+        XMUINT2(XMUINT2&&) = default;
+        XMUINT2& operator=(XMUINT2&&) = default;
+
+        constexpr XMUINT2(uint32_t _x, uint32_t _y) noexcept : x(_x), y(_y) {}
+        explicit XMUINT2(_In_reads_(2) const uint32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {}
+
+#if (__cplusplus >= 202002L)
+        bool operator == (const XMUINT2&) const = default;
+        auto operator <=> (const XMUINT2&) const = default;
+#endif
+    };
+
+    //------------------------------------------------------------------------------
+    // 3D Vector; 32 bit floating point components
+    struct XMFLOAT3
+    {
+        float x;
+        float y;
+        float z;
+
+        XMFLOAT3() = default;
+
+        XMFLOAT3(const XMFLOAT3&) = default;
+        XMFLOAT3& operator=(const XMFLOAT3&) = default;
+
+        XMFLOAT3(XMFLOAT3&&) = default;
+        XMFLOAT3& operator=(XMFLOAT3&&) = default;
+
+        constexpr XMFLOAT3(float _x, float _y, float _z) noexcept : x(_x), y(_y), z(_z) {}
+        explicit XMFLOAT3(_In_reads_(3) const float* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
+    };
+
+    // 3D Vector; 32 bit floating point components aligned on a 16 byte boundary
+    XM_ALIGNED_STRUCT(16) XMFLOAT3A : public XMFLOAT3
+    {
+        using XMFLOAT3::XMFLOAT3;
+    };
+
+    //------------------------------------------------------------------------------
+    // 3D Vector; 32 bit signed integer components
+    struct XMINT3
+    {
+        int32_t x;
+        int32_t y;
+        int32_t z;
+
+        XMINT3() = default;
+
+        XMINT3(const XMINT3&) = default;
+        XMINT3& operator=(const XMINT3&) = default;
+
+        XMINT3(XMINT3&&) = default;
+        XMINT3& operator=(XMINT3&&) = default;
+
+        constexpr XMINT3(int32_t _x, int32_t _y, int32_t _z) noexcept : x(_x), y(_y), z(_z) {}
+        explicit XMINT3(_In_reads_(3) const int32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
+
+#if (__cplusplus >= 202002L)
+        bool operator == (const XMINT3&) const = default;
+        auto operator <=> (const XMINT3&) const = default;
+#endif
+    };
+
+    // 3D Vector; 32 bit unsigned integer components
+    struct XMUINT3
+    {
+        uint32_t x;
+        uint32_t y;
+        uint32_t z;
+
+        XMUINT3() = default;
+
+        XMUINT3(const XMUINT3&) = default;
+        XMUINT3& operator=(const XMUINT3&) = default;
+
+        XMUINT3(XMUINT3&&) = default;
+        XMUINT3& operator=(XMUINT3&&) = default;
+
+        constexpr XMUINT3(uint32_t _x, uint32_t _y, uint32_t _z) noexcept : x(_x), y(_y), z(_z) {}
+        explicit XMUINT3(_In_reads_(3) const uint32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
+
+#if (__cplusplus >= 202002L)
+        bool operator == (const XMUINT3&) const = default;
+        auto operator <=> (const XMUINT3&) const = default;
+#endif
+    };
+
+    //------------------------------------------------------------------------------
+    // 4D Vector; 32 bit floating point components
+    struct XMFLOAT4
+    {
+        float x;
+        float y;
+        float z;
+        float w;
+
+        XMFLOAT4() = default;
+
+        XMFLOAT4(const XMFLOAT4&) = default;
+        XMFLOAT4& operator=(const XMFLOAT4&) = default;
+
+        XMFLOAT4(XMFLOAT4&&) = default;
+        XMFLOAT4& operator=(XMFLOAT4&&) = default;
+
+        constexpr XMFLOAT4(float _x, float _y, float _z, float _w) noexcept : x(_x), y(_y), z(_z), w(_w) {}
+        explicit XMFLOAT4(_In_reads_(4) const float* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+
+#if (__cplusplus >= 202002L)
+        bool operator == (const XMFLOAT4&) const = default;
+        auto operator <=> (const XMFLOAT4&) const = default;
+#endif
+    };
+
+    // 4D Vector; 32 bit floating point components aligned on a 16 byte boundary
+    XM_ALIGNED_STRUCT(16) XMFLOAT4A : public XMFLOAT4
+    {
+        using XMFLOAT4::XMFLOAT4;
+    };
+
+    //------------------------------------------------------------------------------
+    // 4D Vector; 32 bit signed integer components
+    struct XMINT4
+    {
+        int32_t x;
+        int32_t y;
+        int32_t z;
+        int32_t w;
+
+        XMINT4() = default;
+
+        XMINT4(const XMINT4&) = default;
+        XMINT4& operator=(const XMINT4&) = default;
+
+        XMINT4(XMINT4&&) = default;
+        XMINT4& operator=(XMINT4&&) = default;
+
+        constexpr XMINT4(int32_t _x, int32_t _y, int32_t _z, int32_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {}
+        explicit XMINT4(_In_reads_(4) const int32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+
+#if (__cplusplus >= 202002L)
+        bool operator == (const XMINT4&) const = default;
+        auto operator <=> (const XMINT4&) const = default;
+#endif
+    };
+
+    // 4D Vector; 32 bit unsigned integer components
+    struct XMUINT4
+    {
+        uint32_t x;
+        uint32_t y;
+        uint32_t z;
+        uint32_t w;
+
+        XMUINT4() = default;
+
+        XMUINT4(const XMUINT4&) = default;
+        XMUINT4& operator=(const XMUINT4&) = default;
+
+        XMUINT4(XMUINT4&&) = default;
+        XMUINT4& operator=(XMUINT4&&) = default;
+
+        constexpr XMUINT4(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {}
+        explicit XMUINT4(_In_reads_(4) const uint32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+
+#if (__cplusplus >= 202002L)
+        bool operator == (const XMUINT4&) const = default;
+        auto operator <=> (const XMUINT4&) const = default;
+#endif
+    };
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
+#pragma clang diagnostic ignored "-Wnested-anon-types"
+#endif
+
+    //------------------------------------------------------------------------------
+    // 3x3 Matrix: 32 bit floating point components
+    struct XMFLOAT3X3
+    {
+        union
+        {
+            struct
+            {
+                float _11, _12, _13;
+                float _21, _22, _23;
+                float _31, _32, _33;
+            };
+            float m[3][3];
+        };
+
+        XMFLOAT3X3() = default;
+
+        XMFLOAT3X3(const XMFLOAT3X3&) = default;
+        XMFLOAT3X3& operator=(const XMFLOAT3X3&) = default;
+
+        XMFLOAT3X3(XMFLOAT3X3&&) = default;
+        XMFLOAT3X3& operator=(XMFLOAT3X3&&) = default;
+
+        constexpr XMFLOAT3X3(float m00, float m01, float m02,
+            float m10, float m11, float m12,
+            float m20, float m21, float m22) noexcept
+            : _11(m00), _12(m01), _13(m02),
+            _21(m10), _22(m11), _23(m12),
+            _31(m20), _32(m21), _33(m22) {}
+        explicit XMFLOAT3X3(_In_reads_(9) const float* pArray) noexcept;
+
+        float  operator() (size_t Row, size_t Column) const  noexcept { return m[Row][Column]; }
+        float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; }
+
+#if (__cplusplus >= 202002L)
+        bool operator == (const XMFLOAT3X3&) const = default;
+        auto operator <=> (const XMFLOAT3X3&) const = default;
+#endif
+    };
+
+    //------------------------------------------------------------------------------
+    // 4x3 Row-major Matrix: 32 bit floating point components
+    struct XMFLOAT4X3
+    {
+        union
+        {
+            struct
+            {
+                float _11, _12, _13;
+                float _21, _22, _23;
+                float _31, _32, _33;
+                float _41, _42, _43;
+            };
+            float m[4][3];
+            float f[12];
+        };
+
+        XMFLOAT4X3() = default;
+
+        XMFLOAT4X3(const XMFLOAT4X3&) = default;
+        XMFLOAT4X3& operator=(const XMFLOAT4X3&) = default;
+
+        XMFLOAT4X3(XMFLOAT4X3&&) = default;
+        XMFLOAT4X3& operator=(XMFLOAT4X3&&) = default;
+
+        constexpr XMFLOAT4X3(float m00, float m01, float m02,
+            float m10, float m11, float m12,
+            float m20, float m21, float m22,
+            float m30, float m31, float m32) noexcept
+            : _11(m00), _12(m01), _13(m02),
+            _21(m10), _22(m11), _23(m12),
+            _31(m20), _32(m21), _33(m22),
+            _41(m30), _42(m31), _43(m32) {}
+        explicit XMFLOAT4X3(_In_reads_(12) const float* pArray) noexcept;
+
+        float  operator() (size_t Row, size_t Column) const  noexcept { return m[Row][Column]; }
+        float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; }
+
+#if (__cplusplus >= 202002L)
+        bool operator == (const XMFLOAT4X3&) const = default;
+        auto operator <=> (const XMFLOAT4X3&) const = default;
+#endif
+    };
+
+    // 4x3 Row-major Matrix: 32 bit floating point components aligned on a 16 byte boundary
+    XM_ALIGNED_STRUCT(16) XMFLOAT4X3A : public XMFLOAT4X3
+    {
+        using XMFLOAT4X3::XMFLOAT4X3;
+    };
+
+    //------------------------------------------------------------------------------
+    // 3x4 Column-major Matrix: 32 bit floating point components
+    struct XMFLOAT3X4
+    {
+        union
+        {
+            struct
+            {
+                float _11, _12, _13, _14;
+                float _21, _22, _23, _24;
+                float _31, _32, _33, _34;
+            };
+            float m[3][4];
+            float f[12];
+        };
+
+        XMFLOAT3X4() = default;
+
+        XMFLOAT3X4(const XMFLOAT3X4&) = default;
+        XMFLOAT3X4& operator=(const XMFLOAT3X4&) = default;
+
+        XMFLOAT3X4(XMFLOAT3X4&&) = default;
+        XMFLOAT3X4& operator=(XMFLOAT3X4&&) = default;
+
+        constexpr XMFLOAT3X4(float m00, float m01, float m02, float m03,
+            float m10, float m11, float m12, float m13,
+            float m20, float m21, float m22, float m23) noexcept
+            : _11(m00), _12(m01), _13(m02), _14(m03),
+            _21(m10), _22(m11), _23(m12), _24(m13),
+            _31(m20), _32(m21), _33(m22), _34(m23) {}
+        explicit XMFLOAT3X4(_In_reads_(12) const float* pArray) noexcept;
+
+        float  operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; }
+        float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; }
+
+#if (__cplusplus >= 202002L)
+        bool operator == (const XMFLOAT3X4&) const = default;
+        auto operator <=> (const XMFLOAT3X4&) const = default;
+#endif
+    };
+
+    // 3x4 Column-major Matrix: 32 bit floating point components aligned on a 16 byte boundary
+    XM_ALIGNED_STRUCT(16) XMFLOAT3X4A : public XMFLOAT3X4
+    {
+        using XMFLOAT3X4::XMFLOAT3X4;
+    };
+
+    //------------------------------------------------------------------------------
+    // 4x4 Matrix: 32 bit floating point components
+    struct XMFLOAT4X4
+    {
+        union
+        {
+            struct
+            {
+                float _11, _12, _13, _14;
+                float _21, _22, _23, _24;
+                float _31, _32, _33, _34;
+                float _41, _42, _43, _44;
+            };
+            float m[4][4];
+        };
+
+        XMFLOAT4X4() = default;
+
+        XMFLOAT4X4(const XMFLOAT4X4&) = default;
+        XMFLOAT4X4& operator=(const XMFLOAT4X4&) = default;
+
+        XMFLOAT4X4(XMFLOAT4X4&&) = default;
+        XMFLOAT4X4& operator=(XMFLOAT4X4&&) = default;
+
+        constexpr XMFLOAT4X4(float m00, float m01, float m02, float m03,
+            float m10, float m11, float m12, float m13,
+            float m20, float m21, float m22, float m23,
+            float m30, float m31, float m32, float m33) noexcept
+            : _11(m00), _12(m01), _13(m02), _14(m03),
+            _21(m10), _22(m11), _23(m12), _24(m13),
+            _31(m20), _32(m21), _33(m22), _34(m23),
+            _41(m30), _42(m31), _43(m32), _44(m33) {}
+        explicit XMFLOAT4X4(_In_reads_(16) const float* pArray) noexcept;
+
+        float  operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; }
+        float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; }
+
+#if (__cplusplus >= 202002L)
+        bool operator == (const XMFLOAT4X4&) const = default;
+        auto operator <=> (const XMFLOAT4X4&) const = default;
+#endif
+    };
+
+    // 4x4 Matrix: 32 bit floating point components aligned on a 16 byte boundary
+    XM_ALIGNED_STRUCT(16) XMFLOAT4X4A : public XMFLOAT4X4
+    {
+        using XMFLOAT4X4::XMFLOAT4X4;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+#ifdef _PREFAST_
+#pragma prefast(pop)
+#endif
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+/****************************************************************************
+ *
+ * Data conversion operations
+ *
+ ****************************************************************************/
+
+    XMVECTOR    XM_CALLCONV     XMConvertVectorIntToFloat(FXMVECTOR VInt, uint32_t DivExponent) noexcept;
+    XMVECTOR    XM_CALLCONV     XMConvertVectorFloatToInt(FXMVECTOR VFloat, uint32_t MulExponent) noexcept;
+    XMVECTOR    XM_CALLCONV     XMConvertVectorUIntToFloat(FXMVECTOR VUInt, uint32_t DivExponent) noexcept;
+    XMVECTOR    XM_CALLCONV     XMConvertVectorFloatToUInt(FXMVECTOR VFloat, uint32_t MulExponent) noexcept;
+
+#if defined(__XNAMATH_H__) && defined(XMVectorSetBinaryConstant)
+#undef XMVectorSetBinaryConstant
+#undef XMVectorSplatConstant
+#undef XMVectorSplatConstantInt
+#endif
+
+    XMVECTOR    XM_CALLCONV     XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSplatConstantInt(int32_t IntConstant) noexcept;
+
+    /****************************************************************************
+     *
+     * Load operations
+     *
+     ****************************************************************************/
+
+    XMVECTOR    XM_CALLCONV     XMLoadInt(_In_ const uint32_t* pSource) noexcept;
+    XMVECTOR    XM_CALLCONV     XMLoadFloat(_In_ const float* pSource) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMLoadInt2(_In_reads_(2) const uint32_t* pSource) noexcept;
+    XMVECTOR    XM_CALLCONV     XMLoadInt2A(_In_reads_(2) const uint32_t* PSource) noexcept;
+    XMVECTOR    XM_CALLCONV     XMLoadFloat2(_In_ const XMFLOAT2* pSource) noexcept;
+    XMVECTOR    XM_CALLCONV     XMLoadFloat2A(_In_ const XMFLOAT2A* pSource) noexcept;
+    XMVECTOR    XM_CALLCONV     XMLoadSInt2(_In_ const XMINT2* pSource) noexcept;
+    XMVECTOR    XM_CALLCONV     XMLoadUInt2(_In_ const XMUINT2* pSource) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMLoadInt3(_In_reads_(3) const uint32_t* pSource) noexcept;
+    XMVECTOR    XM_CALLCONV     XMLoadInt3A(_In_reads_(3) const uint32_t* pSource) noexcept;
+    XMVECTOR    XM_CALLCONV     XMLoadFloat3(_In_ const XMFLOAT3* pSource) noexcept;
+    XMVECTOR    XM_CALLCONV     XMLoadFloat3A(_In_ const XMFLOAT3A* pSource) noexcept;
+    XMVECTOR    XM_CALLCONV     XMLoadSInt3(_In_ const XMINT3* pSource) noexcept;
+    XMVECTOR    XM_CALLCONV     XMLoadUInt3(_In_ const XMUINT3* pSource) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMLoadInt4(_In_reads_(4) const uint32_t* pSource) noexcept;
+    XMVECTOR    XM_CALLCONV     XMLoadInt4A(_In_reads_(4) const uint32_t* pSource) noexcept;
+    XMVECTOR    XM_CALLCONV     XMLoadFloat4(_In_ const XMFLOAT4* pSource) noexcept;
+    XMVECTOR    XM_CALLCONV     XMLoadFloat4A(_In_ const XMFLOAT4A* pSource) noexcept;
+    XMVECTOR    XM_CALLCONV     XMLoadSInt4(_In_ const XMINT4* pSource) noexcept;
+    XMVECTOR    XM_CALLCONV     XMLoadUInt4(_In_ const XMUINT4* pSource) noexcept;
+
+    XMMATRIX    XM_CALLCONV     XMLoadFloat3x3(_In_ const XMFLOAT3X3* pSource) noexcept;
+    XMMATRIX    XM_CALLCONV     XMLoadFloat4x3(_In_ const XMFLOAT4X3* pSource) noexcept;
+    XMMATRIX    XM_CALLCONV     XMLoadFloat4x3A(_In_ const XMFLOAT4X3A* pSource) noexcept;
+    XMMATRIX    XM_CALLCONV     XMLoadFloat3x4(_In_ const XMFLOAT3X4* pSource) noexcept;
+    XMMATRIX    XM_CALLCONV     XMLoadFloat3x4A(_In_ const XMFLOAT3X4A* pSource) noexcept;
+    XMMATRIX    XM_CALLCONV     XMLoadFloat4x4(_In_ const XMFLOAT4X4* pSource) noexcept;
+    XMMATRIX    XM_CALLCONV     XMLoadFloat4x4A(_In_ const XMFLOAT4X4A* pSource) noexcept;
+
+    /****************************************************************************
+     *
+     * Store operations
+     *
+     ****************************************************************************/
+
+    void        XM_CALLCONV     XMStoreInt(_Out_ uint32_t* pDestination, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMStoreFloat(_Out_ float* pDestination, _In_ FXMVECTOR V) noexcept;
+
+    void        XM_CALLCONV     XMStoreInt2(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMStoreInt2A(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMStoreFloat2(_Out_ XMFLOAT2* pDestination, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMStoreFloat2A(_Out_ XMFLOAT2A* pDestination, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMStoreSInt2(_Out_ XMINT2* pDestination, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMStoreUInt2(_Out_ XMUINT2* pDestination, _In_ FXMVECTOR V) noexcept;
+
+    void        XM_CALLCONV     XMStoreInt3(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMStoreInt3A(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMStoreFloat3(_Out_ XMFLOAT3* pDestination, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMStoreFloat3A(_Out_ XMFLOAT3A* pDestination, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMStoreSInt3(_Out_ XMINT3* pDestination, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMStoreUInt3(_Out_ XMUINT3* pDestination, _In_ FXMVECTOR V) noexcept;
+
+    void        XM_CALLCONV     XMStoreInt4(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMStoreInt4A(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMStoreFloat4(_Out_ XMFLOAT4* pDestination, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMStoreFloat4A(_Out_ XMFLOAT4A* pDestination, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMStoreSInt4(_Out_ XMINT4* pDestination, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMStoreUInt4(_Out_ XMUINT4* pDestination, _In_ FXMVECTOR V) noexcept;
+
+    void        XM_CALLCONV     XMStoreFloat3x3(_Out_ XMFLOAT3X3* pDestination, _In_ FXMMATRIX M) noexcept;
+    void        XM_CALLCONV     XMStoreFloat4x3(_Out_ XMFLOAT4X3* pDestination, _In_ FXMMATRIX M) noexcept;
+    void        XM_CALLCONV     XMStoreFloat4x3A(_Out_ XMFLOAT4X3A* pDestination, _In_ FXMMATRIX M) noexcept;
+    void        XM_CALLCONV     XMStoreFloat3x4(_Out_ XMFLOAT3X4* pDestination, _In_ FXMMATRIX M) noexcept;
+    void        XM_CALLCONV     XMStoreFloat3x4A(_Out_ XMFLOAT3X4A* pDestination, _In_ FXMMATRIX M) noexcept;
+    void        XM_CALLCONV     XMStoreFloat4x4(_Out_ XMFLOAT4X4* pDestination, _In_ FXMMATRIX M) noexcept;
+    void        XM_CALLCONV     XMStoreFloat4x4A(_Out_ XMFLOAT4X4A* pDestination, _In_ FXMMATRIX M) noexcept;
+
+    /****************************************************************************
+     *
+     * General vector operations
+     *
+     ****************************************************************************/
+
+    XMVECTOR    XM_CALLCONV     XMVectorZero() noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSet(float x, float y, float z, float w) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorReplicate(float Value) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorReplicatePtr(_In_ const float* pValue) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorReplicateInt(uint32_t Value) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorReplicateIntPtr(_In_ const uint32_t* pValue) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorTrueInt() noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorFalseInt() noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSplatX(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSplatY(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSplatZ(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSplatW(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSplatOne() noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSplatInfinity() noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSplatQNaN() noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSplatEpsilon() noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSplatSignMask() noexcept;
+
+    float       XM_CALLCONV     XMVectorGetByIndex(FXMVECTOR V, size_t i) noexcept;
+    float       XM_CALLCONV     XMVectorGetX(FXMVECTOR V) noexcept;
+    float       XM_CALLCONV     XMVectorGetY(FXMVECTOR V) noexcept;
+    float       XM_CALLCONV     XMVectorGetZ(FXMVECTOR V) noexcept;
+    float       XM_CALLCONV     XMVectorGetW(FXMVECTOR V) noexcept;
+
+    void        XM_CALLCONV     XMVectorGetByIndexPtr(_Out_ float* f, _In_ FXMVECTOR V, _In_ size_t i) noexcept;
+    void        XM_CALLCONV     XMVectorGetXPtr(_Out_ float* x, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMVectorGetYPtr(_Out_ float* y, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMVectorGetZPtr(_Out_ float* z, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMVectorGetWPtr(_Out_ float* w, _In_ FXMVECTOR V) noexcept;
+
+    uint32_t    XM_CALLCONV     XMVectorGetIntByIndex(FXMVECTOR V, size_t i) noexcept;
+    uint32_t    XM_CALLCONV     XMVectorGetIntX(FXMVECTOR V) noexcept;
+    uint32_t    XM_CALLCONV     XMVectorGetIntY(FXMVECTOR V) noexcept;
+    uint32_t    XM_CALLCONV     XMVectorGetIntZ(FXMVECTOR V) noexcept;
+    uint32_t    XM_CALLCONV     XMVectorGetIntW(FXMVECTOR V) noexcept;
+
+    void        XM_CALLCONV     XMVectorGetIntByIndexPtr(_Out_ uint32_t* x, _In_ FXMVECTOR V, _In_ size_t i) noexcept;
+    void        XM_CALLCONV     XMVectorGetIntXPtr(_Out_ uint32_t* x, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMVectorGetIntYPtr(_Out_ uint32_t* y, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMVectorGetIntZPtr(_Out_ uint32_t* z, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMVectorGetIntWPtr(_Out_ uint32_t* w, _In_ FXMVECTOR V) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMVectorSetByIndex(FXMVECTOR V, float f, size_t i) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetX(FXMVECTOR V, float x) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetY(FXMVECTOR V, float y) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetZ(FXMVECTOR V, float z) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetW(FXMVECTOR V, float w) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMVectorSetByIndexPtr(_In_ FXMVECTOR V, _In_ const float* f, _In_ size_t i) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetXPtr(_In_ FXMVECTOR V, _In_ const float* x) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetYPtr(_In_ FXMVECTOR V, _In_ const float* y) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetZPtr(_In_ FXMVECTOR V, _In_ const float* z) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetWPtr(_In_ FXMVECTOR V, _In_ const float* w) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetIntX(FXMVECTOR V, uint32_t x) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetIntY(FXMVECTOR V, uint32_t y) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetIntZ(FXMVECTOR V, uint32_t z) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetIntW(FXMVECTOR V, uint32_t w) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMVectorSetIntByIndexPtr(_In_ FXMVECTOR V, _In_ const uint32_t* x, _In_ size_t i) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetIntXPtr(_In_ FXMVECTOR V, _In_ const uint32_t* x) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetIntYPtr(_In_ FXMVECTOR V, _In_ const uint32_t* y) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetIntZPtr(_In_ FXMVECTOR V, _In_ const uint32_t* z) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSetIntWPtr(_In_ FXMVECTOR V, _In_ const uint32_t* w) noexcept;
+
+#if defined(__XNAMATH_H__) && defined(XMVectorSwizzle)
+#undef XMVectorSwizzle
+#endif
+
+    XMVECTOR    XM_CALLCONV     XMVectorSwizzle(FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSelectControl(uint32_t VectorIndex0, uint32_t VectorIndex1, uint32_t VectorIndex2, uint32_t VectorIndex3) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSelect(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Control) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorMergeXY(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorMergeZW(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+
+#if defined(__XNAMATH_H__) && defined(XMVectorShiftLeft)
+#undef XMVectorShiftLeft
+#undef XMVectorRotateLeft
+#undef XMVectorRotateRight
+#undef XMVectorInsert
+#endif
+
+    XMVECTOR    XM_CALLCONV     XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements,
+        uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMVectorEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorEqualIntR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorNearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorNotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorNotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorGreater(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorGreaterR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorGreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorGreaterOrEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorLess(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorLessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorInBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorInBoundsR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR Bounds) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMVectorIsNaN(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorIsInfinite(FXMVECTOR V) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMVectorMin(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorMax(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorRound(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorTruncate(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorFloor(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorCeiling(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorClamp(FXMVECTOR V, FXMVECTOR Min, FXMVECTOR Max) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSaturate(FXMVECTOR V) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMVectorAndInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorAndCInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorOrInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorNorInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorXorInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMVectorNegate(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorAdd(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSum(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorAddAngles(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSubtract(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSubtractAngles(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorMultiply(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorMultiplyAdd(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorDivide(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorNegativeMultiplySubtract(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorScale(FXMVECTOR V, float ScaleFactor) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorReciprocalEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorReciprocal(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSqrtEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSqrt(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorReciprocalSqrtEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorReciprocalSqrt(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorExp2(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorExp10(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorExpE(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorExp(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorLog2(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorLog10(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorLogE(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorLog(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorPow(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorAbs(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorMod(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorModAngles(FXMVECTOR Angles) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSin(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSinEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorCos(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorCosEst(FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMVectorSinCos(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V) noexcept;
+    void        XM_CALLCONV     XMVectorSinCosEst(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorTan(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorTanEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorSinH(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorCosH(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorTanH(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorASin(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorASinEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorACos(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorACosEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorATan(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorATanEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorATan2(FXMVECTOR Y, FXMVECTOR X) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorATan2Est(FXMVECTOR Y, FXMVECTOR X) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorLerp(FXMVECTOR V0, FXMVECTOR V1, float t) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorLerpV(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR T) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorHermite(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, float t) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorHermiteV(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, HXMVECTOR T) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorCatmullRom(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, float t) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorCatmullRomV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, HXMVECTOR T) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorBaryCentric(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, float f, float g) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVectorBaryCentricV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR F, HXMVECTOR G) noexcept;
+
+    /****************************************************************************
+     *
+     * 2D vector operations
+     *
+     ****************************************************************************/
+
+    bool        XM_CALLCONV     XMVector2Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    uint32_t    XM_CALLCONV     XMVector2EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector2EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    uint32_t    XM_CALLCONV     XMVector2EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector2NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept;
+    bool        XM_CALLCONV     XMVector2NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector2NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector2Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    uint32_t    XM_CALLCONV     XMVector2GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector2GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    uint32_t    XM_CALLCONV     XMVector2GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector2Less(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector2LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector2InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept;
+
+    bool        XM_CALLCONV     XMVector2IsNaN(FXMVECTOR V) noexcept;
+    bool        XM_CALLCONV     XMVector2IsInfinite(FXMVECTOR V) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMVector2Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2Cross(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2LengthSq(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2ReciprocalLengthEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2ReciprocalLength(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2LengthEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2Length(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2NormalizeEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2Normalize(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2ClampLength(FXMVECTOR V, float LengthMin, float LengthMax) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2Reflect(FXMVECTOR Incident, FXMVECTOR Normal) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2Orthogonal(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2IntersectLine(FXMVECTOR Line1Point1, FXMVECTOR Line1Point2, FXMVECTOR Line2Point1, GXMVECTOR Line2Point2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2Transform(FXMVECTOR V, FXMMATRIX M) noexcept;
+    XMFLOAT4*   XM_CALLCONV     XMVector2TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (VectorCount - 1)) XMFLOAT4* pOutputStream,
+        _In_ size_t OutputStride,
+        _In_reads_bytes_(sizeof(XMFLOAT2) + InputStride * (VectorCount - 1)) const XMFLOAT2* pInputStream,
+        _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2TransformCoord(FXMVECTOR V, FXMMATRIX M) noexcept;
+    XMFLOAT2*   XM_CALLCONV     XMVector2TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT2) + OutputStride * (VectorCount - 1)) XMFLOAT2* pOutputStream,
+        _In_ size_t OutputStride,
+        _In_reads_bytes_(sizeof(XMFLOAT2) + InputStride * (VectorCount - 1)) const XMFLOAT2* pInputStream,
+        _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector2TransformNormal(FXMVECTOR V, FXMMATRIX M) noexcept;
+    XMFLOAT2*   XM_CALLCONV     XMVector2TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT2) + OutputStride * (VectorCount - 1)) XMFLOAT2* pOutputStream,
+        _In_ size_t OutputStride,
+        _In_reads_bytes_(sizeof(XMFLOAT2) + InputStride * (VectorCount - 1)) const XMFLOAT2* pInputStream,
+        _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept;
+
+    /****************************************************************************
+     *
+     * 3D vector operations
+     *
+     ****************************************************************************/
+
+    bool        XM_CALLCONV     XMVector3Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    uint32_t    XM_CALLCONV     XMVector3EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector3EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    uint32_t    XM_CALLCONV     XMVector3EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector3NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept;
+    bool        XM_CALLCONV     XMVector3NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector3NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector3Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    uint32_t    XM_CALLCONV     XMVector3GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector3GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    uint32_t    XM_CALLCONV     XMVector3GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector3Less(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector3LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector3InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept;
+
+    bool        XM_CALLCONV     XMVector3IsNaN(FXMVECTOR V) noexcept;
+    bool        XM_CALLCONV     XMVector3IsInfinite(FXMVECTOR V) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMVector3Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3Cross(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3LengthSq(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3ReciprocalLengthEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3ReciprocalLength(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3LengthEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3Length(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3NormalizeEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3Normalize(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3ClampLength(FXMVECTOR V, float LengthMin, float LengthMax) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3Reflect(FXMVECTOR Incident, FXMVECTOR Normal) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3Orthogonal(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point) noexcept;
+    void        XM_CALLCONV     XMVector3ComponentsFromNormal(_Out_ XMVECTOR* pParallel, _Out_ XMVECTOR* pPerpendicular, _In_ FXMVECTOR V, _In_ FXMVECTOR Normal) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3Rotate(FXMVECTOR V, FXMVECTOR RotationQuaternion) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3InverseRotate(FXMVECTOR V, FXMVECTOR RotationQuaternion) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3Transform(FXMVECTOR V, FXMMATRIX M) noexcept;
+    XMFLOAT4*   XM_CALLCONV     XMVector3TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (VectorCount - 1)) XMFLOAT4* pOutputStream,
+        _In_ size_t OutputStride,
+        _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream,
+        _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3TransformCoord(FXMVECTOR V, FXMMATRIX M) noexcept;
+    XMFLOAT3*   XM_CALLCONV     XMVector3TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1)) XMFLOAT3* pOutputStream,
+        _In_ size_t OutputStride,
+        _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream,
+        _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3TransformNormal(FXMVECTOR V, FXMMATRIX M) noexcept;
+    XMFLOAT3*   XM_CALLCONV     XMVector3TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1)) XMFLOAT3* pOutputStream,
+        _In_ size_t OutputStride,
+        _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream,
+        _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3Project(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ,
+        FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World) noexcept;
+    XMFLOAT3*   XM_CALLCONV     XMVector3ProjectStream(_Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1)) XMFLOAT3* pOutputStream,
+        _In_ size_t OutputStride,
+        _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream,
+        _In_ size_t InputStride, _In_ size_t VectorCount,
+        _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ,
+        _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector3Unproject(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ,
+        FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World) noexcept;
+    XMFLOAT3*   XM_CALLCONV     XMVector3UnprojectStream(_Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1)) XMFLOAT3* pOutputStream,
+        _In_ size_t OutputStride,
+        _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream,
+        _In_ size_t InputStride, _In_ size_t VectorCount,
+        _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ,
+        _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World) noexcept;
+
+    /****************************************************************************
+     *
+     * 4D vector operations
+     *
+     ****************************************************************************/
+
+    bool        XM_CALLCONV     XMVector4Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    uint32_t    XM_CALLCONV     XMVector4EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector4EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    uint32_t    XM_CALLCONV     XMVector4EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector4NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept;
+    bool        XM_CALLCONV     XMVector4NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector4NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector4Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    uint32_t    XM_CALLCONV     XMVector4GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector4GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    uint32_t    XM_CALLCONV     XMVector4GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector4Less(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector4LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    bool        XM_CALLCONV     XMVector4InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept;
+
+    bool        XM_CALLCONV     XMVector4IsNaN(FXMVECTOR V) noexcept;
+    bool        XM_CALLCONV     XMVector4IsInfinite(FXMVECTOR V) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMVector4Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4Cross(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4LengthSq(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4ReciprocalLengthEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4ReciprocalLength(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4LengthEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4Length(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4NormalizeEst(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4Normalize(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4ClampLength(FXMVECTOR V, float LengthMin, float LengthMax) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4Reflect(FXMVECTOR Incident, FXMVECTOR Normal) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4Orthogonal(FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMVector4Transform(FXMVECTOR V, FXMMATRIX M) noexcept;
+    XMFLOAT4*   XM_CALLCONV     XMVector4TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (VectorCount - 1)) XMFLOAT4* pOutputStream,
+        _In_ size_t OutputStride,
+        _In_reads_bytes_(sizeof(XMFLOAT4) + InputStride * (VectorCount - 1)) const XMFLOAT4* pInputStream,
+        _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept;
+
+    /****************************************************************************
+     *
+     * Matrix operations
+     *
+     ****************************************************************************/
+
+    bool        XM_CALLCONV     XMMatrixIsNaN(FXMMATRIX M) noexcept;
+    bool        XM_CALLCONV     XMMatrixIsInfinite(FXMMATRIX M) noexcept;
+    bool        XM_CALLCONV     XMMatrixIsIdentity(FXMMATRIX M) noexcept;
+
+    XMMATRIX    XM_CALLCONV     XMMatrixMultiply(FXMMATRIX M1, CXMMATRIX M2) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixMultiplyTranspose(FXMMATRIX M1, CXMMATRIX M2) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixTranspose(FXMMATRIX M) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixInverse(_Out_opt_ XMVECTOR* pDeterminant, _In_ FXMMATRIX M) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixVectorTensorProduct(FXMVECTOR V1, FXMVECTOR V2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMMatrixDeterminant(FXMMATRIX M) noexcept;
+
+    _Success_(return)
+    bool        XM_CALLCONV     XMMatrixDecompose(_Out_ XMVECTOR* outScale, _Out_ XMVECTOR* outRotQuat, _Out_ XMVECTOR* outTrans, _In_ FXMMATRIX M) noexcept;
+
+    XMMATRIX    XM_CALLCONV     XMMatrixIdentity() noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixSet(float m00, float m01, float m02, float m03,
+        float m10, float m11, float m12, float m13,
+        float m20, float m21, float m22, float m23,
+        float m30, float m31, float m32, float m33) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixTranslation(float OffsetX, float OffsetY, float OffsetZ) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixTranslationFromVector(FXMVECTOR Offset) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixScaling(float ScaleX, float ScaleY, float ScaleZ) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixScalingFromVector(FXMVECTOR Scale) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixRotationX(float Angle) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixRotationY(float Angle) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixRotationZ(float Angle) noexcept;
+
+    // Rotates about y-axis (Yaw), then x-axis (Pitch), then z-axis (Roll)
+    XMMATRIX    XM_CALLCONV     XMMatrixRotationRollPitchYaw(float Pitch, float Yaw, float Roll) noexcept;
+
+    // Rotates about y-axis (Angles.y), then x-axis (Angles.x), then z-axis (Angles.z)
+    XMMATRIX    XM_CALLCONV     XMMatrixRotationRollPitchYawFromVector(FXMVECTOR Angles) noexcept;
+
+    XMMATRIX    XM_CALLCONV     XMMatrixRotationNormal(FXMVECTOR NormalAxis, float Angle) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixRotationAxis(FXMVECTOR Axis, float Angle) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixRotationQuaternion(FXMVECTOR Quaternion) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixTransformation2D(FXMVECTOR ScalingOrigin, float ScalingOrientation, FXMVECTOR Scaling,
+        FXMVECTOR RotationOrigin, float Rotation, GXMVECTOR Translation) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixTransformation(FXMVECTOR ScalingOrigin, FXMVECTOR ScalingOrientationQuaternion, FXMVECTOR Scaling,
+        GXMVECTOR RotationOrigin, HXMVECTOR RotationQuaternion, HXMVECTOR Translation) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixAffineTransformation2D(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, float Rotation, FXMVECTOR Translation) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixAffineTransformation(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FXMVECTOR RotationQuaternion, GXMVECTOR Translation) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixReflect(FXMVECTOR ReflectionPlane) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixShadow(FXMVECTOR ShadowPlane, FXMVECTOR LightPosition) noexcept;
+
+    XMMATRIX    XM_CALLCONV     XMMatrixLookAtLH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixLookAtRH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixLookToLH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixLookToRH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveFovLH(float FovAngleY, float AspectRatio, float NearZ, float FarZ) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveFovRH(float FovAngleY, float AspectRatio, float NearZ, float FarZ) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixPerspectiveOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixOrthographicLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixOrthographicRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixOrthographicOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept;
+    XMMATRIX    XM_CALLCONV     XMMatrixOrthographicOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept;
+
+
+    /****************************************************************************
+     *
+     * Quaternion operations
+     *
+     ****************************************************************************/
+
+    bool        XM_CALLCONV     XMQuaternionEqual(FXMVECTOR Q1, FXMVECTOR Q2) noexcept;
+    bool        XM_CALLCONV     XMQuaternionNotEqual(FXMVECTOR Q1, FXMVECTOR Q2) noexcept;
+
+    bool        XM_CALLCONV     XMQuaternionIsNaN(FXMVECTOR Q) noexcept;
+    bool        XM_CALLCONV     XMQuaternionIsInfinite(FXMVECTOR Q) noexcept;
+    bool        XM_CALLCONV     XMQuaternionIsIdentity(FXMVECTOR Q) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMQuaternionDot(FXMVECTOR Q1, FXMVECTOR Q2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionMultiply(FXMVECTOR Q1, FXMVECTOR Q2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionLengthSq(FXMVECTOR Q) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionReciprocalLength(FXMVECTOR Q) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionLength(FXMVECTOR Q) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionNormalizeEst(FXMVECTOR Q) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionNormalize(FXMVECTOR Q) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionConjugate(FXMVECTOR Q) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionInverse(FXMVECTOR Q) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionLn(FXMVECTOR Q) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionExp(FXMVECTOR Q) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionSlerp(FXMVECTOR Q0, FXMVECTOR Q1, float t) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionSlerpV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR T) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionSquad(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, float t) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionSquadV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, HXMVECTOR T) noexcept;
+    void        XM_CALLCONV     XMQuaternionSquadSetup(_Out_ XMVECTOR* pA, _Out_ XMVECTOR* pB, _Out_ XMVECTOR* pC, _In_ FXMVECTOR Q0, _In_ FXMVECTOR Q1, _In_ FXMVECTOR Q2, _In_ GXMVECTOR Q3) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionBaryCentric(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, float f, float g) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionBaryCentricV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR F, HXMVECTOR G) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMQuaternionIdentity() noexcept;
+
+    // Rotates about y-axis (Yaw), then x-axis (Pitch), then z-axis (Roll)
+    XMVECTOR    XM_CALLCONV     XMQuaternionRotationRollPitchYaw(float Pitch, float Yaw, float Roll) noexcept;
+
+    // Rotates about y-axis (Angles.y), then x-axis (Angles.x), then z-axis (Angles.z)
+    XMVECTOR    XM_CALLCONV     XMQuaternionRotationRollPitchYawFromVector(FXMVECTOR Angles) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMQuaternionRotationNormal(FXMVECTOR NormalAxis, float Angle) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionRotationAxis(FXMVECTOR Axis, float Angle) noexcept;
+    XMVECTOR    XM_CALLCONV     XMQuaternionRotationMatrix(FXMMATRIX M) noexcept;
+
+    void        XM_CALLCONV     XMQuaternionToAxisAngle(_Out_ XMVECTOR* pAxis, _Out_ float* pAngle, _In_ FXMVECTOR Q) noexcept;
+
+    /****************************************************************************
+     *
+     * Plane operations
+     *
+     ****************************************************************************/
+
+    bool        XM_CALLCONV     XMPlaneEqual(FXMVECTOR P1, FXMVECTOR P2) noexcept;
+    bool        XM_CALLCONV     XMPlaneNearEqual(FXMVECTOR P1, FXMVECTOR P2, FXMVECTOR Epsilon) noexcept;
+    bool        XM_CALLCONV     XMPlaneNotEqual(FXMVECTOR P1, FXMVECTOR P2) noexcept;
+
+    bool        XM_CALLCONV     XMPlaneIsNaN(FXMVECTOR P) noexcept;
+    bool        XM_CALLCONV     XMPlaneIsInfinite(FXMVECTOR P) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMPlaneDot(FXMVECTOR P, FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMPlaneDotCoord(FXMVECTOR P, FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMPlaneDotNormal(FXMVECTOR P, FXMVECTOR V) noexcept;
+    XMVECTOR    XM_CALLCONV     XMPlaneNormalizeEst(FXMVECTOR P) noexcept;
+    XMVECTOR    XM_CALLCONV     XMPlaneNormalize(FXMVECTOR P) noexcept;
+    XMVECTOR    XM_CALLCONV     XMPlaneIntersectLine(FXMVECTOR P, FXMVECTOR LinePoint1, FXMVECTOR LinePoint2) noexcept;
+    void        XM_CALLCONV     XMPlaneIntersectPlane(_Out_ XMVECTOR* pLinePoint1, _Out_ XMVECTOR* pLinePoint2, _In_ FXMVECTOR P1, _In_ FXMVECTOR P2) noexcept;
+
+    // Transforms a plane given an inverse transpose matrix
+    XMVECTOR    XM_CALLCONV     XMPlaneTransform(FXMVECTOR P, FXMMATRIX ITM) noexcept;
+
+    // Transforms an array of planes given an inverse transpose matrix
+    XMFLOAT4*   XM_CALLCONV     XMPlaneTransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (PlaneCount - 1)) XMFLOAT4* pOutputStream,
+        _In_ size_t OutputStride,
+        _In_reads_bytes_(sizeof(XMFLOAT4) + InputStride * (PlaneCount - 1)) const XMFLOAT4* pInputStream,
+        _In_ size_t InputStride, _In_ size_t PlaneCount, _In_ FXMMATRIX ITM) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMPlaneFromPointNormal(FXMVECTOR Point, FXMVECTOR Normal) noexcept;
+    XMVECTOR    XM_CALLCONV     XMPlaneFromPoints(FXMVECTOR Point1, FXMVECTOR Point2, FXMVECTOR Point3) noexcept;
+
+    /****************************************************************************
+     *
+     * Color operations
+     *
+     ****************************************************************************/
+
+    bool        XM_CALLCONV     XMColorEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept;
+    bool        XM_CALLCONV     XMColorNotEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept;
+    bool        XM_CALLCONV     XMColorGreater(FXMVECTOR C1, FXMVECTOR C2) noexcept;
+    bool        XM_CALLCONV     XMColorGreaterOrEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept;
+    bool        XM_CALLCONV     XMColorLess(FXMVECTOR C1, FXMVECTOR C2) noexcept;
+    bool        XM_CALLCONV     XMColorLessOrEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept;
+
+    bool        XM_CALLCONV     XMColorIsNaN(FXMVECTOR C) noexcept;
+    bool        XM_CALLCONV     XMColorIsInfinite(FXMVECTOR C) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMColorNegative(FXMVECTOR C) noexcept;
+    XMVECTOR    XM_CALLCONV     XMColorModulate(FXMVECTOR C1, FXMVECTOR C2) noexcept;
+    XMVECTOR    XM_CALLCONV     XMColorAdjustSaturation(FXMVECTOR C, float Saturation) noexcept;
+    XMVECTOR    XM_CALLCONV     XMColorAdjustContrast(FXMVECTOR C, float Contrast) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMColorRGBToHSL(FXMVECTOR rgb) noexcept;
+    XMVECTOR    XM_CALLCONV     XMColorHSLToRGB(FXMVECTOR hsl) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMColorRGBToHSV(FXMVECTOR rgb) noexcept;
+    XMVECTOR    XM_CALLCONV     XMColorHSVToRGB(FXMVECTOR hsv) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMColorRGBToYUV(FXMVECTOR rgb) noexcept;
+    XMVECTOR    XM_CALLCONV     XMColorYUVToRGB(FXMVECTOR yuv) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMColorRGBToYUV_HD(FXMVECTOR rgb) noexcept;
+    XMVECTOR    XM_CALLCONV     XMColorYUVToRGB_HD(FXMVECTOR yuv) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMColorRGBToYUV_UHD(FXMVECTOR rgb) noexcept;
+    XMVECTOR    XM_CALLCONV     XMColorYUVToRGB_UHD(FXMVECTOR yuv) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMColorRGBToXYZ(FXMVECTOR rgb) noexcept;
+    XMVECTOR    XM_CALLCONV     XMColorXYZToRGB(FXMVECTOR xyz) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMColorXYZToSRGB(FXMVECTOR xyz) noexcept;
+    XMVECTOR    XM_CALLCONV     XMColorSRGBToXYZ(FXMVECTOR srgb) noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMColorRGBToSRGB(FXMVECTOR rgb) noexcept;
+    XMVECTOR    XM_CALLCONV     XMColorSRGBToRGB(FXMVECTOR srgb) noexcept;
+
+
+    /****************************************************************************
+     *
+     * Miscellaneous operations
+     *
+     ****************************************************************************/
+
+    bool            XMVerifyCPUSupport() noexcept;
+
+    XMVECTOR    XM_CALLCONV     XMFresnelTerm(FXMVECTOR CosIncidentAngle, FXMVECTOR RefractionIndex) noexcept;
+
+    bool            XMScalarNearEqual(float S1, float S2, float Epsilon) noexcept;
+    float           XMScalarModAngle(float Value) noexcept;
+
+    float           XMScalarSin(float Value) noexcept;
+    float           XMScalarSinEst(float Value) noexcept;
+
+    float           XMScalarCos(float Value) noexcept;
+    float           XMScalarCosEst(float Value) noexcept;
+
+    void            XMScalarSinCos(_Out_ float* pSin, _Out_ float* pCos, float Value) noexcept;
+    void            XMScalarSinCosEst(_Out_ float* pSin, _Out_ float* pCos, float Value) noexcept;
+
+    float           XMScalarASin(float Value) noexcept;
+    float           XMScalarASinEst(float Value) noexcept;
+
+    float           XMScalarACos(float Value) noexcept;
+    float           XMScalarACosEst(float Value) noexcept;
+
+    /****************************************************************************
+     *
+     * Templates
+     *
+     ****************************************************************************/
+
+#if defined(__XNAMATH_H__) && defined(XMMin)
+#undef XMMin
+#undef XMMax
+#endif
+
+    template<class T> inline T XMMin(T a, T b) noexcept { return (a < b) ? a : b; }
+    template<class T> inline T XMMax(T a, T b) noexcept { return (a > b) ? a : b; }
+
+    //------------------------------------------------------------------------------
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+
+// PermuteHelper internal template (SSE only)
+    namespace Internal
+    {
+        // Slow path fallback for permutes that do not map to a single SSE shuffle opcode.
+        template<uint32_t Shuffle, bool WhichX, bool WhichY, bool WhichZ, bool WhichW> struct PermuteHelper
+        {
+            static XMVECTOR     XM_CALLCONV     Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept
+            {
+                static const XMVECTORU32 selectMask =
+                { { {
+                        WhichX ? 0xFFFFFFFF : 0,
+                        WhichY ? 0xFFFFFFFF : 0,
+                        WhichZ ? 0xFFFFFFFF : 0,
+                        WhichW ? 0xFFFFFFFF : 0,
+                } } };
+
+                XMVECTOR shuffled1 = XM_PERMUTE_PS(v1, Shuffle);
+                XMVECTOR shuffled2 = XM_PERMUTE_PS(v2, Shuffle);
+
+                XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1);
+                XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2);
+
+                return _mm_or_ps(masked1, masked2);
+            }
+        };
+
+        // Fast path for permutes that only read from the first vector.
+        template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, false, false>
+        {
+            static XMVECTOR     XM_CALLCONV     Permute(FXMVECTOR v1, FXMVECTOR) noexcept { return XM_PERMUTE_PS(v1, Shuffle); }
+        };
+
+        // Fast path for permutes that only read from the second vector.
+        template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, true, true>
+        {
+            static XMVECTOR     XM_CALLCONV     Permute(FXMVECTOR, FXMVECTOR v2) noexcept { return XM_PERMUTE_PS(v2, Shuffle); }
+        };
+
+        // Fast path for permutes that read XY from the first vector, ZW from the second.
+        template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, true, true>
+        {
+            static XMVECTOR     XM_CALLCONV     Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept { return _mm_shuffle_ps(v1, v2, Shuffle); }
+        };
+
+        // Fast path for permutes that read XY from the second vector, ZW from the first.
+        template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, false, false>
+        {
+            static XMVECTOR     XM_CALLCONV     Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept { return _mm_shuffle_ps(v2, v1, Shuffle); }
+        };
+    }
+
+#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
+
+    // General permute template
+    template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW>
+    inline XMVECTOR     XM_CALLCONV     XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2) noexcept
+    {
+        static_assert(PermuteX <= 7, "PermuteX template parameter out of range");
+        static_assert(PermuteY <= 7, "PermuteY template parameter out of range");
+        static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range");
+        static_assert(PermuteW <= 7, "PermuteW template parameter out of range");
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+        constexpr uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3);
+
+        constexpr bool WhichX = PermuteX > 3;
+        constexpr bool WhichY = PermuteY > 3;
+        constexpr bool WhichZ = PermuteZ > 3;
+        constexpr bool WhichW = PermuteW > 3;
+
+        return Internal::PermuteHelper<Shuffle, WhichX, WhichY, WhichZ, WhichW>::Permute(V1, V2);
+#else
+
+        return XMVectorPermute(V1, V2, PermuteX, PermuteY, PermuteZ, PermuteW);
+
+#endif
+    }
+
+    // Special-case permute templates
+    template<> constexpr XMVECTOR XM_CALLCONV     XMVectorPermute<0, 1, 2, 3>(FXMVECTOR V1, FXMVECTOR) noexcept { return V1; }
+    template<> constexpr XMVECTOR XM_CALLCONV     XMVectorPermute<4, 5, 6, 7>(FXMVECTOR, FXMVECTOR V2) noexcept { return V2; }
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0, 1, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_movelh_ps(V1, V2); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<6, 7, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_movehl_ps(V1, V2); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0, 4, 1, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_unpacklo_ps(V1, V2); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2, 6, 3, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_unpackhi_ps(V1, V2); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2, 3, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(V1), _mm_castps_pd(V2))); }
+#endif
+
+#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4, 1, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x1); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0, 5, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x2); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4, 5, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x3); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0, 1, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x4); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4, 1, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x5); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0, 5, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x6); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4, 5, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x7); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0, 1, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x8); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4, 1, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x9); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0, 5, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xA); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4, 5, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xB); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xC); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<4, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xD); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0, 5, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xE); }
+#endif
+
+#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+
+    // If the indices are all in the range 0-3 or 4-7, then use XMVectorSwizzle instead
+    // The mirror cases are not spelled out here as the programmer can always swap the arguments
+    // (i.e. prefer permutes where the X element comes from the V1 vector instead of the V2 vector)
+
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0, 1, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vget_low_f32(V2)); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1, 0, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vget_low_f32(V2)); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0, 1, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vrev64_f32(vget_low_f32(V2))); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1, 0, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vrev64_f32(vget_low_f32(V2))); }
+
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2, 3, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_high_f32(V1), vget_high_f32(V2)); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<3, 2, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vget_high_f32(V2)); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2, 3, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_high_f32(V1), vrev64_f32(vget_high_f32(V2))); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<3, 2, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vrev64_f32(vget_high_f32(V2))); }
+
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vget_high_f32(V2)); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1, 0, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vget_high_f32(V2)); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0, 1, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vrev64_f32(vget_high_f32(V2))); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1, 0, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vrev64_f32(vget_high_f32(V2))); }
+
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<3, 2, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vget_low_f32(V2)); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2, 3, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_high_f32(V1), vrev64_f32(vget_low_f32(V2))); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<3, 2, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vrev64_f32(vget_low_f32(V2))); }
+
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0, 4, 2, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vtrnq_f32(V1, V2).val[0]; }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1, 5, 3, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vtrnq_f32(V1, V2).val[1]; }
+
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0, 4, 1, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vzipq_f32(V1, V2).val[0]; }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2, 6, 3, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vzipq_f32(V1, V2).val[1]; }
+
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<0, 2, 4, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vuzpq_f32(V1, V2).val[0]; }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1, 3, 5, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vuzpq_f32(V1, V2).val[1]; }
+
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<1, 2, 3, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vextq_f32(V1, V2, 1); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<2, 3, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vextq_f32(V1, V2, 2); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorPermute<3, 4, 5, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vextq_f32(V1, V2, 3); }
+
+#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
+
+    //------------------------------------------------------------------------------
+
+    // General swizzle template
+    template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
+    inline XMVECTOR     XM_CALLCONV     XMVectorSwizzle(FXMVECTOR V) noexcept
+    {
+        static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
+        static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
+        static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
+        static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+        return XM_PERMUTE_PS(V, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
+#else
+
+        return XMVectorSwizzle(V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
+
+#endif
+    }
+
+    // Specialized swizzles
+    template<> constexpr XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 2, 3>(FXMVECTOR V) noexcept { return V; }
+
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0, 1, 0, 1>(FXMVECTOR V) noexcept { return _mm_movelh_ps(V, V); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2, 3, 2, 3>(FXMVECTOR V) noexcept { return _mm_movehl_ps(V, V); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0, 0, 1, 1>(FXMVECTOR V) noexcept { return _mm_unpacklo_ps(V, V); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2, 2, 3, 3>(FXMVECTOR V) noexcept { return _mm_unpackhi_ps(V, V); }
+#endif
+
+#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0, 0, 2, 2>(FXMVECTOR V) noexcept { return _mm_moveldup_ps(V); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1, 1, 3, 3>(FXMVECTOR V) noexcept { return _mm_movehdup_ps(V); }
+#endif
+
+#if defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) && defined(_XM_FAVOR_INTEL_)
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0, 0, 0, 0>(FXMVECTOR V) noexcept { return _mm_broadcastss_ps(V); }
+#endif
+
+#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0, 0, 0, 0>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_low_f32(V), 0); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1, 1, 1, 1>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_low_f32(V), 1); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2, 2, 2, 2>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_high_f32(V), 0); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<3, 3, 3, 3>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_high_f32(V), 1); }
+
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1, 0, 3, 2>(FXMVECTOR V) noexcept { return vrev64q_f32(V); }
+
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0, 1, 0, 1>(FXMVECTOR V) noexcept { float32x2_t vt = vget_low_f32(V); return vcombine_f32(vt, vt); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2, 3, 2, 3>(FXMVECTOR V) noexcept { float32x2_t vt = vget_high_f32(V); return vcombine_f32(vt, vt); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1, 0, 1, 0>(FXMVECTOR V) noexcept { float32x2_t vt = vrev64_f32(vget_low_f32(V)); return vcombine_f32(vt, vt); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<3, 2, 3, 2>(FXMVECTOR V) noexcept { float32x2_t vt = vrev64_f32(vget_high_f32(V)); return vcombine_f32(vt, vt); }
+
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0, 1, 3, 2>(FXMVECTOR V) noexcept { return vcombine_f32(vget_low_f32(V), vrev64_f32(vget_high_f32(V))); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1, 0, 2, 3>(FXMVECTOR V) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V)), vget_high_f32(V)); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2, 3, 1, 0>(FXMVECTOR V) noexcept { return vcombine_f32(vget_high_f32(V), vrev64_f32(vget_low_f32(V))); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<3, 2, 0, 1>(FXMVECTOR V) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V)), vget_low_f32(V)); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<3, 2, 1, 0>(FXMVECTOR V) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V)), vrev64_f32(vget_low_f32(V))); }
+
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0, 0, 2, 2>(FXMVECTOR V) noexcept { return vtrnq_f32(V, V).val[0]; }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1, 1, 3, 3>(FXMVECTOR V) noexcept { return vtrnq_f32(V, V).val[1]; }
+
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0, 0, 1, 1>(FXMVECTOR V) noexcept { return vzipq_f32(V, V).val[0]; }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2, 2, 3, 3>(FXMVECTOR V) noexcept { return vzipq_f32(V, V).val[1]; }
+
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<0, 2, 0, 2>(FXMVECTOR V) noexcept { return vuzpq_f32(V, V).val[0]; }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1, 3, 1, 3>(FXMVECTOR V) noexcept { return vuzpq_f32(V, V).val[1]; }
+
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<1, 2, 3, 0>(FXMVECTOR V) noexcept { return vextq_f32(V, V, 1); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<2, 3, 0, 1>(FXMVECTOR V) noexcept { return vextq_f32(V, V, 2); }
+    template<> inline XMVECTOR      XM_CALLCONV     XMVectorSwizzle<3, 0, 1, 2>(FXMVECTOR V) noexcept { return vextq_f32(V, V, 3); }
+
+#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
+
+    //------------------------------------------------------------------------------
+
+    template<uint32_t Elements>
+    inline XMVECTOR     XM_CALLCONV     XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2) noexcept
+    {
+        static_assert(Elements < 4, "Elements template parameter out of range");
+        return XMVectorPermute<Elements, (Elements + 1), (Elements + 2), (Elements + 3)>(V1, V2);
+    }
+
+    template<uint32_t Elements>
+    inline XMVECTOR     XM_CALLCONV     XMVectorRotateLeft(FXMVECTOR V) noexcept
+    {
+        static_assert(Elements < 4, "Elements template parameter out of range");
+        return XMVectorSwizzle<Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3>(V);
+    }
+
+    template<uint32_t Elements>
+    inline XMVECTOR     XM_CALLCONV     XMVectorRotateRight(FXMVECTOR V) noexcept
+    {
+        static_assert(Elements < 4, "Elements template parameter out of range");
+        return XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V);
+    }
+
+    template<uint32_t VSLeftRotateElements, uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3>
+    inline XMVECTOR     XM_CALLCONV     XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS) noexcept
+    {
+        XMVECTOR Control = XMVectorSelectControl(Select0 & 1, Select1 & 1, Select2 & 1, Select3 & 1);
+        return XMVectorSelect(VD, XMVectorRotateLeft<VSLeftRotateElements>(VS), Control);
+    }
+
+    /****************************************************************************
+     *
+     * Globals
+     *
+     ****************************************************************************/
+
+     // The purpose of the following global constants is to prevent redundant
+     // reloading of the constants when they are referenced by more than one
+     // separate inline math routine called within the same function.  Declaring
+     // a constant locally within a routine is sufficient to prevent redundant
+     // reloads of that constant when that single routine is called multiple
+     // times in a function, but if the constant is used (and declared) in a
+     // separate math routine it would be reloaded.
+
+#ifndef XMGLOBALCONST
+#if defined(__GNUC__) && !defined(__MINGW32__)
+#define XMGLOBALCONST extern const __attribute__((weak))
+#else
+#define XMGLOBALCONST extern const __declspec(selectany)
+#endif
+#endif
+
+    XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients0 = { { { -0.16666667f, +0.0083333310f, -0.00019840874f, +2.7525562e-06f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients1 = { { { -2.3889859e-08f, -0.16665852f /*Est1*/, +0.0083139502f /*Est2*/, -0.00018524670f /*Est3*/ } } };
+    XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients0 = { { { -0.5f, +0.041666638f, -0.0013888378f, +2.4760495e-05f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients1 = { { { -2.6051615e-07f, -0.49992746f /*Est1*/, +0.041493919f /*Est2*/, -0.0012712436f /*Est3*/ } } };
+    XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients0 = { { { 1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients1 = { { { 2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients2 = { { { 5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients0 = { { { +1.5707963050f, -0.2145988016f, +0.0889789874f, -0.0501743046f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients1 = { { { +0.0308918810f, -0.0170881256f, +0.0066700901f, -0.0012624911f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients0 = { { { -0.3333314528f, +0.1999355085f, -0.1420889944f, +0.1065626393f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients1 = { { { -0.0752896400f, +0.0429096138f, -0.0161657367f, +0.0028662257f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients0 = { { { +0.999866f, +0.999866f, +0.999866f, +0.999866f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients1 = { { { -0.3302995f, +0.180141f, -0.085133f, +0.0208351f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMTanEstCoefficients = { { { 2.484f, -1.954923183e-1f, 2.467401101f, XM_1DIVPI } } };
+    XMGLOBALCONST XMVECTORF32 g_XMArcEstCoefficients = { { { +1.5707288f, -0.2121144f, +0.0742610f, -0.0187293f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMPiConstants0 = { { { XM_PI, XM_2PI, XM_1DIVPI, XM_1DIV2PI } } };
+    XMGLOBALCONST XMVECTORF32 g_XMIdentityR0 = { { { 1.0f, 0.0f, 0.0f, 0.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMIdentityR1 = { { { 0.0f, 1.0f, 0.0f, 0.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMIdentityR2 = { { { 0.0f, 0.0f, 1.0f, 0.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMIdentityR3 = { { { 0.0f, 0.0f, 0.0f, 1.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0 = { { { -1.0f, 0.0f, 0.0f, 0.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1 = { { { 0.0f, -1.0f, 0.0f, 0.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2 = { { { 0.0f, 0.0f, -1.0f, 0.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3 = { { { 0.0f, 0.0f, 0.0f, -1.0f } } };
+    XMGLOBALCONST XMVECTORU32 g_XMNegativeZero = { { { 0x80000000, 0x80000000, 0x80000000, 0x80000000 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMNegate3 = { { { 0x80000000, 0x80000000, 0x80000000, 0x00000000 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMMaskXY = { { { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMMask3 = { { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMMaskX = { { { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMMaskY = { { { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMMaskZ = { { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMMaskW = { { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF } } };
+    XMGLOBALCONST XMVECTORF32 g_XMOne = { { { 1.0f, 1.0f, 1.0f, 1.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMOne3 = { { { 1.0f, 1.0f, 1.0f, 0.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMZero = { { { 0.0f, 0.0f, 0.0f, 0.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMTwo = { { { 2.f, 2.f, 2.f, 2.f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMFour = { { { 4.f, 4.f, 4.f, 4.f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMSix = { { { 6.f, 6.f, 6.f, 6.f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNegativeOne = { { { -1.0f, -1.0f, -1.0f, -1.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMOneHalf = { { { 0.5f, 0.5f, 0.5f, 0.5f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNegativeOneHalf = { { { -0.5f, -0.5f, -0.5f, -0.5f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNegativeTwoPi = { { { -XM_2PI, -XM_2PI, -XM_2PI, -XM_2PI } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNegativePi = { { { -XM_PI, -XM_PI, -XM_PI, -XM_PI } } };
+    XMGLOBALCONST XMVECTORF32 g_XMHalfPi = { { { XM_PIDIV2, XM_PIDIV2, XM_PIDIV2, XM_PIDIV2 } } };
+    XMGLOBALCONST XMVECTORF32 g_XMPi = { { { XM_PI, XM_PI, XM_PI, XM_PI } } };
+    XMGLOBALCONST XMVECTORF32 g_XMReciprocalPi = { { { XM_1DIVPI, XM_1DIVPI, XM_1DIVPI, XM_1DIVPI } } };
+    XMGLOBALCONST XMVECTORF32 g_XMTwoPi = { { { XM_2PI, XM_2PI, XM_2PI, XM_2PI } } };
+    XMGLOBALCONST XMVECTORF32 g_XMReciprocalTwoPi = { { { XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI } } };
+    XMGLOBALCONST XMVECTORF32 g_XMEpsilon = { { { 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f } } };
+    XMGLOBALCONST XMVECTORI32 g_XMInfinity = { { { 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000 } } };
+    XMGLOBALCONST XMVECTORI32 g_XMQNaN = { { { 0x7FC00000, 0x7FC00000, 0x7FC00000, 0x7FC00000 } } };
+    XMGLOBALCONST XMVECTORI32 g_XMQNaNTest = { { { 0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF } } };
+    XMGLOBALCONST XMVECTORI32 g_XMAbsMask = { { { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF } } };
+    XMGLOBALCONST XMVECTORI32 g_XMFltMin = { { { 0x00800000, 0x00800000, 0x00800000, 0x00800000 } } };
+    XMGLOBALCONST XMVECTORI32 g_XMFltMax = { { { 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF } } };
+    XMGLOBALCONST XMVECTORU32 g_XMNegOneMask = { { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } } };
+    XMGLOBALCONST XMVECTORU32 g_XMMaskA8R8G8B8 = { { { 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMFlipA8R8G8B8 = { { { 0x00000000, 0x00000000, 0x00000000, 0x80000000 } } };
+    XMGLOBALCONST XMVECTORF32 g_XMFixAA8R8G8B8 = { { { 0.0f, 0.0f, 0.0f, float(0x80000000U) } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNormalizeA8R8G8B8 = { { { 1.0f / (255.0f * float(0x10000)), 1.0f / (255.0f * float(0x100)), 1.0f / 255.0f, 1.0f / (255.0f * float(0x1000000)) } } };
+    XMGLOBALCONST XMVECTORU32 g_XMMaskA2B10G10R10 = { { { 0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMFlipA2B10G10R10 = { { { 0x00000200, 0x00080000, 0x20000000, 0x80000000 } } };
+    XMGLOBALCONST XMVECTORF32 g_XMFixAA2B10G10R10 = { { { -512.0f, -512.0f * float(0x400), -512.0f * float(0x100000), float(0x80000000U) } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNormalizeA2B10G10R10 = { { { 1.0f / 511.0f, 1.0f / (511.0f * float(0x400)), 1.0f / (511.0f * float(0x100000)), 1.0f / (3.0f * float(0x40000000)) } } };
+    XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16 = { { { 0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000 } } };
+    XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16 = { { { 0x00008000, 0x00000000, 0x00000000, 0x00000000 } } };
+    XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16 = { { { -32768.0f, 0.0f, 0.0f, 0.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16 = { { { 1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f } } };
+    XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16Z16W16 = { { { 0x0000FFFF, 0x0000FFFF, 0xFFFF0000, 0xFFFF0000 } } };
+    XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16Z16W16 = { { { 0x00008000, 0x00008000, 0x00000000, 0x00000000 } } };
+    XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16Z16W16 = { { { -32768.0f, -32768.0f, 0.0f, 0.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16Z16W16 = { { { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 1.0f / (32767.0f * 65536.0f) } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNoFraction = { { { 8388608.0f, 8388608.0f, 8388608.0f, 8388608.0f } } };
+    XMGLOBALCONST XMVECTORI32 g_XMMaskByte = { { { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNegateX = { { { -1.0f, 1.0f, 1.0f, 1.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNegateY = { { { 1.0f, -1.0f, 1.0f, 1.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNegateZ = { { { 1.0f, 1.0f, -1.0f, 1.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMNegateW = { { { 1.0f, 1.0f, 1.0f, -1.0f } } };
+    XMGLOBALCONST XMVECTORU32 g_XMSelect0101 = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMSelect1010 = { { { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } };
+    XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = { { { 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD } } };
+    XMGLOBALCONST XMVECTORU32 g_XMSelect1000 = { { { XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMSelect1100 = { { { XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMSelect1110 = { { { XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMSelect1011 = { { { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 } } };
+    XMGLOBALCONST XMVECTORF32 g_XMFixupY16 = { { { 1.0f, 1.0f / 65536.0f, 0.0f, 0.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16 = { { { 1.0f, 1.0f, 1.0f / 65536.0f, 1.0f / 65536.0f } } };
+    XMGLOBALCONST XMVECTORU32 g_XMFlipY = { { { 0, 0x80000000, 0, 0 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMFlipZ = { { { 0, 0, 0x80000000, 0 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMFlipW = { { { 0, 0, 0, 0x80000000 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMFlipYZ = { { { 0, 0x80000000, 0x80000000, 0 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMFlipZW = { { { 0, 0, 0x80000000, 0x80000000 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMFlipYW = { { { 0, 0x80000000, 0, 0x80000000 } } };
+    XMGLOBALCONST XMVECTORI32 g_XMMaskDec4 = { { { 0x3FF, 0x3FF << 10, 0x3FF << 20, static_cast<int>(0xC0000000) } } };
+    XMGLOBALCONST XMVECTORI32 g_XMXorDec4 = { { { 0x200, 0x200 << 10, 0x200 << 20, 0 } } };
+    XMGLOBALCONST XMVECTORF32 g_XMAddUDec4 = { { { 0, 0, 0, 32768.0f * 65536.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMAddDec4 = { { { -512.0f, -512.0f * 1024.0f, -512.0f * 1024.0f * 1024.0f, 0 } } };
+    XMGLOBALCONST XMVECTORF32 g_XMMulDec4 = { { { 1.0f, 1.0f / 1024.0f, 1.0f / (1024.0f * 1024.0f), 1.0f / (1024.0f * 1024.0f * 1024.0f) } } };
+    XMGLOBALCONST XMVECTORU32 g_XMMaskByte4 = { { { 0xFF, 0xFF00, 0xFF0000, 0xFF000000 } } };
+    XMGLOBALCONST XMVECTORI32 g_XMXorByte4 = { { { 0x80, 0x8000, 0x800000, 0x00000000 } } };
+    XMGLOBALCONST XMVECTORF32 g_XMAddByte4 = { { { -128.0f, -128.0f * 256.0f, -128.0f * 65536.0f, 0 } } };
+    XMGLOBALCONST XMVECTORF32 g_XMFixUnsigned = { { { 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMMaxInt = { { { 65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMMaxUInt = { { { 65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMUnsignedFix = { { { 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMsrgbScale = { { { 12.92f, 12.92f, 12.92f, 1.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMsrgbA = { { { 0.055f, 0.055f, 0.055f, 0.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMsrgbA1 = { { { 1.055f, 1.055f, 1.055f, 1.0f } } };
+    XMGLOBALCONST XMVECTORI32 g_XMExponentBias = { { { 127, 127, 127, 127 } } };
+    XMGLOBALCONST XMVECTORI32 g_XMSubnormalExponent = { { { -126, -126, -126, -126 } } };
+    XMGLOBALCONST XMVECTORI32 g_XMNumTrailing = { { { 23, 23, 23, 23 } } };
+    XMGLOBALCONST XMVECTORI32 g_XMMinNormal = { { { 0x00800000, 0x00800000, 0x00800000, 0x00800000 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMNegInfinity = { { { 0xFF800000, 0xFF800000, 0xFF800000, 0xFF800000 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMNegQNaN = { { { 0xFFC00000, 0xFFC00000, 0xFFC00000, 0xFFC00000 } } };
+    XMGLOBALCONST XMVECTORI32 g_XMBin128 = { { { 0x43000000, 0x43000000, 0x43000000, 0x43000000 } } };
+    XMGLOBALCONST XMVECTORU32 g_XMBinNeg150 = { { { 0xC3160000, 0xC3160000, 0xC3160000, 0xC3160000 } } };
+    XMGLOBALCONST XMVECTORI32 g_XM253 = { { { 253, 253, 253, 253 } } };
+    XMGLOBALCONST XMVECTORF32 g_XMExpEst1 = { { { -6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMExpEst2 = { { { +2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMExpEst3 = { { { -5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMExpEst4 = { { { +9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMExpEst5 = { { { -1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMExpEst6 = { { { +1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMExpEst7 = { { { -1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMLogEst0 = { { { +1.442693f, +1.442693f, +1.442693f, +1.442693f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMLogEst1 = { { { -0.721242f, -0.721242f, -0.721242f, -0.721242f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMLogEst2 = { { { +0.479384f, +0.479384f, +0.479384f, +0.479384f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMLogEst3 = { { { -0.350295f, -0.350295f, -0.350295f, -0.350295f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMLogEst4 = { { { +0.248590f, +0.248590f, +0.248590f, +0.248590f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMLogEst5 = { { { -0.145700f, -0.145700f, -0.145700f, -0.145700f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMLogEst6 = { { { +0.057148f, +0.057148f, +0.057148f, +0.057148f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMLogEst7 = { { { -0.010578f, -0.010578f, -0.010578f, -0.010578f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMLgE = { { { +1.442695f, +1.442695f, +1.442695f, +1.442695f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMInvLgE = { { { +6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMLg10 = { { { +3.321928f, +3.321928f, +3.321928f, +3.321928f } } };
+    XMGLOBALCONST XMVECTORF32 g_XMInvLg10 = { { { +3.010299956e-1f, +3.010299956e-1f, +3.010299956e-1f, +3.010299956e-1f } } };
+    XMGLOBALCONST XMVECTORF32 g_UByteMax = { { { 255.0f, 255.0f, 255.0f, 255.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_ByteMin = { { { -127.0f, -127.0f, -127.0f, -127.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_ByteMax = { { { 127.0f, 127.0f, 127.0f, 127.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_ShortMin = { { { -32767.0f, -32767.0f, -32767.0f, -32767.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_ShortMax = { { { 32767.0f, 32767.0f, 32767.0f, 32767.0f } } };
+    XMGLOBALCONST XMVECTORF32 g_UShortMax = { { { 65535.0f, 65535.0f, 65535.0f, 65535.0f } } };
+
+    /****************************************************************************
+     *
+     * Implementation
+     *
+     ****************************************************************************/
+
+#ifdef _MSC_VER    
+#pragma warning(push)
+#pragma warning(disable:4068 4214 4204 4365 4616 4640 6001 6101)
+     // C4068/4616: ignore unknown pragmas
+     // C4214/4204: nonstandard extension used
+     // C4365/4640: Off by default noise
+     // C6001/6101: False positives
+#endif
+    
+#ifdef _PREFAST_
+#pragma prefast(push)
+#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
+#pragma prefast(disable : 26495, "Union initialization confuses /analyze")
+#endif
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
+#endif
+
+//------------------------------------------------------------------------------
+
+    inline XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3) noexcept
+    {
+#if defined(_XM_NO_INTRINSICS_)
+        XMVECTORU32 vResult;
+        vResult.u[0] = (0 - (C0 & 1)) & 0x3F800000;
+        vResult.u[1] = (0 - (C1 & 1)) & 0x3F800000;
+        vResult.u[2] = (0 - (C2 & 1)) & 0x3F800000;
+        vResult.u[3] = (0 - (C3 & 1)) & 0x3F800000;
+        return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+        XMVECTORU32 vResult;
+        vResult.u[0] = (0 - (C0 & 1)) & 0x3F800000;
+        vResult.u[1] = (0 - (C1 & 1)) & 0x3F800000;
+        vResult.u[2] = (0 - (C2 & 1)) & 0x3F800000;
+        vResult.u[3] = (0 - (C3 & 1)) & 0x3F800000;
+        return vResult.v;
+#else // XM_SSE_INTRINSICS_
+        static const XMVECTORU32 g_vMask1 = { { { 1, 1, 1, 1 } } };
+        // Move the parms to a vector
+        __m128i vTemp = _mm_set_epi32(static_cast<int>(C3), static_cast<int>(C2), static_cast<int>(C1), static_cast<int>(C0));
+        // Mask off the low bits
+        vTemp = _mm_and_si128(vTemp, g_vMask1);
+        // 0xFFFFFFFF on true bits
+        vTemp = _mm_cmpeq_epi32(vTemp, g_vMask1);
+        // 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f
+        vTemp = _mm_and_si128(vTemp, g_XMOne);
+        return _mm_castsi128_ps(vTemp);
+#endif
+    }
+
+    //------------------------------------------------------------------------------
+
+    inline XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent) noexcept
+    {
+        assert(IntConstant >= -16 && IntConstant <= 15);
+        assert(DivExponent < 32);
+#if defined(_XM_NO_INTRINSICS_)
+
+        using DirectX::XMConvertVectorIntToFloat;
+
+        XMVECTORI32 V = { { { IntConstant, IntConstant, IntConstant, IntConstant } } };
+        return XMConvertVectorIntToFloat(V.v, DivExponent);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+        // Splat the int
+        int32x4_t vScale = vdupq_n_s32(IntConstant);
+        // Convert to a float
+        XMVECTOR vResult = vcvtq_f32_s32(vScale);
+        // Convert DivExponent into 1.0f/(1<<DivExponent)
+        uint32_t uScale = 0x3F800000U - (DivExponent << 23);
+        // Splat the scalar value (It's really a float)
+        vScale = vreinterpretq_s32_u32(vdupq_n_u32(uScale));
+        // Multiply by the reciprocal (Perform a right shift by DivExponent)
+        vResult = vmulq_f32(vResult, reinterpret_cast<const float32x4_t*>(&vScale)[0]);
+        return vResult;
+#else // XM_SSE_INTRINSICS_
+        // Splat the int
+        __m128i vScale = _mm_set1_epi32(IntConstant);
+        // Convert to a float
+        XMVECTOR vResult = _mm_cvtepi32_ps(vScale);
+        // Convert DivExponent into 1.0f/(1<<DivExponent)
+        uint32_t uScale = 0x3F800000U - (DivExponent << 23);
+        // Splat the scalar value (It's really a float)
+        vScale = _mm_set1_epi32(static_cast<int>(uScale));
+        // Multiply by the reciprocal (Perform a right shift by DivExponent)
+        vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(vScale));
+        return vResult;
+#endif
+    }
+
+    //------------------------------------------------------------------------------
+
+    inline XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant) noexcept
+    {
+        assert(IntConstant >= -16 && IntConstant <= 15);
+#if defined(_XM_NO_INTRINSICS_)
+
+        XMVECTORI32 V = { { { IntConstant, IntConstant, IntConstant, IntConstant } } };
+        return V.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+        int32x4_t V = vdupq_n_s32(IntConstant);
+        return reinterpret_cast<float32x4_t*>(&V)[0];
+#else // XM_SSE_INTRINSICS_
+        __m128i V = _mm_set1_epi32(IntConstant);
+        return _mm_castsi128_ps(V);
+#endif
+    }
+
+#include "DirectXMathConvert.inl"
+#include "DirectXMathVector.inl"
+#include "DirectXMathMatrix.inl"
+#include "DirectXMathMisc.inl"
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+#ifdef _PREFAST_
+#pragma prefast(pop)
+#endif
+#ifdef _MSC_VER    
+#pragma warning(pop)
+#endif
+
+} // namespace DirectX
+
diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathConvert.inl b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathConvert.inl
new file mode 100644
index 000000000..3ca86d5ff
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathConvert.inl
@@ -0,0 +1,2191 @@
+//-------------------------------------------------------------------------------------
+// DirectXMathConvert.inl -- SIMD C++ Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+/****************************************************************************
+ *
+ * Data conversion
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4701)
+// C4701: false positives
+#endif
+
+inline XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat
+(
+    FXMVECTOR    VInt,
+    uint32_t     DivExponent
+) noexcept
+{
+    assert(DivExponent < 32);
+#if defined(_XM_NO_INTRINSICS_)
+    float fScale = 1.0f / static_cast<float>(1U << DivExponent);
+    uint32_t ElementIndex = 0;
+    XMVECTOR Result;
+    do {
+        auto iTemp = static_cast<int32_t>(VInt.vector4_u32[ElementIndex]);
+        Result.vector4_f32[ElementIndex] = static_cast<float>(iTemp)* fScale;
+    } while (++ElementIndex < 4);
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float fScale = 1.0f / static_cast<float>(1U << DivExponent);
+    float32x4_t vResult = vcvtq_f32_s32(vreinterpretq_s32_f32(VInt));
+    return vmulq_n_f32(vResult, fScale);
+#else // _XM_SSE_INTRINSICS_
+    // Convert to floats
+    XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt));
+    // Convert DivExponent into 1.0f/(1<<DivExponent)
+    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
+    // Splat the scalar value
+    __m128i vScale = _mm_set1_epi32(static_cast<int>(uScale));
+    vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(vScale));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt
+(
+    FXMVECTOR    VFloat,
+    uint32_t     MulExponent
+) noexcept
+{
+    assert(MulExponent < 32);
+#if defined(_XM_NO_INTRINSICS_)
+    // Get the scalar factor.
+    auto fScale = static_cast<float>(1U << MulExponent);
+    uint32_t ElementIndex = 0;
+    XMVECTOR Result;
+    do {
+        int32_t iResult;
+        float fTemp = VFloat.vector4_f32[ElementIndex] * fScale;
+        if (fTemp <= -(65536.0f * 32768.0f))
+        {
+            iResult = (-0x7FFFFFFF) - 1;
+        }
+        else if (fTemp > (65536.0f * 32768.0f) - 128.0f)
+        {
+            iResult = 0x7FFFFFFF;
+        }
+        else {
+            iResult = static_cast<int32_t>(fTemp);
+        }
+        Result.vector4_u32[ElementIndex] = static_cast<uint32_t>(iResult);
+    } while (++ElementIndex < 4);
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vResult = vmulq_n_f32(VFloat, static_cast<float>(1U << MulExponent));
+    // In case of positive overflow, detect it
+    uint32x4_t vOverflow = vcgtq_f32(vResult, g_XMMaxInt);
+    // Float to int conversion
+    int32x4_t vResulti = vcvtq_s32_f32(vResult);
+    // If there was positive overflow, set to 0x7FFFFFFF
+    vResult = vreinterpretq_f32_u32(vandq_u32(vOverflow, g_XMAbsMask));
+    vOverflow = vbicq_u32(vreinterpretq_u32_s32(vResulti), vOverflow);
+    vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult));
+    return vreinterpretq_f32_u32(vOverflow);
+#else // _XM_SSE_INTRINSICS_
+    XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
+    vResult = _mm_mul_ps(vResult, VFloat);
+    // In case of positive overflow, detect it
+    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxInt);
+    // Float to int conversion
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // If there was positive overflow, set to 0x7FFFFFFF
+    vResult = _mm_and_ps(vOverflow, g_XMAbsMask);
+    vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti));
+    vOverflow = _mm_or_ps(vOverflow, vResult);
+    return vOverflow;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat
+(
+    FXMVECTOR     VUInt,
+    uint32_t      DivExponent
+) noexcept
+{
+    assert(DivExponent < 32);
+#if defined(_XM_NO_INTRINSICS_)
+    float fScale = 1.0f / static_cast<float>(1U << DivExponent);
+    uint32_t ElementIndex = 0;
+    XMVECTOR Result;
+    do {
+        Result.vector4_f32[ElementIndex] = static_cast<float>(VUInt.vector4_u32[ElementIndex])* fScale;
+    } while (++ElementIndex < 4);
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float fScale = 1.0f / static_cast<float>(1U << DivExponent);
+    float32x4_t vResult = vcvtq_f32_u32(vreinterpretq_u32_f32(VUInt));
+    return vmulq_n_f32(vResult, fScale);
+#else // _XM_SSE_INTRINSICS_
+    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
+    // Determine which ones need the fix.
+    XMVECTOR vMask = _mm_and_ps(VUInt, g_XMNegativeZero);
+    // Force all values positive
+    XMVECTOR vResult = _mm_xor_ps(VUInt, vMask);
+    // Convert to floats
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
+    // Convert 0x80000000 -> 0xFFFFFFFF
+    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31);
+    // For only the ones that are too big, add the fixup
+    vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned);
+    vResult = _mm_add_ps(vResult, vMask);
+    // Convert DivExponent into 1.0f/(1<<DivExponent)
+    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
+    // Splat
+    iMask = _mm_set1_epi32(static_cast<int>(uScale));
+    vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(iMask));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt
+(
+    FXMVECTOR     VFloat,
+    uint32_t      MulExponent
+) noexcept
+{
+    assert(MulExponent < 32);
+#if defined(_XM_NO_INTRINSICS_)
+    // Get the scalar factor.
+    auto fScale = static_cast<float>(1U << MulExponent);
+    uint32_t ElementIndex = 0;
+    XMVECTOR Result;
+    do {
+        uint32_t uResult;
+        float fTemp = VFloat.vector4_f32[ElementIndex] * fScale;
+        if (fTemp <= 0.0f)
+        {
+            uResult = 0;
+        }
+        else if (fTemp >= (65536.0f * 65536.0f))
+        {
+            uResult = 0xFFFFFFFFU;
+        }
+        else {
+            uResult = static_cast<uint32_t>(fTemp);
+        }
+        Result.vector4_u32[ElementIndex] = uResult;
+    } while (++ElementIndex < 4);
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vResult = vmulq_n_f32(VFloat, static_cast<float>(1U << MulExponent));
+    // In case of overflow, detect it
+    uint32x4_t vOverflow = vcgtq_f32(vResult, g_XMMaxUInt);
+    // Float to int conversion
+    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
+    // If there was overflow, set to 0xFFFFFFFFU
+    vResult = vreinterpretq_f32_u32(vbicq_u32(vResulti, vOverflow));
+    vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult));
+    return vreinterpretq_f32_u32(vOverflow);
+#else // _XM_SSE_INTRINSICS_
+    XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
+    vResult = _mm_mul_ps(vResult, VFloat);
+    // Clamp to >=0
+    vResult = _mm_max_ps(vResult, g_XMZero);
+    // Any numbers that are too big, set to 0xFFFFFFFFU
+    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt);
+    XMVECTOR vValue = g_XMUnsignedFix;
+    // Too large for a signed integer?
+    XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue);
+    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
+    vValue = _mm_and_ps(vValue, vMask);
+    // Perform fixup only on numbers too large (Keeps low bit precision)
+    vResult = _mm_sub_ps(vResult, vValue);
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Convert from signed to unsigned pnly if greater than 0x80000000
+    vMask = _mm_and_ps(vMask, g_XMNegativeZero);
+    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask);
+    // On those that are too large, set to 0xFFFFFFFF
+    vResult = _mm_or_ps(vResult, vOverflow);
+    return vResult;
+#endif
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+/****************************************************************************
+ *
+ * Vector and matrix load operations
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadInt(const uint32_t* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_u32[0] = *pSource;
+    V.vector4_u32[1] = 0;
+    V.vector4_u32[2] = 0;
+    V.vector4_u32[3] = 0;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t zero = vdupq_n_u32(0);
+    return vreinterpretq_f32_u32(vld1q_lane_u32(pSource, zero, 0));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_load_ss(reinterpret_cast<const float*>(pSource));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadFloat(const float* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = *pSource;
+    V.vector4_f32[1] = 0.f;
+    V.vector4_f32[2] = 0.f;
+    V.vector4_f32[3] = 0.f;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t zero = vdupq_n_f32(0);
+    return vld1q_lane_f32(pSource, zero, 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_load_ss(pSource);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadInt2(const uint32_t* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_u32[0] = pSource[0];
+    V.vector4_u32[1] = pSource[1];
+    V.vector4_u32[2] = 0;
+    V.vector4_u32[3] = 0;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t x = vld1_u32(pSource);
+    uint32x2_t zero = vdup_n_u32(0);
+    return vreinterpretq_f32_u32(vcombine_u32(x, zero));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadInt2A(const uint32_t* pSource) noexcept
+{
+    assert(pSource);
+    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_u32[0] = pSource[0];
+    V.vector4_u32[1] = pSource[1];
+    V.vector4_u32[2] = 0;
+    V.vector4_u32[3] = 0;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    uint32x2_t x = vld1_u32_ex(pSource, 64);
+#else
+    uint32x2_t x = vld1_u32(pSource);
+#endif
+    uint32x2_t zero = vdup_n_u32(0);
+    return vreinterpretq_f32_u32(vcombine_u32(x, zero));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadFloat2(const XMFLOAT2* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = pSource->x;
+    V.vector4_f32[1] = pSource->y;
+    V.vector4_f32[2] = 0.f;
+    V.vector4_f32[3] = 0.f;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t x = vld1_f32(reinterpret_cast<const float*>(pSource));
+    float32x2_t zero = vdup_n_f32(0);
+    return vcombine_f32(x, zero);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadFloat2A(const XMFLOAT2A* pSource) noexcept
+{
+    assert(pSource);
+    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = pSource->x;
+    V.vector4_f32[1] = pSource->y;
+    V.vector4_f32[2] = 0.f;
+    V.vector4_f32[3] = 0.f;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    float32x2_t x = vld1_f32_ex(reinterpret_cast<const float*>(pSource), 64);
+#else
+    float32x2_t x = vld1_f32(reinterpret_cast<const float*>(pSource));
+#endif
+    float32x2_t zero = vdup_n_f32(0);
+    return vcombine_f32(x, zero);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadSInt2(const XMINT2* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = static_cast<float>(pSource->x);
+    V.vector4_f32[1] = static_cast<float>(pSource->y);
+    V.vector4_f32[2] = 0.f;
+    V.vector4_f32[3] = 0.f;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x2_t x = vld1_s32(reinterpret_cast<const int32_t*>(pSource));
+    float32x2_t v = vcvt_f32_s32(x);
+    float32x2_t zero = vdup_n_f32(0);
+    return vcombine_f32(v, zero);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 V = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
+    return _mm_cvtepi32_ps(_mm_castps_si128(V));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUInt2(const XMUINT2* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = static_cast<float>(pSource->x);
+    V.vector4_f32[1] = static_cast<float>(pSource->y);
+    V.vector4_f32[2] = 0.f;
+    V.vector4_f32[3] = 0.f;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t x = vld1_u32(reinterpret_cast<const uint32_t*>(pSource));
+    float32x2_t v = vcvt_f32_u32(x);
+    float32x2_t zero = vdup_n_f32(0);
+    return vcombine_f32(v, zero);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 V = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
+    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
+    // Determine which ones need the fix.
+    XMVECTOR vMask = _mm_and_ps(V, g_XMNegativeZero);
+    // Force all values positive
+    XMVECTOR vResult = _mm_xor_ps(V, vMask);
+    // Convert to floats
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
+    // Convert 0x80000000 -> 0xFFFFFFFF
+    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31);
+    // For only the ones that are too big, add the fixup
+    vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned);
+    vResult = _mm_add_ps(vResult, vMask);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadInt3(const uint32_t* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_u32[0] = pSource[0];
+    V.vector4_u32[1] = pSource[1];
+    V.vector4_u32[2] = pSource[2];
+    V.vector4_u32[3] = 0;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t x = vld1_u32(pSource);
+    uint32x2_t zero = vdup_n_u32(0);
+    uint32x2_t y = vld1_lane_u32(pSource + 2, zero, 0);
+    return vreinterpretq_f32_u32(vcombine_u32(x, y));
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
+    __m128 z = _mm_load_ss(reinterpret_cast<const float*>(pSource + 2));
+    return _mm_insert_ps(xy, z, 0x20);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
+    __m128 z = _mm_load_ss(reinterpret_cast<const float*>(pSource + 2));
+    return _mm_movelh_ps(xy, z);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadInt3A(const uint32_t* pSource) noexcept
+{
+    assert(pSource);
+    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_u32[0] = pSource[0];
+    V.vector4_u32[1] = pSource[1];
+    V.vector4_u32[2] = pSource[2];
+    V.vector4_u32[3] = 0;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Reads an extra integer which is zero'd
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    uint32x4_t V = vld1q_u32_ex(pSource, 128);
+#else
+    uint32x4_t V = vld1q_u32(pSource);
+#endif
+    return vreinterpretq_f32_u32(vsetq_lane_u32(0, V, 3));
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
+    __m128 z = _mm_load_ss(reinterpret_cast<const float*>(pSource + 2));
+    return _mm_insert_ps(xy, z, 0x20);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
+    __m128 z = _mm_load_ss(reinterpret_cast<const float*>(pSource + 2));
+    return _mm_movelh_ps(xy, z);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadFloat3(const XMFLOAT3* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = pSource->x;
+    V.vector4_f32[1] = pSource->y;
+    V.vector4_f32[2] = pSource->z;
+    V.vector4_f32[3] = 0.f;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t x = vld1_f32(reinterpret_cast<const float*>(pSource));
+    float32x2_t zero = vdup_n_f32(0);
+    float32x2_t y = vld1_lane_f32(reinterpret_cast<const float*>(pSource) + 2, zero, 0);
+    return vcombine_f32(x, y);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
+    __m128 z = _mm_load_ss(&pSource->z);
+    return _mm_insert_ps(xy, z, 0x20);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
+    __m128 z = _mm_load_ss(&pSource->z);
+    return _mm_movelh_ps(xy, z);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadFloat3A(const XMFLOAT3A* pSource) noexcept
+{
+    assert(pSource);
+    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = pSource->x;
+    V.vector4_f32[1] = pSource->y;
+    V.vector4_f32[2] = pSource->z;
+    V.vector4_f32[3] = 0.f;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Reads an extra float which is zero'd
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    float32x4_t V = vld1q_f32_ex(reinterpret_cast<const float*>(pSource), 128);
+#else
+    float32x4_t V = vld1q_f32(reinterpret_cast<const float*>(pSource));
+#endif
+    return vsetq_lane_f32(0, V, 3);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Reads an extra float which is zero'd
+    __m128 V = _mm_load_ps(&pSource->x);
+    return _mm_and_ps(V, g_XMMask3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadSInt3(const XMINT3* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR V;
+    V.vector4_f32[0] = static_cast<float>(pSource->x);
+    V.vector4_f32[1] = static_cast<float>(pSource->y);
+    V.vector4_f32[2] = static_cast<float>(pSource->z);
+    V.vector4_f32[3] = 0.f;
+    return V;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x2_t x = vld1_s32(reinterpret_cast<const int32_t*>(pSource));
+    int32x2_t zero = vdup_n_s32(0);
+    int32x2_t y = vld1_lane_s32(reinterpret_cast<const int32_t*>(pSource) + 2, zero, 0);
+    int32x4_t v = vcombine_s32(x, y);
+    return vcvtq_f32_s32(v);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
+    __m128 z = _mm_load_ss(reinterpret_cast<const float*>(&pSource->z));
+    __m128 V = _mm_movelh_ps(xy, z);
+    return _mm_cvtepi32_ps(_mm_castps_si128(V));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUInt3(const XMUINT3* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = static_cast<float>(pSource->x);
+    V.vector4_f32[1] = static_cast<float>(pSource->y);
+    V.vector4_f32[2] = static_cast<float>(pSource->z);
+    V.vector4_f32[3] = 0.f;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t x = vld1_u32(reinterpret_cast<const uint32_t*>(pSource));
+    uint32x2_t zero = vdup_n_u32(0);
+    uint32x2_t y = vld1_lane_u32(reinterpret_cast<const uint32_t*>(pSource) + 2, zero, 0);
+    uint32x4_t v = vcombine_u32(x, y);
+    return vcvtq_f32_u32(v);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
+    __m128 z = _mm_load_ss(reinterpret_cast<const float*>(&pSource->z));
+    __m128 V = _mm_movelh_ps(xy, z);
+    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
+    // Determine which ones need the fix.
+    XMVECTOR vMask = _mm_and_ps(V, g_XMNegativeZero);
+    // Force all values positive
+    XMVECTOR vResult = _mm_xor_ps(V, vMask);
+    // Convert to floats
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
+    // Convert 0x80000000 -> 0xFFFFFFFF
+    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31);
+    // For only the ones that are too big, add the fixup
+    vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned);
+    vResult = _mm_add_ps(vResult, vMask);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadInt4(const uint32_t* pSource) noexcept
+{
+    assert(pSource);
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_u32[0] = pSource[0];
+    V.vector4_u32[1] = pSource[1];
+    V.vector4_u32[2] = pSource[2];
+    V.vector4_u32[3] = pSource[3];
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vld1q_u32(pSource));
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSource));
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadInt4A(const uint32_t* pSource) noexcept
+{
+    assert(pSource);
+    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_u32[0] = pSource[0];
+    V.vector4_u32[1] = pSource[1];
+    V.vector4_u32[2] = pSource[2];
+    V.vector4_u32[3] = pSource[3];
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    return vld1q_u32_ex(pSource, 128);
+#else
+    return vreinterpretq_f32_u32(vld1q_u32(pSource));
+#endif
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_load_si128(reinterpret_cast<const __m128i*>(pSource));
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadFloat4(const XMFLOAT4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = pSource->x;
+    V.vector4_f32[1] = pSource->y;
+    V.vector4_f32[2] = pSource->z;
+    V.vector4_f32[3] = pSource->w;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_f32(reinterpret_cast<const float*>(pSource));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_loadu_ps(&pSource->x);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadFloat4A(const XMFLOAT4A* pSource) noexcept
+{
+    assert(pSource);
+    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = pSource->x;
+    V.vector4_f32[1] = pSource->y;
+    V.vector4_f32[2] = pSource->z;
+    V.vector4_f32[3] = pSource->w;
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    return vld1q_f32_ex(reinterpret_cast<const float*>(pSource), 128);
+#else
+    return vld1q_f32(reinterpret_cast<const float*>(pSource));
+#endif
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_load_ps(&pSource->x);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadSInt4(const XMINT4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR V;
+    V.vector4_f32[0] = static_cast<float>(pSource->x);
+    V.vector4_f32[1] = static_cast<float>(pSource->y);
+    V.vector4_f32[2] = static_cast<float>(pSource->z);
+    V.vector4_f32[3] = static_cast<float>(pSource->w);
+    return V;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x4_t v = vld1q_s32(reinterpret_cast<const int32_t*>(pSource));
+    return vcvtq_f32_s32(v);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSource));
+    return _mm_cvtepi32_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUInt4(const XMUINT4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR V;
+    V.vector4_f32[0] = static_cast<float>(pSource->x);
+    V.vector4_f32[1] = static_cast<float>(pSource->y);
+    V.vector4_f32[2] = static_cast<float>(pSource->z);
+    V.vector4_f32[3] = static_cast<float>(pSource->w);
+    return V;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t v = vld1q_u32(reinterpret_cast<const uint32_t*>(pSource));
+    return vcvtq_f32_u32(v);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSource));
+    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
+    // Determine which ones need the fix.
+    XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V), g_XMNegativeZero);
+    // Force all values positive
+    XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V), vMask);
+    // Convert to floats
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
+    // Convert 0x80000000 -> 0xFFFFFFFF
+    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31);
+    // For only the ones that are too big, add the fixup
+    vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned);
+    vResult = _mm_add_ps(vResult, vMask);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMMATRIX XM_CALLCONV XMLoadFloat3x3(const XMFLOAT3X3* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.r[0].vector4_f32[0] = pSource->m[0][0];
+    M.r[0].vector4_f32[1] = pSource->m[0][1];
+    M.r[0].vector4_f32[2] = pSource->m[0][2];
+    M.r[0].vector4_f32[3] = 0.0f;
+
+    M.r[1].vector4_f32[0] = pSource->m[1][0];
+    M.r[1].vector4_f32[1] = pSource->m[1][1];
+    M.r[1].vector4_f32[2] = pSource->m[1][2];
+    M.r[1].vector4_f32[3] = 0.0f;
+
+    M.r[2].vector4_f32[0] = pSource->m[2][0];
+    M.r[2].vector4_f32[1] = pSource->m[2][1];
+    M.r[2].vector4_f32[2] = pSource->m[2][2];
+    M.r[2].vector4_f32[3] = 0.0f;
+    M.r[3].vector4_f32[0] = 0.0f;
+    M.r[3].vector4_f32[1] = 0.0f;
+    M.r[3].vector4_f32[2] = 0.0f;
+    M.r[3].vector4_f32[3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t v0 = vld1q_f32(&pSource->m[0][0]);
+    float32x4_t v1 = vld1q_f32(&pSource->m[1][1]);
+    float32x2_t v2 = vcreate_f32(static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&pSource->m[2][2])));
+    float32x4_t T = vextq_f32(v0, v1, 3);
+
+    XMMATRIX M;
+    M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
+    M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T), g_XMMask3));
+    M.r[2] = vcombine_f32(vget_high_f32(v1), v2);
+    M.r[3] = g_XMIdentityR3;
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 Z = _mm_setzero_ps();
+
+    __m128 V1 = _mm_loadu_ps(&pSource->m[0][0]);
+    __m128 V2 = _mm_loadu_ps(&pSource->m[1][1]);
+    __m128 V3 = _mm_load_ss(&pSource->m[2][2]);
+
+    __m128 T1 = _mm_unpackhi_ps(V1, Z);
+    __m128 T2 = _mm_unpacklo_ps(V2, Z);
+    __m128 T3 = _mm_shuffle_ps(V3, T2, _MM_SHUFFLE(0, 1, 0, 0));
+    __m128 T4 = _mm_movehl_ps(T2, T3);
+    __m128 T5 = _mm_movehl_ps(Z, T1);
+
+    XMMATRIX M;
+    M.r[0] = _mm_movelh_ps(V1, T1);
+    M.r[1] = _mm_add_ps(T4, T5);
+    M.r[2] = _mm_shuffle_ps(V2, V3, _MM_SHUFFLE(1, 0, 3, 2));
+    M.r[3] = g_XMIdentityR3;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMMATRIX XM_CALLCONV XMLoadFloat4x3(const XMFLOAT4X3* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.r[0].vector4_f32[0] = pSource->m[0][0];
+    M.r[0].vector4_f32[1] = pSource->m[0][1];
+    M.r[0].vector4_f32[2] = pSource->m[0][2];
+    M.r[0].vector4_f32[3] = 0.0f;
+
+    M.r[1].vector4_f32[0] = pSource->m[1][0];
+    M.r[1].vector4_f32[1] = pSource->m[1][1];
+    M.r[1].vector4_f32[2] = pSource->m[1][2];
+    M.r[1].vector4_f32[3] = 0.0f;
+
+    M.r[2].vector4_f32[0] = pSource->m[2][0];
+    M.r[2].vector4_f32[1] = pSource->m[2][1];
+    M.r[2].vector4_f32[2] = pSource->m[2][2];
+    M.r[2].vector4_f32[3] = 0.0f;
+
+    M.r[3].vector4_f32[0] = pSource->m[3][0];
+    M.r[3].vector4_f32[1] = pSource->m[3][1];
+    M.r[3].vector4_f32[2] = pSource->m[3][2];
+    M.r[3].vector4_f32[3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t v0 = vld1q_f32(&pSource->m[0][0]);
+    float32x4_t v1 = vld1q_f32(&pSource->m[1][1]);
+    float32x4_t v2 = vld1q_f32(&pSource->m[2][2]);
+
+    float32x4_t T1 = vextq_f32(v0, v1, 3);
+    float32x4_t T2 = vcombine_f32(vget_high_f32(v1), vget_low_f32(v2));
+    float32x4_t T3 = vextq_f32(v2, v2, 1);
+
+    XMMATRIX M;
+    M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
+    M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
+    M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
+    M.r[3] = vsetq_lane_f32(1.f, T3, 3);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Use unaligned load instructions to
+    // load the 12 floats
+    // vTemp1 = x1,y1,z1,x2
+    XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]);
+    // vTemp2 = y2,z2,x3,y3
+    XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]);
+    // vTemp4 = z3,x4,y4,z4
+    XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]);
+    // vTemp3 = x3,y3,z3,z3
+    XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2, vTemp4, _MM_SHUFFLE(0, 0, 3, 2));
+    // vTemp2 = y2,z2,x2,x2
+    vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(3, 3, 1, 0));
+    // vTemp2 = x2,y2,z2,z2
+    vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2));
+    // vTemp1 = x1,y1,z1,0
+    vTemp1 = _mm_and_ps(vTemp1, g_XMMask3);
+    // vTemp2 = x2,y2,z2,0
+    vTemp2 = _mm_and_ps(vTemp2, g_XMMask3);
+    // vTemp3 = x3,y3,z3,0
+    vTemp3 = _mm_and_ps(vTemp3, g_XMMask3);
+    // vTemp4i = x4,y4,z4,0
+    __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8);
+    // vTemp4i = x4,y4,z4,1.0f
+    vTemp4i = _mm_or_si128(vTemp4i, g_XMIdentityR3);
+    XMMATRIX M(vTemp1,
+        vTemp2,
+        vTemp3,
+        _mm_castsi128_ps(vTemp4i));
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A(const XMFLOAT4X3A* pSource) noexcept
+{
+    assert(pSource);
+    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.r[0].vector4_f32[0] = pSource->m[0][0];
+    M.r[0].vector4_f32[1] = pSource->m[0][1];
+    M.r[0].vector4_f32[2] = pSource->m[0][2];
+    M.r[0].vector4_f32[3] = 0.0f;
+
+    M.r[1].vector4_f32[0] = pSource->m[1][0];
+    M.r[1].vector4_f32[1] = pSource->m[1][1];
+    M.r[1].vector4_f32[2] = pSource->m[1][2];
+    M.r[1].vector4_f32[3] = 0.0f;
+
+    M.r[2].vector4_f32[0] = pSource->m[2][0];
+    M.r[2].vector4_f32[1] = pSource->m[2][1];
+    M.r[2].vector4_f32[2] = pSource->m[2][2];
+    M.r[2].vector4_f32[3] = 0.0f;
+
+    M.r[3].vector4_f32[0] = pSource->m[3][0];
+    M.r[3].vector4_f32[1] = pSource->m[3][1];
+    M.r[3].vector4_f32[2] = pSource->m[3][2];
+    M.r[3].vector4_f32[3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    float32x4_t v0 = vld1q_f32_ex(&pSource->m[0][0], 128);
+    float32x4_t v1 = vld1q_f32_ex(&pSource->m[1][1], 128);
+    float32x4_t v2 = vld1q_f32_ex(&pSource->m[2][2], 128);
+#else
+    float32x4_t v0 = vld1q_f32(&pSource->m[0][0]);
+    float32x4_t v1 = vld1q_f32(&pSource->m[1][1]);
+    float32x4_t v2 = vld1q_f32(&pSource->m[2][2]);
+#endif
+
+    float32x4_t T1 = vextq_f32(v0, v1, 3);
+    float32x4_t T2 = vcombine_f32(vget_high_f32(v1), vget_low_f32(v2));
+    float32x4_t T3 = vextq_f32(v2, v2, 1);
+
+    XMMATRIX M;
+    M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
+    M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
+    M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
+    M.r[3] = vsetq_lane_f32(1.f, T3, 3);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Use aligned load instructions to
+    // load the 12 floats
+    // vTemp1 = x1,y1,z1,x2
+    XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]);
+    // vTemp2 = y2,z2,x3,y3
+    XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]);
+    // vTemp4 = z3,x4,y4,z4
+    XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]);
+    // vTemp3 = x3,y3,z3,z3
+    XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2, vTemp4, _MM_SHUFFLE(0, 0, 3, 2));
+    // vTemp2 = y2,z2,x2,x2
+    vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(3, 3, 1, 0));
+    // vTemp2 = x2,y2,z2,z2
+    vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2));
+    // vTemp1 = x1,y1,z1,0
+    vTemp1 = _mm_and_ps(vTemp1, g_XMMask3);
+    // vTemp2 = x2,y2,z2,0
+    vTemp2 = _mm_and_ps(vTemp2, g_XMMask3);
+    // vTemp3 = x3,y3,z3,0
+    vTemp3 = _mm_and_ps(vTemp3, g_XMMask3);
+    // vTemp4i = x4,y4,z4,0
+    __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8);
+    // vTemp4i = x4,y4,z4,1.0f
+    vTemp4i = _mm_or_si128(vTemp4i, g_XMIdentityR3);
+    XMMATRIX M(vTemp1,
+        vTemp2,
+        vTemp3,
+        _mm_castsi128_ps(vTemp4i));
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMMATRIX XM_CALLCONV XMLoadFloat3x4(const XMFLOAT3X4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.r[0].vector4_f32[0] = pSource->m[0][0];
+    M.r[0].vector4_f32[1] = pSource->m[1][0];
+    M.r[0].vector4_f32[2] = pSource->m[2][0];
+    M.r[0].vector4_f32[3] = 0.0f;
+
+    M.r[1].vector4_f32[0] = pSource->m[0][1];
+    M.r[1].vector4_f32[1] = pSource->m[1][1];
+    M.r[1].vector4_f32[2] = pSource->m[2][1];
+    M.r[1].vector4_f32[3] = 0.0f;
+
+    M.r[2].vector4_f32[0] = pSource->m[0][2];
+    M.r[2].vector4_f32[1] = pSource->m[1][2];
+    M.r[2].vector4_f32[2] = pSource->m[2][2];
+    M.r[2].vector4_f32[3] = 0.0f;
+
+    M.r[3].vector4_f32[0] = pSource->m[0][3];
+    M.r[3].vector4_f32[1] = pSource->m[1][3];
+    M.r[3].vector4_f32[2] = pSource->m[2][3];
+    M.r[3].vector4_f32[3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2x4_t vTemp0 = vld4_f32(&pSource->_11);
+    float32x4_t vTemp1 = vld1q_f32(&pSource->_31);
+
+    float32x2_t l = vget_low_f32(vTemp1);
+    float32x4_t T0 = vcombine_f32(vTemp0.val[0], l);
+    float32x2_t rl = vrev64_f32(l);
+    float32x4_t T1 = vcombine_f32(vTemp0.val[1], rl);
+
+    float32x2_t h = vget_high_f32(vTemp1);
+    float32x4_t T2 = vcombine_f32(vTemp0.val[2], h);
+    float32x2_t rh = vrev64_f32(h);
+    float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh);
+
+    XMMATRIX M = {};
+    M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3));
+    M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
+    M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
+    M.r[3] = vsetq_lane_f32(1.f, T3, 3);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = _mm_loadu_ps(&pSource->_11);
+    M.r[1] = _mm_loadu_ps(&pSource->_21);
+    M.r[2] = _mm_loadu_ps(&pSource->_31);
+    M.r[3] = g_XMIdentityR3;
+
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
+    XMMATRIX mResult;
+
+    // x.x,y.x,z.x,w.x
+    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
+    // x.y,y.y,z.y,w.y
+    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
+    // x.z,y.z,z.z,w.z
+    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
+    // x.w,y.w,z.w,w.w
+    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
+    return mResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMMATRIX XM_CALLCONV XMLoadFloat3x4A(const XMFLOAT3X4A* pSource) noexcept
+{
+    assert(pSource);
+    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.r[0].vector4_f32[0] = pSource->m[0][0];
+    M.r[0].vector4_f32[1] = pSource->m[1][0];
+    M.r[0].vector4_f32[2] = pSource->m[2][0];
+    M.r[0].vector4_f32[3] = 0.0f;
+
+    M.r[1].vector4_f32[0] = pSource->m[0][1];
+    M.r[1].vector4_f32[1] = pSource->m[1][1];
+    M.r[1].vector4_f32[2] = pSource->m[2][1];
+    M.r[1].vector4_f32[3] = 0.0f;
+
+    M.r[2].vector4_f32[0] = pSource->m[0][2];
+    M.r[2].vector4_f32[1] = pSource->m[1][2];
+    M.r[2].vector4_f32[2] = pSource->m[2][2];
+    M.r[2].vector4_f32[3] = 0.0f;
+
+    M.r[3].vector4_f32[0] = pSource->m[0][3];
+    M.r[3].vector4_f32[1] = pSource->m[1][3];
+    M.r[3].vector4_f32[2] = pSource->m[2][3];
+    M.r[3].vector4_f32[3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    float32x2x4_t vTemp0 = vld4_f32_ex(&pSource->_11, 128);
+    float32x4_t vTemp1 = vld1q_f32_ex(&pSource->_31, 128);
+#else
+    float32x2x4_t vTemp0 = vld4_f32(&pSource->_11);
+    float32x4_t vTemp1 = vld1q_f32(&pSource->_31);
+#endif
+
+    float32x2_t l = vget_low_f32(vTemp1);
+    float32x4_t T0 = vcombine_f32(vTemp0.val[0], l);
+    float32x2_t rl = vrev64_f32(l);
+    float32x4_t T1 = vcombine_f32(vTemp0.val[1], rl);
+
+    float32x2_t h = vget_high_f32(vTemp1);
+    float32x4_t T2 = vcombine_f32(vTemp0.val[2], h);
+    float32x2_t rh = vrev64_f32(h);
+    float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh);
+
+    XMMATRIX M = {};
+    M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3));
+    M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
+    M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
+    M.r[3] = vsetq_lane_f32(1.f, T3, 3);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = _mm_load_ps(&pSource->_11);
+    M.r[1] = _mm_load_ps(&pSource->_21);
+    M.r[2] = _mm_load_ps(&pSource->_31);
+    M.r[3] = g_XMIdentityR3;
+
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
+    XMMATRIX mResult;
+
+    // x.x,y.x,z.x,w.x
+    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
+    // x.y,y.y,z.y,w.y
+    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
+    // x.z,y.z,z.z,w.z
+    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
+    // x.w,y.w,z.w,w.w
+    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
+    return mResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMMATRIX XM_CALLCONV XMLoadFloat4x4(const XMFLOAT4X4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.r[0].vector4_f32[0] = pSource->m[0][0];
+    M.r[0].vector4_f32[1] = pSource->m[0][1];
+    M.r[0].vector4_f32[2] = pSource->m[0][2];
+    M.r[0].vector4_f32[3] = pSource->m[0][3];
+
+    M.r[1].vector4_f32[0] = pSource->m[1][0];
+    M.r[1].vector4_f32[1] = pSource->m[1][1];
+    M.r[1].vector4_f32[2] = pSource->m[1][2];
+    M.r[1].vector4_f32[3] = pSource->m[1][3];
+
+    M.r[2].vector4_f32[0] = pSource->m[2][0];
+    M.r[2].vector4_f32[1] = pSource->m[2][1];
+    M.r[2].vector4_f32[2] = pSource->m[2][2];
+    M.r[2].vector4_f32[3] = pSource->m[2][3];
+
+    M.r[3].vector4_f32[0] = pSource->m[3][0];
+    M.r[3].vector4_f32[1] = pSource->m[3][1];
+    M.r[3].vector4_f32[2] = pSource->m[3][2];
+    M.r[3].vector4_f32[3] = pSource->m[3][3];
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = vld1q_f32(reinterpret_cast<const float*>(&pSource->_11));
+    M.r[1] = vld1q_f32(reinterpret_cast<const float*>(&pSource->_21));
+    M.r[2] = vld1q_f32(reinterpret_cast<const float*>(&pSource->_31));
+    M.r[3] = vld1q_f32(reinterpret_cast<const float*>(&pSource->_41));
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = _mm_loadu_ps(&pSource->_11);
+    M.r[1] = _mm_loadu_ps(&pSource->_21);
+    M.r[2] = _mm_loadu_ps(&pSource->_31);
+    M.r[3] = _mm_loadu_ps(&pSource->_41);
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMMATRIX XM_CALLCONV XMLoadFloat4x4A(const XMFLOAT4X4A* pSource) noexcept
+{
+    assert(pSource);
+    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.r[0].vector4_f32[0] = pSource->m[0][0];
+    M.r[0].vector4_f32[1] = pSource->m[0][1];
+    M.r[0].vector4_f32[2] = pSource->m[0][2];
+    M.r[0].vector4_f32[3] = pSource->m[0][3];
+
+    M.r[1].vector4_f32[0] = pSource->m[1][0];
+    M.r[1].vector4_f32[1] = pSource->m[1][1];
+    M.r[1].vector4_f32[2] = pSource->m[1][2];
+    M.r[1].vector4_f32[3] = pSource->m[1][3];
+
+    M.r[2].vector4_f32[0] = pSource->m[2][0];
+    M.r[2].vector4_f32[1] = pSource->m[2][1];
+    M.r[2].vector4_f32[2] = pSource->m[2][2];
+    M.r[2].vector4_f32[3] = pSource->m[2][3];
+
+    M.r[3].vector4_f32[0] = pSource->m[3][0];
+    M.r[3].vector4_f32[1] = pSource->m[3][1];
+    M.r[3].vector4_f32[2] = pSource->m[3][2];
+    M.r[3].vector4_f32[3] = pSource->m[3][3];
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMMATRIX M;
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    M.r[0] = vld1q_f32_ex(reinterpret_cast<const float*>(&pSource->_11), 128);
+    M.r[1] = vld1q_f32_ex(reinterpret_cast<const float*>(&pSource->_21), 128);
+    M.r[2] = vld1q_f32_ex(reinterpret_cast<const float*>(&pSource->_31), 128);
+    M.r[3] = vld1q_f32_ex(reinterpret_cast<const float*>(&pSource->_41), 128);
+#else
+    M.r[0] = vld1q_f32(reinterpret_cast<const float*>(&pSource->_11));
+    M.r[1] = vld1q_f32(reinterpret_cast<const float*>(&pSource->_21));
+    M.r[2] = vld1q_f32(reinterpret_cast<const float*>(&pSource->_31));
+    M.r[3] = vld1q_f32(reinterpret_cast<const float*>(&pSource->_41));
+#endif
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = _mm_load_ps(&pSource->_11);
+    M.r[1] = _mm_load_ps(&pSource->_21);
+    M.r[2] = _mm_load_ps(&pSource->_31);
+    M.r[3] = _mm_load_ps(&pSource->_41);
+    return M;
+#endif
+}
+
+/****************************************************************************
+ *
+ * Vector and matrix store operations
+ *
+ ****************************************************************************/
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreInt
+(
+    uint32_t* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    *pDestination = XMVectorGetIntX(V);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_u32(pDestination, *reinterpret_cast<const uint32x4_t*>(&V), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_ss(reinterpret_cast<float*>(pDestination), V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat
+(
+    float* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    *pDestination = XMVectorGetX(V);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_f32(pDestination, V, 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_ss(pDestination, V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreInt2
+(
+    uint32_t* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination[0] = V.vector4_u32[0];
+    pDestination[1] = V.vector4_u32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
+    vst1_u32(pDestination, VL);
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreInt2A
+(
+    uint32_t* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination[0] = V.vector4_u32[0];
+    pDestination[1] = V.vector4_u32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    vst1_u32_ex(pDestination, VL, 64);
+#else
+    vst1_u32(pDestination, VL);
+#endif
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat2
+(
+    XMFLOAT2* pDestination,
+    FXMVECTOR  V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = V.vector4_f32[0];
+    pDestination->y = V.vector4_f32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    vst1_f32(reinterpret_cast<float*>(pDestination), VL);
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat2A
+(
+    XMFLOAT2A* pDestination,
+    FXMVECTOR     V
+) noexcept
+{
+    assert(pDestination);
+    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = V.vector4_f32[0];
+    pDestination->y = V.vector4_f32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    vst1_f32_ex(reinterpret_cast<float*>(pDestination), VL, 64);
+#else
+    vst1_f32(reinterpret_cast<float*>(pDestination), VL);
+#endif
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreSInt2
+(
+    XMINT2* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = static_cast<int32_t>(V.vector4_f32[0]);
+    pDestination->y = static_cast<int32_t>(V.vector4_f32[1]);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t v = vget_low_f32(V);
+    int32x2_t iv = vcvt_s32_f32(v);
+    vst1_s32(reinterpret_cast<int32_t*>(pDestination), iv);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // In case of positive overflow, detect it
+    XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt);
+    // Float to int conversion
+    __m128i vResulti = _mm_cvttps_epi32(V);
+    // If there was positive overflow, set to 0x7FFFFFFF
+    XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask);
+    vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti));
+    vOverflow = _mm_or_ps(vOverflow, vResult);
+    // Write two ints
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(vOverflow));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUInt2
+(
+    XMUINT2* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = static_cast<uint32_t>(V.vector4_f32[0]);
+    pDestination->y = static_cast<uint32_t>(V.vector4_f32[1]);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t v = vget_low_f32(V);
+    uint32x2_t iv = vcvt_u32_f32(v);
+    vst1_u32(reinterpret_cast<uint32_t*>(pDestination), iv);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Clamp to >=0
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    // Any numbers that are too big, set to 0xFFFFFFFFU
+    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt);
+    XMVECTOR vValue = g_XMUnsignedFix;
+    // Too large for a signed integer?
+    XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue);
+    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
+    vValue = _mm_and_ps(vValue, vMask);
+    // Perform fixup only on numbers too large (Keeps low bit precision)
+    vResult = _mm_sub_ps(vResult, vValue);
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Convert from signed to unsigned pnly if greater than 0x80000000
+    vMask = _mm_and_ps(vMask, g_XMNegativeZero);
+    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask);
+    // On those that are too large, set to 0xFFFFFFFF
+    vResult = _mm_or_ps(vResult, vOverflow);
+    // Write two uints
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(vResult));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreInt3
+(
+    uint32_t* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination[0] = V.vector4_u32[0];
+    pDestination[1] = V.vector4_u32[1];
+    pDestination[2] = V.vector4_u32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
+    vst1_u32(pDestination, VL);
+    vst1q_lane_u32(pDestination + 2, *reinterpret_cast<const uint32x4_t*>(&V), 2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
+    __m128 z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination[2]), z);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreInt3A
+(
+    uint32_t* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination[0] = V.vector4_u32[0];
+    pDestination[1] = V.vector4_u32[1];
+    pDestination[2] = V.vector4_u32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    vst1_u32_ex(pDestination, VL, 64);
+#else
+    vst1_u32(pDestination, VL);
+#endif
+    vst1q_lane_u32(pDestination + 2, *reinterpret_cast<const uint32x4_t*>(&V), 2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
+    __m128 z = _mm_movehl_ps(V, V);
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination[2]), z);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat3
+(
+    XMFLOAT3* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = V.vector4_f32[0];
+    pDestination->y = V.vector4_f32[1];
+    pDestination->z = V.vector4_f32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    vst1_f32(reinterpret_cast<float*>(pDestination), VL);
+    vst1q_lane_f32(reinterpret_cast<float*>(pDestination) + 2, V, 2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    * reinterpret_cast<int*>(&pDestination->x) = _mm_extract_ps(V, 0);
+    *reinterpret_cast<int*>(&pDestination->y) = _mm_extract_ps(V, 1);
+    *reinterpret_cast<int*>(&pDestination->z) = _mm_extract_ps(V, 2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
+    __m128 z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+    _mm_store_ss(&pDestination->z, z);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat3A
+(
+    XMFLOAT3A* pDestination,
+    FXMVECTOR     V
+) noexcept
+{
+    assert(pDestination);
+    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = V.vector4_f32[0];
+    pDestination->y = V.vector4_f32[1];
+    pDestination->z = V.vector4_f32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    vst1_f32_ex(reinterpret_cast<float*>(pDestination), VL, 64);
+#else
+    vst1_f32(reinterpret_cast<float*>(pDestination), VL);
+#endif
+    vst1q_lane_f32(reinterpret_cast<float*>(pDestination) + 2, V, 2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
+    *reinterpret_cast<int*>(&pDestination->z) = _mm_extract_ps(V, 2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
+    __m128 z = _mm_movehl_ps(V, V);
+    _mm_store_ss(&pDestination->z, z);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreSInt3
+(
+    XMINT3* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = static_cast<int32_t>(V.vector4_f32[0]);
+    pDestination->y = static_cast<int32_t>(V.vector4_f32[1]);
+    pDestination->z = static_cast<int32_t>(V.vector4_f32[2]);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x4_t v = vcvtq_s32_f32(V);
+    int32x2_t vL = vget_low_s32(v);
+    vst1_s32(reinterpret_cast<int32_t*>(pDestination), vL);
+    vst1q_lane_s32(reinterpret_cast<int32_t*>(pDestination) + 2, v, 2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // In case of positive overflow, detect it
+    XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt);
+    // Float to int conversion
+    __m128i vResulti = _mm_cvttps_epi32(V);
+    // If there was positive overflow, set to 0x7FFFFFFF
+    XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask);
+    vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti));
+    vOverflow = _mm_or_ps(vOverflow, vResult);
+    // Write 3 uints
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(vOverflow));
+    __m128 z = XM_PERMUTE_PS(vOverflow, _MM_SHUFFLE(2, 2, 2, 2));
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination->z), z);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUInt3
+(
+    XMUINT3* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = static_cast<uint32_t>(V.vector4_f32[0]);
+    pDestination->y = static_cast<uint32_t>(V.vector4_f32[1]);
+    pDestination->z = static_cast<uint32_t>(V.vector4_f32[2]);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t v = vcvtq_u32_f32(V);
+    uint32x2_t vL = vget_low_u32(v);
+    vst1_u32(reinterpret_cast<uint32_t*>(pDestination), vL);
+    vst1q_lane_u32(reinterpret_cast<uint32_t*>(pDestination) + 2, v, 2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Clamp to >=0
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    // Any numbers that are too big, set to 0xFFFFFFFFU
+    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt);
+    XMVECTOR vValue = g_XMUnsignedFix;
+    // Too large for a signed integer?
+    XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue);
+    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
+    vValue = _mm_and_ps(vValue, vMask);
+    // Perform fixup only on numbers too large (Keeps low bit precision)
+    vResult = _mm_sub_ps(vResult, vValue);
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Convert from signed to unsigned pnly if greater than 0x80000000
+    vMask = _mm_and_ps(vMask, g_XMNegativeZero);
+    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask);
+    // On those that are too large, set to 0xFFFFFFFF
+    vResult = _mm_or_ps(vResult, vOverflow);
+    // Write 3 uints
+    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(vResult));
+    __m128 z = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(2, 2, 2, 2));
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination->z), z);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreInt4
+(
+    uint32_t* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination[0] = V.vector4_u32[0];
+    pDestination[1] = V.vector4_u32[1];
+    pDestination[2] = V.vector4_u32[2];
+    pDestination[3] = V.vector4_u32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_u32(pDestination, vreinterpretq_u32_f32(V));
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreInt4A
+(
+    uint32_t* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination[0] = V.vector4_u32[0];
+    pDestination[1] = V.vector4_u32[1];
+    pDestination[2] = V.vector4_u32[2];
+    pDestination[3] = V.vector4_u32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    vst1q_u32_ex(pDestination, V, 128);
+#else
+    vst1q_u32(pDestination, vreinterpretq_u32_f32(V));
+#endif
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat4
+(
+    XMFLOAT4* pDestination,
+    FXMVECTOR  V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = V.vector4_f32[0];
+    pDestination->y = V.vector4_f32[1];
+    pDestination->z = V.vector4_f32[2];
+    pDestination->w = V.vector4_f32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_f32(reinterpret_cast<float*>(pDestination), V);
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_storeu_ps(&pDestination->x, V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat4A
+(
+    XMFLOAT4A* pDestination,
+    FXMVECTOR     V
+) noexcept
+{
+    assert(pDestination);
+    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = V.vector4_f32[0];
+    pDestination->y = V.vector4_f32[1];
+    pDestination->z = V.vector4_f32[2];
+    pDestination->w = V.vector4_f32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    vst1q_f32_ex(reinterpret_cast<float*>(pDestination), V, 128);
+#else
+    vst1q_f32(reinterpret_cast<float*>(pDestination), V);
+#endif
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_ps(&pDestination->x, V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreSInt4
+(
+    XMINT4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = static_cast<int32_t>(V.vector4_f32[0]);
+    pDestination->y = static_cast<int32_t>(V.vector4_f32[1]);
+    pDestination->z = static_cast<int32_t>(V.vector4_f32[2]);
+    pDestination->w = static_cast<int32_t>(V.vector4_f32[3]);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x4_t v = vcvtq_s32_f32(V);
+    vst1q_s32(reinterpret_cast<int32_t*>(pDestination), v);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // In case of positive overflow, detect it
+    XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt);
+    // Float to int conversion
+    __m128i vResulti = _mm_cvttps_epi32(V);
+    // If there was positive overflow, set to 0x7FFFFFFF
+    XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask);
+    vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti));
+    vOverflow = _mm_or_ps(vOverflow, vResult);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUInt4
+(
+    XMUINT4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+    pDestination->x = static_cast<uint32_t>(V.vector4_f32[0]);
+    pDestination->y = static_cast<uint32_t>(V.vector4_f32[1]);
+    pDestination->z = static_cast<uint32_t>(V.vector4_f32[2]);
+    pDestination->w = static_cast<uint32_t>(V.vector4_f32[3]);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t v = vcvtq_u32_f32(V);
+    vst1q_u32(reinterpret_cast<uint32_t*>(pDestination), v);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Clamp to >=0
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    // Any numbers that are too big, set to 0xFFFFFFFFU
+    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt);
+    XMVECTOR vValue = g_XMUnsignedFix;
+    // Too large for a signed integer?
+    XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue);
+    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
+    vValue = _mm_and_ps(vValue, vMask);
+    // Perform fixup only on numbers too large (Keeps low bit precision)
+    vResult = _mm_sub_ps(vResult, vValue);
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Convert from signed to unsigned pnly if greater than 0x80000000
+    vMask = _mm_and_ps(vMask, g_XMNegativeZero);
+    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask);
+    // On those that are too large, set to 0xFFFFFFFF
+    vResult = _mm_or_ps(vResult, vOverflow);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat3x3
+(
+    XMFLOAT3X3* pDestination,
+    FXMMATRIX   M
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    pDestination->m[0][0] = M.r[0].vector4_f32[0];
+    pDestination->m[0][1] = M.r[0].vector4_f32[1];
+    pDestination->m[0][2] = M.r[0].vector4_f32[2];
+
+    pDestination->m[1][0] = M.r[1].vector4_f32[0];
+    pDestination->m[1][1] = M.r[1].vector4_f32[1];
+    pDestination->m[1][2] = M.r[1].vector4_f32[2];
+
+    pDestination->m[2][0] = M.r[2].vector4_f32[0];
+    pDestination->m[2][1] = M.r[2].vector4_f32[1];
+    pDestination->m[2][2] = M.r[2].vector4_f32[2];
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1);
+    float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1);
+    vst1q_f32(&pDestination->m[0][0], T2);
+
+    T1 = vextq_f32(M.r[1], M.r[1], 1);
+    T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2]));
+    vst1q_f32(&pDestination->m[1][1], T2);
+
+    vst1q_lane_f32(&pDestination->m[2][2], M.r[2], 2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp1 = M.r[0];
+    XMVECTOR vTemp2 = M.r[1];
+    XMVECTOR vTemp3 = M.r[2];
+    XMVECTOR vWork = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(0, 0, 2, 2));
+    vTemp1 = _mm_shuffle_ps(vTemp1, vWork, _MM_SHUFFLE(2, 0, 1, 0));
+    _mm_storeu_ps(&pDestination->m[0][0], vTemp1);
+    vTemp2 = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1));
+    _mm_storeu_ps(&pDestination->m[1][1], vTemp2);
+    vTemp3 = XM_PERMUTE_PS(vTemp3, _MM_SHUFFLE(2, 2, 2, 2));
+    _mm_store_ss(&pDestination->m[2][2], vTemp3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat4x3
+(
+    XMFLOAT4X3* pDestination,
+    FXMMATRIX M
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    pDestination->m[0][0] = M.r[0].vector4_f32[0];
+    pDestination->m[0][1] = M.r[0].vector4_f32[1];
+    pDestination->m[0][2] = M.r[0].vector4_f32[2];
+
+    pDestination->m[1][0] = M.r[1].vector4_f32[0];
+    pDestination->m[1][1] = M.r[1].vector4_f32[1];
+    pDestination->m[1][2] = M.r[1].vector4_f32[2];
+
+    pDestination->m[2][0] = M.r[2].vector4_f32[0];
+    pDestination->m[2][1] = M.r[2].vector4_f32[1];
+    pDestination->m[2][2] = M.r[2].vector4_f32[2];
+
+    pDestination->m[3][0] = M.r[3].vector4_f32[0];
+    pDestination->m[3][1] = M.r[3].vector4_f32[1];
+    pDestination->m[3][2] = M.r[3].vector4_f32[2];
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1);
+    float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1);
+    vst1q_f32(&pDestination->m[0][0], T2);
+
+    T1 = vextq_f32(M.r[1], M.r[1], 1);
+    T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2]));
+    vst1q_f32(&pDestination->m[1][1], T2);
+
+    T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0);
+    T2 = vextq_f32(T1, M.r[3], 3);
+    vst1q_f32(&pDestination->m[2][2], T2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp1 = M.r[0];
+    XMVECTOR vTemp2 = M.r[1];
+    XMVECTOR vTemp3 = M.r[2];
+    XMVECTOR vTemp4 = M.r[3];
+    XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1));
+    vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(2, 2, 0, 0));
+    vTemp1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(0, 2, 1, 0));
+    vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(0, 0, 2, 2));
+    vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 1, 2, 0));
+    _mm_storeu_ps(&pDestination->m[0][0], vTemp1);
+    _mm_storeu_ps(&pDestination->m[1][1], vTemp2x);
+    _mm_storeu_ps(&pDestination->m[2][2], vTemp3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat4x3A
+(
+    XMFLOAT4X3A* pDestination,
+    FXMMATRIX       M
+) noexcept
+{
+    assert(pDestination);
+    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+
+    pDestination->m[0][0] = M.r[0].vector4_f32[0];
+    pDestination->m[0][1] = M.r[0].vector4_f32[1];
+    pDestination->m[0][2] = M.r[0].vector4_f32[2];
+
+    pDestination->m[1][0] = M.r[1].vector4_f32[0];
+    pDestination->m[1][1] = M.r[1].vector4_f32[1];
+    pDestination->m[1][2] = M.r[1].vector4_f32[2];
+
+    pDestination->m[2][0] = M.r[2].vector4_f32[0];
+    pDestination->m[2][1] = M.r[2].vector4_f32[1];
+    pDestination->m[2][2] = M.r[2].vector4_f32[2];
+
+    pDestination->m[3][0] = M.r[3].vector4_f32[0];
+    pDestination->m[3][1] = M.r[3].vector4_f32[1];
+    pDestination->m[3][2] = M.r[3].vector4_f32[2];
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1);
+    float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1);
+    vst1q_f32_ex(&pDestination->m[0][0], T2, 128);
+
+    T1 = vextq_f32(M.r[1], M.r[1], 1);
+    T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2]));
+    vst1q_f32_ex(&pDestination->m[1][1], T2, 128);
+
+    T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0);
+    T2 = vextq_f32(T1, M.r[3], 3);
+    vst1q_f32_ex(&pDestination->m[2][2], T2, 128);
+#else
+    float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1);
+    float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1);
+    vst1q_f32(&pDestination->m[0][0], T2);
+
+    T1 = vextq_f32(M.r[1], M.r[1], 1);
+    T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2]));
+    vst1q_f32(&pDestination->m[1][1], T2);
+
+    T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0);
+    T2 = vextq_f32(T1, M.r[3], 3);
+    vst1q_f32(&pDestination->m[2][2], T2);
+#endif
+#elif defined(_XM_SSE_INTRINSICS_)
+    // x1,y1,z1,w1
+    XMVECTOR vTemp1 = M.r[0];
+    // x2,y2,z2,w2
+    XMVECTOR vTemp2 = M.r[1];
+    // x3,y3,z3,w3
+    XMVECTOR vTemp3 = M.r[2];
+    // x4,y4,z4,w4
+    XMVECTOR vTemp4 = M.r[3];
+    // z1,z1,x2,y2
+    XMVECTOR vTemp = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(1, 0, 2, 2));
+    // y2,z2,x3,y3 (Final)
+    vTemp2 = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1));
+    // x1,y1,z1,x2 (Final)
+    vTemp1 = _mm_shuffle_ps(vTemp1, vTemp, _MM_SHUFFLE(2, 0, 1, 0));
+    // z3,z3,x4,x4
+    vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(0, 0, 2, 2));
+    // z3,x4,y4,z4 (Final)
+    vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 1, 2, 0));
+    // Store in 3 operations
+    _mm_store_ps(&pDestination->m[0][0], vTemp1);
+    _mm_store_ps(&pDestination->m[1][1], vTemp2);
+    _mm_store_ps(&pDestination->m[2][2], vTemp3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat3x4
+(
+    XMFLOAT3X4* pDestination,
+    FXMMATRIX M
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    pDestination->m[0][0] = M.r[0].vector4_f32[0];
+    pDestination->m[0][1] = M.r[1].vector4_f32[0];
+    pDestination->m[0][2] = M.r[2].vector4_f32[0];
+    pDestination->m[0][3] = M.r[3].vector4_f32[0];
+
+    pDestination->m[1][0] = M.r[0].vector4_f32[1];
+    pDestination->m[1][1] = M.r[1].vector4_f32[1];
+    pDestination->m[1][2] = M.r[2].vector4_f32[1];
+    pDestination->m[1][3] = M.r[3].vector4_f32[1];
+
+    pDestination->m[2][0] = M.r[0].vector4_f32[2];
+    pDestination->m[2][1] = M.r[1].vector4_f32[2];
+    pDestination->m[2][2] = M.r[2].vector4_f32[2];
+    pDestination->m[2][3] = M.r[3].vector4_f32[2];
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]);
+    float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]);
+
+    float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
+    float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);
+
+    vst1q_f32(&pDestination->m[0][0], T0.val[0]);
+    vst1q_f32(&pDestination->m[1][0], T0.val[1]);
+    vst1q_f32(&pDestination->m[2][0], T1.val[0]);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
+
+    // x.x,y.x,z.x,w.x
+    XMVECTOR r0 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
+    // x.y,y.y,z.y,w.y
+    XMVECTOR r1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
+    // x.z,y.z,z.z,w.z
+    XMVECTOR r2 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
+
+    _mm_storeu_ps(&pDestination->m[0][0], r0);
+    _mm_storeu_ps(&pDestination->m[1][0], r1);
+    _mm_storeu_ps(&pDestination->m[2][0], r2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat3x4A
+(
+    XMFLOAT3X4A* pDestination,
+    FXMMATRIX M
+) noexcept
+{
+    assert(pDestination);
+    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+
+    pDestination->m[0][0] = M.r[0].vector4_f32[0];
+    pDestination->m[0][1] = M.r[1].vector4_f32[0];
+    pDestination->m[0][2] = M.r[2].vector4_f32[0];
+    pDestination->m[0][3] = M.r[3].vector4_f32[0];
+
+    pDestination->m[1][0] = M.r[0].vector4_f32[1];
+    pDestination->m[1][1] = M.r[1].vector4_f32[1];
+    pDestination->m[1][2] = M.r[2].vector4_f32[1];
+    pDestination->m[1][3] = M.r[3].vector4_f32[1];
+
+    pDestination->m[2][0] = M.r[0].vector4_f32[2];
+    pDestination->m[2][1] = M.r[1].vector4_f32[2];
+    pDestination->m[2][2] = M.r[2].vector4_f32[2];
+    pDestination->m[2][3] = M.r[3].vector4_f32[2];
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]);
+    float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]);
+
+    float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
+    float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);
+
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    vst1q_f32_ex(&pDestination->m[0][0], T0.val[0], 128);
+    vst1q_f32_ex(&pDestination->m[1][0], T0.val[1], 128);
+    vst1q_f32_ex(&pDestination->m[2][0], T1.val[0], 128);
+#else
+    vst1q_f32(&pDestination->m[0][0], T0.val[0]);
+    vst1q_f32(&pDestination->m[1][0], T0.val[1]);
+    vst1q_f32(&pDestination->m[2][0], T1.val[0]);
+#endif
+#elif defined(_XM_SSE_INTRINSICS_)
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
+
+    // x.x,y.x,z.x,w.x
+    XMVECTOR r0 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
+    // x.y,y.y,z.y,w.y
+    XMVECTOR r1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
+    // x.z,y.z,z.z,w.z
+    XMVECTOR r2 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
+
+    _mm_store_ps(&pDestination->m[0][0], r0);
+    _mm_store_ps(&pDestination->m[1][0], r1);
+    _mm_store_ps(&pDestination->m[2][0], r2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat4x4
+(
+    XMFLOAT4X4* pDestination,
+    FXMMATRIX M
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    pDestination->m[0][0] = M.r[0].vector4_f32[0];
+    pDestination->m[0][1] = M.r[0].vector4_f32[1];
+    pDestination->m[0][2] = M.r[0].vector4_f32[2];
+    pDestination->m[0][3] = M.r[0].vector4_f32[3];
+
+    pDestination->m[1][0] = M.r[1].vector4_f32[0];
+    pDestination->m[1][1] = M.r[1].vector4_f32[1];
+    pDestination->m[1][2] = M.r[1].vector4_f32[2];
+    pDestination->m[1][3] = M.r[1].vector4_f32[3];
+
+    pDestination->m[2][0] = M.r[2].vector4_f32[0];
+    pDestination->m[2][1] = M.r[2].vector4_f32[1];
+    pDestination->m[2][2] = M.r[2].vector4_f32[2];
+    pDestination->m[2][3] = M.r[2].vector4_f32[3];
+
+    pDestination->m[3][0] = M.r[3].vector4_f32[0];
+    pDestination->m[3][1] = M.r[3].vector4_f32[1];
+    pDestination->m[3][2] = M.r[3].vector4_f32[2];
+    pDestination->m[3][3] = M.r[3].vector4_f32[3];
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_f32(reinterpret_cast<float*>(&pDestination->_11), M.r[0]);
+    vst1q_f32(reinterpret_cast<float*>(&pDestination->_21), M.r[1]);
+    vst1q_f32(reinterpret_cast<float*>(&pDestination->_31), M.r[2]);
+    vst1q_f32(reinterpret_cast<float*>(&pDestination->_41), M.r[3]);
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_storeu_ps(&pDestination->_11, M.r[0]);
+    _mm_storeu_ps(&pDestination->_21, M.r[1]);
+    _mm_storeu_ps(&pDestination->_31, M.r[2]);
+    _mm_storeu_ps(&pDestination->_41, M.r[3]);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat4x4A
+(
+    XMFLOAT4X4A* pDestination,
+    FXMMATRIX       M
+) noexcept
+{
+    assert(pDestination);
+    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
+#if defined(_XM_NO_INTRINSICS_)
+
+    pDestination->m[0][0] = M.r[0].vector4_f32[0];
+    pDestination->m[0][1] = M.r[0].vector4_f32[1];
+    pDestination->m[0][2] = M.r[0].vector4_f32[2];
+    pDestination->m[0][3] = M.r[0].vector4_f32[3];
+
+    pDestination->m[1][0] = M.r[1].vector4_f32[0];
+    pDestination->m[1][1] = M.r[1].vector4_f32[1];
+    pDestination->m[1][2] = M.r[1].vector4_f32[2];
+    pDestination->m[1][3] = M.r[1].vector4_f32[3];
+
+    pDestination->m[2][0] = M.r[2].vector4_f32[0];
+    pDestination->m[2][1] = M.r[2].vector4_f32[1];
+    pDestination->m[2][2] = M.r[2].vector4_f32[2];
+    pDestination->m[2][3] = M.r[2].vector4_f32[3];
+
+    pDestination->m[3][0] = M.r[3].vector4_f32[0];
+    pDestination->m[3][1] = M.r[3].vector4_f32[1];
+    pDestination->m[3][2] = M.r[3].vector4_f32[2];
+    pDestination->m[3][3] = M.r[3].vector4_f32[3];
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    vst1q_f32_ex(reinterpret_cast<float*>(&pDestination->_11), M.r[0], 128);
+    vst1q_f32_ex(reinterpret_cast<float*>(&pDestination->_21), M.r[1], 128);
+    vst1q_f32_ex(reinterpret_cast<float*>(&pDestination->_31), M.r[2], 128);
+    vst1q_f32_ex(reinterpret_cast<float*>(&pDestination->_41), M.r[3], 128);
+#else
+    vst1q_f32(reinterpret_cast<float*>(&pDestination->_11), M.r[0]);
+    vst1q_f32(reinterpret_cast<float*>(&pDestination->_21), M.r[1]);
+    vst1q_f32(reinterpret_cast<float*>(&pDestination->_31), M.r[2]);
+    vst1q_f32(reinterpret_cast<float*>(&pDestination->_41), M.r[3]);
+#endif
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_ps(&pDestination->_11, M.r[0]);
+    _mm_store_ps(&pDestination->_21, M.r[1]);
+    _mm_store_ps(&pDestination->_31, M.r[2]);
+    _mm_store_ps(&pDestination->_41, M.r[3]);
+#endif
+}
+
diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathMatrix.inl b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathMatrix.inl
new file mode 100644
index 000000000..1c579a1ec
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathMatrix.inl
@@ -0,0 +1,3550 @@
+//-------------------------------------------------------------------------------------
+// DirectXMathMatrix.inl -- SIMD C++ Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+/****************************************************************************
+ *
+ * Matrix
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+ // Comparison operations
+ //------------------------------------------------------------------------------
+
+ //------------------------------------------------------------------------------
+
+#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#pragma float_control(push)
+#pragma float_control(precise, on)
+#endif
+
+// Return true if any entry in the matrix is NaN
+inline bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    size_t i = 16;
+    auto pWork = reinterpret_cast<const uint32_t*>(&M.m[0][0]);
+    do {
+        // Fetch value into integer unit
+        uint32_t uTest = pWork[0];
+        // Remove sign
+        uTest &= 0x7FFFFFFFU;
+        // NaN is 0x7F800001 through 0x7FFFFFFF inclusive
+        uTest -= 0x7F800001U;
+        if (uTest < 0x007FFFFFU)
+        {
+            break;      // NaN found
+        }
+        ++pWork;        // Next entry
+    } while (--i);
+    return (i != 0);      // i == 0 if nothing matched
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Load in registers
+    float32x4_t vX = M.r[0];
+    float32x4_t vY = M.r[1];
+    float32x4_t vZ = M.r[2];
+    float32x4_t vW = M.r[3];
+    // Test themselves to check for NaN
+    uint32x4_t xmask = vmvnq_u32(vceqq_f32(vX, vX));
+    uint32x4_t ymask = vmvnq_u32(vceqq_f32(vY, vY));
+    uint32x4_t zmask = vmvnq_u32(vceqq_f32(vZ, vZ));
+    uint32x4_t wmask = vmvnq_u32(vceqq_f32(vW, vW));
+    // Or all the results
+    xmask = vorrq_u32(xmask, zmask);
+    ymask = vorrq_u32(ymask, wmask);
+    xmask = vorrq_u32(xmask, ymask);
+    // If any tested true, return true
+    uint8x8x2_t vTemp = vzip_u8(
+        vget_low_u8(vreinterpretq_u8_u32(xmask)),
+        vget_high_u8(vreinterpretq_u8_u32(xmask)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
+    return (r != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Load in registers
+    XMVECTOR vX = M.r[0];
+    XMVECTOR vY = M.r[1];
+    XMVECTOR vZ = M.r[2];
+    XMVECTOR vW = M.r[3];
+    // Test themselves to check for NaN
+    vX = _mm_cmpneq_ps(vX, vX);
+    vY = _mm_cmpneq_ps(vY, vY);
+    vZ = _mm_cmpneq_ps(vZ, vZ);
+    vW = _mm_cmpneq_ps(vW, vW);
+    // Or all the results
+    vX = _mm_or_ps(vX, vZ);
+    vY = _mm_or_ps(vY, vW);
+    vX = _mm_or_ps(vX, vY);
+    // If any tested true, return true
+    return (_mm_movemask_ps(vX) != 0);
+#else
+#endif
+}
+
+#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#pragma float_control(pop)
+#endif
+
+//------------------------------------------------------------------------------
+
+// Return true if any entry in the matrix is +/-INF
+inline bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    size_t i = 16;
+    auto pWork = reinterpret_cast<const uint32_t*>(&M.m[0][0]);
+    do {
+        // Fetch value into integer unit
+        uint32_t uTest = pWork[0];
+        // Remove sign
+        uTest &= 0x7FFFFFFFU;
+        // INF is 0x7F800000
+        if (uTest == 0x7F800000U)
+        {
+            break;      // INF found
+        }
+        ++pWork;        // Next entry
+    } while (--i);
+    return (i != 0);      // i == 0 if nothing matched
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Load in registers
+    float32x4_t vX = M.r[0];
+    float32x4_t vY = M.r[1];
+    float32x4_t vZ = M.r[2];
+    float32x4_t vW = M.r[3];
+    // Mask off the sign bits
+    vX = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vX), g_XMAbsMask));
+    vY = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vY), g_XMAbsMask));
+    vZ = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vZ), g_XMAbsMask));
+    vW = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vW), g_XMAbsMask));
+    // Compare to infinity
+    uint32x4_t xmask = vceqq_f32(vX, g_XMInfinity);
+    uint32x4_t ymask = vceqq_f32(vY, g_XMInfinity);
+    uint32x4_t zmask = vceqq_f32(vZ, g_XMInfinity);
+    uint32x4_t wmask = vceqq_f32(vW, g_XMInfinity);
+    // Or the answers together
+    xmask = vorrq_u32(xmask, zmask);
+    ymask = vorrq_u32(ymask, wmask);
+    xmask = vorrq_u32(xmask, ymask);
+    // If any tested true, return true
+    uint8x8x2_t vTemp = vzip_u8(
+        vget_low_u8(vreinterpretq_u8_u32(xmask)),
+        vget_high_u8(vreinterpretq_u8_u32(xmask)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
+    return (r != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Mask off the sign bits
+    XMVECTOR vTemp1 = _mm_and_ps(M.r[0], g_XMAbsMask);
+    XMVECTOR vTemp2 = _mm_and_ps(M.r[1], g_XMAbsMask);
+    XMVECTOR vTemp3 = _mm_and_ps(M.r[2], g_XMAbsMask);
+    XMVECTOR vTemp4 = _mm_and_ps(M.r[3], g_XMAbsMask);
+    // Compare to infinity
+    vTemp1 = _mm_cmpeq_ps(vTemp1, g_XMInfinity);
+    vTemp2 = _mm_cmpeq_ps(vTemp2, g_XMInfinity);
+    vTemp3 = _mm_cmpeq_ps(vTemp3, g_XMInfinity);
+    vTemp4 = _mm_cmpeq_ps(vTemp4, g_XMInfinity);
+    // Or the answers together
+    vTemp1 = _mm_or_ps(vTemp1, vTemp2);
+    vTemp3 = _mm_or_ps(vTemp3, vTemp4);
+    vTemp1 = _mm_or_ps(vTemp1, vTemp3);
+    // If any are infinity, the signs are true.
+    return (_mm_movemask_ps(vTemp1) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Return true if the XMMatrix is equal to identity
+inline bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    // Use the integer pipeline to reduce branching to a minimum
+    auto pWork = reinterpret_cast<const uint32_t*>(&M.m[0][0]);
+    // Convert 1.0f to zero and or them together
+    uint32_t uOne = pWork[0] ^ 0x3F800000U;
+    // Or all the 0.0f entries together
+    uint32_t uZero = pWork[1];
+    uZero |= pWork[2];
+    uZero |= pWork[3];
+    // 2nd row
+    uZero |= pWork[4];
+    uOne |= pWork[5] ^ 0x3F800000U;
+    uZero |= pWork[6];
+    uZero |= pWork[7];
+    // 3rd row
+    uZero |= pWork[8];
+    uZero |= pWork[9];
+    uOne |= pWork[10] ^ 0x3F800000U;
+    uZero |= pWork[11];
+    // 4th row
+    uZero |= pWork[12];
+    uZero |= pWork[13];
+    uZero |= pWork[14];
+    uOne |= pWork[15] ^ 0x3F800000U;
+    // If all zero entries are zero, the uZero==0
+    uZero &= 0x7FFFFFFF;    // Allow -0.0f
+    // If all 1.0f entries are 1.0f, then uOne==0
+    uOne |= uZero;
+    return (uOne == 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t xmask = vceqq_f32(M.r[0], g_XMIdentityR0);
+    uint32x4_t ymask = vceqq_f32(M.r[1], g_XMIdentityR1);
+    uint32x4_t zmask = vceqq_f32(M.r[2], g_XMIdentityR2);
+    uint32x4_t wmask = vceqq_f32(M.r[3], g_XMIdentityR3);
+    xmask = vandq_u32(xmask, zmask);
+    ymask = vandq_u32(ymask, wmask);
+    xmask = vandq_u32(xmask, ymask);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(xmask)), vget_high_u8(vreinterpretq_u8_u32(xmask)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
+    return (r == 0xFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0], g_XMIdentityR0);
+    XMVECTOR vTemp2 = _mm_cmpeq_ps(M.r[1], g_XMIdentityR1);
+    XMVECTOR vTemp3 = _mm_cmpeq_ps(M.r[2], g_XMIdentityR2);
+    XMVECTOR vTemp4 = _mm_cmpeq_ps(M.r[3], g_XMIdentityR3);
+    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
+    vTemp3 = _mm_and_ps(vTemp3, vTemp4);
+    vTemp1 = _mm_and_ps(vTemp1, vTemp3);
+    return (_mm_movemask_ps(vTemp1) == 0x0f);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Perform a 4x4 matrix multiply by a 4x4 matrix
+inline XMMATRIX XM_CALLCONV XMMatrixMultiply
+(
+    FXMMATRIX M1,
+    CXMMATRIX M2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMMATRIX mResult;
+    // Cache the invariants in registers
+    float x = M1.m[0][0];
+    float y = M1.m[0][1];
+    float z = M1.m[0][2];
+    float w = M1.m[0][3];
+    // Perform the operation on the first row
+    mResult.m[0][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w);
+    mResult.m[0][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w);
+    mResult.m[0][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w);
+    mResult.m[0][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w);
+    // Repeat for all the other rows
+    x = M1.m[1][0];
+    y = M1.m[1][1];
+    z = M1.m[1][2];
+    w = M1.m[1][3];
+    mResult.m[1][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w);
+    mResult.m[1][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w);
+    mResult.m[1][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w);
+    mResult.m[1][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w);
+    x = M1.m[2][0];
+    y = M1.m[2][1];
+    z = M1.m[2][2];
+    w = M1.m[2][3];
+    mResult.m[2][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w);
+    mResult.m[2][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w);
+    mResult.m[2][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w);
+    mResult.m[2][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w);
+    x = M1.m[3][0];
+    y = M1.m[3][1];
+    z = M1.m[3][2];
+    w = M1.m[3][3];
+    mResult.m[3][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w);
+    mResult.m[3][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w);
+    mResult.m[3][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w);
+    mResult.m[3][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w);
+    return mResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMMATRIX mResult;
+    float32x2_t VL = vget_low_f32(M1.r[0]);
+    float32x2_t VH = vget_high_f32(M1.r[0]);
+    // Perform the operation on the first row
+    float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    mResult.r[0] = vaddq_f32(vZ, vW);
+    // Repeat for the other 3 rows
+    VL = vget_low_f32(M1.r[1]);
+    VH = vget_high_f32(M1.r[1]);
+    vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    mResult.r[1] = vaddq_f32(vZ, vW);
+    VL = vget_low_f32(M1.r[2]);
+    VH = vget_high_f32(M1.r[2]);
+    vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    mResult.r[2] = vaddq_f32(vZ, vW);
+    VL = vget_low_f32(M1.r[3]);
+    VH = vget_high_f32(M1.r[3]);
+    vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    mResult.r[3] = vaddq_f32(vZ, vW);
+    return mResult;
+#elif defined(_XM_AVX2_INTRINSICS_)
+    __m256 t0 = _mm256_castps128_ps256(M1.r[0]);
+    t0 = _mm256_insertf128_ps(t0, M1.r[1], 1);
+    __m256 t1 = _mm256_castps128_ps256(M1.r[2]);
+    t1 = _mm256_insertf128_ps(t1, M1.r[3], 1);
+
+    __m256 u0 = _mm256_castps128_ps256(M2.r[0]);
+    u0 = _mm256_insertf128_ps(u0, M2.r[1], 1);
+    __m256 u1 = _mm256_castps128_ps256(M2.r[2]);
+    u1 = _mm256_insertf128_ps(u1, M2.r[3], 1);
+
+    __m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0));
+    __m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
+    __m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00);
+    __m256 c0 = _mm256_mul_ps(a0, b0);
+    __m256 c1 = _mm256_mul_ps(a1, b0);
+
+    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1));
+    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
+    b0 = _mm256_permute2f128_ps(u0, u0, 0x11);
+    __m256 c2 = _mm256_fmadd_ps(a0, b0, c0);
+    __m256 c3 = _mm256_fmadd_ps(a1, b0, c1);
+
+    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2));
+    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
+    __m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00);
+    __m256 c4 = _mm256_mul_ps(a0, b1);
+    __m256 c5 = _mm256_mul_ps(a1, b1);
+
+    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3));
+    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
+    b1 = _mm256_permute2f128_ps(u1, u1, 0x11);
+    __m256 c6 = _mm256_fmadd_ps(a0, b1, c4);
+    __m256 c7 = _mm256_fmadd_ps(a1, b1, c5);
+
+    t0 = _mm256_add_ps(c2, c6);
+    t1 = _mm256_add_ps(c3, c7);
+
+    XMMATRIX mResult;
+    mResult.r[0] = _mm256_castps256_ps128(t0);
+    mResult.r[1] = _mm256_extractf128_ps(t0, 1);
+    mResult.r[2] = _mm256_castps256_ps128(t1);
+    mResult.r[3] = _mm256_extractf128_ps(t1, 1);
+    return mResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX mResult;
+    // Splat the component X,Y,Z then W
+#if defined(_XM_AVX_INTRINSICS_)
+    XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 0);
+    XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 1);
+    XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 2);
+    XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 3);
+#else
+    // Use vW to hold the original row
+    XMVECTOR vW = M1.r[0];
+    XMVECTOR vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
+    XMVECTOR vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
+    XMVECTOR vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
+    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
+#endif
+    // Perform the operation on the first row
+    vX = _mm_mul_ps(vX, M2.r[0]);
+    vY = _mm_mul_ps(vY, M2.r[1]);
+    vZ = _mm_mul_ps(vZ, M2.r[2]);
+    vW = _mm_mul_ps(vW, M2.r[3]);
+    // Perform a binary add to reduce cumulative errors
+    vX = _mm_add_ps(vX, vZ);
+    vY = _mm_add_ps(vY, vW);
+    vX = _mm_add_ps(vX, vY);
+    mResult.r[0] = vX;
+    // Repeat for the other 3 rows
+#if defined(_XM_AVX_INTRINSICS_)
+    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 0);
+    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 1);
+    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 2);
+    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 3);
+#else
+    vW = M1.r[1];
+    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
+    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
+    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
+    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
+#endif
+    vX = _mm_mul_ps(vX, M2.r[0]);
+    vY = _mm_mul_ps(vY, M2.r[1]);
+    vZ = _mm_mul_ps(vZ, M2.r[2]);
+    vW = _mm_mul_ps(vW, M2.r[3]);
+    vX = _mm_add_ps(vX, vZ);
+    vY = _mm_add_ps(vY, vW);
+    vX = _mm_add_ps(vX, vY);
+    mResult.r[1] = vX;
+#if defined(_XM_AVX_INTRINSICS_)
+    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 0);
+    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 1);
+    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 2);
+    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 3);
+#else
+    vW = M1.r[2];
+    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
+    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
+    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
+    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
+#endif
+    vX = _mm_mul_ps(vX, M2.r[0]);
+    vY = _mm_mul_ps(vY, M2.r[1]);
+    vZ = _mm_mul_ps(vZ, M2.r[2]);
+    vW = _mm_mul_ps(vW, M2.r[3]);
+    vX = _mm_add_ps(vX, vZ);
+    vY = _mm_add_ps(vY, vW);
+    vX = _mm_add_ps(vX, vY);
+    mResult.r[2] = vX;
+#if defined(_XM_AVX_INTRINSICS_)
+    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 0);
+    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 1);
+    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 2);
+    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 3);
+#else
+    vW = M1.r[3];
+    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
+    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
+    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
+    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
+#endif
+    vX = _mm_mul_ps(vX, M2.r[0]);
+    vY = _mm_mul_ps(vY, M2.r[1]);
+    vZ = _mm_mul_ps(vZ, M2.r[2]);
+    vW = _mm_mul_ps(vW, M2.r[3]);
+    vX = _mm_add_ps(vX, vZ);
+    vY = _mm_add_ps(vY, vW);
+    vX = _mm_add_ps(vX, vY);
+    mResult.r[3] = vX;
+    return mResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose
+(
+    FXMMATRIX M1,
+    CXMMATRIX M2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMMATRIX mResult;
+    // Cache the invariants in registers
+    float x = M2.m[0][0];
+    float y = M2.m[1][0];
+    float z = M2.m[2][0];
+    float w = M2.m[3][0];
+    // Perform the operation on the first row
+    mResult.m[0][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w);
+    mResult.m[0][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w);
+    mResult.m[0][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w);
+    mResult.m[0][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w);
+    // Repeat for all the other rows
+    x = M2.m[0][1];
+    y = M2.m[1][1];
+    z = M2.m[2][1];
+    w = M2.m[3][1];
+    mResult.m[1][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w);
+    mResult.m[1][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w);
+    mResult.m[1][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w);
+    mResult.m[1][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w);
+    x = M2.m[0][2];
+    y = M2.m[1][2];
+    z = M2.m[2][2];
+    w = M2.m[3][2];
+    mResult.m[2][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w);
+    mResult.m[2][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w);
+    mResult.m[2][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w);
+    mResult.m[2][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w);
+    x = M2.m[0][3];
+    y = M2.m[1][3];
+    z = M2.m[2][3];
+    w = M2.m[3][3];
+    mResult.m[3][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w);
+    mResult.m[3][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w);
+    mResult.m[3][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w);
+    mResult.m[3][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w);
+    return mResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(M1.r[0]);
+    float32x2_t VH = vget_high_f32(M1.r[0]);
+    // Perform the operation on the first row
+    float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    float32x4_t r0 = vaddq_f32(vZ, vW);
+    // Repeat for the other 3 rows
+    VL = vget_low_f32(M1.r[1]);
+    VH = vget_high_f32(M1.r[1]);
+    vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    float32x4_t r1 = vaddq_f32(vZ, vW);
+    VL = vget_low_f32(M1.r[2]);
+    VH = vget_high_f32(M1.r[2]);
+    vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    float32x4_t r2 = vaddq_f32(vZ, vW);
+    VL = vget_low_f32(M1.r[3]);
+    VH = vget_high_f32(M1.r[3]);
+    vX = vmulq_lane_f32(M2.r[0], VL, 0);
+    vY = vmulq_lane_f32(M2.r[1], VL, 1);
+    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
+    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
+    float32x4_t r3 = vaddq_f32(vZ, vW);
+
+    // Transpose result
+    float32x4x2_t P0 = vzipq_f32(r0, r2);
+    float32x4x2_t P1 = vzipq_f32(r1, r3);
+
+    float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
+    float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);
+
+    XMMATRIX mResult;
+    mResult.r[0] = T0.val[0];
+    mResult.r[1] = T0.val[1];
+    mResult.r[2] = T1.val[0];
+    mResult.r[3] = T1.val[1];
+    return mResult;
+#elif defined(_XM_AVX2_INTRINSICS_)
+    __m256 t0 = _mm256_castps128_ps256(M1.r[0]);
+    t0 = _mm256_insertf128_ps(t0, M1.r[1], 1);
+    __m256 t1 = _mm256_castps128_ps256(M1.r[2]);
+    t1 = _mm256_insertf128_ps(t1, M1.r[3], 1);
+
+    __m256 u0 = _mm256_castps128_ps256(M2.r[0]);
+    u0 = _mm256_insertf128_ps(u0, M2.r[1], 1);
+    __m256 u1 = _mm256_castps128_ps256(M2.r[2]);
+    u1 = _mm256_insertf128_ps(u1, M2.r[3], 1);
+
+    __m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0));
+    __m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
+    __m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00);
+    __m256 c0 = _mm256_mul_ps(a0, b0);
+    __m256 c1 = _mm256_mul_ps(a1, b0);
+
+    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1));
+    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
+    b0 = _mm256_permute2f128_ps(u0, u0, 0x11);
+    __m256 c2 = _mm256_fmadd_ps(a0, b0, c0);
+    __m256 c3 = _mm256_fmadd_ps(a1, b0, c1);
+
+    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2));
+    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
+    __m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00);
+    __m256 c4 = _mm256_mul_ps(a0, b1);
+    __m256 c5 = _mm256_mul_ps(a1, b1);
+
+    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3));
+    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
+    b1 = _mm256_permute2f128_ps(u1, u1, 0x11);
+    __m256 c6 = _mm256_fmadd_ps(a0, b1, c4);
+    __m256 c7 = _mm256_fmadd_ps(a1, b1, c5);
+
+    t0 = _mm256_add_ps(c2, c6);
+    t1 = _mm256_add_ps(c3, c7);
+
+    // Transpose result
+    __m256 vTemp = _mm256_unpacklo_ps(t0, t1);
+    __m256 vTemp2 = _mm256_unpackhi_ps(t0, t1);
+    __m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
+    __m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
+    vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4);
+    vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4);
+    t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
+    t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
+
+    XMMATRIX mResult;
+    mResult.r[0] = _mm256_castps256_ps128(t0);
+    mResult.r[1] = _mm256_extractf128_ps(t0, 1);
+    mResult.r[2] = _mm256_castps256_ps128(t1);
+    mResult.r[3] = _mm256_extractf128_ps(t1, 1);
+    return mResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the component X,Y,Z then W
+#if defined(_XM_AVX_INTRINSICS_)
+    XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 0);
+    XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 1);
+    XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 2);
+    XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 3);
+#else
+    // Use vW to hold the original row
+    XMVECTOR vW = M1.r[0];
+    XMVECTOR vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
+    XMVECTOR vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
+    XMVECTOR vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
+    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
+#endif
+    // Perform the operation on the first row
+    vX = _mm_mul_ps(vX, M2.r[0]);
+    vY = _mm_mul_ps(vY, M2.r[1]);
+    vZ = _mm_mul_ps(vZ, M2.r[2]);
+    vW = _mm_mul_ps(vW, M2.r[3]);
+    // Perform a binary add to reduce cumulative errors
+    vX = _mm_add_ps(vX, vZ);
+    vY = _mm_add_ps(vY, vW);
+    vX = _mm_add_ps(vX, vY);
+    XMVECTOR r0 = vX;
+    // Repeat for the other 3 rows
+#if defined(_XM_AVX_INTRINSICS_)
+    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 0);
+    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 1);
+    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 2);
+    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 3);
+#else
+    vW = M1.r[1];
+    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
+    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
+    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
+    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
+#endif
+    vX = _mm_mul_ps(vX, M2.r[0]);
+    vY = _mm_mul_ps(vY, M2.r[1]);
+    vZ = _mm_mul_ps(vZ, M2.r[2]);
+    vW = _mm_mul_ps(vW, M2.r[3]);
+    vX = _mm_add_ps(vX, vZ);
+    vY = _mm_add_ps(vY, vW);
+    vX = _mm_add_ps(vX, vY);
+    XMVECTOR r1 = vX;
+#if defined(_XM_AVX_INTRINSICS_)
+    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 0);
+    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 1);
+    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 2);
+    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 3);
+#else
+    vW = M1.r[2];
+    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
+    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
+    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
+    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
+#endif
+    vX = _mm_mul_ps(vX, M2.r[0]);
+    vY = _mm_mul_ps(vY, M2.r[1]);
+    vZ = _mm_mul_ps(vZ, M2.r[2]);
+    vW = _mm_mul_ps(vW, M2.r[3]);
+    vX = _mm_add_ps(vX, vZ);
+    vY = _mm_add_ps(vY, vW);
+    vX = _mm_add_ps(vX, vY);
+    XMVECTOR r2 = vX;
+#if defined(_XM_AVX_INTRINSICS_)
+    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 0);
+    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 1);
+    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 2);
+    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 3);
+#else
+    vW = M1.r[3];
+    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
+    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
+    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
+    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
+#endif
+    vX = _mm_mul_ps(vX, M2.r[0]);
+    vY = _mm_mul_ps(vY, M2.r[1]);
+    vZ = _mm_mul_ps(vZ, M2.r[2]);
+    vW = _mm_mul_ps(vW, M2.r[3]);
+    vX = _mm_add_ps(vX, vZ);
+    vY = _mm_add_ps(vY, vW);
+    vX = _mm_add_ps(vX, vY);
+    XMVECTOR r3 = vX;
+
+    // Transpose result
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 0, 1, 0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(3, 2, 3, 2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(1, 0, 1, 0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2));
+
+    XMMATRIX mResult;
+    // x.x,y.x,z.x,w.x
+    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
+    // x.y,y.y,z.y,w.y
+    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
+    // x.z,y.z,z.z,w.z
+    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
+    // x.w,y.w,z.w,w.w
+    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
+    return mResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixTranspose(FXMMATRIX M) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    // Original matrix:
+    //
+    //     m00m01m02m03
+    //     m10m11m12m13
+    //     m20m21m22m23
+    //     m30m31m32m33
+
+    XMMATRIX P;
+    P.r[0] = XMVectorMergeXY(M.r[0], M.r[2]); // m00m20m01m21
+    P.r[1] = XMVectorMergeXY(M.r[1], M.r[3]); // m10m30m11m31
+    P.r[2] = XMVectorMergeZW(M.r[0], M.r[2]); // m02m22m03m23
+    P.r[3] = XMVectorMergeZW(M.r[1], M.r[3]); // m12m32m13m33
+
+    XMMATRIX MT;
+    MT.r[0] = XMVectorMergeXY(P.r[0], P.r[1]); // m00m10m20m30
+    MT.r[1] = XMVectorMergeZW(P.r[0], P.r[1]); // m01m11m21m31
+    MT.r[2] = XMVectorMergeXY(P.r[2], P.r[3]); // m02m12m22m32
+    MT.r[3] = XMVectorMergeZW(P.r[2], P.r[3]); // m03m13m23m33
+    return MT;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]);
+    float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]);
+
+    float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
+    float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);
+
+    XMMATRIX mResult;
+    mResult.r[0] = T0.val[0];
+    mResult.r[1] = T0.val[1];
+    mResult.r[2] = T1.val[0];
+    mResult.r[3] = T1.val[1];
+    return mResult;
+#elif defined(_XM_AVX2_INTRINSICS_)
+    __m256 t0 = _mm256_castps128_ps256(M.r[0]);
+    t0 = _mm256_insertf128_ps(t0, M.r[1], 1);
+    __m256 t1 = _mm256_castps128_ps256(M.r[2]);
+    t1 = _mm256_insertf128_ps(t1, M.r[3], 1);
+
+    __m256 vTemp = _mm256_unpacklo_ps(t0, t1);
+    __m256 vTemp2 = _mm256_unpackhi_ps(t0, t1);
+    __m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
+    __m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
+    vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4);
+    vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4);
+    t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
+    t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
+
+    XMMATRIX mResult;
+    mResult.r[0] = _mm256_castps256_ps128(t0);
+    mResult.r[1] = _mm256_extractf128_ps(t0, 1);
+    mResult.r[2] = _mm256_castps256_ps128(t1);
+    mResult.r[3] = _mm256_extractf128_ps(t1, 1);
+    return mResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // x.x,x.y,y.x,y.y
+    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
+    // x.z,x.w,y.z,y.w
+    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
+    // z.x,z.y,w.x,w.y
+    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
+    // z.z,z.w,w.z,w.w
+    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
+
+    XMMATRIX mResult;
+    // x.x,y.x,z.x,w.x
+    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
+    // x.y,y.y,z.y,w.y
+    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
+    // x.z,y.z,z.z,w.z
+    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
+    // x.w,y.w,z.w,w.w
+    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
+    return mResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return the inverse and the determinant of a 4x4 matrix
+_Use_decl_annotations_
+inline XMMATRIX XM_CALLCONV XMMatrixInverse
+(
+    XMVECTOR* pDeterminant,
+    FXMMATRIX  M
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+
+    XMMATRIX MT = XMMatrixTranspose(M);
+
+    XMVECTOR V0[4], V1[4];
+    V0[0] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[2]);
+    V1[0] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[3]);
+    V0[1] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[0]);
+    V1[1] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[1]);
+    V0[2] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Z>(MT.r[2], MT.r[0]);
+    V1[2] = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_1W>(MT.r[3], MT.r[1]);
+
+    XMVECTOR D0 = XMVectorMultiply(V0[0], V1[0]);
+    XMVECTOR D1 = XMVectorMultiply(V0[1], V1[1]);
+    XMVECTOR D2 = XMVectorMultiply(V0[2], V1[2]);
+
+    V0[0] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[2]);
+    V1[0] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[3]);
+    V0[1] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[0]);
+    V1[1] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[1]);
+    V0[2] = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_1W>(MT.r[2], MT.r[0]);
+    V1[2] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Z>(MT.r[3], MT.r[1]);
+
+    D0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], D0);
+    D1 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], D1);
+    D2 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], D2);
+
+    V0[0] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y>(MT.r[1]);
+    V1[0] = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_0X>(D0, D2);
+    V0[1] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_X>(MT.r[0]);
+    V1[1] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0Z>(D0, D2);
+    V0[2] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y>(MT.r[3]);
+    V1[2] = XMVectorPermute<XM_PERMUTE_1W, XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_0X>(D1, D2);
+    V0[3] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_X>(MT.r[2]);
+    V1[3] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1W, XM_PERMUTE_0Y, XM_PERMUTE_0Z>(D1, D2);
+
+    XMVECTOR C0 = XMVectorMultiply(V0[0], V1[0]);
+    XMVECTOR C2 = XMVectorMultiply(V0[1], V1[1]);
+    XMVECTOR C4 = XMVectorMultiply(V0[2], V1[2]);
+    XMVECTOR C6 = XMVectorMultiply(V0[3], V1[3]);
+
+    V0[0] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y, XM_SWIZZLE_Z>(MT.r[1]);
+    V1[0] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1X>(D0, D2);
+    V0[1] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y>(MT.r[0]);
+    V1[1] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_0X>(D0, D2);
+    V0[2] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y, XM_SWIZZLE_Z>(MT.r[3]);
+    V1[2] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1Z>(D1, D2);
+    V0[3] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y>(MT.r[2]);
+    V1[3] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1Z, XM_PERMUTE_0X>(D1, D2);
+
+    C0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0);
+    C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2);
+    C4 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4);
+    C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6);
+
+    V0[0] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_X>(MT.r[1]);
+    V1[0] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1X, XM_PERMUTE_0Z>(D0, D2);
+    V0[1] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Z>(MT.r[0]);
+    V1[1] = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1X>(D0, D2);
+    V0[2] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_X>(MT.r[3]);
+    V1[2] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1W, XM_PERMUTE_1Z, XM_PERMUTE_0Z>(D1, D2);
+    V0[3] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Z>(MT.r[2]);
+    V1[3] = XMVectorPermute<XM_PERMUTE_1W, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1Z>(D1, D2);
+
+    XMVECTOR C1 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0);
+    C0 = XMVectorMultiplyAdd(V0[0], V1[0], C0);
+    XMVECTOR C3 = XMVectorMultiplyAdd(V0[1], V1[1], C2);
+    C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2);
+    XMVECTOR C5 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4);
+    C4 = XMVectorMultiplyAdd(V0[2], V1[2], C4);
+    XMVECTOR C7 = XMVectorMultiplyAdd(V0[3], V1[3], C6);
+    C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6);
+
+    XMMATRIX R;
+    R.r[0] = XMVectorSelect(C0, C1, g_XMSelect0101.v);
+    R.r[1] = XMVectorSelect(C2, C3, g_XMSelect0101.v);
+    R.r[2] = XMVectorSelect(C4, C5, g_XMSelect0101.v);
+    R.r[3] = XMVectorSelect(C6, C7, g_XMSelect0101.v);
+
+    XMVECTOR Determinant = XMVector4Dot(R.r[0], MT.r[0]);
+
+    if (pDeterminant != nullptr)
+        *pDeterminant = Determinant;
+
+    XMVECTOR Reciprocal = XMVectorReciprocal(Determinant);
+
+    XMMATRIX Result;
+    Result.r[0] = XMVectorMultiply(R.r[0], Reciprocal);
+    Result.r[1] = XMVectorMultiply(R.r[1], Reciprocal);
+    Result.r[2] = XMVectorMultiply(R.r[2], Reciprocal);
+    Result.r[3] = XMVectorMultiply(R.r[3], Reciprocal);
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Transpose matrix
+    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
+    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
+    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
+    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
+
+    XMMATRIX MT;
+    MT.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
+    MT.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
+    MT.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
+    MT.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
+
+    XMVECTOR V00 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(1, 1, 0, 0));
+    XMVECTOR V10 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(3, 2, 3, 2));
+    XMVECTOR V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1, 1, 0, 0));
+    XMVECTOR V11 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(3, 2, 3, 2));
+    XMVECTOR V02 = _mm_shuffle_ps(MT.r[2], MT.r[0], _MM_SHUFFLE(2, 0, 2, 0));
+    XMVECTOR V12 = _mm_shuffle_ps(MT.r[3], MT.r[1], _MM_SHUFFLE(3, 1, 3, 1));
+
+    XMVECTOR D0 = _mm_mul_ps(V00, V10);
+    XMVECTOR D1 = _mm_mul_ps(V01, V11);
+    XMVECTOR D2 = _mm_mul_ps(V02, V12);
+
+    V00 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(3, 2, 3, 2));
+    V10 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1, 1, 0, 0));
+    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(3, 2, 3, 2));
+    V11 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1, 1, 0, 0));
+    V02 = _mm_shuffle_ps(MT.r[2], MT.r[0], _MM_SHUFFLE(3, 1, 3, 1));
+    V12 = _mm_shuffle_ps(MT.r[3], MT.r[1], _MM_SHUFFLE(2, 0, 2, 0));
+
+    D0 = XM_FNMADD_PS(V00, V10, D0);
+    D1 = XM_FNMADD_PS(V01, V11, D1);
+    D2 = XM_FNMADD_PS(V02, V12, D2);
+    // V11 = D0Y,D0W,D2Y,D2Y
+    V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 1, 3, 1));
+    V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1, 0, 2, 1));
+    V10 = _mm_shuffle_ps(V11, D0, _MM_SHUFFLE(0, 3, 0, 2));
+    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(0, 1, 0, 2));
+    V11 = _mm_shuffle_ps(V11, D0, _MM_SHUFFLE(2, 1, 2, 1));
+    // V13 = D1Y,D1W,D2W,D2W
+    XMVECTOR V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 3, 3, 1));
+    V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1, 0, 2, 1));
+    V12 = _mm_shuffle_ps(V13, D1, _MM_SHUFFLE(0, 3, 0, 2));
+    XMVECTOR V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(0, 1, 0, 2));
+    V13 = _mm_shuffle_ps(V13, D1, _MM_SHUFFLE(2, 1, 2, 1));
+
+    XMVECTOR C0 = _mm_mul_ps(V00, V10);
+    XMVECTOR C2 = _mm_mul_ps(V01, V11);
+    XMVECTOR C4 = _mm_mul_ps(V02, V12);
+    XMVECTOR C6 = _mm_mul_ps(V03, V13);
+
+    // V11 = D0X,D0Y,D2X,D2X
+    V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(0, 0, 1, 0));
+    V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(2, 1, 3, 2));
+    V10 = _mm_shuffle_ps(D0, V11, _MM_SHUFFLE(2, 1, 0, 3));
+    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1, 3, 2, 3));
+    V11 = _mm_shuffle_ps(D0, V11, _MM_SHUFFLE(0, 2, 1, 2));
+    // V13 = D1X,D1Y,D2Z,D2Z
+    V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(2, 2, 1, 0));
+    V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(2, 1, 3, 2));
+    V12 = _mm_shuffle_ps(D1, V13, _MM_SHUFFLE(2, 1, 0, 3));
+    V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(1, 3, 2, 3));
+    V13 = _mm_shuffle_ps(D1, V13, _MM_SHUFFLE(0, 2, 1, 2));
+
+    C0 = XM_FNMADD_PS(V00, V10, C0);
+    C2 = XM_FNMADD_PS(V01, V11, C2);
+    C4 = XM_FNMADD_PS(V02, V12, C4);
+    C6 = XM_FNMADD_PS(V03, V13, C6);
+
+    V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(0, 3, 0, 3));
+    // V10 = D0Z,D0Z,D2X,D2Y
+    V10 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 0, 2, 2));
+    V10 = XM_PERMUTE_PS(V10, _MM_SHUFFLE(0, 2, 3, 0));
+    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(2, 0, 3, 1));
+    // V11 = D0X,D0W,D2X,D2Y
+    V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 0, 3, 0));
+    V11 = XM_PERMUTE_PS(V11, _MM_SHUFFLE(2, 1, 0, 3));
+    V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(0, 3, 0, 3));
+    // V12 = D1Z,D1Z,D2Z,D2W
+    V12 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 2, 2, 2));
+    V12 = XM_PERMUTE_PS(V12, _MM_SHUFFLE(0, 2, 3, 0));
+    V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(2, 0, 3, 1));
+    // V13 = D1X,D1W,D2Z,D2W
+    V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 2, 3, 0));
+    V13 = XM_PERMUTE_PS(V13, _MM_SHUFFLE(2, 1, 0, 3));
+
+    V00 = _mm_mul_ps(V00, V10);
+    V01 = _mm_mul_ps(V01, V11);
+    V02 = _mm_mul_ps(V02, V12);
+    V03 = _mm_mul_ps(V03, V13);
+    XMVECTOR C1 = _mm_sub_ps(C0, V00);
+    C0 = _mm_add_ps(C0, V00);
+    XMVECTOR C3 = _mm_add_ps(C2, V01);
+    C2 = _mm_sub_ps(C2, V01);
+    XMVECTOR C5 = _mm_sub_ps(C4, V02);
+    C4 = _mm_add_ps(C4, V02);
+    XMVECTOR C7 = _mm_add_ps(C6, V03);
+    C6 = _mm_sub_ps(C6, V03);
+
+    C0 = _mm_shuffle_ps(C0, C1, _MM_SHUFFLE(3, 1, 2, 0));
+    C2 = _mm_shuffle_ps(C2, C3, _MM_SHUFFLE(3, 1, 2, 0));
+    C4 = _mm_shuffle_ps(C4, C5, _MM_SHUFFLE(3, 1, 2, 0));
+    C6 = _mm_shuffle_ps(C6, C7, _MM_SHUFFLE(3, 1, 2, 0));
+    C0 = XM_PERMUTE_PS(C0, _MM_SHUFFLE(3, 1, 2, 0));
+    C2 = XM_PERMUTE_PS(C2, _MM_SHUFFLE(3, 1, 2, 0));
+    C4 = XM_PERMUTE_PS(C4, _MM_SHUFFLE(3, 1, 2, 0));
+    C6 = XM_PERMUTE_PS(C6, _MM_SHUFFLE(3, 1, 2, 0));
+    // Get the determinant
+    XMVECTOR vTemp = XMVector4Dot(C0, MT.r[0]);
+    if (pDeterminant != nullptr)
+        *pDeterminant = vTemp;
+    vTemp = _mm_div_ps(g_XMOne, vTemp);
+    XMMATRIX mResult;
+    mResult.r[0] = _mm_mul_ps(C0, vTemp);
+    mResult.r[1] = _mm_mul_ps(C2, vTemp);
+    mResult.r[2] = _mm_mul_ps(C4, vTemp);
+    mResult.r[3] = _mm_mul_ps(C6, vTemp);
+    return mResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixVectorTensorProduct
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+    XMMATRIX mResult;
+    mResult.r[0] = XMVectorMultiply(XMVectorSwizzle<0, 0, 0, 0>(V1), V2);
+    mResult.r[1] = XMVectorMultiply(XMVectorSwizzle<1, 1, 1, 1>(V1), V2);
+    mResult.r[2] = XMVectorMultiply(XMVectorSwizzle<2, 2, 2, 2>(V1), V2);
+    mResult.r[3] = XMVectorMultiply(XMVectorSwizzle<3, 3, 3, 3>(V1), V2);
+    return mResult;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMMatrixDeterminant(FXMMATRIX M) noexcept
+{
+    static const XMVECTORF32 Sign = { { { 1.0f, -1.0f, 1.0f, -1.0f } } };
+
+    XMVECTOR V0 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[2]);
+    XMVECTOR V1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[3]);
+    XMVECTOR V2 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[2]);
+    XMVECTOR V3 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[3]);
+    XMVECTOR V4 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[2]);
+    XMVECTOR V5 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[3]);
+
+    XMVECTOR P0 = XMVectorMultiply(V0, V1);
+    XMVECTOR P1 = XMVectorMultiply(V2, V3);
+    XMVECTOR P2 = XMVectorMultiply(V4, V5);
+
+    V0 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[2]);
+    V1 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[3]);
+    V2 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[2]);
+    V3 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[3]);
+    V4 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[2]);
+    V5 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[3]);
+
+    P0 = XMVectorNegativeMultiplySubtract(V0, V1, P0);
+    P1 = XMVectorNegativeMultiplySubtract(V2, V3, P1);
+    P2 = XMVectorNegativeMultiplySubtract(V4, V5, P2);
+
+    V0 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[1]);
+    V1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[1]);
+    V2 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[1]);
+
+    XMVECTOR S = XMVectorMultiply(M.r[0], Sign.v);
+    XMVECTOR R = XMVectorMultiply(V0, P0);
+    R = XMVectorNegativeMultiplySubtract(V1, P1, R);
+    R = XMVectorMultiplyAdd(V2, P2, R);
+
+    return XMVector4Dot(S, R);
+}
+
+#define XM3RANKDECOMPOSE(a, b, c, x, y, z)      \
+    if((x) < (y))                   \
+    {                               \
+        if((y) < (z))               \
+        {                           \
+            (a) = 2;                \
+            (b) = 1;                \
+            (c) = 0;                \
+        }                           \
+        else                        \
+        {                           \
+            (a) = 1;                \
+                                    \
+            if((x) < (z))           \
+            {                       \
+                (b) = 2;            \
+                (c) = 0;            \
+            }                       \
+            else                    \
+            {                       \
+                (b) = 0;            \
+                (c) = 2;            \
+            }                       \
+        }                           \
+    }                               \
+    else                            \
+    {                               \
+        if((x) < (z))               \
+        {                           \
+            (a) = 2;                \
+            (b) = 0;                \
+            (c) = 1;                \
+        }                           \
+        else                        \
+        {                           \
+            (a) = 0;                \
+                                    \
+            if((y) < (z))           \
+            {                       \
+                (b) = 2;            \
+                (c) = 1;            \
+            }                       \
+            else                    \
+            {                       \
+                (b) = 1;            \
+                (c) = 2;            \
+            }                       \
+        }                           \
+    }
+
+#define XM3_DECOMP_EPSILON 0.0001f
+
+_Use_decl_annotations_
+inline bool XM_CALLCONV XMMatrixDecompose
+(
+    XMVECTOR* outScale,
+    XMVECTOR* outRotQuat,
+    XMVECTOR* outTrans,
+    FXMMATRIX M
+) noexcept
+{
+    static const XMVECTOR* pvCanonicalBasis[3] = {
+        &g_XMIdentityR0.v,
+        &g_XMIdentityR1.v,
+        &g_XMIdentityR2.v
+    };
+
+    assert(outScale != nullptr);
+    assert(outRotQuat != nullptr);
+    assert(outTrans != nullptr);
+
+    // Get the translation
+    outTrans[0] = M.r[3];
+
+    XMVECTOR* ppvBasis[3];
+    XMMATRIX matTemp;
+    ppvBasis[0] = &matTemp.r[0];
+    ppvBasis[1] = &matTemp.r[1];
+    ppvBasis[2] = &matTemp.r[2];
+
+    matTemp.r[0] = M.r[0];
+    matTemp.r[1] = M.r[1];
+    matTemp.r[2] = M.r[2];
+    matTemp.r[3] = g_XMIdentityR3.v;
+
+    auto pfScales = reinterpret_cast<float*>(outScale);
+
+    size_t a, b, c;
+    XMVectorGetXPtr(&pfScales[0], XMVector3Length(ppvBasis[0][0]));
+    XMVectorGetXPtr(&pfScales[1], XMVector3Length(ppvBasis[1][0]));
+    XMVectorGetXPtr(&pfScales[2], XMVector3Length(ppvBasis[2][0]));
+    pfScales[3] = 0.f;
+
+    XM3RANKDECOMPOSE(a, b, c, pfScales[0], pfScales[1], pfScales[2])
+
+        if (pfScales[a] < XM3_DECOMP_EPSILON)
+        {
+            ppvBasis[a][0] = pvCanonicalBasis[a][0];
+        }
+    ppvBasis[a][0] = XMVector3Normalize(ppvBasis[a][0]);
+
+    if (pfScales[b] < XM3_DECOMP_EPSILON)
+    {
+        size_t aa, bb, cc;
+        float fAbsX, fAbsY, fAbsZ;
+
+        fAbsX = fabsf(XMVectorGetX(ppvBasis[a][0]));
+        fAbsY = fabsf(XMVectorGetY(ppvBasis[a][0]));
+        fAbsZ = fabsf(XMVectorGetZ(ppvBasis[a][0]));
+
+        XM3RANKDECOMPOSE(aa, bb, cc, fAbsX, fAbsY, fAbsZ)
+
+            ppvBasis[b][0] = XMVector3Cross(ppvBasis[a][0], pvCanonicalBasis[cc][0]);
+    }
+
+    ppvBasis[b][0] = XMVector3Normalize(ppvBasis[b][0]);
+
+    if (pfScales[c] < XM3_DECOMP_EPSILON)
+    {
+        ppvBasis[c][0] = XMVector3Cross(ppvBasis[a][0], ppvBasis[b][0]);
+    }
+
+    ppvBasis[c][0] = XMVector3Normalize(ppvBasis[c][0]);
+
+    float fDet = XMVectorGetX(XMMatrixDeterminant(matTemp));
+
+    // use Kramer's rule to check for handedness of coordinate system
+    if (fDet < 0.0f)
+    {
+        // switch coordinate system by negating the scale and inverting the basis vector on the x-axis
+        pfScales[a] = -pfScales[a];
+        ppvBasis[a][0] = XMVectorNegate(ppvBasis[a][0]);
+
+        fDet = -fDet;
+    }
+
+    fDet -= 1.0f;
+    fDet *= fDet;
+
+    if (XM3_DECOMP_EPSILON < fDet)
+    {
+        // Non-SRT matrix encountered
+        return false;
+    }
+
+    // generate the quaternion from the matrix
+    outRotQuat[0] = XMQuaternionRotationMatrix(matTemp);
+    return true;
+}
+
+#undef XM3_DECOMP_EPSILON
+#undef XM3RANKDECOMPOSE
+
+//------------------------------------------------------------------------------
+// Transformation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixIdentity() noexcept
+{
+    XMMATRIX M;
+    M.r[0] = g_XMIdentityR0.v;
+    M.r[1] = g_XMIdentityR1.v;
+    M.r[2] = g_XMIdentityR2.v;
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixSet
+(
+    float m00, float m01, float m02, float m03,
+    float m10, float m11, float m12, float m13,
+    float m20, float m21, float m22, float m23,
+    float m30, float m31, float m32, float m33
+) noexcept
+{
+    XMMATRIX M;
+#if defined(_XM_NO_INTRINSICS_)
+    M.m[0][0] = m00; M.m[0][1] = m01; M.m[0][2] = m02; M.m[0][3] = m03;
+    M.m[1][0] = m10; M.m[1][1] = m11; M.m[1][2] = m12; M.m[1][3] = m13;
+    M.m[2][0] = m20; M.m[2][1] = m21; M.m[2][2] = m22; M.m[2][3] = m23;
+    M.m[3][0] = m30; M.m[3][1] = m31; M.m[3][2] = m32; M.m[3][3] = m33;
+#else
+    M.r[0] = XMVectorSet(m00, m01, m02, m03);
+    M.r[1] = XMVectorSet(m10, m11, m12, m13);
+    M.r[2] = XMVectorSet(m20, m21, m22, m23);
+    M.r[3] = XMVectorSet(m30, m31, m32, m33);
+#endif
+    return M;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixTranslation
+(
+    float OffsetX,
+    float OffsetY,
+    float OffsetZ
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.m[0][0] = 1.0f;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = 1.0f;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = 1.0f;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = OffsetX;
+    M.m[3][1] = OffsetY;
+    M.m[3][2] = OffsetZ;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = g_XMIdentityR0.v;
+    M.r[1] = g_XMIdentityR1.v;
+    M.r[2] = g_XMIdentityR2.v;
+    M.r[3] = XMVectorSet(OffsetX, OffsetY, OffsetZ, 1.f);
+    return M;
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector(FXMVECTOR Offset) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.m[0][0] = 1.0f;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = 1.0f;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = 1.0f;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = Offset.vector4_f32[0];
+    M.m[3][1] = Offset.vector4_f32[1];
+    M.m[3][2] = Offset.vector4_f32[2];
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = g_XMIdentityR0.v;
+    M.r[1] = g_XMIdentityR1.v;
+    M.r[2] = g_XMIdentityR2.v;
+    M.r[3] = XMVectorSelect(g_XMIdentityR3.v, Offset, g_XMSelect1110.v);
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixScaling
+(
+    float ScaleX,
+    float ScaleY,
+    float ScaleZ
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.m[0][0] = ScaleX;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = ScaleY;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = ScaleZ;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = 0.0f;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    const XMVECTOR Zero = vdupq_n_f32(0);
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32(ScaleX, Zero, 0);
+    M.r[1] = vsetq_lane_f32(ScaleY, Zero, 1);
+    M.r[2] = vsetq_lane_f32(ScaleZ, Zero, 2);
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = _mm_set_ps(0, 0, 0, ScaleX);
+    M.r[1] = _mm_set_ps(0, 0, ScaleY, 0);
+    M.r[2] = _mm_set_ps(0, ScaleZ, 0, 0);
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixScalingFromVector(FXMVECTOR Scale) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMMATRIX M;
+    M.m[0][0] = Scale.vector4_f32[0];
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = Scale.vector4_f32[1];
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = Scale.vector4_f32[2];
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = 0.0f;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskX));
+    M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskY));
+    M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskZ));
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    M.r[0] = _mm_and_ps(Scale, g_XMMaskX);
+    M.r[1] = _mm_and_ps(Scale, g_XMMaskY);
+    M.r[2] = _mm_and_ps(Scale, g_XMMaskZ);
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    float    fSinAngle;
+    float    fCosAngle;
+    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
+
+    XMMATRIX M;
+    M.m[0][0] = 1.0f;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = fCosAngle;
+    M.m[1][2] = fSinAngle;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = -fSinAngle;
+    M.m[2][2] = fCosAngle;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = 0.0f;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float    fSinAngle;
+    float    fCosAngle;
+    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
+
+    const float32x4_t Zero = vdupq_n_f32(0);
+
+    float32x4_t T1 = vsetq_lane_f32(fCosAngle, Zero, 1);
+    T1 = vsetq_lane_f32(fSinAngle, T1, 2);
+
+    float32x4_t T2 = vsetq_lane_f32(-fSinAngle, Zero, 1);
+    T2 = vsetq_lane_f32(fCosAngle, T2, 2);
+
+    XMMATRIX M;
+    M.r[0] = g_XMIdentityR0.v;
+    M.r[1] = T1;
+    M.r[2] = T2;
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    float    SinAngle;
+    float    CosAngle;
+    XMScalarSinCos(&SinAngle, &CosAngle, Angle);
+
+    XMVECTOR vSin = _mm_set_ss(SinAngle);
+    XMVECTOR vCos = _mm_set_ss(CosAngle);
+    // x = 0,y = cos,z = sin, w = 0
+    vCos = _mm_shuffle_ps(vCos, vSin, _MM_SHUFFLE(3, 0, 0, 3));
+    XMMATRIX M;
+    M.r[0] = g_XMIdentityR0;
+    M.r[1] = vCos;
+    // x = 0,y = sin,z = cos, w = 0
+    vCos = XM_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 1, 2, 0));
+    // x = 0,y = -sin,z = cos, w = 0
+    vCos = _mm_mul_ps(vCos, g_XMNegateY);
+    M.r[2] = vCos;
+    M.r[3] = g_XMIdentityR3;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    float    fSinAngle;
+    float    fCosAngle;
+    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
+
+    XMMATRIX M;
+    M.m[0][0] = fCosAngle;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = -fSinAngle;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = 1.0f;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = fSinAngle;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fCosAngle;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = 0.0f;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float    fSinAngle;
+    float    fCosAngle;
+    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
+
+    const float32x4_t Zero = vdupq_n_f32(0);
+
+    float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
+    T0 = vsetq_lane_f32(-fSinAngle, T0, 2);
+
+    float32x4_t T2 = vsetq_lane_f32(fSinAngle, Zero, 0);
+    T2 = vsetq_lane_f32(fCosAngle, T2, 2);
+
+    XMMATRIX M;
+    M.r[0] = T0;
+    M.r[1] = g_XMIdentityR1.v;
+    M.r[2] = T2;
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    float    SinAngle;
+    float    CosAngle;
+    XMScalarSinCos(&SinAngle, &CosAngle, Angle);
+
+    XMVECTOR vSin = _mm_set_ss(SinAngle);
+    XMVECTOR vCos = _mm_set_ss(CosAngle);
+    // x = sin,y = 0,z = cos, w = 0
+    vSin = _mm_shuffle_ps(vSin, vCos, _MM_SHUFFLE(3, 0, 3, 0));
+    XMMATRIX M;
+    M.r[2] = vSin;
+    M.r[1] = g_XMIdentityR1;
+    // x = cos,y = 0,z = sin, w = 0
+    vSin = XM_PERMUTE_PS(vSin, _MM_SHUFFLE(3, 0, 1, 2));
+    // x = cos,y = 0,z = -sin, w = 0
+    vSin = _mm_mul_ps(vSin, g_XMNegateZ);
+    M.r[0] = vSin;
+    M.r[3] = g_XMIdentityR3;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    float    fSinAngle;
+    float    fCosAngle;
+    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
+
+    XMMATRIX M;
+    M.m[0][0] = fCosAngle;
+    M.m[0][1] = fSinAngle;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = -fSinAngle;
+    M.m[1][1] = fCosAngle;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = 1.0f;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = 0.0f;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float    fSinAngle;
+    float    fCosAngle;
+    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
+
+    const float32x4_t Zero = vdupq_n_f32(0);
+
+    float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
+    T0 = vsetq_lane_f32(fSinAngle, T0, 1);
+
+    float32x4_t T1 = vsetq_lane_f32(-fSinAngle, Zero, 0);
+    T1 = vsetq_lane_f32(fCosAngle, T1, 1);
+
+    XMMATRIX M;
+    M.r[0] = T0;
+    M.r[1] = T1;
+    M.r[2] = g_XMIdentityR2.v;
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    float    SinAngle;
+    float    CosAngle;
+    XMScalarSinCos(&SinAngle, &CosAngle, Angle);
+
+    XMVECTOR vSin = _mm_set_ss(SinAngle);
+    XMVECTOR vCos = _mm_set_ss(CosAngle);
+    // x = cos,y = sin,z = 0, w = 0
+    vCos = _mm_unpacklo_ps(vCos, vSin);
+    XMMATRIX M;
+    M.r[0] = vCos;
+    // x = sin,y = cos,z = 0, w = 0
+    vCos = XM_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 2, 0, 1));
+    // x = cos,y = -sin,z = 0, w = 0
+    vCos = _mm_mul_ps(vCos, g_XMNegateX);
+    M.r[1] = vCos;
+    M.r[2] = g_XMIdentityR2;
+    M.r[3] = g_XMIdentityR3;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw
+(
+    float Pitch,
+    float Yaw,
+    float Roll
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float cp = cosf(Pitch);
+    float sp = sinf(Pitch);
+
+    float cy = cosf(Yaw);
+    float sy = sinf(Yaw);
+
+    float cr = cosf(Roll);
+    float sr = sinf(Roll);
+
+    XMMATRIX M;
+    M.m[0][0] = cr * cy + sr * sp * sy;
+    M.m[0][1] = sr * cp;
+    M.m[0][2] = sr * sp * cy - cr * sy;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = cr * sp * sy - sr * cy;
+    M.m[1][1] = cr * cp;
+    M.m[1][2] = sr * sy + cr * sp * cy;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = cp * sy;
+    M.m[2][1] = -sp;
+    M.m[2][2] = cp * cy;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = 0.0f;
+    M.m[3][3] = 1.0f;
+    return M;
+#else
+    XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f);
+    return XMMatrixRotationRollPitchYawFromVector(Angles);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector
+(
+    FXMVECTOR Angles // <Pitch, Yaw, Roll, undefined>
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float cp = cosf(Angles.vector4_f32[0]);
+    float sp = sinf(Angles.vector4_f32[0]);
+
+    float cy = cosf(Angles.vector4_f32[1]);
+    float sy = sinf(Angles.vector4_f32[1]);
+
+    float cr = cosf(Angles.vector4_f32[2]);
+    float sr = sinf(Angles.vector4_f32[2]);
+
+    XMMATRIX M;
+    M.m[0][0] = cr * cy + sr * sp * sy;
+    M.m[0][1] = sr * cp;
+    M.m[0][2] = sr * sp * cy - cr * sy;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = cr * sp * sy - sr * cy;
+    M.m[1][1] = cr * cp;
+    M.m[1][2] = sr * sy + cr * sp * cy;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = cp * sy;
+    M.m[2][1] = -sp;
+    M.m[2][2] = cp * cy;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = 0.0f;
+    M.m[3][3] = 1.0f;
+    return M;
+#else
+    static const XMVECTORF32  Sign = { { { 1.0f, -1.0f, -1.0f, 1.0f } } };
+
+    XMVECTOR SinAngles, CosAngles;
+    XMVectorSinCos(&SinAngles, &CosAngles, Angles);
+
+    XMVECTOR P0 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0Z, XM_PERMUTE_1Z, XM_PERMUTE_1X>(SinAngles, CosAngles);
+    XMVECTOR Y0 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1Y>(SinAngles, CosAngles);
+    XMVECTOR P1 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z>(SinAngles, CosAngles);
+    XMVECTOR Y1 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0Y>(SinAngles, CosAngles);
+    XMVECTOR P2 = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z>(SinAngles, CosAngles);
+    XMVECTOR P3 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y>(SinAngles, CosAngles);
+    XMVECTOR Y2 = XMVectorSplatX(SinAngles);
+    XMVECTOR NS = XMVectorNegate(SinAngles);
+
+    XMVECTOR Q0 = XMVectorMultiply(P0, Y0);
+    XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v);
+    Q1 = XMVectorMultiply(Q1, Y1);
+    XMVECTOR Q2 = XMVectorMultiply(P2, Y2);
+    Q2 = XMVectorMultiplyAdd(Q2, P3, Q1);
+
+    XMVECTOR V0 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1Z, XM_PERMUTE_0W>(Q0, Q2);
+    XMVECTOR V1 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_1W, XM_PERMUTE_0W>(Q0, Q2);
+    XMVECTOR V2 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_0W, XM_PERMUTE_0W>(Q0, NS);
+
+    XMMATRIX M;
+    M.r[0] = XMVectorSelect(g_XMZero, V0, g_XMSelect1110.v);
+    M.r[1] = XMVectorSelect(g_XMZero, V1, g_XMSelect1110.v);
+    M.r[2] = XMVectorSelect(g_XMZero, V2, g_XMSelect1110.v);
+    M.r[3] = g_XMIdentityR3;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixRotationNormal
+(
+    FXMVECTOR NormalAxis,
+    float     Angle
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+
+    float    fSinAngle;
+    float    fCosAngle;
+    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
+
+    XMVECTOR A = XMVectorSet(fSinAngle, fCosAngle, 1.0f - fCosAngle, 0.0f);
+
+    XMVECTOR C2 = XMVectorSplatZ(A);
+    XMVECTOR C1 = XMVectorSplatY(A);
+    XMVECTOR C0 = XMVectorSplatX(A);
+
+    XMVECTOR N0 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_W>(NormalAxis);
+    XMVECTOR N1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_W>(NormalAxis);
+
+    XMVECTOR V0 = XMVectorMultiply(C2, N0);
+    V0 = XMVectorMultiply(V0, N1);
+
+    XMVECTOR R0 = XMVectorMultiply(C2, NormalAxis);
+    R0 = XMVectorMultiplyAdd(R0, NormalAxis, C1);
+
+    XMVECTOR R1 = XMVectorMultiplyAdd(C0, NormalAxis, V0);
+    XMVECTOR R2 = XMVectorNegativeMultiplySubtract(C0, NormalAxis, V0);
+
+    V0 = XMVectorSelect(A, R0, g_XMSelect1110.v);
+    XMVECTOR V1 = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_0X>(R1, R2);
+    XMVECTOR V2 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1X>(R1, R2);
+
+    XMMATRIX M;
+    M.r[0] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0W>(V0, V1);
+    M.r[1] = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1W, XM_PERMUTE_0W>(V0, V1);
+    M.r[2] = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_0W>(V0, V2);
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    float    fSinAngle;
+    float    fCosAngle;
+    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
+
+    XMVECTOR C2 = _mm_set_ps1(1.0f - fCosAngle);
+    XMVECTOR C1 = _mm_set_ps1(fCosAngle);
+    XMVECTOR C0 = _mm_set_ps1(fSinAngle);
+
+    XMVECTOR N0 = XM_PERMUTE_PS(NormalAxis, _MM_SHUFFLE(3, 0, 2, 1));
+    XMVECTOR N1 = XM_PERMUTE_PS(NormalAxis, _MM_SHUFFLE(3, 1, 0, 2));
+
+    XMVECTOR V0 = _mm_mul_ps(C2, N0);
+    V0 = _mm_mul_ps(V0, N1);
+
+    XMVECTOR R0 = _mm_mul_ps(C2, NormalAxis);
+    R0 = _mm_mul_ps(R0, NormalAxis);
+    R0 = _mm_add_ps(R0, C1);
+
+    XMVECTOR R1 = _mm_mul_ps(C0, NormalAxis);
+    R1 = _mm_add_ps(R1, V0);
+    XMVECTOR R2 = _mm_mul_ps(C0, NormalAxis);
+    R2 = _mm_sub_ps(V0, R2);
+
+    V0 = _mm_and_ps(R0, g_XMMask3);
+    XMVECTOR V1 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(2, 1, 2, 0));
+    V1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 3, 2, 1));
+    XMVECTOR V2 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(0, 0, 1, 1));
+    V2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 0, 2, 0));
+
+    R2 = _mm_shuffle_ps(V0, V1, _MM_SHUFFLE(1, 0, 3, 0));
+    R2 = XM_PERMUTE_PS(R2, _MM_SHUFFLE(1, 3, 2, 0));
+
+    XMMATRIX M;
+    M.r[0] = R2;
+
+    R2 = _mm_shuffle_ps(V0, V1, _MM_SHUFFLE(3, 2, 3, 1));
+    R2 = XM_PERMUTE_PS(R2, _MM_SHUFFLE(1, 3, 0, 2));
+    M.r[1] = R2;
+
+    V2 = _mm_shuffle_ps(V2, V0, _MM_SHUFFLE(3, 2, 1, 0));
+    M.r[2] = V2;
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixRotationAxis
+(
+    FXMVECTOR Axis,
+    float     Angle
+) noexcept
+{
+    assert(!XMVector3Equal(Axis, XMVectorZero()));
+    assert(!XMVector3IsInfinite(Axis));
+
+    XMVECTOR Normal = XMVector3Normalize(Axis);
+    return XMMatrixRotationNormal(Normal, Angle);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion(FXMVECTOR Quaternion) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    float qx = Quaternion.vector4_f32[0];
+    float qxx = qx * qx;
+
+    float qy = Quaternion.vector4_f32[1];
+    float qyy = qy * qy;
+
+    float qz = Quaternion.vector4_f32[2];
+    float qzz = qz * qz;
+
+    float qw = Quaternion.vector4_f32[3];
+
+    XMMATRIX M;
+    M.m[0][0] = 1.f - 2.f * qyy - 2.f * qzz;
+    M.m[0][1] = 2.f * qx * qy + 2.f * qz * qw;
+    M.m[0][2] = 2.f * qx * qz - 2.f * qy * qw;
+    M.m[0][3] = 0.f;
+
+    M.m[1][0] = 2.f * qx * qy - 2.f * qz * qw;
+    M.m[1][1] = 1.f - 2.f * qxx - 2.f * qzz;
+    M.m[1][2] = 2.f * qy * qz + 2.f * qx * qw;
+    M.m[1][3] = 0.f;
+
+    M.m[2][0] = 2.f * qx * qz + 2.f * qy * qw;
+    M.m[2][1] = 2.f * qy * qz - 2.f * qx * qw;
+    M.m[2][2] = 1.f - 2.f * qxx - 2.f * qyy;
+    M.m[2][3] = 0.f;
+
+    M.m[3][0] = 0.f;
+    M.m[3][1] = 0.f;
+    M.m[3][2] = 0.f;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Constant1110 = { { { 1.0f, 1.0f, 1.0f, 0.0f } } };
+
+    XMVECTOR Q0 = XMVectorAdd(Quaternion, Quaternion);
+    XMVECTOR Q1 = XMVectorMultiply(Quaternion, Q0);
+
+    XMVECTOR V0 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_1W>(Q1, Constant1110.v);
+    XMVECTOR V1 = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1W>(Q1, Constant1110.v);
+    XMVECTOR R0 = XMVectorSubtract(Constant1110, V0);
+    R0 = XMVectorSubtract(R0, V1);
+
+    V0 = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_W>(Quaternion);
+    V1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_W>(Q0);
+    V0 = XMVectorMultiply(V0, V1);
+
+    V1 = XMVectorSplatW(Quaternion);
+    XMVECTOR V2 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_W>(Q0);
+    V1 = XMVectorMultiply(V1, V2);
+
+    XMVECTOR R1 = XMVectorAdd(V0, V1);
+    XMVECTOR R2 = XMVectorSubtract(V0, V1);
+
+    V0 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z>(R1, R2);
+    V1 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1Z, XM_PERMUTE_0X, XM_PERMUTE_1Z>(R1, R2);
+
+    XMMATRIX M;
+    M.r[0] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0W>(R0, V0);
+    M.r[1] = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1W, XM_PERMUTE_0W>(R0, V0);
+    M.r[2] = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_0W>(R0, V1);
+    M.r[3] = g_XMIdentityR3.v;
+    return M;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32  Constant1110 = { { { 1.0f, 1.0f, 1.0f, 0.0f } } };
+
+    XMVECTOR Q0 = _mm_add_ps(Quaternion, Quaternion);
+    XMVECTOR Q1 = _mm_mul_ps(Quaternion, Q0);
+
+    XMVECTOR V0 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(3, 0, 0, 1));
+    V0 = _mm_and_ps(V0, g_XMMask3);
+    XMVECTOR V1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(3, 1, 2, 2));
+    V1 = _mm_and_ps(V1, g_XMMask3);
+    XMVECTOR R0 = _mm_sub_ps(Constant1110, V0);
+    R0 = _mm_sub_ps(R0, V1);
+
+    V0 = XM_PERMUTE_PS(Quaternion, _MM_SHUFFLE(3, 1, 0, 0));
+    V1 = XM_PERMUTE_PS(Q0, _MM_SHUFFLE(3, 2, 1, 2));
+    V0 = _mm_mul_ps(V0, V1);
+
+    V1 = XM_PERMUTE_PS(Quaternion, _MM_SHUFFLE(3, 3, 3, 3));
+    XMVECTOR V2 = XM_PERMUTE_PS(Q0, _MM_SHUFFLE(3, 0, 2, 1));
+    V1 = _mm_mul_ps(V1, V2);
+
+    XMVECTOR R1 = _mm_add_ps(V0, V1);
+    XMVECTOR R2 = _mm_sub_ps(V0, V1);
+
+    V0 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(1, 0, 2, 1));
+    V0 = XM_PERMUTE_PS(V0, _MM_SHUFFLE(1, 3, 2, 0));
+    V1 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(2, 2, 0, 0));
+    V1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 0, 2, 0));
+
+    Q1 = _mm_shuffle_ps(R0, V0, _MM_SHUFFLE(1, 0, 3, 0));
+    Q1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(1, 3, 2, 0));
+
+    XMMATRIX M;
+    M.r[0] = Q1;
+
+    Q1 = _mm_shuffle_ps(R0, V0, _MM_SHUFFLE(3, 2, 3, 1));
+    Q1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(1, 3, 0, 2));
+    M.r[1] = Q1;
+
+    Q1 = _mm_shuffle_ps(V1, R0, _MM_SHUFFLE(3, 2, 1, 0));
+    M.r[2] = Q1;
+    M.r[3] = g_XMIdentityR3;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixTransformation2D
+(
+    FXMVECTOR ScalingOrigin,
+    float     ScalingOrientation,
+    FXMVECTOR Scaling,
+    FXMVECTOR RotationOrigin,
+    float     Rotation,
+    GXMVECTOR Translation
+) noexcept
+{
+    // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation *
+    //         MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
+
+    XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1100.v, ScalingOrigin, g_XMSelect1100.v);
+    XMVECTOR NegScalingOrigin = XMVectorNegate(VScalingOrigin);
+
+    XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin);
+    XMMATRIX MScalingOrientation = XMMatrixRotationZ(ScalingOrientation);
+    XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation);
+    XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v);
+    XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling);
+    XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v);
+    XMMATRIX MRotation = XMMatrixRotationZ(Rotation);
+    XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation, g_XMSelect1100.v);
+
+    XMMATRIX M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT);
+    M = XMMatrixMultiply(M, MScaling);
+    M = XMMatrixMultiply(M, MScalingOrientation);
+    M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin);
+    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
+    M = XMMatrixMultiply(M, MRotation);
+    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
+    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
+
+    return M;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixTransformation
+(
+    FXMVECTOR ScalingOrigin,
+    FXMVECTOR ScalingOrientationQuaternion,
+    FXMVECTOR Scaling,
+    GXMVECTOR RotationOrigin,
+    HXMVECTOR RotationQuaternion,
+    HXMVECTOR Translation
+) noexcept
+{
+    // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation *
+    //         MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
+
+    XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1110.v, ScalingOrigin, g_XMSelect1110.v);
+    XMVECTOR NegScalingOrigin = XMVectorNegate(ScalingOrigin);
+
+    XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin);
+    XMMATRIX MScalingOrientation = XMMatrixRotationQuaternion(ScalingOrientationQuaternion);
+    XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation);
+    XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling);
+    XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v);
+    XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion);
+    XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v);
+
+    XMMATRIX M;
+    M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT);
+    M = XMMatrixMultiply(M, MScaling);
+    M = XMMatrixMultiply(M, MScalingOrientation);
+    M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin);
+    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
+    M = XMMatrixMultiply(M, MRotation);
+    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
+    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
+    return M;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D
+(
+    FXMVECTOR Scaling,
+    FXMVECTOR RotationOrigin,
+    float     Rotation,
+    FXMVECTOR Translation
+) noexcept
+{
+    // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
+
+    XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v);
+    XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling);
+    XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v);
+    XMMATRIX MRotation = XMMatrixRotationZ(Rotation);
+    XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation, g_XMSelect1100.v);
+
+    XMMATRIX M;
+    M = MScaling;
+    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
+    M = XMMatrixMultiply(M, MRotation);
+    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
+    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
+    return M;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation
+(
+    FXMVECTOR Scaling,
+    FXMVECTOR RotationOrigin,
+    FXMVECTOR RotationQuaternion,
+    GXMVECTOR Translation
+) noexcept
+{
+    // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;
+
+    XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling);
+    XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v);
+    XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion);
+    XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v);
+
+    XMMATRIX M;
+    M = MScaling;
+    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
+    M = XMMatrixMultiply(M, MRotation);
+    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
+    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
+    return M;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixReflect(FXMVECTOR ReflectionPlane) noexcept
+{
+    assert(!XMVector3Equal(ReflectionPlane, XMVectorZero()));
+    assert(!XMPlaneIsInfinite(ReflectionPlane));
+
+    static const XMVECTORF32 NegativeTwo = { { { -2.0f, -2.0f, -2.0f, 0.0f } } };
+
+    XMVECTOR P = XMPlaneNormalize(ReflectionPlane);
+    XMVECTOR S = XMVectorMultiply(P, NegativeTwo);
+
+    XMVECTOR A = XMVectorSplatX(P);
+    XMVECTOR B = XMVectorSplatY(P);
+    XMVECTOR C = XMVectorSplatZ(P);
+    XMVECTOR D = XMVectorSplatW(P);
+
+    XMMATRIX M;
+    M.r[0] = XMVectorMultiplyAdd(A, S, g_XMIdentityR0.v);
+    M.r[1] = XMVectorMultiplyAdd(B, S, g_XMIdentityR1.v);
+    M.r[2] = XMVectorMultiplyAdd(C, S, g_XMIdentityR2.v);
+    M.r[3] = XMVectorMultiplyAdd(D, S, g_XMIdentityR3.v);
+    return M;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixShadow
+(
+    FXMVECTOR ShadowPlane,
+    FXMVECTOR LightPosition
+) noexcept
+{
+    static const XMVECTORU32 Select0001 = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_0, XM_SELECT_1 } } };
+
+    assert(!XMVector3Equal(ShadowPlane, XMVectorZero()));
+    assert(!XMPlaneIsInfinite(ShadowPlane));
+
+    XMVECTOR P = XMPlaneNormalize(ShadowPlane);
+    XMVECTOR Dot = XMPlaneDot(P, LightPosition);
+    P = XMVectorNegate(P);
+    XMVECTOR D = XMVectorSplatW(P);
+    XMVECTOR C = XMVectorSplatZ(P);
+    XMVECTOR B = XMVectorSplatY(P);
+    XMVECTOR A = XMVectorSplatX(P);
+    Dot = XMVectorSelect(Select0001.v, Dot, Select0001.v);
+
+    XMMATRIX M;
+    M.r[3] = XMVectorMultiplyAdd(D, LightPosition, Dot);
+    Dot = XMVectorRotateLeft(Dot, 1);
+    M.r[2] = XMVectorMultiplyAdd(C, LightPosition, Dot);
+    Dot = XMVectorRotateLeft(Dot, 1);
+    M.r[1] = XMVectorMultiplyAdd(B, LightPosition, Dot);
+    Dot = XMVectorRotateLeft(Dot, 1);
+    M.r[0] = XMVectorMultiplyAdd(A, LightPosition, Dot);
+    return M;
+}
+
+//------------------------------------------------------------------------------
+// View and projection initialization operations
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixLookAtLH
+(
+    FXMVECTOR EyePosition,
+    FXMVECTOR FocusPosition,
+    FXMVECTOR UpDirection
+) noexcept
+{
+    XMVECTOR EyeDirection = XMVectorSubtract(FocusPosition, EyePosition);
+    return XMMatrixLookToLH(EyePosition, EyeDirection, UpDirection);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixLookAtRH
+(
+    FXMVECTOR EyePosition,
+    FXMVECTOR FocusPosition,
+    FXMVECTOR UpDirection
+) noexcept
+{
+    XMVECTOR NegEyeDirection = XMVectorSubtract(EyePosition, FocusPosition);
+    return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixLookToLH
+(
+    FXMVECTOR EyePosition,
+    FXMVECTOR EyeDirection,
+    FXMVECTOR UpDirection
+) noexcept
+{
+    assert(!XMVector3Equal(EyeDirection, XMVectorZero()));
+    assert(!XMVector3IsInfinite(EyeDirection));
+    assert(!XMVector3Equal(UpDirection, XMVectorZero()));
+    assert(!XMVector3IsInfinite(UpDirection));
+
+    XMVECTOR R2 = XMVector3Normalize(EyeDirection);
+
+    XMVECTOR R0 = XMVector3Cross(UpDirection, R2);
+    R0 = XMVector3Normalize(R0);
+
+    XMVECTOR R1 = XMVector3Cross(R2, R0);
+
+    XMVECTOR NegEyePosition = XMVectorNegate(EyePosition);
+
+    XMVECTOR D0 = XMVector3Dot(R0, NegEyePosition);
+    XMVECTOR D1 = XMVector3Dot(R1, NegEyePosition);
+    XMVECTOR D2 = XMVector3Dot(R2, NegEyePosition);
+
+    XMMATRIX M;
+    M.r[0] = XMVectorSelect(D0, R0, g_XMSelect1110.v);
+    M.r[1] = XMVectorSelect(D1, R1, g_XMSelect1110.v);
+    M.r[2] = XMVectorSelect(D2, R2, g_XMSelect1110.v);
+    M.r[3] = g_XMIdentityR3.v;
+
+    M = XMMatrixTranspose(M);
+
+    return M;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixLookToRH
+(
+    FXMVECTOR EyePosition,
+    FXMVECTOR EyeDirection,
+    FXMVECTOR UpDirection
+) noexcept
+{
+    XMVECTOR NegEyeDirection = XMVectorNegate(EyeDirection);
+    return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection);
+}
+
+//------------------------------------------------------------------------------
+
+#ifdef _PREFAST_
+#pragma prefast(push)
+#pragma prefast(disable:28931, "PREfast noise: Esp:1266")
+#endif
+
+inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH
+(
+    float ViewWidth,
+    float ViewHeight,
+    float NearZ,
+    float FarZ
+) noexcept
+{
+    assert(NearZ > 0.f && FarZ > 0.f);
+    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float TwoNearZ = NearZ + NearZ;
+    float fRange = FarZ / (FarZ - NearZ);
+
+    XMMATRIX M;
+    M.m[0][0] = TwoNearZ / ViewWidth;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = TwoNearZ / ViewHeight;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fRange;
+    M.m[2][3] = 1.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = -fRange * NearZ;
+    M.m[3][3] = 0.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float TwoNearZ = NearZ + NearZ;
+    float fRange = FarZ / (FarZ - NearZ);
+    const float32x4_t Zero = vdupq_n_f32(0);
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0);
+    M.r[1] = vsetq_lane_f32(TwoNearZ / ViewHeight, Zero, 1);
+    M.r[2] = vsetq_lane_f32(fRange, g_XMIdentityR3.v, 2);
+    M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    float TwoNearZ = NearZ + NearZ;
+    float fRange = FarZ / (FarZ - NearZ);
+    // Note: This is recorded on the stack
+    XMVECTOR rMem = {
+        TwoNearZ / ViewWidth,
+        TwoNearZ / ViewHeight,
+        fRange,
+        -fRange * NearZ
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps();
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp, vValues);
+    // TwoNearZ / ViewWidth,0,0,0
+    M.r[0] = vTemp;
+    // 0,TwoNearZ / ViewHeight,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
+    M.r[1] = vTemp;
+    // x=fRange,y=-fRange * NearZ,0,1.0f
+    vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
+    // 0,0,fRange,1.0f
+    vTemp = _mm_setzero_ps();
+    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
+    M.r[2] = vTemp;
+    // 0,0,-fRange * NearZ,0
+    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
+    M.r[3] = vTemp;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH
+(
+    float ViewWidth,
+    float ViewHeight,
+    float NearZ,
+    float FarZ
+) noexcept
+{
+    assert(NearZ > 0.f && FarZ > 0.f);
+    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float TwoNearZ = NearZ + NearZ;
+    float fRange = FarZ / (NearZ - FarZ);
+
+    XMMATRIX M;
+    M.m[0][0] = TwoNearZ / ViewWidth;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = TwoNearZ / ViewHeight;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fRange;
+    M.m[2][3] = -1.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = fRange * NearZ;
+    M.m[3][3] = 0.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float TwoNearZ = NearZ + NearZ;
+    float fRange = FarZ / (NearZ - FarZ);
+    const float32x4_t Zero = vdupq_n_f32(0);
+
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0);
+    M.r[1] = vsetq_lane_f32(TwoNearZ / ViewHeight, Zero, 1);
+    M.r[2] = vsetq_lane_f32(fRange, g_XMNegIdentityR3.v, 2);
+    M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    float TwoNearZ = NearZ + NearZ;
+    float fRange = FarZ / (NearZ - FarZ);
+    // Note: This is recorded on the stack
+    XMVECTOR rMem = {
+        TwoNearZ / ViewWidth,
+        TwoNearZ / ViewHeight,
+        fRange,
+        fRange * NearZ
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps();
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp, vValues);
+    // TwoNearZ / ViewWidth,0,0,0
+    M.r[0] = vTemp;
+    // 0,TwoNearZ / ViewHeight,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
+    M.r[1] = vTemp;
+    // x=fRange,y=-fRange * NearZ,0,-1.0f
+    vValues = _mm_shuffle_ps(vValues, g_XMNegIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
+    // 0,0,fRange,-1.0f
+    vTemp = _mm_setzero_ps();
+    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
+    M.r[2] = vTemp;
+    // 0,0,-fRange * NearZ,0
+    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
+    M.r[3] = vTemp;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH
+(
+    float FovAngleY,
+    float AspectRatio,
+    float NearZ,
+    float FarZ
+) noexcept
+{
+    assert(NearZ > 0.f && FarZ > 0.f);
+    assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
+    assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float    SinFov;
+    float    CosFov;
+    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+
+    float Height = CosFov / SinFov;
+    float Width = Height / AspectRatio;
+    float fRange = FarZ / (FarZ - NearZ);
+
+    XMMATRIX M;
+    M.m[0][0] = Width;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = Height;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fRange;
+    M.m[2][3] = 1.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = -fRange * NearZ;
+    M.m[3][3] = 0.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float    SinFov;
+    float    CosFov;
+    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+
+    float fRange = FarZ / (FarZ - NearZ);
+    float Height = CosFov / SinFov;
+    float Width = Height / AspectRatio;
+    const float32x4_t Zero = vdupq_n_f32(0);
+
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32(Width, Zero, 0);
+    M.r[1] = vsetq_lane_f32(Height, Zero, 1);
+    M.r[2] = vsetq_lane_f32(fRange, g_XMIdentityR3.v, 2);
+    M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    float    SinFov;
+    float    CosFov;
+    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+
+    float fRange = FarZ / (FarZ - NearZ);
+    // Note: This is recorded on the stack
+    float Height = CosFov / SinFov;
+    XMVECTOR rMem = {
+        Height / AspectRatio,
+        Height,
+        fRange,
+        -fRange * NearZ
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps();
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp, vValues);
+    // Height / AspectRatio,0,0,0
+    XMMATRIX M;
+    M.r[0] = vTemp;
+    // 0,Height,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
+    M.r[1] = vTemp;
+    // x=fRange,y=-fRange * NearZ,0,1.0f
+    vTemp = _mm_setzero_ps();
+    vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
+    // 0,0,fRange,1.0f
+    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
+    M.r[2] = vTemp;
+    // 0,0,-fRange * NearZ,0.0f
+    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
+    M.r[3] = vTemp;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH
+(
+    float FovAngleY,
+    float AspectRatio,
+    float NearZ,
+    float FarZ
+) noexcept
+{
+    assert(NearZ > 0.f && FarZ > 0.f);
+    assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
+    assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float    SinFov;
+    float    CosFov;
+    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+
+    float Height = CosFov / SinFov;
+    float Width = Height / AspectRatio;
+    float fRange = FarZ / (NearZ - FarZ);
+
+    XMMATRIX M;
+    M.m[0][0] = Width;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = Height;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fRange;
+    M.m[2][3] = -1.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = fRange * NearZ;
+    M.m[3][3] = 0.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float    SinFov;
+    float    CosFov;
+    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+    float fRange = FarZ / (NearZ - FarZ);
+    float Height = CosFov / SinFov;
+    float Width = Height / AspectRatio;
+    const float32x4_t Zero = vdupq_n_f32(0);
+
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32(Width, Zero, 0);
+    M.r[1] = vsetq_lane_f32(Height, Zero, 1);
+    M.r[2] = vsetq_lane_f32(fRange, g_XMNegIdentityR3.v, 2);
+    M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    float    SinFov;
+    float    CosFov;
+    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+    float fRange = FarZ / (NearZ - FarZ);
+    // Note: This is recorded on the stack
+    float Height = CosFov / SinFov;
+    XMVECTOR rMem = {
+        Height / AspectRatio,
+        Height,
+        fRange,
+        fRange * NearZ
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps();
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp, vValues);
+    // Height / AspectRatio,0,0,0
+    XMMATRIX M;
+    M.r[0] = vTemp;
+    // 0,Height,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
+    M.r[1] = vTemp;
+    // x=fRange,y=-fRange * NearZ,0,-1.0f
+    vTemp = _mm_setzero_ps();
+    vValues = _mm_shuffle_ps(vValues, g_XMNegIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
+    // 0,0,fRange,-1.0f
+    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
+    M.r[2] = vTemp;
+    // 0,0,fRange * NearZ,0.0f
+    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
+    M.r[3] = vTemp;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH
+(
+    float ViewLeft,
+    float ViewRight,
+    float ViewBottom,
+    float ViewTop,
+    float NearZ,
+    float FarZ
+) noexcept
+{
+    assert(NearZ > 0.f && FarZ > 0.f);
+    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
+    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float TwoNearZ = NearZ + NearZ;
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = FarZ / (FarZ - NearZ);
+
+    XMMATRIX M;
+    M.m[0][0] = TwoNearZ * ReciprocalWidth;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = TwoNearZ * ReciprocalHeight;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = -(ViewLeft + ViewRight) * ReciprocalWidth;
+    M.m[2][1] = -(ViewTop + ViewBottom) * ReciprocalHeight;
+    M.m[2][2] = fRange;
+    M.m[2][3] = 1.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = -fRange * NearZ;
+    M.m[3][3] = 0.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float TwoNearZ = NearZ + NearZ;
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = FarZ / (FarZ - NearZ);
+    const float32x4_t Zero = vdupq_n_f32(0);
+
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0);
+    M.r[1] = vsetq_lane_f32(TwoNearZ * ReciprocalHeight, Zero, 1);
+    M.r[2] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
+        -(ViewTop + ViewBottom) * ReciprocalHeight,
+        fRange,
+        1.0f);
+    M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    float TwoNearZ = NearZ + NearZ;
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = FarZ / (FarZ - NearZ);
+    // Note: This is recorded on the stack
+    XMVECTOR rMem = {
+        TwoNearZ * ReciprocalWidth,
+        TwoNearZ * ReciprocalHeight,
+        -fRange * NearZ,
+        0
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps();
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp, vValues);
+    // TwoNearZ*ReciprocalWidth,0,0,0
+    M.r[0] = vTemp;
+    // 0,TwoNearZ*ReciprocalHeight,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
+    M.r[1] = vTemp;
+    // 0,0,fRange,1.0f
+    M.r[2] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
+        -(ViewTop + ViewBottom) * ReciprocalHeight,
+        fRange,
+        1.0f);
+    // 0,0,-fRange * NearZ,0.0f
+    vValues = _mm_and_ps(vValues, g_XMMaskZ);
+    M.r[3] = vValues;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH
+(
+    float ViewLeft,
+    float ViewRight,
+    float ViewBottom,
+    float ViewTop,
+    float NearZ,
+    float FarZ
+) noexcept
+{
+    assert(NearZ > 0.f && FarZ > 0.f);
+    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
+    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float TwoNearZ = NearZ + NearZ;
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = FarZ / (NearZ - FarZ);
+
+    XMMATRIX M;
+    M.m[0][0] = TwoNearZ * ReciprocalWidth;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = TwoNearZ * ReciprocalHeight;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = (ViewLeft + ViewRight) * ReciprocalWidth;
+    M.m[2][1] = (ViewTop + ViewBottom) * ReciprocalHeight;
+    M.m[2][2] = fRange;
+    M.m[2][3] = -1.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = fRange * NearZ;
+    M.m[3][3] = 0.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float TwoNearZ = NearZ + NearZ;
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = FarZ / (NearZ - FarZ);
+    const float32x4_t Zero = vdupq_n_f32(0);
+
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0);
+    M.r[1] = vsetq_lane_f32(TwoNearZ * ReciprocalHeight, Zero, 1);
+    M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth,
+        (ViewTop + ViewBottom) * ReciprocalHeight,
+        fRange,
+        -1.0f);
+    M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    float TwoNearZ = NearZ + NearZ;
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = FarZ / (NearZ - FarZ);
+    // Note: This is recorded on the stack
+    XMVECTOR rMem = {
+        TwoNearZ * ReciprocalWidth,
+        TwoNearZ * ReciprocalHeight,
+        fRange * NearZ,
+        0
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps();
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp, vValues);
+    // TwoNearZ*ReciprocalWidth,0,0,0
+    M.r[0] = vTemp;
+    // 0,TwoNearZ*ReciprocalHeight,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
+    M.r[1] = vTemp;
+    // 0,0,fRange,1.0f
+    M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth,
+        (ViewTop + ViewBottom) * ReciprocalHeight,
+        fRange,
+        -1.0f);
+    // 0,0,-fRange * NearZ,0.0f
+    vValues = _mm_and_ps(vValues, g_XMMaskZ);
+    M.r[3] = vValues;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixOrthographicLH
+(
+    float ViewWidth,
+    float ViewHeight,
+    float NearZ,
+    float FarZ
+) noexcept
+{
+    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float fRange = 1.0f / (FarZ - NearZ);
+
+    XMMATRIX M;
+    M.m[0][0] = 2.0f / ViewWidth;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = 2.0f / ViewHeight;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fRange;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = -fRange * NearZ;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float fRange = 1.0f / (FarZ - NearZ);
+
+    const float32x4_t Zero = vdupq_n_f32(0);
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0);
+    M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1);
+    M.r[2] = vsetq_lane_f32(fRange, Zero, 2);
+    M.r[3] = vsetq_lane_f32(-fRange * NearZ, g_XMIdentityR3.v, 2);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    float fRange = 1.0f / (FarZ - NearZ);
+    // Note: This is recorded on the stack
+    XMVECTOR rMem = {
+        2.0f / ViewWidth,
+        2.0f / ViewHeight,
+        fRange,
+        -fRange * NearZ
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps();
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp, vValues);
+    // 2.0f / ViewWidth,0,0,0
+    M.r[0] = vTemp;
+    // 0,2.0f / ViewHeight,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
+    M.r[1] = vTemp;
+    // x=fRange,y=-fRange * NearZ,0,1.0f
+    vTemp = _mm_setzero_ps();
+    vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
+    // 0,0,fRange,0.0f
+    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 0, 0, 0));
+    M.r[2] = vTemp;
+    // 0,0,-fRange * NearZ,1.0f
+    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 1, 0, 0));
+    M.r[3] = vTemp;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixOrthographicRH
+(
+    float ViewWidth,
+    float ViewHeight,
+    float NearZ,
+    float FarZ
+) noexcept
+{
+    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float fRange = 1.0f / (NearZ - FarZ);
+
+    XMMATRIX M;
+    M.m[0][0] = 2.0f / ViewWidth;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = 2.0f / ViewHeight;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fRange;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = 0.0f;
+    M.m[3][1] = 0.0f;
+    M.m[3][2] = fRange * NearZ;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float fRange = 1.0f / (NearZ - FarZ);
+
+    const float32x4_t Zero = vdupq_n_f32(0);
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0);
+    M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1);
+    M.r[2] = vsetq_lane_f32(fRange, Zero, 2);
+    M.r[3] = vsetq_lane_f32(fRange * NearZ, g_XMIdentityR3.v, 2);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    float fRange = 1.0f / (NearZ - FarZ);
+    // Note: This is recorded on the stack
+    XMVECTOR rMem = {
+        2.0f / ViewWidth,
+        2.0f / ViewHeight,
+        fRange,
+        fRange * NearZ
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps();
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp, vValues);
+    // 2.0f / ViewWidth,0,0,0
+    M.r[0] = vTemp;
+    // 0,2.0f / ViewHeight,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
+    M.r[1] = vTemp;
+    // x=fRange,y=fRange * NearZ,0,1.0f
+    vTemp = _mm_setzero_ps();
+    vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
+    // 0,0,fRange,0.0f
+    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 0, 0, 0));
+    M.r[2] = vTemp;
+    // 0,0,fRange * NearZ,1.0f
+    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 1, 0, 0));
+    M.r[3] = vTemp;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH
+(
+    float ViewLeft,
+    float ViewRight,
+    float ViewBottom,
+    float ViewTop,
+    float NearZ,
+    float FarZ
+) noexcept
+{
+    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
+    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = 1.0f / (FarZ - NearZ);
+
+    XMMATRIX M;
+    M.m[0][0] = ReciprocalWidth + ReciprocalWidth;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = ReciprocalHeight + ReciprocalHeight;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fRange;
+    M.m[2][3] = 0.0f;
+
+    M.m[3][0] = -(ViewLeft + ViewRight) * ReciprocalWidth;
+    M.m[3][1] = -(ViewTop + ViewBottom) * ReciprocalHeight;
+    M.m[3][2] = -fRange * NearZ;
+    M.m[3][3] = 1.0f;
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = 1.0f / (FarZ - NearZ);
+    const float32x4_t Zero = vdupq_n_f32(0);
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0);
+    M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1);
+    M.r[2] = vsetq_lane_f32(fRange, Zero, 2);
+    M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
+        -(ViewTop + ViewBottom) * ReciprocalHeight,
+        -fRange * NearZ,
+        1.0f);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = 1.0f / (FarZ - NearZ);
+    // Note: This is recorded on the stack
+    XMVECTOR rMem = {
+        fReciprocalWidth,
+        fReciprocalHeight,
+        fRange,
+        1.0f
+    };
+    XMVECTOR rMem2 = {
+        -(ViewLeft + ViewRight),
+        -(ViewTop + ViewBottom),
+        -NearZ,
+        1.0f
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps();
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp, vValues);
+    // fReciprocalWidth*2,0,0,0
+    vTemp = _mm_add_ss(vTemp, vTemp);
+    M.r[0] = vTemp;
+    // 0,fReciprocalHeight*2,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
+    vTemp = _mm_add_ps(vTemp, vTemp);
+    M.r[1] = vTemp;
+    // 0,0,fRange,0.0f
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp, g_XMMaskZ);
+    M.r[2] = vTemp;
+    // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f
+    vValues = _mm_mul_ps(vValues, rMem2);
+    M.r[3] = vValues;
+    return M;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH
+(
+    float ViewLeft,
+    float ViewRight,
+    float ViewBottom,
+    float ViewTop,
+    float NearZ,
+    float FarZ
+) noexcept
+{
+    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
+    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = 1.0f / (NearZ - FarZ);
+
+    XMMATRIX M;
+    M.m[0][0] = ReciprocalWidth + ReciprocalWidth;
+    M.m[0][1] = 0.0f;
+    M.m[0][2] = 0.0f;
+    M.m[0][3] = 0.0f;
+
+    M.m[1][0] = 0.0f;
+    M.m[1][1] = ReciprocalHeight + ReciprocalHeight;
+    M.m[1][2] = 0.0f;
+    M.m[1][3] = 0.0f;
+
+    M.m[2][0] = 0.0f;
+    M.m[2][1] = 0.0f;
+    M.m[2][2] = fRange;
+    M.m[2][3] = 0.0f;
+
+    M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
+        -(ViewTop + ViewBottom) * ReciprocalHeight,
+        fRange * NearZ,
+        1.0f);
+    return M;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = 1.0f / (NearZ - FarZ);
+    const float32x4_t Zero = vdupq_n_f32(0);
+    XMMATRIX M;
+    M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0);
+    M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1);
+    M.r[2] = vsetq_lane_f32(fRange, Zero, 2);
+    M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
+        -(ViewTop + ViewBottom) * ReciprocalHeight,
+        fRange * NearZ,
+        1.0f);
+    return M;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMMATRIX M;
+    float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
+    float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
+    float fRange = 1.0f / (NearZ - FarZ);
+    // Note: This is recorded on the stack
+    XMVECTOR rMem = {
+        fReciprocalWidth,
+        fReciprocalHeight,
+        fRange,
+        1.0f
+    };
+    XMVECTOR rMem2 = {
+        -(ViewLeft + ViewRight),
+        -(ViewTop + ViewBottom),
+        NearZ,
+        1.0f
+    };
+    // Copy from memory to SSE register
+    XMVECTOR vValues = rMem;
+    XMVECTOR vTemp = _mm_setzero_ps();
+    // Copy x only
+    vTemp = _mm_move_ss(vTemp, vValues);
+    // fReciprocalWidth*2,0,0,0
+    vTemp = _mm_add_ss(vTemp, vTemp);
+    M.r[0] = vTemp;
+    // 0,fReciprocalHeight*2,0,0
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
+    vTemp = _mm_add_ps(vTemp, vTemp);
+    M.r[1] = vTemp;
+    // 0,0,fRange,0.0f
+    vTemp = vValues;
+    vTemp = _mm_and_ps(vTemp, g_XMMaskZ);
+    M.r[2] = vTemp;
+    // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f
+    vValues = _mm_mul_ps(vValues, rMem2);
+    M.r[3] = vValues;
+    return M;
+#endif
+}
+
+#ifdef _PREFAST_
+#pragma prefast(pop)
+#endif
+
+/****************************************************************************
+ *
+ * XMMATRIX operators and methods
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMMATRIX::XMMATRIX
+(
+    float m00, float m01, float m02, float m03,
+    float m10, float m11, float m12, float m13,
+    float m20, float m21, float m22, float m23,
+    float m30, float m31, float m32, float m33
+) noexcept
+{
+    r[0] = XMVectorSet(m00, m01, m02, m03);
+    r[1] = XMVectorSet(m10, m11, m12, m13);
+    r[2] = XMVectorSet(m20, m21, m22, m23);
+    r[3] = XMVectorSet(m30, m31, m32, m33);
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMMATRIX::XMMATRIX(const float* pArray) noexcept
+{
+    assert(pArray != nullptr);
+    r[0] = XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray));
+    r[1] = XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray + 4));
+    r[2] = XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray + 8));
+    r[3] = XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray + 12));
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XMMATRIX::operator- () const noexcept
+{
+    XMMATRIX R;
+    R.r[0] = XMVectorNegate(r[0]);
+    R.r[1] = XMVectorNegate(r[1]);
+    R.r[2] = XMVectorNegate(r[2]);
+    R.r[3] = XMVectorNegate(r[3]);
+    return R;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX& XM_CALLCONV XMMATRIX::operator+= (FXMMATRIX M) noexcept
+{
+    r[0] = XMVectorAdd(r[0], M.r[0]);
+    r[1] = XMVectorAdd(r[1], M.r[1]);
+    r[2] = XMVectorAdd(r[2], M.r[2]);
+    r[3] = XMVectorAdd(r[3], M.r[3]);
+    return *this;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX& XM_CALLCONV XMMATRIX::operator-= (FXMMATRIX M) noexcept
+{
+    r[0] = XMVectorSubtract(r[0], M.r[0]);
+    r[1] = XMVectorSubtract(r[1], M.r[1]);
+    r[2] = XMVectorSubtract(r[2], M.r[2]);
+    r[3] = XMVectorSubtract(r[3], M.r[3]);
+    return *this;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX& XM_CALLCONV XMMATRIX::operator*=(FXMMATRIX M) noexcept
+{
+    *this = XMMatrixMultiply(*this, M);
+    return *this;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX& XMMATRIX::operator*= (float S) noexcept
+{
+    r[0] = XMVectorScale(r[0], S);
+    r[1] = XMVectorScale(r[1], S);
+    r[2] = XMVectorScale(r[2], S);
+    r[3] = XMVectorScale(r[3], S);
+    return *this;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX& XMMATRIX::operator/= (float S) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vS = XMVectorReplicate(S);
+    r[0] = XMVectorDivide(r[0], vS);
+    r[1] = XMVectorDivide(r[1], vS);
+    r[2] = XMVectorDivide(r[2], vS);
+    r[3] = XMVectorDivide(r[3], vS);
+    return *this;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+    float32x4_t vS = vdupq_n_f32(S);
+    r[0] = vdivq_f32(r[0], vS);
+    r[1] = vdivq_f32(r[1], vS);
+    r[2] = vdivq_f32(r[2], vS);
+    r[3] = vdivq_f32(r[3], vS);
+#else
+    // 2 iterations of Newton-Raphson refinement of reciprocal
+    float32x2_t vS = vdup_n_f32(S);
+    float32x2_t R0 = vrecpe_f32(vS);
+    float32x2_t S0 = vrecps_f32(R0, vS);
+    R0 = vmul_f32(S0, R0);
+    S0 = vrecps_f32(R0, vS);
+    R0 = vmul_f32(S0, R0);
+    float32x4_t Reciprocal = vcombine_f32(R0, R0);
+    r[0] = vmulq_f32(r[0], Reciprocal);
+    r[1] = vmulq_f32(r[1], Reciprocal);
+    r[2] = vmulq_f32(r[2], Reciprocal);
+    r[3] = vmulq_f32(r[3], Reciprocal);
+#endif
+    return *this;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 vS = _mm_set_ps1(S);
+    r[0] = _mm_div_ps(r[0], vS);
+    r[1] = _mm_div_ps(r[1], vS);
+    r[2] = _mm_div_ps(r[2], vS);
+    r[3] = _mm_div_ps(r[3], vS);
+    return *this;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMATRIX::operator+ (FXMMATRIX M) const noexcept
+{
+    XMMATRIX R;
+    R.r[0] = XMVectorAdd(r[0], M.r[0]);
+    R.r[1] = XMVectorAdd(r[1], M.r[1]);
+    R.r[2] = XMVectorAdd(r[2], M.r[2]);
+    R.r[3] = XMVectorAdd(r[3], M.r[3]);
+    return R;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMATRIX::operator- (FXMMATRIX M) const noexcept
+{
+    XMMATRIX R;
+    R.r[0] = XMVectorSubtract(r[0], M.r[0]);
+    R.r[1] = XMVectorSubtract(r[1], M.r[1]);
+    R.r[2] = XMVectorSubtract(r[2], M.r[2]);
+    R.r[3] = XMVectorSubtract(r[3], M.r[3]);
+    return R;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV XMMATRIX::operator*(FXMMATRIX M) const noexcept
+{
+    return XMMatrixMultiply(*this, M);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XMMATRIX::operator* (float S) const noexcept
+{
+    XMMATRIX R;
+    R.r[0] = XMVectorScale(r[0], S);
+    R.r[1] = XMVectorScale(r[1], S);
+    R.r[2] = XMVectorScale(r[2], S);
+    R.r[3] = XMVectorScale(r[3], S);
+    return R;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XMMATRIX::operator/ (float S) const noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR vS = XMVectorReplicate(S);
+    XMMATRIX R;
+    R.r[0] = XMVectorDivide(r[0], vS);
+    R.r[1] = XMVectorDivide(r[1], vS);
+    R.r[2] = XMVectorDivide(r[2], vS);
+    R.r[3] = XMVectorDivide(r[3], vS);
+    return R;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+    float32x4_t vS = vdupq_n_f32(S);
+    XMMATRIX R;
+    R.r[0] = vdivq_f32(r[0], vS);
+    R.r[1] = vdivq_f32(r[1], vS);
+    R.r[2] = vdivq_f32(r[2], vS);
+    R.r[3] = vdivq_f32(r[3], vS);
+#else
+    // 2 iterations of Newton-Raphson refinement of reciprocal
+    float32x2_t vS = vdup_n_f32(S);
+    float32x2_t R0 = vrecpe_f32(vS);
+    float32x2_t S0 = vrecps_f32(R0, vS);
+    R0 = vmul_f32(S0, R0);
+    S0 = vrecps_f32(R0, vS);
+    R0 = vmul_f32(S0, R0);
+    float32x4_t Reciprocal = vcombine_f32(R0, R0);
+    XMMATRIX R;
+    R.r[0] = vmulq_f32(r[0], Reciprocal);
+    R.r[1] = vmulq_f32(r[1], Reciprocal);
+    R.r[2] = vmulq_f32(r[2], Reciprocal);
+    R.r[3] = vmulq_f32(r[3], Reciprocal);
+#endif
+    return R;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 vS = _mm_set_ps1(S);
+    XMMATRIX R;
+    R.r[0] = _mm_div_ps(r[0], vS);
+    R.r[1] = _mm_div_ps(r[1], vS);
+    R.r[2] = _mm_div_ps(r[2], vS);
+    R.r[3] = _mm_div_ps(r[3], vS);
+    return R;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMMATRIX XM_CALLCONV operator*
+(
+    float S,
+    FXMMATRIX M
+) noexcept
+{
+    XMMATRIX R;
+    R.r[0] = XMVectorScale(M.r[0], S);
+    R.r[1] = XMVectorScale(M.r[1], S);
+    R.r[2] = XMVectorScale(M.r[2], S);
+    R.r[3] = XMVectorScale(M.r[3], S);
+    return R;
+}
+
+/****************************************************************************
+ *
+ * XMFLOAT3X3 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMFLOAT3X3::XMFLOAT3X3(const float* pArray) noexcept
+{
+    assert(pArray != nullptr);
+    for (size_t Row = 0; Row < 3; Row++)
+    {
+        for (size_t Column = 0; Column < 3; Column++)
+        {
+            m[Row][Column] = pArray[Row * 3 + Column];
+        }
+    }
+}
+
+/****************************************************************************
+ *
+ * XMFLOAT4X3 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMFLOAT4X3::XMFLOAT4X3(const float* pArray) noexcept
+{
+    assert(pArray != nullptr);
+
+    m[0][0] = pArray[0];
+    m[0][1] = pArray[1];
+    m[0][2] = pArray[2];
+
+    m[1][0] = pArray[3];
+    m[1][1] = pArray[4];
+    m[1][2] = pArray[5];
+
+    m[2][0] = pArray[6];
+    m[2][1] = pArray[7];
+    m[2][2] = pArray[8];
+
+    m[3][0] = pArray[9];
+    m[3][1] = pArray[10];
+    m[3][2] = pArray[11];
+}
+
+/****************************************************************************
+*
+* XMFLOAT3X4 operators
+*
+****************************************************************************/
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMFLOAT3X4::XMFLOAT3X4(const float* pArray) noexcept
+{
+    assert(pArray != nullptr);
+
+    m[0][0] = pArray[0];
+    m[0][1] = pArray[1];
+    m[0][2] = pArray[2];
+    m[0][3] = pArray[3];
+
+    m[1][0] = pArray[4];
+    m[1][1] = pArray[5];
+    m[1][2] = pArray[6];
+    m[1][3] = pArray[7];
+
+    m[2][0] = pArray[8];
+    m[2][1] = pArray[9];
+    m[2][2] = pArray[10];
+    m[2][3] = pArray[11];
+}
+
+/****************************************************************************
+ *
+ * XMFLOAT4X4 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMFLOAT4X4::XMFLOAT4X4(const float* pArray) noexcept
+{
+    assert(pArray != nullptr);
+
+    m[0][0] = pArray[0];
+    m[0][1] = pArray[1];
+    m[0][2] = pArray[2];
+    m[0][3] = pArray[3];
+
+    m[1][0] = pArray[4];
+    m[1][1] = pArray[5];
+    m[1][2] = pArray[6];
+    m[1][3] = pArray[7];
+
+    m[2][0] = pArray[8];
+    m[2][1] = pArray[9];
+    m[2][2] = pArray[10];
+    m[2][3] = pArray[11];
+
+    m[3][0] = pArray[12];
+    m[3][1] = pArray[13];
+    m[3][2] = pArray[14];
+    m[3][3] = pArray[15];
+}
+
diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathMisc.inl b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathMisc.inl
new file mode 100644
index 000000000..5f88da642
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathMisc.inl
@@ -0,0 +1,2493 @@
+//-------------------------------------------------------------------------------------
+// DirectXMathMisc.inl -- SIMD C++ Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+/****************************************************************************
+ *
+ * Quaternion
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+ // Comparison operations
+ //------------------------------------------------------------------------------
+
+ //------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMQuaternionEqual
+(
+    FXMVECTOR Q1,
+    FXMVECTOR Q2
+) noexcept
+{
+    return XMVector4Equal(Q1, Q2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMQuaternionNotEqual
+(
+    FXMVECTOR Q1,
+    FXMVECTOR Q2
+) noexcept
+{
+    return XMVector4NotEqual(Q1, Q2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMQuaternionIsNaN(FXMVECTOR Q) noexcept
+{
+    return XMVector4IsNaN(Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMQuaternionIsInfinite(FXMVECTOR Q) noexcept
+{
+    return XMVector4IsInfinite(Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMQuaternionIsIdentity(FXMVECTOR Q) noexcept
+{
+    return XMVector4Equal(Q, g_XMIdentityR3.v);
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionDot
+(
+    FXMVECTOR Q1,
+    FXMVECTOR Q2
+) noexcept
+{
+    return XMVector4Dot(Q1, Q2);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionMultiply
+(
+    FXMVECTOR Q1,
+    FXMVECTOR Q2
+) noexcept
+{
+    // Returns the product Q2*Q1 (which is the concatenation of a rotation Q1 followed by the rotation Q2)
+
+    // [ (Q2.w * Q1.x) + (Q2.x * Q1.w) + (Q2.y * Q1.z) - (Q2.z * Q1.y),
+    //   (Q2.w * Q1.y) - (Q2.x * Q1.z) + (Q2.y * Q1.w) + (Q2.z * Q1.x),
+    //   (Q2.w * Q1.z) + (Q2.x * Q1.y) - (Q2.y * Q1.x) + (Q2.z * Q1.w),
+    //   (Q2.w * Q1.w) - (Q2.x * Q1.x) - (Q2.y * Q1.y) - (Q2.z * Q1.z) ]
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            (Q2.vector4_f32[3] * Q1.vector4_f32[0]) + (Q2.vector4_f32[0] * Q1.vector4_f32[3]) + (Q2.vector4_f32[1] * Q1.vector4_f32[2]) - (Q2.vector4_f32[2] * Q1.vector4_f32[1]),
+            (Q2.vector4_f32[3] * Q1.vector4_f32[1]) - (Q2.vector4_f32[0] * Q1.vector4_f32[2]) + (Q2.vector4_f32[1] * Q1.vector4_f32[3]) + (Q2.vector4_f32[2] * Q1.vector4_f32[0]),
+            (Q2.vector4_f32[3] * Q1.vector4_f32[2]) + (Q2.vector4_f32[0] * Q1.vector4_f32[1]) - (Q2.vector4_f32[1] * Q1.vector4_f32[0]) + (Q2.vector4_f32[2] * Q1.vector4_f32[3]),
+            (Q2.vector4_f32[3] * Q1.vector4_f32[3]) - (Q2.vector4_f32[0] * Q1.vector4_f32[0]) - (Q2.vector4_f32[1] * Q1.vector4_f32[1]) - (Q2.vector4_f32[2] * Q1.vector4_f32[2])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 ControlWZYX = { { { 1.0f, -1.0f, 1.0f, -1.0f } } };
+    static const XMVECTORF32 ControlZWXY = { { { 1.0f, 1.0f, -1.0f, -1.0f } } };
+    static const XMVECTORF32 ControlYXWZ = { { { -1.0f, 1.0f, 1.0f, -1.0f } } };
+
+    float32x2_t Q2L = vget_low_f32(Q2);
+    float32x2_t Q2H = vget_high_f32(Q2);
+
+    float32x4_t Q2X = vdupq_lane_f32(Q2L, 0);
+    float32x4_t Q2Y = vdupq_lane_f32(Q2L, 1);
+    float32x4_t Q2Z = vdupq_lane_f32(Q2H, 0);
+    XMVECTOR vResult = vmulq_lane_f32(Q1, Q2H, 1);
+
+    // Mul by Q1WZYX
+    float32x4_t vTemp = vrev64q_f32(Q1);
+    vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp));
+    Q2X = vmulq_f32(Q2X, vTemp);
+    vResult = vmlaq_f32(vResult, Q2X, ControlWZYX);
+
+    // Mul by Q1ZWXY
+    vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp)));
+    Q2Y = vmulq_f32(Q2Y, vTemp);
+    vResult = vmlaq_f32(vResult, Q2Y, ControlZWXY);
+
+    // Mul by Q1YXWZ
+    vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp)));
+    vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp));
+    Q2Z = vmulq_f32(Q2Z, vTemp);
+    vResult = vmlaq_f32(vResult, Q2Z, ControlYXWZ);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ControlWZYX = { { { 1.0f, -1.0f, 1.0f, -1.0f } } };
+    static const XMVECTORF32 ControlZWXY = { { { 1.0f, 1.0f, -1.0f, -1.0f } } };
+    static const XMVECTORF32 ControlYXWZ = { { { -1.0f, 1.0f, 1.0f, -1.0f } } };
+    // Copy to SSE registers and use as few as possible for x86
+    XMVECTOR Q2X = Q2;
+    XMVECTOR Q2Y = Q2;
+    XMVECTOR Q2Z = Q2;
+    XMVECTOR vResult = Q2;
+    // Splat with one instruction
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 3, 3, 3));
+    Q2X = XM_PERMUTE_PS(Q2X, _MM_SHUFFLE(0, 0, 0, 0));
+    Q2Y = XM_PERMUTE_PS(Q2Y, _MM_SHUFFLE(1, 1, 1, 1));
+    Q2Z = XM_PERMUTE_PS(Q2Z, _MM_SHUFFLE(2, 2, 2, 2));
+    // Retire Q1 and perform Q1*Q2W
+    vResult = _mm_mul_ps(vResult, Q1);
+    XMVECTOR Q1Shuffle = Q1;
+    // Shuffle the copies of Q1
+    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(0, 1, 2, 3));
+    // Mul by Q1WZYX
+    Q2X = _mm_mul_ps(Q2X, Q1Shuffle);
+    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(2, 3, 0, 1));
+    // Flip the signs on y and z
+    vResult = XM_FMADD_PS(Q2X, ControlWZYX, vResult);
+    // Mul by Q1ZWXY
+    Q2Y = _mm_mul_ps(Q2Y, Q1Shuffle);
+    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(0, 1, 2, 3));
+    // Flip the signs on z and w
+    Q2Y = _mm_mul_ps(Q2Y, ControlZWXY);
+    // Mul by Q1YXWZ
+    Q2Z = _mm_mul_ps(Q2Z, Q1Shuffle);
+    // Flip the signs on x and w
+    Q2Y = XM_FMADD_PS(Q2Z, ControlYXWZ, Q2Y);
+    vResult = _mm_add_ps(vResult, Q2Y);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionLengthSq(FXMVECTOR Q) noexcept
+{
+    return XMVector4LengthSq(Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength(FXMVECTOR Q) noexcept
+{
+    return XMVector4ReciprocalLength(Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionLength(FXMVECTOR Q) noexcept
+{
+    return XMVector4Length(Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst(FXMVECTOR Q) noexcept
+{
+    return XMVector4NormalizeEst(Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionNormalize(FXMVECTOR Q) noexcept
+{
+    return XMVector4Normalize(Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionConjugate(FXMVECTOR Q) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            -Q.vector4_f32[0],
+            -Q.vector4_f32[1],
+            -Q.vector4_f32[2],
+            Q.vector4_f32[3]
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 NegativeOne3 = { { { -1.0f, -1.0f, -1.0f, 1.0f } } };
+    return vmulq_f32(Q, NegativeOne3.v);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 NegativeOne3 = { { { -1.0f, -1.0f, -1.0f, 1.0f } } };
+    return _mm_mul_ps(Q, NegativeOne3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionInverse(FXMVECTOR Q) noexcept
+{
+    XMVECTOR L = XMVector4LengthSq(Q);
+    XMVECTOR Conjugate = XMQuaternionConjugate(Q);
+
+    XMVECTOR Control = XMVectorLessOrEqual(L, g_XMEpsilon.v);
+
+    XMVECTOR Result = XMVectorDivide(Conjugate, L);
+
+    Result = XMVectorSelect(Result, g_XMZero, Control);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionLn(FXMVECTOR Q) noexcept
+{
+    static const XMVECTORF32 OneMinusEpsilon = { { { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f } } };
+
+    XMVECTOR QW = XMVectorSplatW(Q);
+    XMVECTOR Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v);
+
+    XMVECTOR ControlW = XMVectorInBounds(QW, OneMinusEpsilon.v);
+
+    XMVECTOR Theta = XMVectorACos(QW);
+    XMVECTOR SinTheta = XMVectorSin(Theta);
+
+    XMVECTOR S = XMVectorDivide(Theta, SinTheta);
+
+    XMVECTOR Result = XMVectorMultiply(Q0, S);
+    Result = XMVectorSelect(Q0, Result, ControlW);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionExp(FXMVECTOR Q) noexcept
+{
+    XMVECTOR Theta = XMVector3Length(Q);
+
+    XMVECTOR SinTheta, CosTheta;
+    XMVectorSinCos(&SinTheta, &CosTheta, Theta);
+
+    XMVECTOR S = XMVectorDivide(SinTheta, Theta);
+
+    XMVECTOR Result = XMVectorMultiply(Q, S);
+
+    const XMVECTOR Zero = XMVectorZero();
+    XMVECTOR Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v);
+    Result = XMVectorSelect(Result, Q, Control);
+
+    Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionSlerp
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    float    t
+) noexcept
+{
+    XMVECTOR T = XMVectorReplicate(t);
+    return XMQuaternionSlerpV(Q0, Q1, T);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionSlerpV
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    FXMVECTOR T
+) noexcept
+{
+    assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)));
+
+    // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega)
+
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+
+    const XMVECTORF32 OneMinusEpsilon = { { { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f } } };
+
+    XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1);
+
+    const XMVECTOR Zero = XMVectorZero();
+    XMVECTOR Control = XMVectorLess(CosOmega, Zero);
+    XMVECTOR Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control);
+
+    CosOmega = XMVectorMultiply(CosOmega, Sign);
+
+    Control = XMVectorLess(CosOmega, OneMinusEpsilon);
+
+    XMVECTOR SinOmega = XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v);
+    SinOmega = XMVectorSqrt(SinOmega);
+
+    XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega);
+
+    XMVECTOR SignMask = XMVectorSplatSignMask();
+    XMVECTOR V01 = XMVectorShiftLeft(T, Zero, 2);
+    SignMask = XMVectorShiftLeft(SignMask, Zero, 3);
+    V01 = XMVectorXorInt(V01, SignMask);
+    V01 = XMVectorAdd(g_XMIdentityR0.v, V01);
+
+    XMVECTOR InvSinOmega = XMVectorReciprocal(SinOmega);
+
+    XMVECTOR S0 = XMVectorMultiply(V01, Omega);
+    S0 = XMVectorSin(S0);
+    S0 = XMVectorMultiply(S0, InvSinOmega);
+
+    S0 = XMVectorSelect(V01, S0, Control);
+
+    XMVECTOR S1 = XMVectorSplatY(S0);
+    S0 = XMVectorSplatX(S0);
+
+    S1 = XMVectorMultiply(S1, Sign);
+
+    XMVECTOR Result = XMVectorMultiply(Q0, S0);
+    Result = XMVectorMultiplyAdd(Q1, S1, Result);
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 OneMinusEpsilon = { { { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f } } };
+    static const XMVECTORU32 SignMask2 = { { { 0x80000000, 0x00000000, 0x00000000, 0x00000000 } } };
+
+    XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1);
+
+    const XMVECTOR Zero = XMVectorZero();
+    XMVECTOR Control = XMVectorLess(CosOmega, Zero);
+    XMVECTOR Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control);
+
+    CosOmega = _mm_mul_ps(CosOmega, Sign);
+
+    Control = XMVectorLess(CosOmega, OneMinusEpsilon);
+
+    XMVECTOR SinOmega = _mm_mul_ps(CosOmega, CosOmega);
+    SinOmega = _mm_sub_ps(g_XMOne, SinOmega);
+    SinOmega = _mm_sqrt_ps(SinOmega);
+
+    XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega);
+
+    XMVECTOR V01 = XM_PERMUTE_PS(T, _MM_SHUFFLE(2, 3, 0, 1));
+    V01 = _mm_and_ps(V01, g_XMMaskXY);
+    V01 = _mm_xor_ps(V01, SignMask2);
+    V01 = _mm_add_ps(g_XMIdentityR0, V01);
+
+    XMVECTOR S0 = _mm_mul_ps(V01, Omega);
+    S0 = XMVectorSin(S0);
+    S0 = _mm_div_ps(S0, SinOmega);
+
+    S0 = XMVectorSelect(V01, S0, Control);
+
+    XMVECTOR S1 = XMVectorSplatY(S0);
+    S0 = XMVectorSplatX(S0);
+
+    S1 = _mm_mul_ps(S1, Sign);
+    XMVECTOR Result = _mm_mul_ps(Q0, S0);
+    S1 = _mm_mul_ps(S1, Q1);
+    Result = _mm_add_ps(Result, S1);
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionSquad
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    FXMVECTOR Q2,
+    GXMVECTOR Q3,
+    float    t
+) noexcept
+{
+    XMVECTOR T = XMVectorReplicate(t);
+    return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionSquadV
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    FXMVECTOR Q2,
+    GXMVECTOR Q3,
+    HXMVECTOR T
+) noexcept
+{
+    assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)));
+
+    XMVECTOR TP = T;
+    const XMVECTOR Two = XMVectorSplatConstant(2, 0);
+
+    XMVECTOR Q03 = XMQuaternionSlerpV(Q0, Q3, T);
+    XMVECTOR Q12 = XMQuaternionSlerpV(Q1, Q2, T);
+
+    TP = XMVectorNegativeMultiplySubtract(TP, TP, TP);
+    TP = XMVectorMultiply(TP, Two);
+
+    XMVECTOR Result = XMQuaternionSlerpV(Q03, Q12, TP);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMQuaternionSquadSetup
+(
+    XMVECTOR* pA,
+    XMVECTOR* pB,
+    XMVECTOR* pC,
+    FXMVECTOR  Q0,
+    FXMVECTOR  Q1,
+    FXMVECTOR  Q2,
+    GXMVECTOR  Q3
+) noexcept
+{
+    assert(pA);
+    assert(pB);
+    assert(pC);
+
+    XMVECTOR LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2));
+    XMVECTOR LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2));
+    XMVECTOR SQ2 = XMVectorNegate(Q2);
+
+    XMVECTOR Control1 = XMVectorLess(LS12, LD12);
+    SQ2 = XMVectorSelect(Q2, SQ2, Control1);
+
+    XMVECTOR LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1));
+    XMVECTOR LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1));
+    XMVECTOR SQ0 = XMVectorNegate(Q0);
+
+    XMVECTOR LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3));
+    XMVECTOR LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3));
+    XMVECTOR SQ3 = XMVectorNegate(Q3);
+
+    XMVECTOR Control0 = XMVectorLess(LS01, LD01);
+    XMVECTOR Control2 = XMVectorLess(LS23, LD23);
+
+    SQ0 = XMVectorSelect(Q0, SQ0, Control0);
+    SQ3 = XMVectorSelect(Q3, SQ3, Control2);
+
+    XMVECTOR InvQ1 = XMQuaternionInverse(Q1);
+    XMVECTOR InvQ2 = XMQuaternionInverse(SQ2);
+
+    XMVECTOR LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0));
+    XMVECTOR LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2));
+    XMVECTOR LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1));
+    XMVECTOR LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3));
+
+    const XMVECTOR NegativeOneQuarter = XMVectorSplatConstant(-1, 2);
+
+    XMVECTOR ExpQ02 = XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter);
+    XMVECTOR ExpQ13 = XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter);
+    ExpQ02 = XMQuaternionExp(ExpQ02);
+    ExpQ13 = XMQuaternionExp(ExpQ13);
+
+    *pA = XMQuaternionMultiply(Q1, ExpQ02);
+    *pB = XMQuaternionMultiply(SQ2, ExpQ13);
+    *pC = SQ2;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentric
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    FXMVECTOR Q2,
+    float    f,
+    float    g
+) noexcept
+{
+    float s = f + g;
+
+    XMVECTOR Result;
+    if ((s < 0.00001f) && (s > -0.00001f))
+    {
+        Result = Q0;
+    }
+    else
+    {
+        XMVECTOR Q01 = XMQuaternionSlerp(Q0, Q1, s);
+        XMVECTOR Q02 = XMQuaternionSlerp(Q0, Q2, s);
+
+        Result = XMQuaternionSlerp(Q01, Q02, g / s);
+    }
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV
+(
+    FXMVECTOR Q0,
+    FXMVECTOR Q1,
+    FXMVECTOR Q2,
+    GXMVECTOR F,
+    HXMVECTOR G
+) noexcept
+{
+    assert((XMVectorGetY(F) == XMVectorGetX(F)) && (XMVectorGetZ(F) == XMVectorGetX(F)) && (XMVectorGetW(F) == XMVectorGetX(F)));
+    assert((XMVectorGetY(G) == XMVectorGetX(G)) && (XMVectorGetZ(G) == XMVectorGetX(G)) && (XMVectorGetW(G) == XMVectorGetX(G)));
+
+    const XMVECTOR Epsilon = XMVectorSplatConstant(1, 16);
+
+    XMVECTOR S = XMVectorAdd(F, G);
+
+    XMVECTOR Result;
+    if (XMVector4InBounds(S, Epsilon))
+    {
+        Result = Q0;
+    }
+    else
+    {
+        XMVECTOR Q01 = XMQuaternionSlerpV(Q0, Q1, S);
+        XMVECTOR Q02 = XMQuaternionSlerpV(Q0, Q2, S);
+        XMVECTOR GS = XMVectorReciprocal(S);
+        GS = XMVectorMultiply(G, GS);
+
+        Result = XMQuaternionSlerpV(Q01, Q02, GS);
+    }
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+// Transformation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionIdentity() noexcept
+{
+    return g_XMIdentityR3.v;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw
+(
+    float Pitch,
+    float Yaw,
+    float Roll
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    const float halfpitch = Pitch * 0.5f;
+    float cp = cosf(halfpitch);
+    float sp = sinf(halfpitch);
+
+    const float halfyaw = Yaw * 0.5f;
+    float cy = cosf(halfyaw);
+    float sy = sinf(halfyaw);
+
+    const float halfroll = Roll * 0.5f;
+    float cr = cosf(halfroll);
+    float sr = sinf(halfroll);
+
+    XMVECTORF32 vResult = { { {
+            cr * sp * cy + sr * cp * sy,
+            cr * cp * sy - sr * sp * cy,
+            sr * cp * cy - cr * sp * sy,
+            cr * cp * cy + sr * sp * sy
+        } } };
+    return vResult;
+#else
+    XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f);
+    return XMQuaternionRotationRollPitchYawFromVector(Angles);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector
+(
+    FXMVECTOR Angles // <Pitch, Yaw, Roll, 0>
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    const float halfpitch = Angles.vector4_f32[0] * 0.5f;
+    float cp = cosf(halfpitch);
+    float sp = sinf(halfpitch);
+
+    const float halfyaw = Angles.vector4_f32[1] * 0.5f;
+    float cy = cosf(halfyaw);
+    float sy = sinf(halfyaw);
+
+    const float halfroll = Angles.vector4_f32[2] * 0.5f;
+    float cr = cosf(halfroll);
+    float sr = sinf(halfroll);
+
+    XMVECTORF32 vResult = { { {
+            cr * sp * cy + sr * cp * sy,
+            cr * cp * sy - sr * sp * cy,
+            sr * cp * cy - cr * sp * sy,
+            cr * cp * cy + sr * sp * sy
+        } } };
+    return vResult;
+#else
+    static const XMVECTORF32  Sign = { { { 1.0f, -1.0f, -1.0f, 1.0f } } };
+
+    XMVECTOR HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v);
+
+    XMVECTOR SinAngles, CosAngles;
+    XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles);
+
+    XMVECTOR P0 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X>(SinAngles, CosAngles);
+    XMVECTOR Y0 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y>(SinAngles, CosAngles);
+    XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z>(SinAngles, CosAngles);
+    XMVECTOR P1 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X>(CosAngles, SinAngles);
+    XMVECTOR Y1 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y>(CosAngles, SinAngles);
+    XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z>(CosAngles, SinAngles);
+
+    XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v);
+    XMVECTOR Q0 = XMVectorMultiply(P0, Y0);
+    Q1 = XMVectorMultiply(Q1, Y1);
+    Q0 = XMVectorMultiply(Q0, R0);
+    XMVECTOR Q = XMVectorMultiplyAdd(Q1, R1, Q0);
+
+    return Q;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionRotationNormal
+(
+    FXMVECTOR NormalAxis,
+    float    Angle
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+
+    XMVECTOR N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v);
+
+    float SinV, CosV;
+    XMScalarSinCos(&SinV, &CosV, 0.5f * Angle);
+
+    XMVECTOR Scale = XMVectorSet(SinV, SinV, SinV, CosV);
+    return XMVectorMultiply(N, Scale);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR N = _mm_and_ps(NormalAxis, g_XMMask3);
+    N = _mm_or_ps(N, g_XMIdentityR3);
+    XMVECTOR Scale = _mm_set_ps1(0.5f * Angle);
+    XMVECTOR vSine;
+    XMVECTOR vCosine;
+    XMVectorSinCos(&vSine, &vCosine, Scale);
+    Scale = _mm_and_ps(vSine, g_XMMask3);
+    vCosine = _mm_and_ps(vCosine, g_XMMaskW);
+    Scale = _mm_or_ps(Scale, vCosine);
+    N = _mm_mul_ps(N, Scale);
+    return N;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionRotationAxis
+(
+    FXMVECTOR Axis,
+    float    Angle
+) noexcept
+{
+    assert(!XMVector3Equal(Axis, XMVectorZero()));
+    assert(!XMVector3IsInfinite(Axis));
+
+    XMVECTOR Normal = XMVector3Normalize(Axis);
+    XMVECTOR Q = XMQuaternionRotationNormal(Normal, Angle);
+    return Q;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 q;
+    float r22 = M.m[2][2];
+    if (r22 <= 0.f)  // x^2 + y^2 >= z^2 + w^2
+    {
+        float dif10 = M.m[1][1] - M.m[0][0];
+        float omr22 = 1.f - r22;
+        if (dif10 <= 0.f)  // x^2 >= y^2
+        {
+            float fourXSqr = omr22 - dif10;
+            float inv4x = 0.5f / sqrtf(fourXSqr);
+            q.f[0] = fourXSqr * inv4x;
+            q.f[1] = (M.m[0][1] + M.m[1][0]) * inv4x;
+            q.f[2] = (M.m[0][2] + M.m[2][0]) * inv4x;
+            q.f[3] = (M.m[1][2] - M.m[2][1]) * inv4x;
+        }
+        else  // y^2 >= x^2
+        {
+            float fourYSqr = omr22 + dif10;
+            float inv4y = 0.5f / sqrtf(fourYSqr);
+            q.f[0] = (M.m[0][1] + M.m[1][0]) * inv4y;
+            q.f[1] = fourYSqr * inv4y;
+            q.f[2] = (M.m[1][2] + M.m[2][1]) * inv4y;
+            q.f[3] = (M.m[2][0] - M.m[0][2]) * inv4y;
+        }
+    }
+    else  // z^2 + w^2 >= x^2 + y^2
+    {
+        float sum10 = M.m[1][1] + M.m[0][0];
+        float opr22 = 1.f + r22;
+        if (sum10 <= 0.f)  // z^2 >= w^2
+        {
+            float fourZSqr = opr22 - sum10;
+            float inv4z = 0.5f / sqrtf(fourZSqr);
+            q.f[0] = (M.m[0][2] + M.m[2][0]) * inv4z;
+            q.f[1] = (M.m[1][2] + M.m[2][1]) * inv4z;
+            q.f[2] = fourZSqr * inv4z;
+            q.f[3] = (M.m[0][1] - M.m[1][0]) * inv4z;
+        }
+        else  // w^2 >= z^2
+        {
+            float fourWSqr = opr22 + sum10;
+            float inv4w = 0.5f / sqrtf(fourWSqr);
+            q.f[0] = (M.m[1][2] - M.m[2][1]) * inv4w;
+            q.f[1] = (M.m[2][0] - M.m[0][2]) * inv4w;
+            q.f[2] = (M.m[0][1] - M.m[1][0]) * inv4w;
+            q.f[3] = fourWSqr * inv4w;
+        }
+    }
+    return q.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 XMPMMP = { { { +1.0f, -1.0f, -1.0f, +1.0f } } };
+    static const XMVECTORF32 XMMPMP = { { { -1.0f, +1.0f, -1.0f, +1.0f } } };
+    static const XMVECTORF32 XMMMPP = { { { -1.0f, -1.0f, +1.0f, +1.0f } } };
+    static const XMVECTORU32 Select0110 = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 } } };
+    static const XMVECTORU32 Select0010 = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } };
+
+    float32x4_t r0 = M.r[0];
+    float32x4_t r1 = M.r[1];
+    float32x4_t r2 = M.r[2];
+
+    float32x4_t r00 = vdupq_lane_f32(vget_low_f32(r0), 0);
+    float32x4_t r11 = vdupq_lane_f32(vget_low_f32(r1), 1);
+    float32x4_t r22 = vdupq_lane_f32(vget_high_f32(r2), 0);
+
+    // x^2 >= y^2 equivalent to r11 - r00 <= 0
+    float32x4_t r11mr00 = vsubq_f32(r11, r00);
+    uint32x4_t x2gey2 = vcleq_f32(r11mr00, g_XMZero);
+
+    // z^2 >= w^2 equivalent to r11 + r00 <= 0
+    float32x4_t r11pr00 = vaddq_f32(r11, r00);
+    uint32x4_t z2gew2 = vcleq_f32(r11pr00, g_XMZero);
+
+    // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
+    uint32x4_t x2py2gez2pw2 = vcleq_f32(r22, g_XMZero);
+
+    // (4*x^2, 4*y^2, 4*z^2, 4*w^2)
+    float32x4_t t0 = vmulq_f32(XMPMMP, r00);
+    float32x4_t x2y2z2w2 = vmlaq_f32(t0, XMMPMP, r11);
+    x2y2z2w2 = vmlaq_f32(x2y2z2w2, XMMMPP, r22);
+    x2y2z2w2 = vaddq_f32(x2y2z2w2, g_XMOne);
+
+    // (r01, r02, r12, r11)
+    t0 = vextq_f32(r0, r0, 1);
+    float32x4_t t1 = vextq_f32(r1, r1, 1);
+    t0 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_low_f32(t1)));
+
+    // (r10, r20, r21, r10)
+    t1 = vextq_f32(r2, r2, 3);
+    float32x4_t r10 = vdupq_lane_f32(vget_low_f32(r1), 0);
+    t1 = vbslq_f32(Select0110, t1, r10);
+
+    // (4*x*y, 4*x*z, 4*y*z, unused)
+    float32x4_t xyxzyz = vaddq_f32(t0, t1);
+
+    // (r21, r20, r10, r10)
+    t0 = vcombine_f32(vrev64_f32(vget_low_f32(r2)), vget_low_f32(r10));
+
+    // (r12, r02, r01, r12)
+    float32x4_t t2 = vcombine_f32(vrev64_f32(vget_high_f32(r0)), vrev64_f32(vget_low_f32(r0)));
+    float32x4_t t3 = vdupq_lane_f32(vget_high_f32(r1), 0);
+    t1 = vbslq_f32(Select0110, t2, t3);
+
+    // (4*x*w, 4*y*w, 4*z*w, unused)
+    float32x4_t xwywzw = vsubq_f32(t0, t1);
+    xwywzw = vmulq_f32(XMMPMP, xwywzw);
+
+    // (4*x*x, 4*x*y, 4*x*z, 4*x*w)
+    t0 = vextq_f32(xyxzyz, xyxzyz, 3);
+    t1 = vbslq_f32(Select0110, t0, x2y2z2w2);
+    t2 = vdupq_lane_f32(vget_low_f32(xwywzw), 0);
+    float32x4_t tensor0 = vbslq_f32(g_XMSelect1110, t1, t2);
+
+    // (4*y*x, 4*y*y, 4*y*z, 4*y*w)
+    t0 = vbslq_f32(g_XMSelect1011, xyxzyz, x2y2z2w2);
+    t1 = vdupq_lane_f32(vget_low_f32(xwywzw), 1);
+    float32x4_t tensor1 = vbslq_f32(g_XMSelect1110, t0, t1);
+
+    // (4*z*x, 4*z*y, 4*z*z, 4*z*w)
+    t0 = vextq_f32(xyxzyz, xyxzyz, 1);
+    t1 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_high_f32(xwywzw)));
+    float32x4_t tensor2 = vbslq_f32(Select0010, x2y2z2w2, t1);
+
+    // (4*w*x, 4*w*y, 4*w*z, 4*w*w)
+    float32x4_t tensor3 = vbslq_f32(g_XMSelect1110, xwywzw, x2y2z2w2);
+
+    // Select the row of the tensor-product matrix that has the largest
+    // magnitude.
+    t0 = vbslq_f32(x2gey2, tensor0, tensor1);
+    t1 = vbslq_f32(z2gew2, tensor2, tensor3);
+    t2 = vbslq_f32(x2py2gez2pw2, t0, t1);
+
+    // Normalize the row.  No division by zero is possible because the
+    // quaternion is unit-length (and the row is a nonzero multiple of
+    // the quaternion).
+    t0 = XMVector4Length(t2);
+    return XMVectorDivide(t2, t0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 XMPMMP = { { { +1.0f, -1.0f, -1.0f, +1.0f } } };
+    static const XMVECTORF32 XMMPMP = { { { -1.0f, +1.0f, -1.0f, +1.0f } } };
+    static const XMVECTORF32 XMMMPP = { { { -1.0f, -1.0f, +1.0f, +1.0f } } };
+
+    XMVECTOR r0 = M.r[0];  // (r00, r01, r02, 0)
+    XMVECTOR r1 = M.r[1];  // (r10, r11, r12, 0)
+    XMVECTOR r2 = M.r[2];  // (r20, r21, r22, 0)
+
+    // (r00, r00, r00, r00)
+    XMVECTOR r00 = XM_PERMUTE_PS(r0, _MM_SHUFFLE(0, 0, 0, 0));
+    // (r11, r11, r11, r11)
+    XMVECTOR r11 = XM_PERMUTE_PS(r1, _MM_SHUFFLE(1, 1, 1, 1));
+    // (r22, r22, r22, r22)
+    XMVECTOR r22 = XM_PERMUTE_PS(r2, _MM_SHUFFLE(2, 2, 2, 2));
+
+    // x^2 >= y^2 equivalent to r11 - r00 <= 0
+    // (r11 - r00, r11 - r00, r11 - r00, r11 - r00)
+    XMVECTOR r11mr00 = _mm_sub_ps(r11, r00);
+    XMVECTOR x2gey2 = _mm_cmple_ps(r11mr00, g_XMZero);
+
+    // z^2 >= w^2 equivalent to r11 + r00 <= 0
+    // (r11 + r00, r11 + r00, r11 + r00, r11 + r00)
+    XMVECTOR r11pr00 = _mm_add_ps(r11, r00);
+    XMVECTOR z2gew2 = _mm_cmple_ps(r11pr00, g_XMZero);
+
+    // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
+    XMVECTOR x2py2gez2pw2 = _mm_cmple_ps(r22, g_XMZero);
+
+    // (4*x^2, 4*y^2, 4*z^2, 4*w^2)
+    XMVECTOR t0 = XM_FMADD_PS(XMPMMP, r00, g_XMOne);
+    XMVECTOR t1 = _mm_mul_ps(XMMPMP, r11);
+    XMVECTOR t2 = XM_FMADD_PS(XMMMPP, r22, t0);
+    XMVECTOR x2y2z2w2 = _mm_add_ps(t1, t2);
+
+    // (r01, r02, r12, r11)
+    t0 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 2, 2, 1));
+    // (r10, r10, r20, r21)
+    t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1, 0, 0, 0));
+    // (r10, r20, r21, r10)
+    t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0));
+    // (4*x*y, 4*x*z, 4*y*z, unused)
+    XMVECTOR xyxzyz = _mm_add_ps(t0, t1);
+
+    // (r21, r20, r10, r10)
+    t0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 1));
+    // (r12, r12, r02, r01)
+    t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1, 2, 2, 2));
+    // (r12, r02, r01, r12)
+    t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0));
+    // (4*x*w, 4*y*w, 4*z*w, unused)
+    XMVECTOR xwywzw = _mm_sub_ps(t0, t1);
+    xwywzw = _mm_mul_ps(XMMPMP, xwywzw);
+
+    // (4*x^2, 4*y^2, 4*x*y, unused)
+    t0 = _mm_shuffle_ps(x2y2z2w2, xyxzyz, _MM_SHUFFLE(0, 0, 1, 0));
+    // (4*z^2, 4*w^2, 4*z*w, unused)
+    t1 = _mm_shuffle_ps(x2y2z2w2, xwywzw, _MM_SHUFFLE(0, 2, 3, 2));
+    // (4*x*z, 4*y*z, 4*x*w, 4*y*w)
+    t2 = _mm_shuffle_ps(xyxzyz, xwywzw, _MM_SHUFFLE(1, 0, 2, 1));
+
+    // (4*x*x, 4*x*y, 4*x*z, 4*x*w)
+    XMVECTOR tensor0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2, 0, 2, 0));
+    // (4*y*x, 4*y*y, 4*y*z, 4*y*w)
+    XMVECTOR tensor1 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 1, 1, 2));
+    // (4*z*x, 4*z*y, 4*z*z, 4*z*w)
+    XMVECTOR tensor2 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2, 0, 1, 0));
+    // (4*w*x, 4*w*y, 4*w*z, 4*w*w)
+    XMVECTOR tensor3 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(1, 2, 3, 2));
+
+    // Select the row of the tensor-product matrix that has the largest
+    // magnitude.
+    t0 = _mm_and_ps(x2gey2, tensor0);
+    t1 = _mm_andnot_ps(x2gey2, tensor1);
+    t0 = _mm_or_ps(t0, t1);
+    t1 = _mm_and_ps(z2gew2, tensor2);
+    t2 = _mm_andnot_ps(z2gew2, tensor3);
+    t1 = _mm_or_ps(t1, t2);
+    t0 = _mm_and_ps(x2py2gez2pw2, t0);
+    t1 = _mm_andnot_ps(x2py2gez2pw2, t1);
+    t2 = _mm_or_ps(t0, t1);
+
+    // Normalize the row.  No division by zero is possible because the
+    // quaternion is unit-length (and the row is a nonzero multiple of
+    // the quaternion).
+    t0 = XMVector4Length(t2);
+    return _mm_div_ps(t2, t0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Conversion operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMQuaternionToAxisAngle
+(
+    XMVECTOR* pAxis,
+    float* pAngle,
+    FXMVECTOR  Q
+) noexcept
+{
+    assert(pAxis);
+    assert(pAngle);
+
+    *pAxis = Q;
+
+    *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q));
+}
+
+/****************************************************************************
+ *
+ * Plane
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+ // Comparison operations
+ //------------------------------------------------------------------------------
+
+ //------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMPlaneEqual
+(
+    FXMVECTOR P1,
+    FXMVECTOR P2
+) noexcept
+{
+    return XMVector4Equal(P1, P2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMPlaneNearEqual
+(
+    FXMVECTOR P1,
+    FXMVECTOR P2,
+    FXMVECTOR Epsilon
+) noexcept
+{
+    XMVECTOR NP1 = XMPlaneNormalize(P1);
+    XMVECTOR NP2 = XMPlaneNormalize(P2);
+    return XMVector4NearEqual(NP1, NP2, Epsilon);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMPlaneNotEqual
+(
+    FXMVECTOR P1,
+    FXMVECTOR P2
+) noexcept
+{
+    return XMVector4NotEqual(P1, P2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMPlaneIsNaN(FXMVECTOR P) noexcept
+{
+    return XMVector4IsNaN(P);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMPlaneIsInfinite(FXMVECTOR P) noexcept
+{
+    return XMVector4IsInfinite(P);
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneDot
+(
+    FXMVECTOR P,
+    FXMVECTOR V
+) noexcept
+{
+    return XMVector4Dot(P, V);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneDotCoord
+(
+    FXMVECTOR P,
+    FXMVECTOR V
+) noexcept
+{
+    // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3]
+
+    XMVECTOR V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v);
+    XMVECTOR Result = XMVector4Dot(P, V3);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneDotNormal
+(
+    FXMVECTOR P,
+    FXMVECTOR V
+) noexcept
+{
+    return XMVector3Dot(P, V);
+}
+
+//------------------------------------------------------------------------------
+// XMPlaneNormalizeEst uses a reciprocal estimate and
+// returns QNaN on zero and infinite vectors.
+
+inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst(FXMVECTOR P) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+
+    XMVECTOR Result = XMVector3ReciprocalLengthEst(P);
+    return XMVectorMultiply(P, Result);
+
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps(P, P, 0x7f);
+    XMVECTOR vResult = _mm_rsqrt_ps(vTemp);
+    return _mm_mul_ps(vResult, P);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product
+    XMVECTOR vDot = _mm_mul_ps(P, P);
+    // x=Dot.y, y=Dot.z
+    XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1));
+    // Result.x = x+y
+    vDot = _mm_add_ss(vDot, vTemp);
+    // x=Dot.z
+    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
+    // Result.x = (x+y)+z
+    vDot = _mm_add_ss(vDot, vTemp);
+    // Splat x
+    vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
+    // Get the reciprocal
+    vDot = _mm_rsqrt_ps(vDot);
+    // Get the reciprocal
+    vDot = _mm_mul_ps(vDot, P);
+    return vDot;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneNormalize(FXMVECTOR P) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float fLengthSq = sqrtf((P.vector4_f32[0] * P.vector4_f32[0]) + (P.vector4_f32[1] * P.vector4_f32[1]) + (P.vector4_f32[2] * P.vector4_f32[2]));
+    // Prevent divide by zero
+    if (fLengthSq > 0)
+    {
+        fLengthSq = 1.0f / fLengthSq;
+    }
+    XMVECTORF32 vResult = { { {
+            P.vector4_f32[0] * fLengthSq,
+            P.vector4_f32[1] * fLengthSq,
+            P.vector4_f32[2] * fLengthSq,
+            P.vector4_f32[3] * fLengthSq
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR vLength = XMVector3ReciprocalLength(P);
+    return XMVectorMultiply(P, vLength);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_dp_ps(P, P, 0x7f);
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(P, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vLengthSq);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y and z only
+    XMVECTOR vLengthSq = _mm_mul_ps(P, P);
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1));
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(P, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vLengthSq);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneIntersectLine
+(
+    FXMVECTOR P,
+    FXMVECTOR LinePoint1,
+    FXMVECTOR LinePoint2
+) noexcept
+{
+    XMVECTOR V1 = XMVector3Dot(P, LinePoint1);
+    XMVECTOR V2 = XMVector3Dot(P, LinePoint2);
+    XMVECTOR D = XMVectorSubtract(V1, V2);
+
+    XMVECTOR VT = XMPlaneDotCoord(P, LinePoint1);
+    VT = XMVectorDivide(VT, D);
+
+    XMVECTOR Point = XMVectorSubtract(LinePoint2, LinePoint1);
+    Point = XMVectorMultiplyAdd(Point, VT, LinePoint1);
+
+    const XMVECTOR Zero = XMVectorZero();
+    XMVECTOR Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v);
+
+    return XMVectorSelect(Point, g_XMQNaN.v, Control);
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMPlaneIntersectPlane
+(
+    XMVECTOR* pLinePoint1,
+    XMVECTOR* pLinePoint2,
+    FXMVECTOR  P1,
+    FXMVECTOR  P2
+) noexcept
+{
+    assert(pLinePoint1);
+    assert(pLinePoint2);
+
+    XMVECTOR V1 = XMVector3Cross(P2, P1);
+
+    XMVECTOR LengthSq = XMVector3LengthSq(V1);
+
+    XMVECTOR V2 = XMVector3Cross(P2, V1);
+
+    XMVECTOR P1W = XMVectorSplatW(P1);
+    XMVECTOR Point = XMVectorMultiply(V2, P1W);
+
+    XMVECTOR V3 = XMVector3Cross(V1, P1);
+
+    XMVECTOR P2W = XMVectorSplatW(P2);
+    Point = XMVectorMultiplyAdd(V3, P2W, Point);
+
+    XMVECTOR LinePoint1 = XMVectorDivide(Point, LengthSq);
+
+    XMVECTOR LinePoint2 = XMVectorAdd(LinePoint1, V1);
+
+    XMVECTOR Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v);
+    *pLinePoint1 = XMVectorSelect(LinePoint1, g_XMQNaN.v, Control);
+    *pLinePoint2 = XMVectorSelect(LinePoint2, g_XMQNaN.v, Control);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneTransform
+(
+    FXMVECTOR P,
+    FXMMATRIX ITM
+) noexcept
+{
+    XMVECTOR W = XMVectorSplatW(P);
+    XMVECTOR Z = XMVectorSplatZ(P);
+    XMVECTOR Y = XMVectorSplatY(P);
+    XMVECTOR X = XMVectorSplatX(P);
+
+    XMVECTOR Result = XMVectorMultiply(W, ITM.r[3]);
+    Result = XMVectorMultiplyAdd(Z, ITM.r[2], Result);
+    Result = XMVectorMultiplyAdd(Y, ITM.r[1], Result);
+    Result = XMVectorMultiplyAdd(X, ITM.r[0], Result);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMFLOAT4* XM_CALLCONV XMPlaneTransformStream
+(
+    XMFLOAT4*       pOutputStream,
+    size_t          OutputStride,
+    const XMFLOAT4* pInputStream,
+    size_t          InputStride,
+    size_t          PlaneCount,
+    FXMMATRIX       ITM
+) noexcept
+{
+    return XMVector4TransformStream(pOutputStream,
+        OutputStride,
+        pInputStream,
+        InputStride,
+        PlaneCount,
+        ITM);
+}
+
+//------------------------------------------------------------------------------
+// Conversion operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneFromPointNormal
+(
+    FXMVECTOR Point,
+    FXMVECTOR Normal
+) noexcept
+{
+    XMVECTOR W = XMVector3Dot(Point, Normal);
+    W = XMVectorNegate(W);
+    return XMVectorSelect(W, Normal, g_XMSelect1110.v);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMPlaneFromPoints
+(
+    FXMVECTOR Point1,
+    FXMVECTOR Point2,
+    FXMVECTOR Point3
+) noexcept
+{
+    XMVECTOR V21 = XMVectorSubtract(Point1, Point2);
+    XMVECTOR V31 = XMVectorSubtract(Point1, Point3);
+
+    XMVECTOR N = XMVector3Cross(V21, V31);
+    N = XMVector3Normalize(N);
+
+    XMVECTOR D = XMPlaneDotNormal(N, Point1);
+    D = XMVectorNegate(D);
+
+    XMVECTOR Result = XMVectorSelect(D, N, g_XMSelect1110.v);
+
+    return Result;
+}
+
+/****************************************************************************
+ *
+ * Color
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+ // Comparison operations
+ //------------------------------------------------------------------------------
+
+ //------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMColorEqual
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+) noexcept
+{
+    return XMVector4Equal(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMColorNotEqual
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+) noexcept
+{
+    return XMVector4NotEqual(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMColorGreater
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+) noexcept
+{
+    return XMVector4Greater(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMColorGreaterOrEqual
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+) noexcept
+{
+    return XMVector4GreaterOrEqual(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMColorLess
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+) noexcept
+{
+    return XMVector4Less(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMColorLessOrEqual
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+) noexcept
+{
+    return XMVector4LessOrEqual(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMColorIsNaN(FXMVECTOR C) noexcept
+{
+    return XMVector4IsNaN(C);
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMColorIsInfinite(FXMVECTOR C) noexcept
+{
+    return XMVector4IsInfinite(C);
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR vColor) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            1.0f - vColor.vector4_f32[0],
+            1.0f - vColor.vector4_f32[1],
+            1.0f - vColor.vector4_f32[2],
+            vColor.vector4_f32[3]
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vTemp = veorq_u32(vreinterpretq_u32_f32(vColor), g_XMNegate3);
+    return vaddq_f32(vreinterpretq_f32_u32(vTemp), g_XMOne3);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Negate only x,y and z.
+    XMVECTOR vTemp = _mm_xor_ps(vColor, g_XMNegate3);
+    // Add 1,1,1,0 to -x,-y,-z,w
+    return _mm_add_ps(vTemp, g_XMOne3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorModulate
+(
+    FXMVECTOR C1,
+    FXMVECTOR C2
+) noexcept
+{
+    return XMVectorMultiply(C1, C2);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorAdjustSaturation
+(
+    FXMVECTOR vColor,
+    float    fSaturation
+) noexcept
+{
+    // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2];
+    // Result = (C - Luminance) * Saturation + Luminance;
+
+    const XMVECTORF32 gvLuminance = { { { 0.2125f, 0.7154f, 0.0721f, 0.0f } } };
+#if defined(_XM_NO_INTRINSICS_)
+    float fLuminance = (vColor.vector4_f32[0] * gvLuminance.f[0]) + (vColor.vector4_f32[1] * gvLuminance.f[1]) + (vColor.vector4_f32[2] * gvLuminance.f[2]);
+    XMVECTOR vResult;
+    vResult.vector4_f32[0] = ((vColor.vector4_f32[0] - fLuminance) * fSaturation) + fLuminance;
+    vResult.vector4_f32[1] = ((vColor.vector4_f32[1] - fLuminance) * fSaturation) + fLuminance;
+    vResult.vector4_f32[2] = ((vColor.vector4_f32[2] - fLuminance) * fSaturation) + fLuminance;
+    vResult.vector4_f32[3] = vColor.vector4_f32[3];
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR vLuminance = XMVector3Dot(vColor, gvLuminance);
+    XMVECTOR vResult = vsubq_f32(vColor, vLuminance);
+    vResult = vmlaq_n_f32(vLuminance, vResult, fSaturation);
+    return vbslq_f32(g_XMSelect1110, vResult, vColor);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vLuminance = XMVector3Dot(vColor, gvLuminance);
+    // Splat fSaturation
+    XMVECTOR vSaturation = _mm_set_ps1(fSaturation);
+    // vResult = ((vColor-vLuminance)*vSaturation)+vLuminance;
+    XMVECTOR vResult = _mm_sub_ps(vColor, vLuminance);
+    vResult = XM_FMADD_PS(vResult, vSaturation, vLuminance);
+    // Retain w from the source color
+    vLuminance = _mm_shuffle_ps(vResult, vColor, _MM_SHUFFLE(3, 2, 2, 2));   // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
+    vResult = _mm_shuffle_ps(vResult, vLuminance, _MM_SHUFFLE(3, 0, 1, 0));  // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorAdjustContrast
+(
+    FXMVECTOR vColor,
+    float    fContrast
+) noexcept
+{
+    // Result = (vColor - 0.5f) * fContrast + 0.5f;
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            ((vColor.vector4_f32[0] - 0.5f) * fContrast) + 0.5f,
+            ((vColor.vector4_f32[1] - 0.5f) * fContrast) + 0.5f,
+            ((vColor.vector4_f32[2] - 0.5f) * fContrast) + 0.5f,
+            vColor.vector4_f32[3]        // Leave W untouched
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR vResult = vsubq_f32(vColor, g_XMOneHalf.v);
+    vResult = vmlaq_n_f32(g_XMOneHalf.v, vResult, fContrast);
+    return vbslq_f32(g_XMSelect1110, vResult, vColor);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vScale = _mm_set_ps1(fContrast);           // Splat the scale
+    XMVECTOR vResult = _mm_sub_ps(vColor, g_XMOneHalf);  // Subtract 0.5f from the source (Saving source)
+    vResult = XM_FMADD_PS(vResult, vScale, g_XMOneHalf);
+// Retain w from the source color
+    vScale = _mm_shuffle_ps(vResult, vColor, _MM_SHUFFLE(3, 2, 2, 2));   // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
+    vResult = _mm_shuffle_ps(vResult, vScale, _MM_SHUFFLE(3, 0, 1, 0));  // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorRGBToHSL(FXMVECTOR rgb) noexcept
+{
+    XMVECTOR r = XMVectorSplatX(rgb);
+    XMVECTOR g = XMVectorSplatY(rgb);
+    XMVECTOR b = XMVectorSplatZ(rgb);
+
+    XMVECTOR min = XMVectorMin(r, XMVectorMin(g, b));
+    XMVECTOR max = XMVectorMax(r, XMVectorMax(g, b));
+
+    XMVECTOR l = XMVectorMultiply(XMVectorAdd(min, max), g_XMOneHalf);
+
+    XMVECTOR d = XMVectorSubtract(max, min);
+
+    XMVECTOR la = XMVectorSelect(rgb, l, g_XMSelect1110);
+
+    if (XMVector3Less(d, g_XMEpsilon))
+    {
+        // Achromatic, assume H and S of 0
+        return XMVectorSelect(la, g_XMZero, g_XMSelect1100);
+    }
+    else
+    {
+        XMVECTOR s, h;
+
+        XMVECTOR d2 = XMVectorAdd(min, max);
+
+        if (XMVector3Greater(l, g_XMOneHalf))
+        {
+            // d / (2-max-min)
+            s = XMVectorDivide(d, XMVectorSubtract(g_XMTwo, d2));
+        }
+        else
+        {
+            // d / (max+min)
+            s = XMVectorDivide(d, d2);
+        }
+
+        if (XMVector3Equal(r, max))
+        {
+            // Red is max
+            h = XMVectorDivide(XMVectorSubtract(g, b), d);
+        }
+        else if (XMVector3Equal(g, max))
+        {
+            // Green is max
+            h = XMVectorDivide(XMVectorSubtract(b, r), d);
+            h = XMVectorAdd(h, g_XMTwo);
+        }
+        else
+        {
+            // Blue is max
+            h = XMVectorDivide(XMVectorSubtract(r, g), d);
+            h = XMVectorAdd(h, g_XMFour);
+        }
+
+        h = XMVectorDivide(h, g_XMSix);
+
+        if (XMVector3Less(h, g_XMZero))
+            h = XMVectorAdd(h, g_XMOne);
+
+        XMVECTOR lha = XMVectorSelect(la, h, g_XMSelect1100);
+        return XMVectorSelect(s, lha, g_XMSelect1011);
+    }
+}
+
+//------------------------------------------------------------------------------
+
+namespace Internal
+{
+
+    inline XMVECTOR XM_CALLCONV XMColorHue2Clr(FXMVECTOR p, FXMVECTOR q, FXMVECTOR h) noexcept
+    {
+        static const XMVECTORF32 oneSixth = { { { 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f } } };
+        static const XMVECTORF32 twoThirds = { { { 2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f } } };
+
+        XMVECTOR t = h;
+
+        if (XMVector3Less(t, g_XMZero))
+            t = XMVectorAdd(t, g_XMOne);
+
+        if (XMVector3Greater(t, g_XMOne))
+            t = XMVectorSubtract(t, g_XMOne);
+
+        if (XMVector3Less(t, oneSixth))
+        {
+            // p + (q - p) * 6 * t
+            XMVECTOR t1 = XMVectorSubtract(q, p);
+            XMVECTOR t2 = XMVectorMultiply(g_XMSix, t);
+            return XMVectorMultiplyAdd(t1, t2, p);
+        }
+
+        if (XMVector3Less(t, g_XMOneHalf))
+            return q;
+
+        if (XMVector3Less(t, twoThirds))
+        {
+            // p + (q - p) * 6 * (2/3 - t)
+            XMVECTOR t1 = XMVectorSubtract(q, p);
+            XMVECTOR t2 = XMVectorMultiply(g_XMSix, XMVectorSubtract(twoThirds, t));
+            return XMVectorMultiplyAdd(t1, t2, p);
+        }
+
+        return p;
+    }
+
+} // namespace Internal
+
+inline XMVECTOR XM_CALLCONV XMColorHSLToRGB(FXMVECTOR hsl) noexcept
+{
+    static const XMVECTORF32 oneThird = { { { 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f } } };
+
+    XMVECTOR s = XMVectorSplatY(hsl);
+    XMVECTOR l = XMVectorSplatZ(hsl);
+
+    if (XMVector3NearEqual(s, g_XMZero, g_XMEpsilon))
+    {
+        // Achromatic
+        return XMVectorSelect(hsl, l, g_XMSelect1110);
+    }
+    else
+    {
+        XMVECTOR h = XMVectorSplatX(hsl);
+
+        XMVECTOR q;
+        if (XMVector3Less(l, g_XMOneHalf))
+        {
+            q = XMVectorMultiply(l, XMVectorAdd(g_XMOne, s));
+        }
+        else
+        {
+            q = XMVectorSubtract(XMVectorAdd(l, s), XMVectorMultiply(l, s));
+        }
+
+        XMVECTOR p = XMVectorSubtract(XMVectorMultiply(g_XMTwo, l), q);
+
+        XMVECTOR r = DirectX::Internal::XMColorHue2Clr(p, q, XMVectorAdd(h, oneThird));
+        XMVECTOR g = DirectX::Internal::XMColorHue2Clr(p, q, h);
+        XMVECTOR b = DirectX::Internal::XMColorHue2Clr(p, q, XMVectorSubtract(h, oneThird));
+
+        XMVECTOR rg = XMVectorSelect(g, r, g_XMSelect1000);
+        XMVECTOR ba = XMVectorSelect(hsl, b, g_XMSelect1110);
+
+        return XMVectorSelect(ba, rg, g_XMSelect1100);
+    }
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorRGBToHSV(FXMVECTOR rgb) noexcept
+{
+    XMVECTOR r = XMVectorSplatX(rgb);
+    XMVECTOR g = XMVectorSplatY(rgb);
+    XMVECTOR b = XMVectorSplatZ(rgb);
+
+    XMVECTOR min = XMVectorMin(r, XMVectorMin(g, b));
+    XMVECTOR v = XMVectorMax(r, XMVectorMax(g, b));
+
+    XMVECTOR d = XMVectorSubtract(v, min);
+
+    XMVECTOR s = (XMVector3NearEqual(v, g_XMZero, g_XMEpsilon)) ? g_XMZero : XMVectorDivide(d, v);
+
+    if (XMVector3Less(d, g_XMEpsilon))
+    {
+        // Achromatic, assume H of 0
+        XMVECTOR hv = XMVectorSelect(v, g_XMZero, g_XMSelect1000);
+        XMVECTOR hva = XMVectorSelect(rgb, hv, g_XMSelect1110);
+        return XMVectorSelect(s, hva, g_XMSelect1011);
+    }
+    else
+    {
+        XMVECTOR h;
+
+        if (XMVector3Equal(r, v))
+        {
+            // Red is max
+            h = XMVectorDivide(XMVectorSubtract(g, b), d);
+
+            if (XMVector3Less(g, b))
+                h = XMVectorAdd(h, g_XMSix);
+        }
+        else if (XMVector3Equal(g, v))
+        {
+            // Green is max
+            h = XMVectorDivide(XMVectorSubtract(b, r), d);
+            h = XMVectorAdd(h, g_XMTwo);
+        }
+        else
+        {
+            // Blue is max
+            h = XMVectorDivide(XMVectorSubtract(r, g), d);
+            h = XMVectorAdd(h, g_XMFour);
+        }
+
+        h = XMVectorDivide(h, g_XMSix);
+
+        XMVECTOR hv = XMVectorSelect(v, h, g_XMSelect1000);
+        XMVECTOR hva = XMVectorSelect(rgb, hv, g_XMSelect1110);
+        return XMVectorSelect(s, hva, g_XMSelect1011);
+    }
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorHSVToRGB(FXMVECTOR hsv) noexcept
+{
+    XMVECTOR h = XMVectorSplatX(hsv);
+    XMVECTOR s = XMVectorSplatY(hsv);
+    XMVECTOR v = XMVectorSplatZ(hsv);
+
+    XMVECTOR h6 = XMVectorMultiply(h, g_XMSix);
+
+    XMVECTOR i = XMVectorFloor(h6);
+    XMVECTOR f = XMVectorSubtract(h6, i);
+
+    // p = v* (1-s)
+    XMVECTOR p = XMVectorMultiply(v, XMVectorSubtract(g_XMOne, s));
+
+    // q = v*(1-f*s)
+    XMVECTOR q = XMVectorMultiply(v, XMVectorSubtract(g_XMOne, XMVectorMultiply(f, s)));
+
+    // t = v*(1 - (1-f)*s)
+    XMVECTOR t = XMVectorMultiply(v, XMVectorSubtract(g_XMOne, XMVectorMultiply(XMVectorSubtract(g_XMOne, f), s)));
+
+    auto ii = static_cast<int>(XMVectorGetX(XMVectorMod(i, g_XMSix)));
+
+    XMVECTOR _rgb;
+
+    switch (ii)
+    {
+    case 0: // rgb = vtp
+    {
+        XMVECTOR vt = XMVectorSelect(t, v, g_XMSelect1000);
+        _rgb = XMVectorSelect(p, vt, g_XMSelect1100);
+    }
+    break;
+    case 1: // rgb = qvp
+    {
+        XMVECTOR qv = XMVectorSelect(v, q, g_XMSelect1000);
+        _rgb = XMVectorSelect(p, qv, g_XMSelect1100);
+    }
+    break;
+    case 2: // rgb = pvt
+    {
+        XMVECTOR pv = XMVectorSelect(v, p, g_XMSelect1000);
+        _rgb = XMVectorSelect(t, pv, g_XMSelect1100);
+    }
+    break;
+    case 3: // rgb = pqv
+    {
+        XMVECTOR pq = XMVectorSelect(q, p, g_XMSelect1000);
+        _rgb = XMVectorSelect(v, pq, g_XMSelect1100);
+    }
+    break;
+    case 4: // rgb = tpv
+    {
+        XMVECTOR tp = XMVectorSelect(p, t, g_XMSelect1000);
+        _rgb = XMVectorSelect(v, tp, g_XMSelect1100);
+    }
+    break;
+    default: // rgb = vpq
+    {
+        XMVECTOR vp = XMVectorSelect(p, v, g_XMSelect1000);
+        _rgb = XMVectorSelect(q, vp, g_XMSelect1100);
+    }
+    break;
+    }
+
+    return XMVectorSelect(hsv, _rgb, g_XMSelect1110);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorRGBToYUV(FXMVECTOR rgb) noexcept
+{
+    static const XMVECTORF32 Scale0 = { { { 0.299f, -0.147f, 0.615f, 0.0f } } };
+    static const XMVECTORF32 Scale1 = { { { 0.587f, -0.289f, -0.515f, 0.0f } } };
+    static const XMVECTORF32 Scale2 = { { { 0.114f, 0.436f, -0.100f, 0.0f } } };
+
+    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
+    XMVECTOR clr = XMVector3Transform(rgb, M);
+
+    return XMVectorSelect(rgb, clr, g_XMSelect1110);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorYUVToRGB(FXMVECTOR yuv) noexcept
+{
+    static const XMVECTORF32 Scale1 = { { { 0.0f, -0.395f, 2.032f, 0.0f } } };
+    static const XMVECTORF32 Scale2 = { { { 1.140f, -0.581f, 0.0f, 0.0f } } };
+
+    XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero);
+    XMVECTOR clr = XMVector3Transform(yuv, M);
+
+    return XMVectorSelect(yuv, clr, g_XMSelect1110);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD(FXMVECTOR rgb) noexcept
+{
+    static const XMVECTORF32 Scale0 = { { { 0.2126f, -0.0997f, 0.6150f, 0.0f } } };
+    static const XMVECTORF32 Scale1 = { { { 0.7152f, -0.3354f, -0.5586f, 0.0f } } };
+    static const XMVECTORF32 Scale2 = { { { 0.0722f, 0.4351f, -0.0564f, 0.0f } } };
+
+    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
+    XMVECTOR clr = XMVector3Transform(rgb, M);
+
+    return XMVectorSelect(rgb, clr, g_XMSelect1110);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD(FXMVECTOR yuv) noexcept
+{
+    static const XMVECTORF32 Scale1 = { { { 0.0f, -0.2153f, 2.1324f, 0.0f } } };
+    static const XMVECTORF32 Scale2 = { { { 1.2803f, -0.3806f, 0.0f, 0.0f } } };
+
+    XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero);
+    XMVECTOR clr = XMVector3Transform(yuv, M);
+
+    return XMVectorSelect(yuv, clr, g_XMSelect1110);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_UHD(FXMVECTOR rgb) noexcept
+{
+    static const XMVECTORF32 Scale0 = { { { 0.2627f, -0.1215f,  0.6150f, 0.0f } } };
+    static const XMVECTORF32 Scale1 = { { { 0.6780f, -0.3136f, -0.5655f, 0.0f } } };
+    static const XMVECTORF32 Scale2 = { { { 0.0593f,  0.4351f, -0.0495f, 0.0f } } };
+
+    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
+    XMVECTOR clr = XMVector3Transform(rgb, M);
+
+    return XMVectorSelect(rgb, clr, g_XMSelect1110);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_UHD(FXMVECTOR yuv) noexcept
+{
+    static const XMVECTORF32 Scale1 = { { {    0.0f, -0.1891f, 2.1620f, 0.0f } } };
+    static const XMVECTORF32 Scale2 = { { { 1.1989f, -0.4645f,    0.0f, 0.0f } } };
+
+    XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero);
+    XMVECTOR clr = XMVector3Transform(yuv, M);
+
+    return XMVectorSelect(yuv, clr, g_XMSelect1110);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorRGBToXYZ(FXMVECTOR rgb) noexcept
+{
+    static const XMVECTORF32 Scale0 = { { { 0.4887180f, 0.1762044f, 0.0000000f, 0.0f } } };
+    static const XMVECTORF32 Scale1 = { { { 0.3106803f, 0.8129847f, 0.0102048f, 0.0f } } };
+    static const XMVECTORF32 Scale2 = { { { 0.2006017f, 0.0108109f, 0.9897952f, 0.0f } } };
+    static const XMVECTORF32 Scale = { { { 1.f / 0.17697f, 1.f / 0.17697f, 1.f / 0.17697f, 0.0f } } };
+
+    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
+    XMVECTOR clr = XMVectorMultiply(XMVector3Transform(rgb, M), Scale);
+
+    return XMVectorSelect(rgb, clr, g_XMSelect1110);
+}
+
+inline XMVECTOR XM_CALLCONV XMColorXYZToRGB(FXMVECTOR xyz) noexcept
+{
+    static const XMVECTORF32 Scale0 = { { { 2.3706743f, -0.5138850f, 0.0052982f, 0.0f } } };
+    static const XMVECTORF32 Scale1 = { { { -0.9000405f, 1.4253036f, -0.0146949f, 0.0f } } };
+    static const XMVECTORF32 Scale2 = { { { -0.4706338f, 0.0885814f, 1.0093968f, 0.0f } } };
+    static const XMVECTORF32 Scale = { { { 0.17697f, 0.17697f, 0.17697f, 0.0f } } };
+
+    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
+    XMVECTOR clr = XMVector3Transform(XMVectorMultiply(xyz, Scale), M);
+
+    return XMVectorSelect(xyz, clr, g_XMSelect1110);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorXYZToSRGB(FXMVECTOR xyz) noexcept
+{
+    static const XMVECTORF32 Scale0 = { { { 3.2406f, -0.9689f, 0.0557f, 0.0f } } };
+    static const XMVECTORF32 Scale1 = { { { -1.5372f, 1.8758f, -0.2040f, 0.0f } } };
+    static const XMVECTORF32 Scale2 = { { { -0.4986f, 0.0415f, 1.0570f, 0.0f } } };
+    static const XMVECTORF32 Cutoff = { { { 0.0031308f, 0.0031308f, 0.0031308f, 0.0f } } };
+    static const XMVECTORF32 Exp = { { { 1.0f / 2.4f, 1.0f / 2.4f, 1.0f / 2.4f, 1.0f } } };
+
+    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
+    XMVECTOR lclr = XMVector3Transform(xyz, M);
+
+    XMVECTOR sel = XMVectorGreater(lclr, Cutoff);
+
+    // clr = 12.92 * lclr for lclr <= 0.0031308f
+    XMVECTOR smallC = XMVectorMultiply(lclr, g_XMsrgbScale);
+
+    // clr = (1+a)*pow(lclr, 1/2.4) - a for lclr > 0.0031308 (where a = 0.055)
+    XMVECTOR largeC = XMVectorSubtract(XMVectorMultiply(g_XMsrgbA1, XMVectorPow(lclr, Exp)), g_XMsrgbA);
+
+    XMVECTOR clr = XMVectorSelect(smallC, largeC, sel);
+
+    return XMVectorSelect(xyz, clr, g_XMSelect1110);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorSRGBToXYZ(FXMVECTOR srgb) noexcept
+{
+    static const XMVECTORF32 Scale0 = { { { 0.4124f, 0.2126f, 0.0193f, 0.0f } } };
+    static const XMVECTORF32 Scale1 = { { { 0.3576f, 0.7152f, 0.1192f, 0.0f } } };
+    static const XMVECTORF32 Scale2 = { { { 0.1805f, 0.0722f, 0.9505f, 0.0f } } };
+    static const XMVECTORF32 Cutoff = { { { 0.04045f, 0.04045f, 0.04045f, 0.0f } } };
+    static const XMVECTORF32 Exp = { { { 2.4f, 2.4f, 2.4f, 1.0f } } };
+
+    XMVECTOR sel = XMVectorGreater(srgb, Cutoff);
+
+    // lclr = clr / 12.92
+    XMVECTOR smallC = XMVectorDivide(srgb, g_XMsrgbScale);
+
+    // lclr = pow( (clr + a) / (1+a), 2.4 )
+    XMVECTOR largeC = XMVectorPow(XMVectorDivide(XMVectorAdd(srgb, g_XMsrgbA), g_XMsrgbA1), Exp);
+
+    XMVECTOR lclr = XMVectorSelect(smallC, largeC, sel);
+
+    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
+    XMVECTOR clr = XMVector3Transform(lclr, M);
+
+    return XMVectorSelect(srgb, clr, g_XMSelect1110);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorRGBToSRGB(FXMVECTOR rgb) noexcept
+{
+    static const XMVECTORF32 Cutoff = { { { 0.0031308f, 0.0031308f, 0.0031308f, 1.f } } };
+    static const XMVECTORF32 Linear = { { { 12.92f, 12.92f, 12.92f, 1.f } } };
+    static const XMVECTORF32 Scale = { { { 1.055f, 1.055f, 1.055f, 1.f } } };
+    static const XMVECTORF32 Bias = { { { 0.055f, 0.055f, 0.055f, 0.f } } };
+    static const XMVECTORF32 InvGamma = { { { 1.0f / 2.4f, 1.0f / 2.4f, 1.0f / 2.4f, 1.f } } };
+
+    XMVECTOR V = XMVectorSaturate(rgb);
+    XMVECTOR V0 = XMVectorMultiply(V, Linear);
+    XMVECTOR V1 = XMVectorSubtract(XMVectorMultiply(Scale, XMVectorPow(V, InvGamma)), Bias);
+    XMVECTOR select = XMVectorLess(V, Cutoff);
+    V = XMVectorSelect(V1, V0, select);
+    return XMVectorSelect(rgb, V, g_XMSelect1110);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMColorSRGBToRGB(FXMVECTOR srgb) noexcept
+{
+    static const XMVECTORF32 Cutoff = { { { 0.04045f, 0.04045f, 0.04045f, 1.f } } };
+    static const XMVECTORF32 ILinear = { { { 1.f / 12.92f, 1.f / 12.92f, 1.f / 12.92f, 1.f } } };
+    static const XMVECTORF32 Scale = { { { 1.f / 1.055f, 1.f / 1.055f, 1.f / 1.055f, 1.f } } };
+    static const XMVECTORF32 Bias = { { { 0.055f, 0.055f, 0.055f, 0.f } } };
+    static const XMVECTORF32 Gamma = { { { 2.4f, 2.4f, 2.4f, 1.f } } };
+
+    XMVECTOR V = XMVectorSaturate(srgb);
+    XMVECTOR V0 = XMVectorMultiply(V, ILinear);
+    XMVECTOR V1 = XMVectorPow(XMVectorMultiply(XMVectorAdd(V, Bias), Scale), Gamma);
+    XMVECTOR select = XMVectorGreater(V, Cutoff);
+    V = XMVectorSelect(V0, V1, select);
+    return XMVectorSelect(srgb, V, g_XMSelect1110);
+}
+
+/****************************************************************************
+ *
+ * Miscellaneous
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline bool XMVerifyCPUSupport() noexcept
+{
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    int CPUInfo[4] = { -1 };
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 0);
+#endif
+
+#ifdef __AVX2__
+    if (CPUInfo[0] < 7)
+        return false;
+#else
+    if (CPUInfo[0] < 1)
+        return false;
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuid(CPUInfo, 1);
+#endif
+
+#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_)
+    // The compiler can emit FMA3 instructions even without explicit intrinsics use
+    if ((CPUInfo[2] & 0x38081001) != 0x38081001)
+        return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
+#elif defined(_XM_FMA3_INTRINSICS_) && defined(_XM_F16C_INTRINSICS_)
+    if ((CPUInfo[2] & 0x38081001) != 0x38081001)
+        return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
+#elif defined(_XM_FMA3_INTRINSICS_)
+    if ((CPUInfo[2] & 0x18081001) != 0x18081001)
+        return false; // No AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
+#elif defined(_XM_F16C_INTRINSICS_)
+    if ((CPUInfo[2] & 0x38080001) != 0x38080001)
+        return false; // No F16C/AVX/OSXSAVE/SSE4.1/SSE3 support
+#elif defined(__AVX__) || defined(_XM_AVX_INTRINSICS_)
+    if ((CPUInfo[2] & 0x18080001) != 0x18080001)
+        return false; // No AVX/OSXSAVE/SSE4.1/SSE3 support
+#elif defined(_XM_SSE4_INTRINSICS_)
+    if ((CPUInfo[2] & 0x80001) != 0x80001)
+        return false; // No SSE3/SSE4.1 support
+#elif defined(_XM_SSE3_INTRINSICS_)
+    if (!(CPUInfo[2] & 0x1))
+        return false; // No SSE3 support
+#endif
+
+    // The x64 processor model requires SSE2 support, but no harm in checking
+    if ((CPUInfo[3] & 0x6000000) != 0x6000000)
+        return false; // No SSE2/SSE support
+
+#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_)
+#if defined(__clang__) || defined(__GNUC__)
+    __cpuid_count(7, 0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+#else
+    __cpuidex(CPUInfo, 7, 0);
+#endif
+    if (!(CPUInfo[1] & 0x20))
+        return false; // No AVX2 support
+#endif
+
+    return true;
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    // ARM-NEON support is required for the Windows on ARM platform
+    return true;
+#else
+    // No intrinsics path always supported
+    return true;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMFresnelTerm
+(
+    FXMVECTOR CosIncidentAngle,
+    FXMVECTOR RefractionIndex
+) noexcept
+{
+    assert(!XMVector4IsInfinite(CosIncidentAngle));
+
+    // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where
+    // c = CosIncidentAngle
+    // g = sqrt(c^2 + RefractionIndex^2 - 1)
+
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+
+    XMVECTOR G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex, g_XMNegativeOne.v);
+    G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G);
+    G = XMVectorAbs(G);
+    G = XMVectorSqrt(G);
+
+    XMVECTOR S = XMVectorAdd(G, CosIncidentAngle);
+    XMVECTOR D = XMVectorSubtract(G, CosIncidentAngle);
+
+    XMVECTOR V0 = XMVectorMultiply(D, D);
+    XMVECTOR V1 = XMVectorMultiply(S, S);
+    V1 = XMVectorReciprocal(V1);
+    V0 = XMVectorMultiply(g_XMOneHalf.v, V0);
+    V0 = XMVectorMultiply(V0, V1);
+
+    XMVECTOR V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v);
+    XMVECTOR V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v);
+    V2 = XMVectorMultiply(V2, V2);
+    V3 = XMVectorMultiply(V3, V3);
+    V3 = XMVectorReciprocal(V3);
+    V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v);
+
+    XMVECTOR Result = XMVectorMultiply(V0, V2);
+
+    Result = XMVectorSaturate(Result);
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2))
+    XMVECTOR G = _mm_mul_ps(RefractionIndex, RefractionIndex);
+    XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle, CosIncidentAngle);
+    G = _mm_sub_ps(G, g_XMOne);
+    vTemp = _mm_add_ps(vTemp, G);
+    // max((0-vTemp),vTemp) == abs(vTemp)
+    // The abs is needed to deal with refraction and cosine being zero
+    G = _mm_setzero_ps();
+    G = _mm_sub_ps(G, vTemp);
+    G = _mm_max_ps(G, vTemp);
+    // Last operation, the sqrt()
+    G = _mm_sqrt_ps(G);
+
+    // Calc G-C and G+C
+    XMVECTOR GAddC = _mm_add_ps(G, CosIncidentAngle);
+    XMVECTOR GSubC = _mm_sub_ps(G, CosIncidentAngle);
+    // Perform the term (0.5f *(g - c)^2) / (g + c)^2
+    XMVECTOR vResult = _mm_mul_ps(GSubC, GSubC);
+    vTemp = _mm_mul_ps(GAddC, GAddC);
+    vResult = _mm_mul_ps(vResult, g_XMOneHalf);
+    vResult = _mm_div_ps(vResult, vTemp);
+    // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1)
+    GAddC = _mm_mul_ps(GAddC, CosIncidentAngle);
+    GSubC = _mm_mul_ps(GSubC, CosIncidentAngle);
+    GAddC = _mm_sub_ps(GAddC, g_XMOne);
+    GSubC = _mm_add_ps(GSubC, g_XMOne);
+    GAddC = _mm_mul_ps(GAddC, GAddC);
+    GSubC = _mm_mul_ps(GSubC, GSubC);
+    GAddC = _mm_div_ps(GAddC, GSubC);
+    GAddC = _mm_add_ps(GAddC, g_XMOne);
+    // Multiply the two term parts
+    vResult = _mm_mul_ps(vResult, GAddC);
+    // Clamp to 0.0 - 1.0f
+    vResult = _mm_max_ps(vResult, g_XMZero);
+    vResult = _mm_min_ps(vResult, g_XMOne);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XMScalarNearEqual
+(
+    float S1,
+    float S2,
+    float Epsilon
+) noexcept
+{
+    float Delta = S1 - S2;
+    return (fabsf(Delta) <= Epsilon);
+}
+
+//------------------------------------------------------------------------------
+// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI
+inline float XMScalarModAngle(float Angle) noexcept
+{
+    // Note: The modulo is performed with unsigned math only to work
+    // around a precision error on numbers that are close to PI
+
+    // Normalize the range from 0.0f to XM_2PI
+    Angle = Angle + XM_PI;
+    // Perform the modulo, unsigned
+    float fTemp = fabsf(Angle);
+    fTemp = fTemp - (XM_2PI * static_cast<float>(static_cast<int32_t>(fTemp / XM_2PI)));
+    // Restore the number to the range of -XM_PI to XM_PI-epsilon
+    fTemp = fTemp - XM_PI;
+    // If the modulo'd value was negative, restore negation
+    if (Angle < 0.0f)
+    {
+        fTemp = -fTemp;
+    }
+    return fTemp;
+}
+
+//------------------------------------------------------------------------------
+
+inline float XMScalarSin(float Value) noexcept
+{
+    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
+    float quotient = XM_1DIV2PI * Value;
+    if (Value >= 0.0f)
+    {
+        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
+    }
+    else
+    {
+        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
+    }
+    float y = Value - XM_2PI * quotient;
+
+    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
+    if (y > XM_PIDIV2)
+    {
+        y = XM_PI - y;
+    }
+    else if (y < -XM_PIDIV2)
+    {
+        y = -XM_PI - y;
+    }
+
+    // 11-degree minimax approximation
+    float y2 = y * y;
+    return (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y;
+}
+
+//------------------------------------------------------------------------------
+
+inline float XMScalarSinEst(float Value) noexcept
+{
+    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
+    float quotient = XM_1DIV2PI * Value;
+    if (Value >= 0.0f)
+    {
+        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
+    }
+    else
+    {
+        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
+    }
+    float y = Value - XM_2PI * quotient;
+
+    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
+    if (y > XM_PIDIV2)
+    {
+        y = XM_PI - y;
+    }
+    else if (y < -XM_PIDIV2)
+    {
+        y = -XM_PI - y;
+    }
+
+    // 7-degree minimax approximation
+    float y2 = y * y;
+    return (((-0.00018524670f * y2 + 0.0083139502f) * y2 - 0.16665852f) * y2 + 1.0f) * y;
+}
+
+//------------------------------------------------------------------------------
+
+inline float XMScalarCos(float Value) noexcept
+{
+    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
+    float quotient = XM_1DIV2PI * Value;
+    if (Value >= 0.0f)
+    {
+        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
+    }
+    else
+    {
+        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
+    }
+    float y = Value - XM_2PI * quotient;
+
+    // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x).
+    float sign;
+    if (y > XM_PIDIV2)
+    {
+        y = XM_PI - y;
+        sign = -1.0f;
+    }
+    else if (y < -XM_PIDIV2)
+    {
+        y = -XM_PI - y;
+        sign = -1.0f;
+    }
+    else
+    {
+        sign = +1.0f;
+    }
+
+    // 10-degree minimax approximation
+    float y2 = y * y;
+    float p = ((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f;
+    return sign * p;
+}
+
+//------------------------------------------------------------------------------
+
+inline float XMScalarCosEst(float Value) noexcept
+{
+    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
+    float quotient = XM_1DIV2PI * Value;
+    if (Value >= 0.0f)
+    {
+        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
+    }
+    else
+    {
+        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
+    }
+    float y = Value - XM_2PI * quotient;
+
+    // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x).
+    float sign;
+    if (y > XM_PIDIV2)
+    {
+        y = XM_PI - y;
+        sign = -1.0f;
+    }
+    else if (y < -XM_PIDIV2)
+    {
+        y = -XM_PI - y;
+        sign = -1.0f;
+    }
+    else
+    {
+        sign = +1.0f;
+    }
+
+    // 6-degree minimax approximation
+    float y2 = y * y;
+    float p = ((-0.0012712436f * y2 + 0.041493919f) * y2 - 0.49992746f) * y2 + 1.0f;
+    return sign * p;
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline void XMScalarSinCos
+(
+    float* pSin,
+    float* pCos,
+    float  Value
+) noexcept
+{
+    assert(pSin);
+    assert(pCos);
+
+    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
+    float quotient = XM_1DIV2PI * Value;
+    if (Value >= 0.0f)
+    {
+        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
+    }
+    else
+    {
+        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
+    }
+    float y = Value - XM_2PI * quotient;
+
+    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
+    float sign;
+    if (y > XM_PIDIV2)
+    {
+        y = XM_PI - y;
+        sign = -1.0f;
+    }
+    else if (y < -XM_PIDIV2)
+    {
+        y = -XM_PI - y;
+        sign = -1.0f;
+    }
+    else
+    {
+        sign = +1.0f;
+    }
+
+    float y2 = y * y;
+
+    // 11-degree minimax approximation
+    *pSin = (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y;
+
+    // 10-degree minimax approximation
+    float p = ((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f;
+    *pCos = sign * p;
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline void XMScalarSinCosEst
+(
+    float* pSin,
+    float* pCos,
+    float  Value
+) noexcept
+{
+    assert(pSin);
+    assert(pCos);
+
+    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
+    float quotient = XM_1DIV2PI * Value;
+    if (Value >= 0.0f)
+    {
+        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
+    }
+    else
+    {
+        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
+    }
+    float y = Value - XM_2PI * quotient;
+
+    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
+    float sign;
+    if (y > XM_PIDIV2)
+    {
+        y = XM_PI - y;
+        sign = -1.0f;
+    }
+    else if (y < -XM_PIDIV2)
+    {
+        y = -XM_PI - y;
+        sign = -1.0f;
+    }
+    else
+    {
+        sign = +1.0f;
+    }
+
+    float y2 = y * y;
+
+    // 7-degree minimax approximation
+    *pSin = (((-0.00018524670f * y2 + 0.0083139502f) * y2 - 0.16665852f) * y2 + 1.0f) * y;
+
+    // 6-degree minimax approximation
+    float p = ((-0.0012712436f * y2 + 0.041493919f) * y2 - 0.49992746f) * y2 + 1.0f;
+    *pCos = sign * p;
+}
+
+//------------------------------------------------------------------------------
+
+inline float XMScalarASin(float Value) noexcept
+{
+    // Clamp input to [-1,1].
+    bool nonnegative = (Value >= 0.0f);
+    float x = fabsf(Value);
+    float omx = 1.0f - x;
+    if (omx < 0.0f)
+    {
+        omx = 0.0f;
+    }
+    float root = sqrtf(omx);
+
+    // 7-degree minimax approximation
+    float result = ((((((-0.0012624911f * x + 0.0066700901f) * x - 0.0170881256f) * x + 0.0308918810f) * x - 0.0501743046f) * x + 0.0889789874f) * x - 0.2145988016f) * x + 1.5707963050f;
+    result *= root;  // acos(|x|)
+
+    // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x)
+    return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2);
+}
+
+//------------------------------------------------------------------------------
+
+inline float XMScalarASinEst(float Value) noexcept
+{
+    // Clamp input to [-1,1].
+    bool nonnegative = (Value >= 0.0f);
+    float x = fabsf(Value);
+    float omx = 1.0f - x;
+    if (omx < 0.0f)
+    {
+        omx = 0.0f;
+    }
+    float root = sqrtf(omx);
+
+    // 3-degree minimax approximation
+    float result = ((-0.0187293f * x + 0.0742610f) * x - 0.2121144f) * x + 1.5707288f;
+    result *= root;  // acos(|x|)
+
+    // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x)
+    return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2);
+}
+
+//------------------------------------------------------------------------------
+
+inline float XMScalarACos(float Value) noexcept
+{
+    // Clamp input to [-1,1].
+    bool nonnegative = (Value >= 0.0f);
+    float x = fabsf(Value);
+    float omx = 1.0f - x;
+    if (omx < 0.0f)
+    {
+        omx = 0.0f;
+    }
+    float root = sqrtf(omx);
+
+    // 7-degree minimax approximation
+    float result = ((((((-0.0012624911f * x + 0.0066700901f) * x - 0.0170881256f) * x + 0.0308918810f) * x - 0.0501743046f) * x + 0.0889789874f) * x - 0.2145988016f) * x + 1.5707963050f;
+    result *= root;
+
+    // acos(x) = pi - acos(-x) when x < 0
+    return (nonnegative ? result : XM_PI - result);
+}
+
+//------------------------------------------------------------------------------
+
+inline float XMScalarACosEst(float Value) noexcept
+{
+    // Clamp input to [-1,1].
+    bool nonnegative = (Value >= 0.0f);
+    float x = fabsf(Value);
+    float omx = 1.0f - x;
+    if (omx < 0.0f)
+    {
+        omx = 0.0f;
+    }
+    float root = sqrtf(omx);
+
+    // 3-degree minimax approximation
+    float result = ((-0.0187293f * x + 0.0742610f) * x - 0.2121144f) * x + 1.5707288f;
+    result *= root;
+
+    // acos(x) = pi - acos(-x) when x < 0
+    return (nonnegative ? result : XM_PI - result);
+}
+
diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathVector.inl b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathVector.inl
new file mode 100644
index 000000000..e3db56a6c
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathVector.inl
@@ -0,0 +1,14819 @@
+//-------------------------------------------------------------------------------------
+// DirectXMathVector.inl -- SIMD C++ Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#if defined(_XM_NO_INTRINSICS_)
+#define XMISNAN(x)  isnan(x)
+#define XMISINF(x)  isinf(x)
+#endif
+
+#if defined(_XM_SSE_INTRINSICS_)
+
+#define XM3UNPACK3INTO4(l1, l2, l3) \
+    XMVECTOR V3 = _mm_shuffle_ps(l2, l3, _MM_SHUFFLE(0, 0, 3, 2));\
+    XMVECTOR V2 = _mm_shuffle_ps(l2, l1, _MM_SHUFFLE(3, 3, 1, 0));\
+    V2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 0, 2));\
+    XMVECTOR V4 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(L3), 32 / 8))
+
+#define XM3PACK4INTO3(v2x) \
+    v2x = _mm_shuffle_ps(V2, V3, _MM_SHUFFLE(1, 0, 2, 1));\
+    V2 = _mm_shuffle_ps(V2, V1, _MM_SHUFFLE(2, 2, 0, 0));\
+    V1 = _mm_shuffle_ps(V1, V2, _MM_SHUFFLE(0, 2, 1, 0));\
+    V3 = _mm_shuffle_ps(V3, V4, _MM_SHUFFLE(0, 0, 2, 2));\
+    V3 = _mm_shuffle_ps(V3, V4, _MM_SHUFFLE(2, 1, 2, 0))
+
+#endif
+
+/****************************************************************************
+ *
+ * General Vector
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+ // Assignment operations
+ //------------------------------------------------------------------------------
+
+ //------------------------------------------------------------------------------
+ // Return a vector with all elements equaling zero
+inline XMVECTOR XM_CALLCONV XMVectorZero() noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { { 0.0f, 0.0f, 0.0f, 0.0f } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_n_f32(0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_setzero_ps();
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with four floating point values
+inline XMVECTOR XM_CALLCONV XMVectorSet
+(
+    float x,
+    float y,
+    float z,
+    float w
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { { x, y, z, w } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t V0 = vcreate_f32(
+        static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&x))
+        | (static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&y)) << 32));
+    float32x2_t V1 = vcreate_f32(
+        static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&z))
+        | (static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&w)) << 32));
+    return vcombine_f32(V0, V1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_set_ps(w, z, y, x);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with four integer values
+inline XMVECTOR XM_CALLCONV XMVectorSetInt
+(
+    uint32_t x,
+    uint32_t y,
+    uint32_t z,
+    uint32_t w
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 vResult = { { { x, y, z, w } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t V0 = vcreate_u32(static_cast<uint64_t>(x) | (static_cast<uint64_t>(y) << 32));
+    uint32x2_t V1 = vcreate_u32(static_cast<uint64_t>(z) | (static_cast<uint64_t>(w) << 32));
+    return vreinterpretq_f32_u32(vcombine_u32(V0, V1));
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_set_epi32(static_cast<int>(w), static_cast<int>(z), static_cast<int>(y), static_cast<int>(x));
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with a replicated floating point value
+inline XMVECTOR XM_CALLCONV XMVectorReplicate(float Value) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult;
+    vResult.f[0] =
+        vResult.f[1] =
+        vResult.f[2] =
+        vResult.f[3] = Value;
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_n_f32(Value);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_set_ps1(Value);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with a replicated floating point value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr(const float* pValue) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float Value = pValue[0];
+    XMVECTORF32 vResult;
+    vResult.f[0] =
+        vResult.f[1] =
+        vResult.f[2] =
+        vResult.f[3] = Value;
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_dup_f32(pValue);
+#elif defined(_XM_AVX_INTRINSICS_)
+    return _mm_broadcast_ss(pValue);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_load_ps1(pValue);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with a replicated integer value
+inline XMVECTOR XM_CALLCONV XMVectorReplicateInt(uint32_t Value) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 vResult;
+    vResult.u[0] =
+        vResult.u[1] =
+        vResult.u[2] =
+        vResult.u[3] = Value;
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vdupq_n_u32(Value));
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_set1_epi32(static_cast<int>(Value));
+    return _mm_castsi128_ps(vTemp);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with a replicated integer value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr(const uint32_t* pValue) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t Value = pValue[0];
+    XMVECTORU32 vResult;
+    vResult.u[0] =
+        vResult.u[1] =
+        vResult.u[2] =
+        vResult.u[3] = Value;
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vld1q_dup_u32(pValue));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_load_ps1(reinterpret_cast<const float*>(pValue));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with all bits set (true mask)
+inline XMVECTOR XM_CALLCONV XMVectorTrueInt() noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 vResult = { { { 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_s32(vdupq_n_s32(-1));
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_set1_epi32(-1);
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Initialize a vector with all bits clear (false mask)
+inline XMVECTOR XM_CALLCONV XMVectorFalseInt() noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { { 0.0f, 0.0f, 0.0f, 0.0f } } };
+    return vResult;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vdupq_n_u32(0));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_setzero_ps();
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Replicate the x component of the vector
+inline XMVECTOR XM_CALLCONV XMVectorSplatX(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult;
+    vResult.f[0] =
+        vResult.f[1] =
+        vResult.f[2] =
+        vResult.f[3] = V.vector4_f32[0];
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_lane_f32(vget_low_f32(V), 0);
+#elif defined(_XM_AVX2_INTRINSICS_) && defined(_XM_FAVOR_INTEL_)
+    return _mm_broadcastss_ps(V);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Replicate the y component of the vector
+inline XMVECTOR XM_CALLCONV XMVectorSplatY(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult;
+    vResult.f[0] =
+        vResult.f[1] =
+        vResult.f[2] =
+        vResult.f[3] = V.vector4_f32[1];
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_lane_f32(vget_low_f32(V), 1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Replicate the z component of the vector
+inline XMVECTOR XM_CALLCONV XMVectorSplatZ(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult;
+    vResult.f[0] =
+        vResult.f[1] =
+        vResult.f[2] =
+        vResult.f[3] = V.vector4_f32[2];
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_lane_f32(vget_high_f32(V), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Replicate the w component of the vector
+inline XMVECTOR XM_CALLCONV XMVectorSplatW(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult;
+    vResult.f[0] =
+        vResult.f[1] =
+        vResult.f[2] =
+        vResult.f[3] = V.vector4_f32[3];
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_lane_f32(vget_high_f32(V), 1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return a vector of 1.0f,1.0f,1.0f,1.0f
+inline XMVECTOR XM_CALLCONV XMVectorSplatOne() noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult;
+    vResult.f[0] =
+        vResult.f[1] =
+        vResult.f[2] =
+        vResult.f[3] = 1.0f;
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vdupq_n_f32(1.0f);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return g_XMOne;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return a vector of INF,INF,INF,INF
+inline XMVECTOR XM_CALLCONV XMVectorSplatInfinity() noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 vResult;
+    vResult.u[0] =
+        vResult.u[1] =
+        vResult.u[2] =
+        vResult.u[3] = 0x7F800000;
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vdupq_n_u32(0x7F800000));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return g_XMInfinity;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN
+inline XMVECTOR XM_CALLCONV XMVectorSplatQNaN() noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 vResult;
+    vResult.u[0] =
+        vResult.u[1] =
+        vResult.u[2] =
+        vResult.u[3] = 0x7FC00000;
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vdupq_n_u32(0x7FC00000));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return g_XMQNaN;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f
+inline XMVECTOR XM_CALLCONV XMVectorSplatEpsilon() noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 vResult;
+    vResult.u[0] =
+        vResult.u[1] =
+        vResult.u[2] =
+        vResult.u[3] = 0x34000000;
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vdupq_n_u32(0x34000000));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return g_XMEpsilon;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f
+inline XMVECTOR XM_CALLCONV XMVectorSplatSignMask() noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 vResult;
+    vResult.u[0] =
+        vResult.u[1] =
+        vResult.u[2] =
+        vResult.u[3] = 0x80000000U;
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vdupq_n_u32(0x80000000U));
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_set1_epi32(static_cast<int>(0x80000000));
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return a floating point value via an index. This is not a recommended
+// function to use due to performance loss.
+inline float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i) noexcept
+{
+    assert(i < 4);
+    _Analysis_assume_(i < 4);
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_f32[i];
+#else
+    XMVECTORF32 U;
+    U.v = V;
+    return U.f[i];
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return the X component in an FPU register.
+inline float XM_CALLCONV XMVectorGetX(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_f32[0];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vgetq_lane_f32(V, 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_cvtss_f32(V);
+#endif
+}
+
+// Return the Y component in an FPU register.
+inline float XM_CALLCONV XMVectorGetY(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_f32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vgetq_lane_f32(V, 1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+    return _mm_cvtss_f32(vTemp);
+#endif
+}
+
+// Return the Z component in an FPU register.
+inline float XM_CALLCONV XMVectorGetZ(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_f32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vgetq_lane_f32(V, 2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+    return _mm_cvtss_f32(vTemp);
+#endif
+}
+
+// Return the W component in an FPU register.
+inline float XM_CALLCONV XMVectorGetW(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_f32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vgetq_lane_f32(V, 3);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+    return _mm_cvtss_f32(vTemp);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Store a component indexed by i into a 32 bit float location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetByIndexPtr(float* f, FXMVECTOR V, size_t i) noexcept
+{
+    assert(f != nullptr);
+    assert(i < 4);
+    _Analysis_assume_(i < 4);
+#if defined(_XM_NO_INTRINSICS_)
+    *f = V.vector4_f32[i];
+#else
+    XMVECTORF32 U;
+    U.v = V;
+    *f = U.f[i];
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Store the X component into a 32 bit float location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetXPtr(float* x, FXMVECTOR V) noexcept
+{
+    assert(x != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    *x = V.vector4_f32[0];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_f32(x, V, 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_ss(x, V);
+#endif
+}
+
+// Store the Y component into a 32 bit float location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetYPtr(float* y, FXMVECTOR V) noexcept
+{
+    assert(y != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    *y = V.vector4_f32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_f32(y, V, 1);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    * (reinterpret_cast<int*>(y)) = _mm_extract_ps(V, 1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+    _mm_store_ss(y, vResult);
+#endif
+}
+
+// Store the Z component into a 32 bit float location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetZPtr(float* z, FXMVECTOR V) noexcept
+{
+    assert(z != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    *z = V.vector4_f32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_f32(z, V, 2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    * (reinterpret_cast<int*>(z)) = _mm_extract_ps(V, 2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+    _mm_store_ss(z, vResult);
+#endif
+}
+
+// Store the W component into a 32 bit float location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetWPtr(float* w, FXMVECTOR V) noexcept
+{
+    assert(w != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    *w = V.vector4_f32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_f32(w, V, 3);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    * (reinterpret_cast<int*>(w)) = _mm_extract_ps(V, 3);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+    _mm_store_ss(w, vResult);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Return an integer value via an index. This is not a recommended
+// function to use due to performance loss.
+inline uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i) noexcept
+{
+    assert(i < 4);
+    _Analysis_assume_(i < 4);
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_u32[i];
+#else
+    XMVECTORU32 U;
+    U.v = V;
+    return U.u[i];
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Return the X component in an integer register.
+inline uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_u32[0];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vgetq_lane_u32(vreinterpretq_u32_f32(V), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_castps_si128(V)));
+#endif
+}
+
+// Return the Y component in an integer register.
+inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_u32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vgetq_lane_u32(vreinterpretq_u32_f32(V), 1);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128(V);
+    return static_cast<uint32_t>(_mm_extract_epi32(V1, 1));
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(1, 1, 1, 1));
+    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
+#endif
+}
+
+// Return the Z component in an integer register.
+inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_u32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vgetq_lane_u32(vreinterpretq_u32_f32(V), 2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128(V);
+    return static_cast<uint32_t>(_mm_extract_epi32(V1, 2));
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(2, 2, 2, 2));
+    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
+#endif
+}
+
+// Return the W component in an integer register.
+inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return V.vector4_u32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vgetq_lane_u32(vreinterpretq_u32_f32(V), 3);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128(V);
+    return static_cast<uint32_t>(_mm_extract_epi32(V1, 3));
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(3, 3, 3, 3));
+    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Store a component indexed by i into a 32 bit integer location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetIntByIndexPtr(uint32_t* x, FXMVECTOR V, size_t i) noexcept
+{
+    assert(x != nullptr);
+    assert(i < 4);
+    _Analysis_assume_(i < 4);
+#if defined(_XM_NO_INTRINSICS_)
+    *x = V.vector4_u32[i];
+#else
+    XMVECTORU32 U;
+    U.v = V;
+    *x = U.u[i];
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Store the X component into a 32 bit integer location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetIntXPtr(uint32_t* x, FXMVECTOR V) noexcept
+{
+    assert(x != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    *x = V.vector4_u32[0];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_u32(x, *reinterpret_cast<const uint32x4_t*>(&V), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    _mm_store_ss(reinterpret_cast<float*>(x), V);
+#endif
+}
+
+// Store the Y component into a 32 bit integer location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetIntYPtr(uint32_t* y, FXMVECTOR V) noexcept
+{
+    assert(y != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    *y = V.vector4_u32[1];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_u32(y, *reinterpret_cast<const uint32x4_t*>(&V), 1);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128(V);
+    *y = static_cast<uint32_t>(_mm_extract_epi32(V1, 1));
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+    _mm_store_ss(reinterpret_cast<float*>(y), vResult);
+#endif
+}
+
+// Store the Z component into a 32 bit integer locaCantion in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetIntZPtr(uint32_t* z, FXMVECTOR V) noexcept
+{
+    assert(z != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    *z = V.vector4_u32[2];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_u32(z, *reinterpret_cast<const uint32x4_t*>(&V), 2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128(V);
+    *z = static_cast<uint32_t>(_mm_extract_epi32(V1, 2));
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+    _mm_store_ss(reinterpret_cast<float*>(z), vResult);
+#endif
+}
+
+// Store the W component into a 32 bit integer location in memory.
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorGetIntWPtr(uint32_t* w, FXMVECTOR V) noexcept
+{
+    assert(w != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    *w = V.vector4_u32[3];
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    vst1q_lane_u32(w, *reinterpret_cast<const uint32x4_t*>(&V), 3);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i V1 = _mm_castps_si128(V);
+    *w = static_cast<uint32_t>(_mm_extract_epi32(V1, 3));
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+    _mm_store_ss(reinterpret_cast<float*>(w), vResult);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Set a single indexed floating point component
+inline XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i) noexcept
+{
+    assert(i < 4);
+    _Analysis_assume_(i < 4);
+    XMVECTORF32 U;
+    U.v = V;
+    U.f[i] = f;
+    return U.v;
+}
+
+//------------------------------------------------------------------------------
+
+// Sets the X component of a vector to a passed floating point value
+inline XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 U = { { {
+            x,
+            V.vector4_f32[1],
+            V.vector4_f32[2],
+            V.vector4_f32[3]
+        } } };
+    return U.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vsetq_lane_f32(x, V, 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = _mm_set_ss(x);
+    vResult = _mm_move_ss(V, vResult);
+    return vResult;
+#endif
+}
+
+// Sets the Y component of a vector to a passed floating point value
+inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 U = { { {
+            V.vector4_f32[0],
+            y,
+            V.vector4_f32[2],
+            V.vector4_f32[3]
+        } } };
+    return U.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vsetq_lane_f32(y, V, 1);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vResult = _mm_set_ss(y);
+    vResult = _mm_insert_ps(V, vResult, 0x10);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap y and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_set_ss(y);
+    // Replace the x component
+    vResult = _mm_move_ss(vResult, vTemp);
+    // Swap y and x again
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1));
+    return vResult;
+#endif
+}
+// Sets the Z component of a vector to a passed floating point value
+inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 U = { { {
+            V.vector4_f32[0],
+            V.vector4_f32[1],
+            z,
+            V.vector4_f32[3]
+        } } };
+    return U.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vsetq_lane_f32(z, V, 2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vResult = _mm_set_ss(z);
+    vResult = _mm_insert_ps(V, vResult, 0x20);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap z and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_set_ss(z);
+    // Replace the x component
+    vResult = _mm_move_ss(vResult, vTemp);
+    // Swap z and x again
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
+    return vResult;
+#endif
+}
+
+// Sets the W component of a vector to a passed floating point value
+inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 U = { { {
+            V.vector4_f32[0],
+            V.vector4_f32[1],
+            V.vector4_f32[2],
+            w
+        } } };
+    return U.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vsetq_lane_f32(w, V, 3);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vResult = _mm_set_ss(w);
+    vResult = _mm_insert_ps(V, vResult, 0x30);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap w and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_set_ss(w);
+    // Replace the x component
+    vResult = _mm_move_ss(vResult, vTemp);
+    // Swap w and x again
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Sets a component of a vector to a floating point value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(FXMVECTOR V, const float* f, size_t i) noexcept
+{
+    assert(f != nullptr);
+    assert(i < 4);
+    _Analysis_assume_(i < 4);
+    XMVECTORF32 U;
+    U.v = V;
+    U.f[i] = *f;
+    return U.v;
+}
+
+//------------------------------------------------------------------------------
+
+// Sets the X component of a vector to a floating point value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetXPtr(FXMVECTOR V, const float* x) noexcept
+{
+    assert(x != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 U = { { {
+            *x,
+            V.vector4_f32[1],
+            V.vector4_f32[2],
+            V.vector4_f32[3]
+        } } };
+    return U.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_lane_f32(x, V, 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = _mm_load_ss(x);
+    vResult = _mm_move_ss(V, vResult);
+    return vResult;
+#endif
+}
+
+// Sets the Y component of a vector to a floating point value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetYPtr(FXMVECTOR V, const float* y) noexcept
+{
+    assert(y != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 U = { { {
+            V.vector4_f32[0],
+            *y,
+            V.vector4_f32[2],
+            V.vector4_f32[3]
+        } } };
+    return U.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_lane_f32(y, V, 1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap y and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_load_ss(y);
+    // Replace the x component
+    vResult = _mm_move_ss(vResult, vTemp);
+    // Swap y and x again
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1));
+    return vResult;
+#endif
+}
+
+// Sets the Z component of a vector to a floating point value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetZPtr(FXMVECTOR V, const float* z) noexcept
+{
+    assert(z != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 U = { { {
+            V.vector4_f32[0],
+            V.vector4_f32[1],
+            *z,
+            V.vector4_f32[3]
+        } } };
+    return U.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_lane_f32(z, V, 2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap z and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_load_ss(z);
+    // Replace the x component
+    vResult = _mm_move_ss(vResult, vTemp);
+    // Swap z and x again
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
+    return vResult;
+#endif
+}
+
+// Sets the W component of a vector to a floating point value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetWPtr(FXMVECTOR V, const float* w) noexcept
+{
+    assert(w != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 U = { { {
+            V.vector4_f32[0],
+            V.vector4_f32[1],
+            V.vector4_f32[2],
+            *w
+        } } };
+    return U.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vld1q_lane_f32(w, V, 3);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap w and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_load_ss(w);
+    // Replace the x component
+    vResult = _mm_move_ss(vResult, vTemp);
+    // Swap w and x again
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Sets a component of a vector to an integer passed by value
+inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i) noexcept
+{
+    assert(i < 4);
+    _Analysis_assume_(i < 4);
+    XMVECTORU32 tmp;
+    tmp.v = V;
+    tmp.u[i] = x;
+    return tmp;
+}
+
+//------------------------------------------------------------------------------
+
+// Sets the X component of a vector to an integer passed by value
+inline XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 U = { { {
+            x,
+            V.vector4_u32[1],
+            V.vector4_u32[2],
+            V.vector4_u32[3]
+        } } };
+    return U.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vsetq_lane_u32(x, vreinterpretq_u32_f32(V), 0));
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cvtsi32_si128(static_cast<int>(x));
+    XMVECTOR vResult = _mm_move_ss(V, _mm_castsi128_ps(vTemp));
+    return vResult;
+#endif
+}
+
+// Sets the Y component of a vector to an integer passed by value
+inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 U = { { {
+            V.vector4_u32[0],
+            y,
+            V.vector4_u32[2],
+            V.vector4_u32[3]
+        } } };
+    return U.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vsetq_lane_u32(y, vreinterpretq_u32_f32(V), 1));
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i vResult = _mm_castps_si128(V);
+    vResult = _mm_insert_epi32(vResult, static_cast<int>(y), 1);
+    return _mm_castsi128_ps(vResult);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap y and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
+    // Convert input to vector
+    __m128i vTemp = _mm_cvtsi32_si128(static_cast<int>(y));
+    // Replace the x component
+    vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp));
+    // Swap y and x again
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1));
+    return vResult;
+#endif
+}
+
+// Sets the Z component of a vector to an integer passed by value
+inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 U = { { {
+            V.vector4_u32[0],
+            V.vector4_u32[1],
+            z,
+            V.vector4_u32[3]
+        } } };
+    return U.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vsetq_lane_u32(z, vreinterpretq_u32_f32(V), 2));
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i vResult = _mm_castps_si128(V);
+    vResult = _mm_insert_epi32(vResult, static_cast<int>(z), 2);
+    return _mm_castsi128_ps(vResult);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap z and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2));
+    // Convert input to vector
+    __m128i vTemp = _mm_cvtsi32_si128(static_cast<int>(z));
+    // Replace the x component
+    vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp));
+    // Swap z and x again
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
+    return vResult;
+#endif
+}
+
+// Sets the W component of a vector to an integer passed by value
+inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 U = { { {
+            V.vector4_u32[0],
+            V.vector4_u32[1],
+            V.vector4_u32[2],
+            w
+        } } };
+    return U.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vsetq_lane_u32(w, vreinterpretq_u32_f32(V), 3));
+#elif defined(_XM_SSE4_INTRINSICS_)
+    __m128i vResult = _mm_castps_si128(V);
+    vResult = _mm_insert_epi32(vResult, static_cast<int>(w), 3);
+    return _mm_castsi128_ps(vResult);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap w and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3));
+    // Convert input to vector
+    __m128i vTemp = _mm_cvtsi32_si128(static_cast<int>(w));
+    // Replace the x component
+    vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp));
+    // Swap w and x again
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+// Sets a component of a vector to an integer value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(FXMVECTOR V, const uint32_t* x, size_t i) noexcept
+{
+    assert(x != nullptr);
+    assert(i < 4);
+    _Analysis_assume_(i < 4);
+    XMVECTORU32 tmp;
+    tmp.v = V;
+    tmp.u[i] = *x;
+    return tmp;
+}
+
+//------------------------------------------------------------------------------
+
+// Sets the X component of a vector to an integer value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(FXMVECTOR V, const uint32_t* x) noexcept
+{
+    assert(x != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 U = { { {
+            *x,
+            V.vector4_u32[1],
+            V.vector4_u32[2],
+            V.vector4_u32[3]
+        } } };
+    return U.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vld1q_lane_u32(x, *reinterpret_cast<const uint32x4_t*>(&V), 0));
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float*>(x));
+    XMVECTOR vResult = _mm_move_ss(V, vTemp);
+    return vResult;
+#endif
+}
+
+// Sets the Y component of a vector to an integer value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(FXMVECTOR V, const uint32_t* y) noexcept
+{
+    assert(y != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 U = { { {
+            V.vector4_u32[0],
+            *y,
+            V.vector4_u32[2],
+            V.vector4_u32[3]
+        } } };
+    return U.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vld1q_lane_u32(y, *reinterpret_cast<const uint32x4_t*>(&V), 1));
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap y and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float*>(y));
+    // Replace the x component
+    vResult = _mm_move_ss(vResult, vTemp);
+    // Swap y and x again
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1));
+    return vResult;
+#endif
+}
+
+// Sets the Z component of a vector to an integer value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(FXMVECTOR V, const uint32_t* z) noexcept
+{
+    assert(z != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 U = { { {
+            V.vector4_u32[0],
+            V.vector4_u32[1],
+            *z,
+            V.vector4_u32[3]
+        } } };
+    return U.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vld1q_lane_u32(z, *reinterpret_cast<const uint32x4_t*>(&V), 2));
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap z and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float*>(z));
+    // Replace the x component
+    vResult = _mm_move_ss(vResult, vTemp);
+    // Swap z and x again
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
+    return vResult;
+#endif
+}
+
+// Sets the W component of a vector to an integer value passed by pointer
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(FXMVECTOR V, const uint32_t* w) noexcept
+{
+    assert(w != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORU32 U = { { {
+            V.vector4_u32[0],
+            V.vector4_u32[1],
+            V.vector4_u32[2],
+            *w
+        } } };
+    return U.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vld1q_lane_u32(w, *reinterpret_cast<const uint32x4_t*>(&V), 3));
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap w and x
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3));
+    // Convert input to vector
+    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float*>(w));
+    // Replace the x component
+    vResult = _mm_move_ss(vResult, vTemp);
+    // Swap w and x again
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSwizzle
+(
+    FXMVECTOR V,
+    uint32_t E0,
+    uint32_t E1,
+    uint32_t E2,
+    uint32_t E3
+) noexcept
+{
+    assert((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4));
+    _Analysis_assume_((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4));
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result = { { {
+            V.vector4_f32[E0],
+            V.vector4_f32[E1],
+            V.vector4_f32[E2],
+            V.vector4_f32[E3]
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const uint32_t ControlElement[4] =
+    {
+        0x03020100, // XM_SWIZZLE_X
+        0x07060504, // XM_SWIZZLE_Y
+        0x0B0A0908, // XM_SWIZZLE_Z
+        0x0F0E0D0C, // XM_SWIZZLE_W
+    };
+
+    uint8x8x2_t tbl;
+    tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V));
+    tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V));
+
+    uint32x2_t idx = vcreate_u32(static_cast<uint64_t>(ControlElement[E0]) | (static_cast<uint64_t>(ControlElement[E1]) << 32));
+    const uint8x8_t rL = vtbl2_u8(tbl, vreinterpret_u8_u32(idx));
+
+    idx = vcreate_u32(static_cast<uint64_t>(ControlElement[E2]) | (static_cast<uint64_t>(ControlElement[E3]) << 32));
+    const uint8x8_t rH = vtbl2_u8(tbl, vreinterpret_u8_u32(idx));
+
+    return vcombine_f32(vreinterpret_f32_u8(rL), vreinterpret_f32_u8(rH));
+#elif defined(_XM_AVX_INTRINSICS_)
+    unsigned int elem[4] = { E0, E1, E2, E3 };
+    __m128i vControl = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&elem[0]));
+    return _mm_permutevar_ps(V, vControl);
+#else
+    auto aPtr = reinterpret_cast<const uint32_t*>(&V);
+
+    XMVECTOR Result;
+    auto pWork = reinterpret_cast<uint32_t*>(&Result);
+
+    pWork[0] = aPtr[E0];
+    pWork[1] = aPtr[E1];
+    pWork[2] = aPtr[E2];
+    pWork[3] = aPtr[E3];
+
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+inline XMVECTOR XM_CALLCONV XMVectorPermute
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2,
+    uint32_t PermuteX,
+    uint32_t PermuteY,
+    uint32_t PermuteZ,
+    uint32_t PermuteW
+) noexcept
+{
+    assert(PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7);
+    _Analysis_assume_(PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7);
+
+#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    static const uint32_t ControlElement[8] =
+    {
+        0x03020100, // XM_PERMUTE_0X
+        0x07060504, // XM_PERMUTE_0Y
+        0x0B0A0908, // XM_PERMUTE_0Z
+        0x0F0E0D0C, // XM_PERMUTE_0W
+        0x13121110, // XM_PERMUTE_1X
+        0x17161514, // XM_PERMUTE_1Y
+        0x1B1A1918, // XM_PERMUTE_1Z
+        0x1F1E1D1C, // XM_PERMUTE_1W
+    };
+
+    uint8x8x4_t tbl;
+    tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V1));
+    tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V1));
+    tbl.val[2] = vreinterpret_u8_f32(vget_low_f32(V2));
+    tbl.val[3] = vreinterpret_u8_f32(vget_high_f32(V2));
+
+    uint32x2_t idx = vcreate_u32(static_cast<uint64_t>(ControlElement[PermuteX]) | (static_cast<uint64_t>(ControlElement[PermuteY]) << 32));
+    const uint8x8_t rL = vtbl4_u8(tbl, vreinterpret_u8_u32(idx));
+
+    idx = vcreate_u32(static_cast<uint64_t>(ControlElement[PermuteZ]) | (static_cast<uint64_t>(ControlElement[PermuteW]) << 32));
+    const uint8x8_t rH = vtbl4_u8(tbl, vreinterpret_u8_u32(idx));
+
+    return vcombine_f32(vreinterpret_f32_u8(rL), vreinterpret_f32_u8(rH));
+#elif defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    static const XMVECTORU32 three = { { { 3, 3, 3, 3 } } };
+
+    XM_ALIGNED_DATA(16) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
+    __m128i vControl = _mm_load_si128(reinterpret_cast<const __m128i*>(&elem[0]));
+
+    __m128i vSelect = _mm_cmpgt_epi32(vControl, three);
+    vControl = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(vControl), three));
+
+    __m128 shuffled1 = _mm_permutevar_ps(V1, vControl);
+    __m128 shuffled2 = _mm_permutevar_ps(V2, vControl);
+
+    __m128 masked1 = _mm_andnot_ps(_mm_castsi128_ps(vSelect), shuffled1);
+    __m128 masked2 = _mm_and_ps(_mm_castsi128_ps(vSelect), shuffled2);
+
+    return _mm_or_ps(masked1, masked2);
+#else
+
+    const uint32_t* aPtr[2];
+    aPtr[0] = reinterpret_cast<const uint32_t*>(&V1);
+    aPtr[1] = reinterpret_cast<const uint32_t*>(&V2);
+
+    XMVECTOR Result;
+    auto pWork = reinterpret_cast<uint32_t*>(&Result);
+
+    const uint32_t i0 = PermuteX & 3;
+    const uint32_t vi0 = PermuteX >> 2;
+    pWork[0] = aPtr[vi0][i0];
+
+    const uint32_t i1 = PermuteY & 3;
+    const uint32_t vi1 = PermuteY >> 2;
+    pWork[1] = aPtr[vi1][i1];
+
+    const uint32_t i2 = PermuteZ & 3;
+    const uint32_t vi2 = PermuteZ >> 2;
+    pWork[2] = aPtr[vi2][i2];
+
+    const uint32_t i3 = PermuteW & 3;
+    const uint32_t vi3 = PermuteW >> 2;
+    pWork[3] = aPtr[vi3][i3];
+
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Define a control vector to be used in XMVectorSelect
+// operations.  The four integers specified in XMVectorSelectControl
+// serve as indices to select between components in two vectors.
+// The first index controls selection for the first component of
+// the vectors involved in a select operation, the second index
+// controls selection for the second component etc.  A value of
+// zero for an index causes the corresponding component from the first
+// vector to be selected whereas a one causes the component from the
+// second vector to be selected instead.
+
+inline XMVECTOR XM_CALLCONV XMVectorSelectControl
+(
+    uint32_t VectorIndex0,
+    uint32_t VectorIndex1,
+    uint32_t VectorIndex2,
+    uint32_t VectorIndex3
+) noexcept
+{
+#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    // x=Index0,y=Index1,z=Index2,w=Index3
+    __m128i vTemp = _mm_set_epi32(static_cast<int>(VectorIndex3), static_cast<int>(VectorIndex2), static_cast<int>(VectorIndex1), static_cast<int>(VectorIndex0));
+    // Any non-zero entries become 0xFFFFFFFF else 0
+    vTemp = _mm_cmpgt_epi32(vTemp, g_XMZero);
+    return _mm_castsi128_ps(vTemp);
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    int32x2_t V0 = vcreate_s32(static_cast<uint64_t>(VectorIndex0) | (static_cast<uint64_t>(VectorIndex1) << 32));
+    int32x2_t V1 = vcreate_s32(static_cast<uint64_t>(VectorIndex2) | (static_cast<uint64_t>(VectorIndex3) << 32));
+    int32x4_t vTemp = vcombine_s32(V0, V1);
+    // Any non-zero entries become 0xFFFFFFFF else 0
+    return vreinterpretq_f32_u32(vcgtq_s32(vTemp, g_XMZero));
+#else
+    XMVECTOR    ControlVector;
+    const uint32_t  ControlElement[] =
+    {
+        XM_SELECT_0,
+        XM_SELECT_1
+    };
+
+    assert(VectorIndex0 < 2);
+    assert(VectorIndex1 < 2);
+    assert(VectorIndex2 < 2);
+    assert(VectorIndex3 < 2);
+    _Analysis_assume_(VectorIndex0 < 2);
+    _Analysis_assume_(VectorIndex1 < 2);
+    _Analysis_assume_(VectorIndex2 < 2);
+    _Analysis_assume_(VectorIndex3 < 2);
+
+    ControlVector.vector4_u32[0] = ControlElement[VectorIndex0];
+    ControlVector.vector4_u32[1] = ControlElement[VectorIndex1];
+    ControlVector.vector4_u32[2] = ControlElement[VectorIndex2];
+    ControlVector.vector4_u32[3] = ControlElement[VectorIndex3];
+
+    return ControlVector;
+
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSelect
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2,
+    FXMVECTOR Control
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Result = { { {
+            (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]),
+            (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]),
+            (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]),
+            (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]),
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vbslq_f32(vreinterpretq_u32_f32(Control), V2, V1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp1 = _mm_andnot_ps(Control, V1);
+    XMVECTOR vTemp2 = _mm_and_ps(V2, Control);
+    return _mm_or_ps(vTemp1, vTemp2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMergeXY
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Result = { { {
+            V1.vector4_u32[0],
+            V2.vector4_u32[0],
+            V1.vector4_u32[1],
+            V2.vector4_u32[1],
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vzipq_f32(V1, V2).val[0];
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_unpacklo_ps(V1, V2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMergeZW
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Result = { { {
+            V1.vector4_u32[2],
+            V2.vector4_u32[2],
+            V1.vector4_u32[3],
+            V2.vector4_u32[3]
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vzipq_f32(V1, V2).val[1];
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_unpackhi_ps(V1, V2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) noexcept
+{
+    assert(Elements < 4);
+    _Analysis_assume_(Elements < 4);
+    return XMVectorPermute(V1, V2, Elements, ((Elements)+1), ((Elements)+2), ((Elements)+3));
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) noexcept
+{
+    assert(Elements < 4);
+    _Analysis_assume_(Elements < 4);
+    return XMVectorSwizzle(V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) noexcept
+{
+    assert(Elements < 4);
+    _Analysis_assume_(Elements < 4);
+    return XMVectorSwizzle(V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorInsert(
+    FXMVECTOR VD, FXMVECTOR VS,
+    uint32_t VSLeftRotateElements,
+    uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3) noexcept
+{
+    XMVECTOR Control = XMVectorSelectControl(Select0 & 1, Select1 & 1, Select2 & 1, Select3 & 1);
+    return XMVectorSelect(VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control);
+}
+
+//------------------------------------------------------------------------------
+// Comparison operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Control = { { {
+            (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0,
+        } } };
+    return Control.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vceqq_f32(V1, V2));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_cmpeq_ps(V1, V2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorEqualR
+(
+    uint32_t* pCR,
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+    assert(pCR != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+    uint32_t uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+    uint32_t uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+    uint32_t uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+    uint32_t CR = 0;
+    if (ux & uy & uz & uw)
+    {
+        // All elements are greater
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!(ux | uy | uz | uw))
+    {
+        // All elements are not greater
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+
+    XMVECTORU32 Control = { { { ux, uy, uz, uw } } };
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vreinterpret_u8_u32(vget_low_u32(vResult)), vreinterpret_u8_u32(vget_high_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFFFU)
+    {
+        // All elements are equal
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!r)
+    {
+        // All elements are not equal
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+    return vreinterpretq_f32_u32(vResult);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
+    uint32_t CR = 0;
+    int iTest = _mm_movemask_ps(vTemp);
+    if (iTest == 0xf)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        // All elements are not greater
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Treat the components of the vectors as unsigned integers and
+// compare individual bits between the two.  This is useful for
+// comparing control vectors and result vectors returned from
+// other comparison operations.
+
+inline XMVECTOR XM_CALLCONV XMVectorEqualInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Control = { { {
+            (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0,
+        } } };
+    return Control.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vceqq_s32(vreinterpretq_s32_f32(V1), vreinterpretq_s32_f32(V2)));
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorEqualIntR
+(
+    uint32_t* pCR,
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+    assert(pCR != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Control = XMVectorEqualInt(V1, V2);
+
+    *pCR = 0;
+    if (XMVector4EqualInt(Control, XMVectorTrueInt()))
+    {
+        // All elements are equal
+        *pCR |= XM_CRMASK_CR6TRUE;
+    }
+    else if (XMVector4EqualInt(Control, XMVectorFalseInt()))
+    {
+        // All elements are not equal
+        *pCR |= XM_CRMASK_CR6FALSE;
+    }
+    return Control;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFFFU)
+    {
+        // All elements are equal
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!r)
+    {
+        // All elements are not equal
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+    return vreinterpretq_f32_u32(vResult);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
+    int iTemp = _mm_movemask_ps(_mm_castsi128_ps(V));
+    uint32_t CR = 0;
+    if (iTemp == 0x0F)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTemp)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorNearEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2,
+    FXMVECTOR Epsilon
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    float fDeltax = V1.vector4_f32[0] - V2.vector4_f32[0];
+    float fDeltay = V1.vector4_f32[1] - V2.vector4_f32[1];
+    float fDeltaz = V1.vector4_f32[2] - V2.vector4_f32[2];
+    float fDeltaw = V1.vector4_f32[3] - V2.vector4_f32[3];
+
+    fDeltax = fabsf(fDeltax);
+    fDeltay = fabsf(fDeltay);
+    fDeltaz = fabsf(fDeltaz);
+    fDeltaw = fabsf(fDeltaw);
+
+    XMVECTORU32 Control = { { {
+            (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0,
+            (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0,
+            (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0,
+            (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0,
+        } } };
+    return Control.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vDelta = vsubq_f32(V1, V2);
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    return vacleq_f32(vDelta, Epsilon);
+#else
+    return vreinterpretq_f32_u32(vcleq_f32(vabsq_f32(vDelta), Epsilon));
+#endif
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Get the difference
+    XMVECTOR vDelta = _mm_sub_ps(V1, V2);
+    // Get the absolute value of the difference
+    XMVECTOR vTemp = _mm_setzero_ps();
+    vTemp = _mm_sub_ps(vTemp, vDelta);
+    vTemp = _mm_max_ps(vTemp, vDelta);
+    vTemp = _mm_cmple_ps(vTemp, Epsilon);
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorNotEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Control = { { {
+            (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0,
+        } } };
+    return Control.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(V1, V2)));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_cmpneq_ps(V1, V2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorNotEqualInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Control = { { {
+            (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0,
+            (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0,
+            (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0,
+            (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0
+        } } };
+    return Control.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vmvnq_u32(
+            vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2))));
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
+    return _mm_xor_ps(_mm_castsi128_ps(V), g_XMNegOneMask);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorGreater
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Control = { { {
+            (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0
+        } } };
+    return Control.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vcgtq_f32(V1, V2));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_cmpgt_ps(V1, V2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorGreaterR
+(
+    uint32_t* pCR,
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+    assert(pCR != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+    uint32_t uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+    uint32_t uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+    uint32_t uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+    uint32_t CR = 0;
+    if (ux & uy & uz & uw)
+    {
+        // All elements are greater
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!(ux | uy | uz | uw))
+    {
+        // All elements are not greater
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+
+    XMVECTORU32 Control = { { { ux, uy, uz, uw } } };
+    return Control.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgtq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFFFU)
+    {
+        // All elements are greater
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!r)
+    {
+        // All elements are not greater
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+    return vreinterpretq_f32_u32(vResult);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
+    uint32_t CR = 0;
+    int iTest = _mm_movemask_ps(vTemp);
+    if (iTest == 0xf)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        // All elements are not greater
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Control = { { {
+            (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0
+        } } };
+    return Control.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vcgeq_f32(V1, V2));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_cmpge_ps(V1, V2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR
+(
+    uint32_t* pCR,
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+    assert(pCR != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+    uint32_t uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+    uint32_t uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+    uint32_t uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+    uint32_t CR = 0;
+    if (ux & uy & uz & uw)
+    {
+        // All elements are greater
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!(ux | uy | uz | uw))
+    {
+        // All elements are not greater
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+
+    XMVECTORU32 Control = { { { ux, uy, uz, uw } } };
+    return Control.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgeq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFFFU)
+    {
+        // All elements are greater or equal
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!r)
+    {
+        // All elements are not greater or equal
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+    return vreinterpretq_f32_u32(vResult);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
+    uint32_t CR = 0;
+    int iTest = _mm_movemask_ps(vTemp);
+    if (iTest == 0xf)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        // All elements are not greater
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    *pCR = CR;
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorLess
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Control = { { {
+            (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0
+        } } };
+    return Control.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vcltq_f32(V1, V2));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_cmplt_ps(V1, V2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorLessOrEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Control = { { {
+            (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
+            (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0
+        } } };
+    return Control.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vcleq_f32(V1, V2));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_cmple_ps(V1, V2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorInBounds
+(
+    FXMVECTOR V,
+    FXMVECTOR Bounds
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Control = { { {
+            (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0,
+            (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0,
+            (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0,
+            (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0
+        } } };
+    return Control.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Test if less than or equal
+    uint32x4_t vTemp1 = vcleq_f32(V, Bounds);
+    // Negate the bounds
+    uint32x4_t vTemp2 = vreinterpretq_u32_f32(vnegq_f32(Bounds));
+    // Test if greater or equal (Reversed)
+    vTemp2 = vcleq_f32(vreinterpretq_f32_u32(vTemp2), V);
+    // Blend answers
+    vTemp1 = vandq_u32(vTemp1, vTemp2);
+    return vreinterpretq_f32_u32(vTemp1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test if less than or equal
+    XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds);
+    // Negate the bounds
+    XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne);
+    // Test if greater or equal (Reversed)
+    vTemp2 = _mm_cmple_ps(vTemp2, V);
+    // Blend answers
+    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
+    return vTemp1;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMVectorInBoundsR
+(
+    uint32_t* pCR,
+    FXMVECTOR V,
+    FXMVECTOR Bounds
+) noexcept
+{
+    assert(pCR != nullptr);
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
+    uint32_t uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
+    uint32_t uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
+    uint32_t uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
+
+    uint32_t CR = 0;
+    if (ux & uy & uz & uw)
+    {
+        // All elements are in bounds
+        CR = XM_CRMASK_CR6BOUNDS;
+    }
+    *pCR = CR;
+
+    XMVECTORU32 Control = { { { ux, uy, uz, uw } } };
+    return Control.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Test if less than or equal
+    uint32x4_t vTemp1 = vcleq_f32(V, Bounds);
+    // Negate the bounds
+    uint32x4_t vTemp2 = vreinterpretq_u32_f32(vnegq_f32(Bounds));
+    // Test if greater or equal (Reversed)
+    vTemp2 = vcleq_f32(vreinterpretq_f32_u32(vTemp2), V);
+    // Blend answers
+    vTemp1 = vandq_u32(vTemp1, vTemp2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTemp1)), vget_high_u8(vreinterpretq_u8_u32(vTemp1)));
+    uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1);
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFFFU)
+    {
+        // All elements are in bounds
+        CR = XM_CRMASK_CR6BOUNDS;
+    }
+    *pCR = CR;
+    return vreinterpretq_f32_u32(vTemp1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test if less than or equal
+    XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds);
+    // Negate the bounds
+    XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne);
+    // Test if greater or equal (Reversed)
+    vTemp2 = _mm_cmple_ps(vTemp2, V);
+    // Blend answers
+    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
+
+    uint32_t CR = 0;
+    if (_mm_movemask_ps(vTemp1) == 0xf)
+    {
+        // All elements are in bounds
+        CR = XM_CRMASK_CR6BOUNDS;
+    }
+    *pCR = CR;
+    return vTemp1;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#pragma float_control(push)
+#pragma float_control(precise, on)
+#endif
+
+inline XMVECTOR XM_CALLCONV XMVectorIsNaN(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Control = { { {
+            XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0,
+            XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0,
+            XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0,
+            XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0
+        } } };
+    return Control.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Test against itself. NaN is always not equal
+    uint32x4_t vTempNan = vceqq_f32(V, V);
+    // Flip results
+    return vreinterpretq_f32_u32(vmvnq_u32(vTempNan));
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test against itself. NaN is always not equal
+    return _mm_cmpneq_ps(V, V);
+#endif
+}
+
+#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#pragma float_control(pop)
+#endif
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorIsInfinite(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Control = { { {
+            XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0,
+            XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0,
+            XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0,
+            XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0
+        } } };
+    return Control.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Mask off the sign bit
+    uint32x4_t vTemp = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
+    // Compare to infinity
+    vTemp = vceqq_f32(vreinterpretq_f32_u32(vTemp), g_XMInfinity);
+    // If any are infinity, the signs are true.
+    return vreinterpretq_f32_u32(vTemp);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Mask off the sign bit
+    __m128 vTemp = _mm_and_ps(V, g_XMAbsMask);
+    // Compare to infinity
+    vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity);
+    // If any are infinity, the signs are true.
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Rounding and clamping operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMin
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result = { { {
+            (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0],
+            (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1],
+            (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2],
+            (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3]
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vminq_f32(V1, V2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_min_ps(V1, V2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMax
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result = { { {
+            (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0],
+            (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1],
+            (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2],
+            (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3]
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vmaxq_f32(V1, V2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_max_ps(V1, V2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+namespace Internal
+{
+    // Round to nearest (even) a.k.a. banker's rounding
+    inline float round_to_nearest(float x) noexcept
+    {
+        float i = floorf(x);
+        x -= i;
+        if (x < 0.5f)
+            return i;
+        if (x > 0.5f)
+            return i + 1.f;
+
+        float int_part;
+        (void)modff(i / 2.f, &int_part);
+        if ((2.f * int_part) == i)
+        {
+            return i;
+        }
+
+        return i + 1.f;
+    }
+}
+
+#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#pragma float_control(push)
+#pragma float_control(precise, on)
+#endif
+
+inline XMVECTOR XM_CALLCONV XMVectorRound(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result = { { {
+            Internal::round_to_nearest(V.vector4_f32[0]),
+            Internal::round_to_nearest(V.vector4_f32[1]),
+            Internal::round_to_nearest(V.vector4_f32[2]),
+            Internal::round_to_nearest(V.vector4_f32[3])
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+    return vrndnq_f32(V);
+#else
+    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(V), g_XMNegativeZero);
+    float32x4_t sMagic = vreinterpretq_f32_u32(vorrq_u32(g_XMNoFraction, sign));
+    float32x4_t R1 = vaddq_f32(V, sMagic);
+    R1 = vsubq_f32(R1, sMagic);
+    float32x4_t R2 = vabsq_f32(V);
+    uint32x4_t mask = vcleq_f32(R2, g_XMNoFraction);
+    return vbslq_f32(mask, R1, V);
+#endif
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_round_ps(V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 sign = _mm_and_ps(V, g_XMNegativeZero);
+    __m128 sMagic = _mm_or_ps(g_XMNoFraction, sign);
+    __m128 R1 = _mm_add_ps(V, sMagic);
+    R1 = _mm_sub_ps(R1, sMagic);
+    __m128 R2 = _mm_and_ps(V, g_XMAbsMask);
+    __m128 mask = _mm_cmple_ps(R2, g_XMNoFraction);
+    R2 = _mm_andnot_ps(mask, V);
+    R1 = _mm_and_ps(R1, mask);
+    XMVECTOR vResult = _mm_xor_ps(R1, R2);
+    return vResult;
+#endif
+}
+
+#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#pragma float_control(pop)
+#endif
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorTruncate(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR Result;
+    uint32_t     i;
+
+    // Avoid C4701
+    Result.vector4_f32[0] = 0.0f;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (XMISNAN(V.vector4_f32[i]))
+        {
+            Result.vector4_u32[i] = 0x7FC00000;
+        }
+        else if (fabsf(V.vector4_f32[i]) < 8388608.0f)
+        {
+            Result.vector4_f32[i] = static_cast<float>(static_cast<int32_t>(V.vector4_f32[i]));
+        }
+        else
+        {
+            Result.vector4_f32[i] = V.vector4_f32[i];
+        }
+    }
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+    return vrndq_f32(V);
+#else
+    float32x4_t vTest = vabsq_f32(V);
+    vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction));
+
+    int32x4_t vInt = vcvtq_s32_f32(V);
+    float32x4_t vResult = vcvtq_f32_s32(vInt);
+
+    // All numbers less than 8388608 will use the round to int
+    // All others, use the ORIGINAL value
+    return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V);
+#endif
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_round_ps(V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // To handle NAN, INF and numbers greater than 8388608, use masking
+    // Get the abs value
+    __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
+    // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
+    vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction);
+    // Convert to int and back to float for rounding with truncation
+    __m128i vInt = _mm_cvttps_epi32(V);
+    // Convert back to floats
+    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
+    // All numbers less than 8388608 will use the round to int
+    vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest));
+    // All others, use the ORIGINAL value
+    vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V));
+    vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorFloor(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            floorf(V.vector4_f32[0]),
+            floorf(V.vector4_f32[1]),
+            floorf(V.vector4_f32[2]),
+            floorf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+    return vrndmq_f32(V);
+#else
+    float32x4_t vTest = vabsq_f32(V);
+    vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction));
+    // Truncate
+    int32x4_t vInt = vcvtq_s32_f32(V);
+    float32x4_t vResult = vcvtq_f32_s32(vInt);
+    uint32x4_t vLargerMask = vcgtq_f32(vResult, V);
+    // 0 -> 0, 0xffffffff -> -1.0f
+    float32x4_t vLarger = vcvtq_f32_s32(vreinterpretq_s32_u32(vLargerMask));
+    vResult = vaddq_f32(vResult, vLarger);
+    // All numbers less than 8388608 will use the round to int
+    // All others, use the ORIGINAL value
+    return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V);
+#endif
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_floor_ps(V);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // To handle NAN, INF and numbers greater than 8388608, use masking
+    __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
+    vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction);
+    // Truncate
+    __m128i vInt = _mm_cvttps_epi32(V);
+    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
+    __m128 vLarger = _mm_cmpgt_ps(vResult, V);
+    // 0 -> 0, 0xffffffff -> -1.0f
+    vLarger = _mm_cvtepi32_ps(_mm_castps_si128(vLarger));
+    vResult = _mm_add_ps(vResult, vLarger);
+    // All numbers less than 8388608 will use the round to int
+    vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest));
+    // All others, use the ORIGINAL value
+    vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V));
+    vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorCeiling(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            ceilf(V.vector4_f32[0]),
+            ceilf(V.vector4_f32[1]),
+            ceilf(V.vector4_f32[2]),
+            ceilf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+    return vrndpq_f32(V);
+#else
+    float32x4_t vTest = vabsq_f32(V);
+    vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction));
+    // Truncate
+    int32x4_t vInt = vcvtq_s32_f32(V);
+    float32x4_t vResult = vcvtq_f32_s32(vInt);
+    uint32x4_t vSmallerMask = vcltq_f32(vResult, V);
+    // 0 -> 0, 0xffffffff -> -1.0f
+    float32x4_t vSmaller = vcvtq_f32_s32(vreinterpretq_s32_u32(vSmallerMask));
+    vResult = vsubq_f32(vResult, vSmaller);
+    // All numbers less than 8388608 will use the round to int
+    // All others, use the ORIGINAL value
+    return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V);
+#endif
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_ceil_ps(V);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // To handle NAN, INF and numbers greater than 8388608, use masking
+    __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
+    vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction);
+    // Truncate
+    __m128i vInt = _mm_cvttps_epi32(V);
+    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
+    __m128 vSmaller = _mm_cmplt_ps(vResult, V);
+    // 0 -> 0, 0xffffffff -> -1.0f
+    vSmaller = _mm_cvtepi32_ps(_mm_castps_si128(vSmaller));
+    vResult = _mm_sub_ps(vResult, vSmaller);
+    // All numbers less than 8388608 will use the round to int
+    vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest));
+    // All others, use the ORIGINAL value
+    vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V));
+    vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorClamp
+(
+    FXMVECTOR V,
+    FXMVECTOR Min,
+    FXMVECTOR Max
+) noexcept
+{
+    assert(XMVector4LessOrEqual(Min, Max));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVectorMax(Min, V);
+    Result = XMVectorMin(Max, Result);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vResult = vmaxq_f32(Min, V);
+    vResult = vminq_f32(Max, vResult);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult;
+    vResult = _mm_max_ps(Min, V);
+    vResult = _mm_min_ps(Max, vResult);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSaturate(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    const XMVECTOR Zero = XMVectorZero();
+
+    return XMVectorClamp(V, Zero, g_XMOne.v);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Set <0 to 0
+    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0));
+    // Set>1 to 1
+    return vminq_f32(vResult, vdupq_n_f32(1.0f));
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Set <0 to 0
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    // Set>1 to 1
+    return _mm_min_ps(vResult, g_XMOne);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Bitwise logical operations
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorAndInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Result = { { {
+            V1.vector4_u32[0] & V2.vector4_u32[0],
+            V1.vector4_u32[1] & V2.vector4_u32[1],
+            V1.vector4_u32[2] & V2.vector4_u32[2],
+            V1.vector4_u32[3] & V2.vector4_u32[3]
+        } } };
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)));
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_and_ps(V1, V2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorAndCInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Result = { { {
+            V1.vector4_u32[0] & ~V2.vector4_u32[0],
+            V1.vector4_u32[1] & ~V2.vector4_u32[1],
+            V1.vector4_u32[2] & ~V2.vector4_u32[2],
+            V1.vector4_u32[3] & ~V2.vector4_u32[3]
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)));
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_andnot_si128(_mm_castps_si128(V2), _mm_castps_si128(V1));
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorOrInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Result = { { {
+            V1.vector4_u32[0] | V2.vector4_u32[0],
+            V1.vector4_u32[1] | V2.vector4_u32[1],
+            V1.vector4_u32[2] | V2.vector4_u32[2],
+            V1.vector4_u32[3] | V2.vector4_u32[3]
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)));
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_or_si128(_mm_castps_si128(V1), _mm_castps_si128(V2));
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorNorInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Result = { { {
+            ~(V1.vector4_u32[0] | V2.vector4_u32[0]),
+            ~(V1.vector4_u32[1] | V2.vector4_u32[1]),
+            ~(V1.vector4_u32[2] | V2.vector4_u32[2]),
+            ~(V1.vector4_u32[3] | V2.vector4_u32[3])
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t Result = vorrq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
+    return vreinterpretq_f32_u32(vbicq_u32(g_XMNegOneMask, Result));
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i Result;
+    Result = _mm_or_si128(_mm_castps_si128(V1), _mm_castps_si128(V2));
+    Result = _mm_andnot_si128(Result, g_XMNegOneMask);
+    return _mm_castsi128_ps(Result);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorXorInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORU32 Result = { { {
+            V1.vector4_u32[0] ^ V2.vector4_u32[0],
+            V1.vector4_u32[1] ^ V2.vector4_u32[1],
+            V1.vector4_u32[2] ^ V2.vector4_u32[2],
+            V1.vector4_u32[3] ^ V2.vector4_u32[3]
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)));
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i V = _mm_xor_si128(_mm_castps_si128(V1), _mm_castps_si128(V2));
+    return _mm_castsi128_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorNegate(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result = { { {
+            -V.vector4_f32[0],
+            -V.vector4_f32[1],
+            -V.vector4_f32[2],
+            -V.vector4_f32[3]
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vnegq_f32(V);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR Z;
+
+    Z = _mm_setzero_ps();
+
+    return _mm_sub_ps(Z, V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorAdd
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result = { { {
+            V1.vector4_f32[0] + V2.vector4_f32[0],
+            V1.vector4_f32[1] + V2.vector4_f32[1],
+            V1.vector4_f32[2] + V2.vector4_f32[2],
+            V1.vector4_f32[3] + V2.vector4_f32[3]
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vaddq_f32(V1, V2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_add_ps(V1, V2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSum(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result;
+    Result.f[0] =
+        Result.f[1] =
+        Result.f[2] =
+        Result.f[3] = V.vector4_f32[0] + V.vector4_f32[1] + V.vector4_f32[2] + V.vector4_f32[3];
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+    float32x4_t vTemp = vpaddq_f32(V, V);
+    return vpaddq_f32(vTemp, vTemp);
+#else
+    float32x2_t v1 = vget_low_f32(V);
+    float32x2_t v2 = vget_high_f32(V);
+    v1 = vadd_f32(v1, v2);
+    v1 = vpadd_f32(v1, v1);
+    return vcombine_f32(v1, v1);
+#endif
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vTemp = _mm_hadd_ps(V, V);
+    return _mm_hadd_ps(vTemp, vTemp);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 3, 0, 1));
+    XMVECTOR vTemp2 = _mm_add_ps(V, vTemp);
+    vTemp = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 0, 3, 2));
+    return _mm_add_ps(vTemp, vTemp2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorAddAngles
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    const XMVECTOR Zero = XMVectorZero();
+
+    // Add the given angles together.  If the range of V1 is such
+    // that -Pi <= V1 < Pi and the range of V2 is such that
+    // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
+    // will be -Pi <= Result < Pi.
+    XMVECTOR Result = XMVectorAdd(V1, V2);
+
+    XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v);
+    XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);
+
+    Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
+    Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);
+
+    Result = XMVectorAdd(Result, Offset);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Adjust the angles
+    float32x4_t vResult = vaddq_f32(V1, V2);
+    // Less than Pi?
+    uint32x4_t vOffset = vcltq_f32(vResult, g_XMNegativePi);
+    vOffset = vandq_u32(vOffset, g_XMTwoPi);
+    // Add 2Pi to all entries less than -Pi
+    vResult = vaddq_f32(vResult, vreinterpretq_f32_u32(vOffset));
+    // Greater than or equal to Pi?
+    vOffset = vcgeq_f32(vResult, g_XMPi);
+    vOffset = vandq_u32(vOffset, g_XMTwoPi);
+    // Sub 2Pi to all entries greater than Pi
+    vResult = vsubq_f32(vResult, vreinterpretq_f32_u32(vOffset));
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Adjust the angles
+    XMVECTOR vResult = _mm_add_ps(V1, V2);
+    // Less than Pi?
+    XMVECTOR vOffset = _mm_cmplt_ps(vResult, g_XMNegativePi);
+    vOffset = _mm_and_ps(vOffset, g_XMTwoPi);
+    // Add 2Pi to all entries less than -Pi
+    vResult = _mm_add_ps(vResult, vOffset);
+    // Greater than or equal to Pi?
+    vOffset = _mm_cmpge_ps(vResult, g_XMPi);
+    vOffset = _mm_and_ps(vOffset, g_XMTwoPi);
+    // Sub 2Pi to all entries greater than Pi
+    vResult = _mm_sub_ps(vResult, vOffset);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSubtract
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result = { { {
+            V1.vector4_f32[0] - V2.vector4_f32[0],
+            V1.vector4_f32[1] - V2.vector4_f32[1],
+            V1.vector4_f32[2] - V2.vector4_f32[2],
+            V1.vector4_f32[3] - V2.vector4_f32[3]
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vsubq_f32(V1, V2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_sub_ps(V1, V2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSubtractAngles
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    const XMVECTOR Zero = XMVectorZero();
+
+    // Subtract the given angles.  If the range of V1 is such
+    // that -Pi <= V1 < Pi and the range of V2 is such that
+    // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
+    // will be -Pi <= Result < Pi.
+    XMVECTOR Result = XMVectorSubtract(V1, V2);
+
+    XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v);
+    XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);
+
+    Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
+    Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);
+
+    Result = XMVectorAdd(Result, Offset);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Adjust the angles
+    XMVECTOR vResult = vsubq_f32(V1, V2);
+    // Less than Pi?
+    uint32x4_t vOffset = vcltq_f32(vResult, g_XMNegativePi);
+    vOffset = vandq_u32(vOffset, g_XMTwoPi);
+    // Add 2Pi to all entries less than -Pi
+    vResult = vaddq_f32(vResult, vreinterpretq_f32_u32(vOffset));
+    // Greater than or equal to Pi?
+    vOffset = vcgeq_f32(vResult, g_XMPi);
+    vOffset = vandq_u32(vOffset, g_XMTwoPi);
+    // Sub 2Pi to all entries greater than Pi
+    vResult = vsubq_f32(vResult, vreinterpretq_f32_u32(vOffset));
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Adjust the angles
+    XMVECTOR vResult = _mm_sub_ps(V1, V2);
+    // Less than Pi?
+    XMVECTOR vOffset = _mm_cmplt_ps(vResult, g_XMNegativePi);
+    vOffset = _mm_and_ps(vOffset, g_XMTwoPi);
+    // Add 2Pi to all entries less than -Pi
+    vResult = _mm_add_ps(vResult, vOffset);
+    // Greater than or equal to Pi?
+    vOffset = _mm_cmpge_ps(vResult, g_XMPi);
+    vOffset = _mm_and_ps(vOffset, g_XMTwoPi);
+    // Sub 2Pi to all entries greater than Pi
+    vResult = _mm_sub_ps(vResult, vOffset);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMultiply
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            V1.vector4_f32[0] * V2.vector4_f32[0],
+            V1.vector4_f32[1] * V2.vector4_f32[1],
+            V1.vector4_f32[2] * V2.vector4_f32[2],
+            V1.vector4_f32[3] * V2.vector4_f32[3]
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vmulq_f32(V1, V2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_mul_ps(V1, V2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2,
+    FXMVECTOR V3
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            V1.vector4_f32[0] * V2.vector4_f32[0] + V3.vector4_f32[0],
+            V1.vector4_f32[1] * V2.vector4_f32[1] + V3.vector4_f32[1],
+            V1.vector4_f32[2] * V2.vector4_f32[2] + V3.vector4_f32[2],
+            V1.vector4_f32[3] * V2.vector4_f32[3] + V3.vector4_f32[3]
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+    return vfmaq_f32(V3, V1, V2);
+#else
+    return vmlaq_f32(V3, V1, V2);
+#endif
+#elif defined(_XM_SSE_INTRINSICS_)
+    return XM_FMADD_PS(V1, V2, V3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorDivide
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            V1.vector4_f32[0] / V2.vector4_f32[0],
+            V1.vector4_f32[1] / V2.vector4_f32[1],
+            V1.vector4_f32[2] / V2.vector4_f32[2],
+            V1.vector4_f32[3] / V2.vector4_f32[3]
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+    return vdivq_f32(V1, V2);
+#else
+    // 2 iterations of Newton-Raphson refinement of reciprocal
+    float32x4_t Reciprocal = vrecpeq_f32(V2);
+    float32x4_t S = vrecpsq_f32(Reciprocal, V2);
+    Reciprocal = vmulq_f32(S, Reciprocal);
+    S = vrecpsq_f32(Reciprocal, V2);
+    Reciprocal = vmulq_f32(S, Reciprocal);
+    return vmulq_f32(V1, Reciprocal);
+#endif
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_div_ps(V1, V2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2,
+    FXMVECTOR V3
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]),
+            V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]),
+            V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]),
+            V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3])
+        } } };
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+    return vfmsq_f32(V3, V1, V2);
+#else
+    return vmlsq_f32(V3, V1, V2);
+#endif
+#elif defined(_XM_SSE_INTRINSICS_)
+    return XM_FNMADD_PS(V1, V2, V3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorScale
+(
+    FXMVECTOR V,
+    float    ScaleFactor
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            V.vector4_f32[0] * ScaleFactor,
+            V.vector4_f32[1] * ScaleFactor,
+            V.vector4_f32[2] * ScaleFactor,
+            V.vector4_f32[3] * ScaleFactor
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vmulq_n_f32(V, ScaleFactor);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = _mm_set_ps1(ScaleFactor);
+    return _mm_mul_ps(vResult, V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorReciprocalEst(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            1.f / V.vector4_f32[0],
+            1.f / V.vector4_f32[1],
+            1.f / V.vector4_f32[2],
+            1.f / V.vector4_f32[3]
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vrecpeq_f32(V);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_rcp_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorReciprocal(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            1.f / V.vector4_f32[0],
+            1.f / V.vector4_f32[1],
+            1.f / V.vector4_f32[2],
+            1.f / V.vector4_f32[3]
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+    float32x4_t one = vdupq_n_f32(1.0f);
+    return vdivq_f32(one, V);
+#else
+    // 2 iterations of Newton-Raphson refinement
+    float32x4_t Reciprocal = vrecpeq_f32(V);
+    float32x4_t S = vrecpsq_f32(Reciprocal, V);
+    Reciprocal = vmulq_f32(S, Reciprocal);
+    S = vrecpsq_f32(Reciprocal, V);
+    return vmulq_f32(S, Reciprocal);
+#endif
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_div_ps(g_XMOne, V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Return an estimated square root
+inline XMVECTOR XM_CALLCONV XMVectorSqrtEst(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            sqrtf(V.vector4_f32[0]),
+            sqrtf(V.vector4_f32[1]),
+            sqrtf(V.vector4_f32[2]),
+            sqrtf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // 1 iteration of Newton-Raphson refinment of sqrt
+    float32x4_t S0 = vrsqrteq_f32(V);
+    float32x4_t P0 = vmulq_f32(V, S0);
+    float32x4_t R0 = vrsqrtsq_f32(P0, S0);
+    float32x4_t S1 = vmulq_f32(S0, R0);
+
+    XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
+    XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0));
+    XMVECTOR Result = vmulq_f32(V, S1);
+    XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
+    return XMVectorSelect(V, Result, Select);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_sqrt_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSqrt(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            sqrtf(V.vector4_f32[0]),
+            sqrtf(V.vector4_f32[1]),
+            sqrtf(V.vector4_f32[2]),
+            sqrtf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // 3 iterations of Newton-Raphson refinment of sqrt
+    float32x4_t S0 = vrsqrteq_f32(V);
+    float32x4_t P0 = vmulq_f32(V, S0);
+    float32x4_t R0 = vrsqrtsq_f32(P0, S0);
+    float32x4_t S1 = vmulq_f32(S0, R0);
+    float32x4_t P1 = vmulq_f32(V, S1);
+    float32x4_t R1 = vrsqrtsq_f32(P1, S1);
+    float32x4_t S2 = vmulq_f32(S1, R1);
+    float32x4_t P2 = vmulq_f32(V, S2);
+    float32x4_t R2 = vrsqrtsq_f32(P2, S2);
+    float32x4_t S3 = vmulq_f32(S2, R2);
+
+    XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
+    XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0));
+    XMVECTOR Result = vmulq_f32(V, S3);
+    XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
+    return XMVectorSelect(V, Result, Select);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_sqrt_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            1.f / sqrtf(V.vector4_f32[0]),
+            1.f / sqrtf(V.vector4_f32[1]),
+            1.f / sqrtf(V.vector4_f32[2]),
+            1.f / sqrtf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vrsqrteq_f32(V);
+#elif defined(_XM_SSE_INTRINSICS_)
+    return _mm_rsqrt_ps(V);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            1.f / sqrtf(V.vector4_f32[0]),
+            1.f / sqrtf(V.vector4_f32[1]),
+            1.f / sqrtf(V.vector4_f32[2]),
+            1.f / sqrtf(V.vector4_f32[3])
+        } } };
+    return Result;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // 2 iterations of Newton-Raphson refinement of reciprocal
+    float32x4_t S0 = vrsqrteq_f32(V);
+
+    float32x4_t P0 = vmulq_f32(V, S0);
+    float32x4_t R0 = vrsqrtsq_f32(P0, S0);
+
+    float32x4_t S1 = vmulq_f32(S0, R0);
+    float32x4_t P1 = vmulq_f32(V, S1);
+    float32x4_t R1 = vrsqrtsq_f32(P1, S1);
+
+    return vmulq_f32(S1, R1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = _mm_sqrt_ps(V);
+    vResult = _mm_div_ps(g_XMOne, vResult);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorExp2(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            exp2f(V.vector4_f32[0]),
+            exp2f(V.vector4_f32[1]),
+            exp2f(V.vector4_f32[2]),
+            exp2f(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x4_t itrunc = vcvtq_s32_f32(V);
+    float32x4_t ftrunc = vcvtq_f32_s32(itrunc);
+    float32x4_t y = vsubq_f32(V, ftrunc);
+
+    float32x4_t poly = vmlaq_f32(g_XMExpEst6, g_XMExpEst7, y);
+    poly = vmlaq_f32(g_XMExpEst5, poly, y);
+    poly = vmlaq_f32(g_XMExpEst4, poly, y);
+    poly = vmlaq_f32(g_XMExpEst3, poly, y);
+    poly = vmlaq_f32(g_XMExpEst2, poly, y);
+    poly = vmlaq_f32(g_XMExpEst1, poly, y);
+    poly = vmlaq_f32(g_XMOne, poly, y);
+
+    int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias);
+    biased = vshlq_n_s32(biased, 23);
+    float32x4_t result0 = XMVectorDivide(vreinterpretq_f32_s32(biased), poly);
+
+    biased = vaddq_s32(itrunc, g_XM253);
+    biased = vshlq_n_s32(biased, 23);
+    float32x4_t result1 = XMVectorDivide(vreinterpretq_f32_s32(biased), poly);
+    result1 = vmulq_f32(g_XMMinNormal.v, result1);
+
+    // Use selection to handle the cases
+    //  if (V is NaN) -> QNaN;
+    //  else if (V sign bit set)
+    //      if (V > -150)
+    //         if (V.exponent < -126) -> result1
+    //         else -> result0
+    //      else -> +0
+    //  else
+    //      if (V < 128) -> result0
+    //      else -> +inf
+
+    uint32x4_t comp = vcltq_s32(vreinterpretq_s32_f32(V), g_XMBin128);
+    float32x4_t result2 = vbslq_f32(comp, result0, g_XMInfinity);
+
+    comp = vcltq_s32(itrunc, g_XMSubnormalExponent);
+    float32x4_t result3 = vbslq_f32(comp, result1, result0);
+
+    comp = vcltq_s32(vreinterpretq_s32_f32(V), g_XMBinNeg150);
+    float32x4_t result4 = vbslq_f32(comp, result3, g_XMZero);
+
+    int32x4_t sign = vandq_s32(vreinterpretq_s32_f32(V), g_XMNegativeZero);
+    comp = vceqq_s32(sign, g_XMNegativeZero);
+    float32x4_t result5 = vbslq_f32(comp, result4, result2);
+
+    int32x4_t t0 = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest);
+    int32x4_t t1 = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
+    t0 = vreinterpretq_s32_u32(vceqq_s32(t0, g_XMZero));
+    t1 = vreinterpretq_s32_u32(vceqq_s32(t1, g_XMInfinity));
+    int32x4_t isNaN = vbicq_s32(t1, t0);
+
+    float32x4_t vResult = vbslq_f32(vreinterpretq_u32_s32(isNaN), g_XMQNaN, result5);
+    return vResult;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_exp2_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i itrunc = _mm_cvttps_epi32(V);
+    __m128 ftrunc = _mm_cvtepi32_ps(itrunc);
+    __m128 y = _mm_sub_ps(V, ftrunc);
+
+    __m128 poly = XM_FMADD_PS(g_XMExpEst7, y, g_XMExpEst6);
+    poly = XM_FMADD_PS(poly, y, g_XMExpEst5);
+    poly = XM_FMADD_PS(poly, y, g_XMExpEst4);
+    poly = XM_FMADD_PS(poly, y, g_XMExpEst3);
+    poly = XM_FMADD_PS(poly, y, g_XMExpEst2);
+    poly = XM_FMADD_PS(poly, y, g_XMExpEst1);
+    poly = XM_FMADD_PS(poly, y, g_XMOne);
+
+    __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias);
+    biased = _mm_slli_epi32(biased, 23);
+    __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
+
+    biased = _mm_add_epi32(itrunc, g_XM253);
+    biased = _mm_slli_epi32(biased, 23);
+    __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
+    result1 = _mm_mul_ps(g_XMMinNormal.v, result1);
+
+    // Use selection to handle the cases
+    //  if (V is NaN) -> QNaN;
+    //  else if (V sign bit set)
+    //      if (V > -150)
+    //         if (V.exponent < -126) -> result1
+    //         else -> result0
+    //      else -> +0
+    //  else
+    //      if (V < 128) -> result0
+    //      else -> +inf
+
+    __m128i comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBin128);
+    __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0));
+    __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity);
+    __m128i result2 = _mm_or_si128(select0, select1);
+
+    comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent);
+    select1 = _mm_and_si128(comp, _mm_castps_si128(result1));
+    select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0));
+    __m128i result3 = _mm_or_si128(select0, select1);
+
+    comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBinNeg150);
+    select0 = _mm_and_si128(comp, result3);
+    select1 = _mm_andnot_si128(comp, g_XMZero);
+    __m128i result4 = _mm_or_si128(select0, select1);
+
+    __m128i sign = _mm_and_si128(_mm_castps_si128(V), g_XMNegativeZero);
+    comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero);
+    select0 = _mm_and_si128(comp, result4);
+    select1 = _mm_andnot_si128(comp, result2);
+    __m128i result5 = _mm_or_si128(select0, select1);
+
+    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
+    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
+    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
+    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
+    __m128i isNaN = _mm_andnot_si128(t0, t1);
+
+    select0 = _mm_and_si128(isNaN, g_XMQNaN);
+    select1 = _mm_andnot_si128(isNaN, result5);
+    __m128i vResult = _mm_or_si128(select0, select1);
+
+    return _mm_castsi128_ps(vResult);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorExp10(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result = { { {
+            powf(10.0f, V.vector4_f32[0]),
+            powf(10.0f, V.vector4_f32[1]),
+            powf(10.0f, V.vector4_f32[2]),
+            powf(10.0f, V.vector4_f32[3])
+        } } };
+    return Result.v;
+
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_exp10_ps(V);
+    return Result;
+#else
+    // exp10(V) = exp2(vin*log2(10))
+    XMVECTOR Vten = XMVectorMultiply(g_XMLg10, V);
+    return XMVectorExp2(Vten);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorExpE(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result = { { {
+            expf(V.vector4_f32[0]),
+            expf(V.vector4_f32[1]),
+            expf(V.vector4_f32[2]),
+            expf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_exp_ps(V);
+    return Result;
+#else
+    // expE(V) = exp2(vin*log2(e))
+    XMVECTOR Ve = XMVectorMultiply(g_XMLgE, V);
+    return XMVectorExp2(Ve);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorExp(FXMVECTOR V) noexcept
+{
+    return XMVectorExp2(V);
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(_XM_SSE_INTRINSICS_)
+
+namespace Internal
+{
+    inline __m128i multi_sll_epi32(__m128i value, __m128i count) noexcept
+    {
+        __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 0, 0, 0));
+        __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0, 0, 0, 0));
+        c = _mm_and_si128(c, g_XMMaskX);
+        __m128i r0 = _mm_sll_epi32(v, c);
+
+        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1));
+        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 1, 1, 1));
+        c = _mm_and_si128(c, g_XMMaskX);
+        __m128i r1 = _mm_sll_epi32(v, c);
+
+        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 2, 2, 2));
+        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 2, 2, 2));
+        c = _mm_and_si128(c, g_XMMaskX);
+        __m128i r2 = _mm_sll_epi32(v, c);
+
+        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3, 3, 3, 3));
+        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3, 3, 3, 3));
+        c = _mm_and_si128(c, g_XMMaskX);
+        __m128i r3 = _mm_sll_epi32(v, c);
+
+        // (r0,r0,r1,r1)
+        __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0, 0, 0, 0));
+        // (r2,r2,r3,r3)
+        __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0, 0, 0, 0));
+        // (r0,r1,r2,r3)
+        __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2, 0, 2, 0));
+        return _mm_castps_si128(result);
+    }
+
+    inline __m128i multi_srl_epi32(__m128i value, __m128i count) noexcept
+    {
+        __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 0, 0, 0));
+        __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0, 0, 0, 0));
+        c = _mm_and_si128(c, g_XMMaskX);
+        __m128i r0 = _mm_srl_epi32(v, c);
+
+        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1));
+        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 1, 1, 1));
+        c = _mm_and_si128(c, g_XMMaskX);
+        __m128i r1 = _mm_srl_epi32(v, c);
+
+        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 2, 2, 2));
+        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 2, 2, 2));
+        c = _mm_and_si128(c, g_XMMaskX);
+        __m128i r2 = _mm_srl_epi32(v, c);
+
+        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3, 3, 3, 3));
+        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3, 3, 3, 3));
+        c = _mm_and_si128(c, g_XMMaskX);
+        __m128i r3 = _mm_srl_epi32(v, c);
+
+        // (r0,r0,r1,r1)
+        __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0, 0, 0, 0));
+        // (r2,r2,r3,r3)
+        __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0, 0, 0, 0));
+        // (r0,r1,r2,r3)
+        __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2, 0, 2, 0));
+        return _mm_castps_si128(result);
+    }
+
+    inline __m128i GetLeadingBit(const __m128i value) noexcept
+    {
+        static const XMVECTORI32 g_XM0000FFFF = { { { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF } } };
+        static const XMVECTORI32 g_XM000000FF = { { { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF } } };
+        static const XMVECTORI32 g_XM0000000F = { { { 0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F } } };
+        static const XMVECTORI32 g_XM00000003 = { { { 0x00000003, 0x00000003, 0x00000003, 0x00000003 } } };
+
+        __m128i v = value, r, c, b, s;
+
+        c = _mm_cmpgt_epi32(v, g_XM0000FFFF);   // c = (v > 0xFFFF)
+        b = _mm_srli_epi32(c, 31);              // b = (c ? 1 : 0)
+        r = _mm_slli_epi32(b, 4);               // r = (b << 4)
+        v = multi_srl_epi32(v, r);              // v = (v >> r)
+
+        c = _mm_cmpgt_epi32(v, g_XM000000FF);   // c = (v > 0xFF)
+        b = _mm_srli_epi32(c, 31);              // b = (c ? 1 : 0)
+        s = _mm_slli_epi32(b, 3);               // s = (b << 3)
+        v = multi_srl_epi32(v, s);              // v = (v >> s)
+        r = _mm_or_si128(r, s);                 // r = (r | s)
+
+        c = _mm_cmpgt_epi32(v, g_XM0000000F);   // c = (v > 0xF)
+        b = _mm_srli_epi32(c, 31);              // b = (c ? 1 : 0)
+        s = _mm_slli_epi32(b, 2);               // s = (b << 2)
+        v = multi_srl_epi32(v, s);              // v = (v >> s)
+        r = _mm_or_si128(r, s);                 // r = (r | s)
+
+        c = _mm_cmpgt_epi32(v, g_XM00000003);   // c = (v > 0x3)
+        b = _mm_srli_epi32(c, 31);              // b = (c ? 1 : 0)
+        s = _mm_slli_epi32(b, 1);               // s = (b << 1)
+        v = multi_srl_epi32(v, s);              // v = (v >> s)
+        r = _mm_or_si128(r, s);                 // r = (r | s)
+
+        s = _mm_srli_epi32(v, 1);
+        r = _mm_or_si128(r, s);
+        return r;
+    }
+} // namespace Internal
+
+#endif // _XM_SSE_INTRINSICS_
+
+#if defined(_XM_ARM_NEON_INTRINSICS_)
+
+namespace Internal
+{
+    inline int32x4_t GetLeadingBit(const int32x4_t value) noexcept
+    {
+        static const XMVECTORI32 g_XM0000FFFF = { { { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF } } };
+        static const XMVECTORI32 g_XM000000FF = { { { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF } } };
+        static const XMVECTORI32 g_XM0000000F = { { { 0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F } } };
+        static const XMVECTORI32 g_XM00000003 = { { { 0x00000003, 0x00000003, 0x00000003, 0x00000003 } } };
+
+        uint32x4_t c = vcgtq_s32(value, g_XM0000FFFF);              // c = (v > 0xFFFF)
+        int32x4_t b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31);    // b = (c ? 1 : 0)
+        int32x4_t r = vshlq_n_s32(b, 4);                            // r = (b << 4)
+        r = vnegq_s32(r);
+        int32x4_t v = vshlq_s32(value, r);                          // v = (v >> r)
+
+        c = vcgtq_s32(v, g_XM000000FF);                             // c = (v > 0xFF)
+        b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31);              // b = (c ? 1 : 0)
+        int32x4_t s = vshlq_n_s32(b, 3);                            // s = (b << 3)
+        s = vnegq_s32(s);
+        v = vshlq_s32(v, s);                                        // v = (v >> s)
+        r = vorrq_s32(r, s);                                        // r = (r | s)
+
+        c = vcgtq_s32(v, g_XM0000000F);                             // c = (v > 0xF)
+        b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31);              // b = (c ? 1 : 0)
+        s = vshlq_n_s32(b, 2);                                      // s = (b << 2)
+        s = vnegq_s32(s);
+        v = vshlq_s32(v, s);                                        // v = (v >> s)
+        r = vorrq_s32(r, s);                                        // r = (r | s)
+
+        c = vcgtq_s32(v, g_XM00000003);                             // c = (v > 0x3)
+        b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31);              // b = (c ? 1 : 0)
+        s = vshlq_n_s32(b, 1);                                      // s = (b << 1)
+        s = vnegq_s32(s);
+        v = vshlq_s32(v, s);                                        // v = (v >> s)
+        r = vorrq_s32(r, s);                                        // r = (r | s)
+
+        s = vshrq_n_s32(v, 1);
+        r = vorrq_s32(r, s);
+        return r;
+    }
+
+} // namespace Internal
+
+#endif
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorLog2(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            log2f(V.vector4_f32[0]),
+            log2f(V.vector4_f32[1]),
+            log2f(V.vector4_f32[2]),
+            log2f(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
+    int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest);
+    uint32x4_t isExponentZero = vceqq_s32(vreinterpretq_s32_f32(g_XMZero), rawBiased);
+
+    // Compute exponent and significand for normals.
+    int32x4_t biased = vshrq_n_s32(rawBiased, 23);
+    int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias);
+    int32x4_t trailingNor = trailing;
+
+    // Compute exponent and significand for subnormals.
+    int32x4_t leading = Internal::GetLeadingBit(trailing);
+    int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading);
+    int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift);
+    int32x4_t trailingSub = vshlq_s32(trailing, shift);
+    trailingSub = vandq_s32(trailingSub, g_XMQNaNTest);
+    int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor);
+    int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor);
+
+    // Compute the approximation.
+    int32x4_t tmp = vorrq_s32(vreinterpretq_s32_f32(g_XMOne), t);
+    float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne);
+
+    float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y);
+    log2 = vmlaq_f32(g_XMLogEst5, log2, y);
+    log2 = vmlaq_f32(g_XMLogEst4, log2, y);
+    log2 = vmlaq_f32(g_XMLogEst3, log2, y);
+    log2 = vmlaq_f32(g_XMLogEst2, log2, y);
+    log2 = vmlaq_f32(g_XMLogEst1, log2, y);
+    log2 = vmlaq_f32(g_XMLogEst0, log2, y);
+    log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y);
+
+    //  if (x is NaN) -> QNaN
+    //  else if (V is positive)
+    //      if (V is infinite) -> +inf
+    //      else -> log2(V)
+    //  else
+    //      if (V is zero) -> -inf
+    //      else -> -QNaN
+
+    uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
+    isInfinite = vceqq_u32(isInfinite, g_XMInfinity);
+
+    uint32x4_t isGreaterZero = vcgtq_f32(V, g_XMZero);
+    uint32x4_t isNotFinite = vcgtq_f32(V, g_XMInfinity);
+    uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite);
+
+    uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
+    isZero = vceqq_u32(isZero, g_XMZero);
+
+    uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest);
+    uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity);
+    t0 = vceqq_u32(t0, g_XMZero);
+    t1 = vceqq_u32(t1, g_XMInfinity);
+    uint32x4_t isNaN = vbicq_u32(t1, t0);
+
+    float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2);
+    float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN);
+    result = vbslq_f32(isPositive, result, tmp2);
+    result = vbslq_f32(isNaN, g_XMQNaN, result);
+    return result;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_log2_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
+    __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
+    __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased);
+
+    // Compute exponent and significand for normals.
+    __m128i biased = _mm_srli_epi32(rawBiased, 23);
+    __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias);
+    __m128i trailingNor = trailing;
+
+    // Compute exponent and significand for subnormals.
+    __m128i leading = Internal::GetLeadingBit(trailing);
+    __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading);
+    __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift);
+    __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift);
+    trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest);
+
+    __m128i select0 = _mm_and_si128(isExponentZero, exponentSub);
+    __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor);
+    __m128i e = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isExponentZero, trailingSub);
+    select1 = _mm_andnot_si128(isExponentZero, trailingNor);
+    __m128i t = _mm_or_si128(select0, select1);
+
+    // Compute the approximation.
+    __m128i tmp = _mm_or_si128(g_XMOne, t);
+    __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne);
+
+    __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst5);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst4);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst3);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst2);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst1);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst0);
+    log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e));
+
+    //  if (x is NaN) -> QNaN
+    //  else if (V is positive)
+    //      if (V is infinite) -> +inf
+    //      else -> log2(V)
+    //  else
+    //      if (V is zero) -> -inf
+    //      else -> -QNaN
+
+    __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
+    isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity);
+
+    __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero);
+    __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity);
+    __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero);
+
+    __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
+    isZero = _mm_cmpeq_epi32(isZero, g_XMZero);
+
+    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
+    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
+    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
+    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
+    __m128i isNaN = _mm_andnot_si128(t0, t1);
+
+    select0 = _mm_and_si128(isInfinite, g_XMInfinity);
+    select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2));
+    __m128i result = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isZero, g_XMNegInfinity);
+    select1 = _mm_andnot_si128(isZero, g_XMNegQNaN);
+    tmp = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isPositive, result);
+    select1 = _mm_andnot_si128(isPositive, tmp);
+    result = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isNaN, g_XMQNaN);
+    select1 = _mm_andnot_si128(isNaN, result);
+    result = _mm_or_si128(select0, select1);
+
+    return _mm_castsi128_ps(result);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorLog10(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result = { { {
+            log10f(V.vector4_f32[0]),
+            log10f(V.vector4_f32[1]),
+            log10f(V.vector4_f32[2]),
+            log10f(V.vector4_f32[3])
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
+    int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest);
+    uint32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased);
+
+    // Compute exponent and significand for normals.
+    int32x4_t biased = vshrq_n_s32(rawBiased, 23);
+    int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias);
+    int32x4_t trailingNor = trailing;
+
+    // Compute exponent and significand for subnormals.
+    int32x4_t leading = Internal::GetLeadingBit(trailing);
+    int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading);
+    int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift);
+    int32x4_t trailingSub = vshlq_s32(trailing, shift);
+    trailingSub = vandq_s32(trailingSub, g_XMQNaNTest);
+    int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor);
+    int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor);
+
+    // Compute the approximation.
+    int32x4_t tmp = vorrq_s32(g_XMOne, t);
+    float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne);
+
+    float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y);
+    log2 = vmlaq_f32(g_XMLogEst5, log2, y);
+    log2 = vmlaq_f32(g_XMLogEst4, log2, y);
+    log2 = vmlaq_f32(g_XMLogEst3, log2, y);
+    log2 = vmlaq_f32(g_XMLogEst2, log2, y);
+    log2 = vmlaq_f32(g_XMLogEst1, log2, y);
+    log2 = vmlaq_f32(g_XMLogEst0, log2, y);
+    log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y);
+
+    log2 = vmulq_f32(g_XMInvLg10, log2);
+
+    //  if (x is NaN) -> QNaN
+    //  else if (V is positive)
+    //      if (V is infinite) -> +inf
+    //      else -> log2(V)
+    //  else
+    //      if (V is zero) -> -inf
+    //      else -> -QNaN
+
+    uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
+    isInfinite = vceqq_u32(isInfinite, g_XMInfinity);
+
+    uint32x4_t isGreaterZero = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMZero);
+    uint32x4_t isNotFinite = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
+    uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite);
+
+    uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
+    isZero = vceqq_u32(isZero, g_XMZero);
+
+    uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest);
+    uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity);
+    t0 = vceqq_u32(t0, g_XMZero);
+    t1 = vceqq_u32(t1, g_XMInfinity);
+    uint32x4_t isNaN = vbicq_u32(t1, t0);
+
+    float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2);
+    float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN);
+    result = vbslq_f32(isPositive, result, tmp2);
+    result = vbslq_f32(isNaN, g_XMQNaN, result);
+    return result;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_log10_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
+    __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
+    __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased);
+
+    // Compute exponent and significand for normals.
+    __m128i biased = _mm_srli_epi32(rawBiased, 23);
+    __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias);
+    __m128i trailingNor = trailing;
+
+    // Compute exponent and significand for subnormals.
+    __m128i leading = Internal::GetLeadingBit(trailing);
+    __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading);
+    __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift);
+    __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift);
+    trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest);
+
+    __m128i select0 = _mm_and_si128(isExponentZero, exponentSub);
+    __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor);
+    __m128i e = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isExponentZero, trailingSub);
+    select1 = _mm_andnot_si128(isExponentZero, trailingNor);
+    __m128i t = _mm_or_si128(select0, select1);
+
+    // Compute the approximation.
+    __m128i tmp = _mm_or_si128(g_XMOne, t);
+    __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne);
+
+    __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst5);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst4);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst3);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst2);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst1);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst0);
+    log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e));
+
+    log2 = _mm_mul_ps(g_XMInvLg10, log2);
+
+    //  if (x is NaN) -> QNaN
+    //  else if (V is positive)
+    //      if (V is infinite) -> +inf
+    //      else -> log2(V)
+    //  else
+    //      if (V is zero) -> -inf
+    //      else -> -QNaN
+
+    __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
+    isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity);
+
+    __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero);
+    __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity);
+    __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero);
+
+    __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
+    isZero = _mm_cmpeq_epi32(isZero, g_XMZero);
+
+    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
+    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
+    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
+    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
+    __m128i isNaN = _mm_andnot_si128(t0, t1);
+
+    select0 = _mm_and_si128(isInfinite, g_XMInfinity);
+    select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2));
+    __m128i result = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isZero, g_XMNegInfinity);
+    select1 = _mm_andnot_si128(isZero, g_XMNegQNaN);
+    tmp = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isPositive, result);
+    select1 = _mm_andnot_si128(isPositive, tmp);
+    result = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isNaN, g_XMQNaN);
+    select1 = _mm_andnot_si128(isNaN, result);
+    result = _mm_or_si128(select0, select1);
+
+    return _mm_castsi128_ps(result);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorLogE(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result = { { {
+            logf(V.vector4_f32[0]),
+            logf(V.vector4_f32[1]),
+            logf(V.vector4_f32[2]),
+            logf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
+    int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest);
+    uint32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased);
+
+    // Compute exponent and significand for normals.
+    int32x4_t biased = vshrq_n_s32(rawBiased, 23);
+    int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias);
+    int32x4_t trailingNor = trailing;
+
+    // Compute exponent and significand for subnormals.
+    int32x4_t leading = Internal::GetLeadingBit(trailing);
+    int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading);
+    int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift);
+    int32x4_t trailingSub = vshlq_s32(trailing, shift);
+    trailingSub = vandq_s32(trailingSub, g_XMQNaNTest);
+    int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor);
+    int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor);
+
+    // Compute the approximation.
+    int32x4_t tmp = vorrq_s32(g_XMOne, t);
+    float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne);
+
+    float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y);
+    log2 = vmlaq_f32(g_XMLogEst5, log2, y);
+    log2 = vmlaq_f32(g_XMLogEst4, log2, y);
+    log2 = vmlaq_f32(g_XMLogEst3, log2, y);
+    log2 = vmlaq_f32(g_XMLogEst2, log2, y);
+    log2 = vmlaq_f32(g_XMLogEst1, log2, y);
+    log2 = vmlaq_f32(g_XMLogEst0, log2, y);
+    log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y);
+
+    log2 = vmulq_f32(g_XMInvLgE, log2);
+
+    //  if (x is NaN) -> QNaN
+    //  else if (V is positive)
+    //      if (V is infinite) -> +inf
+    //      else -> log2(V)
+    //  else
+    //      if (V is zero) -> -inf
+    //      else -> -QNaN
+
+    uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
+    isInfinite = vceqq_u32(isInfinite, g_XMInfinity);
+
+    uint32x4_t isGreaterZero = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMZero);
+    uint32x4_t isNotFinite = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
+    uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite);
+
+    uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
+    isZero = vceqq_u32(isZero, g_XMZero);
+
+    uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest);
+    uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity);
+    t0 = vceqq_u32(t0, g_XMZero);
+    t1 = vceqq_u32(t1, g_XMInfinity);
+    uint32x4_t isNaN = vbicq_u32(t1, t0);
+
+    float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2);
+    float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN);
+    result = vbslq_f32(isPositive, result, tmp2);
+    result = vbslq_f32(isNaN, g_XMQNaN, result);
+    return result;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_log_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
+    __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
+    __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased);
+
+    // Compute exponent and significand for normals.
+    __m128i biased = _mm_srli_epi32(rawBiased, 23);
+    __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias);
+    __m128i trailingNor = trailing;
+
+    // Compute exponent and significand for subnormals.
+    __m128i leading = Internal::GetLeadingBit(trailing);
+    __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading);
+    __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift);
+    __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift);
+    trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest);
+
+    __m128i select0 = _mm_and_si128(isExponentZero, exponentSub);
+    __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor);
+    __m128i e = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isExponentZero, trailingSub);
+    select1 = _mm_andnot_si128(isExponentZero, trailingNor);
+    __m128i t = _mm_or_si128(select0, select1);
+
+    // Compute the approximation.
+    __m128i tmp = _mm_or_si128(g_XMOne, t);
+    __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne);
+
+    __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst5);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst4);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst3);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst2);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst1);
+    log2 = XM_FMADD_PS(log2, y, g_XMLogEst0);
+    log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e));
+
+    log2 = _mm_mul_ps(g_XMInvLgE, log2);
+
+    //  if (x is NaN) -> QNaN
+    //  else if (V is positive)
+    //      if (V is infinite) -> +inf
+    //      else -> log2(V)
+    //  else
+    //      if (V is zero) -> -inf
+    //      else -> -QNaN
+
+    __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
+    isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity);
+
+    __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero);
+    __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity);
+    __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero);
+
+    __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
+    isZero = _mm_cmpeq_epi32(isZero, g_XMZero);
+
+    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
+    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
+    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
+    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
+    __m128i isNaN = _mm_andnot_si128(t0, t1);
+
+    select0 = _mm_and_si128(isInfinite, g_XMInfinity);
+    select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2));
+    __m128i result = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isZero, g_XMNegInfinity);
+    select1 = _mm_andnot_si128(isZero, g_XMNegQNaN);
+    tmp = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isPositive, result);
+    select1 = _mm_andnot_si128(isPositive, tmp);
+    result = _mm_or_si128(select0, select1);
+
+    select0 = _mm_and_si128(isNaN, g_XMQNaN);
+    select1 = _mm_andnot_si128(isNaN, result);
+    result = _mm_or_si128(select0, select1);
+
+    return _mm_castsi128_ps(result);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorLog(FXMVECTOR V) noexcept
+{
+    return XMVectorLog2(V);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorPow
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result = { { {
+            powf(V1.vector4_f32[0], V2.vector4_f32[0]),
+            powf(V1.vector4_f32[1], V2.vector4_f32[1]),
+            powf(V1.vector4_f32[2], V2.vector4_f32[2]),
+            powf(V1.vector4_f32[3], V2.vector4_f32[3])
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            powf(vgetq_lane_f32(V1, 0), vgetq_lane_f32(V2, 0)),
+            powf(vgetq_lane_f32(V1, 1), vgetq_lane_f32(V2, 1)),
+            powf(vgetq_lane_f32(V1, 2), vgetq_lane_f32(V2, 2)),
+            powf(vgetq_lane_f32(V1, 3), vgetq_lane_f32(V2, 3))
+        } } };
+    return vResult.v;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_pow_ps(V1, V2);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XM_ALIGNED_DATA(16) float a[4];
+    XM_ALIGNED_DATA(16) float b[4];
+    _mm_store_ps(a, V1);
+    _mm_store_ps(b, V2);
+    XMVECTOR vResult = _mm_setr_ps(
+        powf(a[0], b[0]),
+        powf(a[1], b[1]),
+        powf(a[2], b[2]),
+        powf(a[3], b[3]));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorAbs(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            fabsf(V.vector4_f32[0]),
+            fabsf(V.vector4_f32[1]),
+            fabsf(V.vector4_f32[2]),
+            fabsf(V.vector4_f32[3])
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    return vabsq_f32(V);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = _mm_setzero_ps();
+    vResult = _mm_sub_ps(vResult, V);
+    vResult = _mm_max_ps(vResult, V);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorMod
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+    // V1 % V2 = V1 - V2 * truncate(V1 / V2)
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Quotient = XMVectorDivide(V1, V2);
+    Quotient = XMVectorTruncate(Quotient);
+    XMVECTOR Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR vResult = XMVectorDivide(V1, V2);
+    vResult = XMVectorTruncate(vResult);
+    return vmlsq_f32(V1, vResult, V2);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = _mm_div_ps(V1, V2);
+    vResult = XMVectorTruncate(vResult);
+    return XM_FNMADD_PS(vResult, V2, V1);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorModAngles(FXMVECTOR Angles) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR V;
+    XMVECTOR Result;
+
+    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
+    V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v);
+    V = XMVectorRound(V);
+    Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
+    XMVECTOR vResult = vmulq_f32(Angles, g_XMReciprocalTwoPi);
+    // Use the inline function due to complexity for rounding
+    vResult = XMVectorRound(vResult);
+    return vmlsq_f32(Angles, vResult, g_XMTwoPi);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
+    XMVECTOR vResult = _mm_mul_ps(Angles, g_XMReciprocalTwoPi);
+    // Use the inline function due to complexity for rounding
+    vResult = XMVectorRound(vResult);
+    return XM_FNMADD_PS(vResult, g_XMTwoPi, Angles);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSin(FXMVECTOR V) noexcept
+{
+    // 11-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            sinf(V.vector4_f32[0]),
+            sinf(V.vector4_f32[1]),
+            sinf(V.vector4_f32[2]),
+            sinf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Force the value within the bounds of pi
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
+    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
+    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    float32x4_t absx = vabsq_f32(x);
+    float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
+    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
+    x = vbslq_f32(comp, x, rflx);
+
+    float32x4_t x2 = vmulq_f32(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR SC1 = g_XMSinCoefficients1;
+    const XMVECTOR SC0 = g_XMSinCoefficients0;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1);
+    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0);
+
+    vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    Result = vmulq_f32(Result, x);
+    return Result;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_sin_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Force the value within the bounds of pi
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
+    __m128 sign = _mm_and_ps(x, g_XMNegativeZero);
+    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
+    __m128 rflx = _mm_sub_ps(c, x);
+    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
+    __m128 select0 = _mm_and_ps(comp, x);
+    __m128 select1 = _mm_andnot_ps(comp, rflx);
+    x = _mm_or_ps(select0, select1);
+
+    __m128 x2 = _mm_mul_ps(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR SC1 = g_XMSinCoefficients1;
+    __m128 vConstantsB = XM_PERMUTE_PS(SC1, _MM_SHUFFLE(0, 0, 0, 0));
+    const XMVECTOR SC0 = g_XMSinCoefficients0;
+    __m128 vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(2, 2, 2, 2));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(1, 1, 1, 1));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(0, 0, 0, 0));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    Result = XM_FMADD_PS(Result, x2, g_XMOne);
+    Result = _mm_mul_ps(Result, x);
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorCos(FXMVECTOR V) noexcept
+{
+    // 10-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            cosf(V.vector4_f32[0]),
+            cosf(V.vector4_f32[1]),
+            cosf(V.vector4_f32[2]),
+            cosf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Map V to x in [-pi,pi].
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
+    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
+    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    float32x4_t absx = vabsq_f32(x);
+    float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
+    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
+    x = vbslq_f32(comp, x, rflx);
+    float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
+
+    float32x4_t x2 = vmulq_f32(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR CC1 = g_XMCosCoefficients1;
+    const XMVECTOR CC0 = g_XMCosCoefficients0;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1);
+    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0);
+
+    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    Result = vmulq_f32(Result, fsign);
+    return Result;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_cos_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Map V to x in [-pi,pi].
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
+    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
+    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
+    __m128 rflx = _mm_sub_ps(c, x);
+    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
+    __m128 select0 = _mm_and_ps(comp, x);
+    __m128 select1 = _mm_andnot_ps(comp, rflx);
+    x = _mm_or_ps(select0, select1);
+    select0 = _mm_and_ps(comp, g_XMOne);
+    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
+    sign = _mm_or_ps(select0, select1);
+
+    __m128 x2 = _mm_mul_ps(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR CC1 = g_XMCosCoefficients1;
+    __m128 vConstantsB = XM_PERMUTE_PS(CC1, _MM_SHUFFLE(0, 0, 0, 0));
+    const XMVECTOR CC0 = g_XMCosCoefficients0;
+    __m128 vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(2, 2, 2, 2));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(1, 1, 1, 1));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(0, 0, 0, 0));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    Result = XM_FMADD_PS(Result, x2, g_XMOne);
+    Result = _mm_mul_ps(Result, sign);
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorSinCos
+(
+    XMVECTOR* pSin,
+    XMVECTOR* pCos,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pSin != nullptr);
+    assert(pCos != nullptr);
+
+    // 11/10-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Sin = { { {
+            sinf(V.vector4_f32[0]),
+            sinf(V.vector4_f32[1]),
+            sinf(V.vector4_f32[2]),
+            sinf(V.vector4_f32[3])
+        } } };
+
+    XMVECTORF32 Cos = { { {
+            cosf(V.vector4_f32[0]),
+            cosf(V.vector4_f32[1]),
+            cosf(V.vector4_f32[2]),
+            cosf(V.vector4_f32[3])
+        } } };
+
+    *pSin = Sin.v;
+    *pCos = Cos.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Force the value within the bounds of pi
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
+    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
+    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    float32x4_t absx = vabsq_f32(x);
+    float32x4_t  rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
+    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
+    x = vbslq_f32(comp, x, rflx);
+    float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
+
+    float32x4_t x2 = vmulq_f32(x, x);
+
+    // Compute polynomial approximation for sine
+    const XMVECTOR SC1 = g_XMSinCoefficients1;
+    const XMVECTOR SC0 = g_XMSinCoefficients0;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1);
+    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0);
+
+    vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    *pSin = vmulq_f32(Result, x);
+
+    // Compute polynomial approximation for cosine
+    const XMVECTOR CC1 = g_XMCosCoefficients1;
+    const XMVECTOR CC0 = g_XMCosCoefficients0;
+    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1);
+    Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0);
+
+    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    *pCos = vmulq_f32(Result, fsign);
+#elif defined(_XM_SVML_INTRINSICS_)
+    *pSin = _mm_sincos_ps(pCos, V);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Force the value within the bounds of pi
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
+    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
+    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
+    __m128 rflx = _mm_sub_ps(c, x);
+    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
+    __m128 select0 = _mm_and_ps(comp, x);
+    __m128 select1 = _mm_andnot_ps(comp, rflx);
+    x = _mm_or_ps(select0, select1);
+    select0 = _mm_and_ps(comp, g_XMOne);
+    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
+    sign = _mm_or_ps(select0, select1);
+
+    __m128 x2 = _mm_mul_ps(x, x);
+
+    // Compute polynomial approximation of sine
+    const XMVECTOR SC1 = g_XMSinCoefficients1;
+    __m128 vConstantsB = XM_PERMUTE_PS(SC1, _MM_SHUFFLE(0, 0, 0, 0));
+    const XMVECTOR SC0 = g_XMSinCoefficients0;
+    __m128 vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(2, 2, 2, 2));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(1, 1, 1, 1));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(0, 0, 0, 0));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    Result = XM_FMADD_PS(Result, x2, g_XMOne);
+    Result = _mm_mul_ps(Result, x);
+    *pSin = Result;
+
+    // Compute polynomial approximation of cosine
+    const XMVECTOR CC1 = g_XMCosCoefficients1;
+    vConstantsB = XM_PERMUTE_PS(CC1, _MM_SHUFFLE(0, 0, 0, 0));
+    const XMVECTOR CC0 = g_XMCosCoefficients0;
+    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(3, 3, 3, 3));
+    Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(2, 2, 2, 2));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(1, 1, 1, 1));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(0, 0, 0, 0));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    Result = XM_FMADD_PS(Result, x2, g_XMOne);
+    Result = _mm_mul_ps(Result, sign);
+    *pCos = Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorTan(FXMVECTOR V) noexcept
+{
+    // Cody and Waite algorithm to compute tangent.
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            tanf(V.vector4_f32[0]),
+            tanf(V.vector4_f32[1]),
+            tanf(V.vector4_f32[2]),
+            tanf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_tan_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+
+    static const XMVECTORF32 TanCoefficients0 = { { { 1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f } } };
+    static const XMVECTORF32 TanCoefficients1 = { { { 4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f } } };
+    static const XMVECTORF32 TanConstants = { { { 1.570796371f, 6.077100628e-11f, 0.000244140625f, 0.63661977228f /*2 / Pi*/ } } };
+    static const XMVECTORU32 Mask = { { { 0x1, 0x1, 0x1, 0x1 } } };
+
+    XMVECTOR TwoDivPi = XMVectorSplatW(TanConstants.v);
+
+    XMVECTOR Zero = XMVectorZero();
+
+    XMVECTOR C0 = XMVectorSplatX(TanConstants.v);
+    XMVECTOR C1 = XMVectorSplatY(TanConstants.v);
+    XMVECTOR Epsilon = XMVectorSplatZ(TanConstants.v);
+
+    XMVECTOR VA = XMVectorMultiply(V, TwoDivPi);
+
+    VA = XMVectorRound(VA);
+
+    XMVECTOR VC = XMVectorNegativeMultiplySubtract(VA, C0, V);
+
+    XMVECTOR VB = XMVectorAbs(VA);
+
+    VC = XMVectorNegativeMultiplySubtract(VA, C1, VC);
+
+#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    VB = vreinterpretq_f32_u32(vcvtq_u32_f32(VB));
+#elif defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    reinterpret_cast<__m128i*>(&VB)[0] = _mm_cvttps_epi32(VB);
+#else
+    for (size_t i = 0; i < 4; i++)
+    {
+        VB.vector4_u32[i] = static_cast<uint32_t>(VB.vector4_f32[i]);
+    }
+#endif
+
+    XMVECTOR VC2 = XMVectorMultiply(VC, VC);
+
+    XMVECTOR T7 = XMVectorSplatW(TanCoefficients1.v);
+    XMVECTOR T6 = XMVectorSplatZ(TanCoefficients1.v);
+    XMVECTOR T4 = XMVectorSplatX(TanCoefficients1.v);
+    XMVECTOR T3 = XMVectorSplatW(TanCoefficients0.v);
+    XMVECTOR T5 = XMVectorSplatY(TanCoefficients1.v);
+    XMVECTOR T2 = XMVectorSplatZ(TanCoefficients0.v);
+    XMVECTOR T1 = XMVectorSplatY(TanCoefficients0.v);
+    XMVECTOR T0 = XMVectorSplatX(TanCoefficients0.v);
+
+    XMVECTOR VBIsEven = XMVectorAndInt(VB, Mask.v);
+    VBIsEven = XMVectorEqualInt(VBIsEven, Zero);
+
+    XMVECTOR N = XMVectorMultiplyAdd(VC2, T7, T6);
+    XMVECTOR D = XMVectorMultiplyAdd(VC2, T4, T3);
+    N = XMVectorMultiplyAdd(VC2, N, T5);
+    D = XMVectorMultiplyAdd(VC2, D, T2);
+    N = XMVectorMultiply(VC2, N);
+    D = XMVectorMultiplyAdd(VC2, D, T1);
+    N = XMVectorMultiplyAdd(VC, N, VC);
+    XMVECTOR VCNearZero = XMVectorInBounds(VC, Epsilon);
+    D = XMVectorMultiplyAdd(VC2, D, T0);
+
+    N = XMVectorSelect(N, VC, VCNearZero);
+    D = XMVectorSelect(D, g_XMOne.v, VCNearZero);
+
+    XMVECTOR R0 = XMVectorNegate(N);
+    XMVECTOR R1 = XMVectorDivide(N, D);
+    R0 = XMVectorDivide(D, R0);
+
+    XMVECTOR VIsZero = XMVectorEqual(V, Zero);
+
+    XMVECTOR Result = XMVectorSelect(R0, R1, VBIsEven);
+
+    Result = XMVectorSelect(Result, Zero, VIsZero);
+
+    return Result;
+
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSinH(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            sinhf(V.vector4_f32[0]),
+            sinhf(V.vector4_f32[1]),
+            sinhf(V.vector4_f32[2]),
+            sinhf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f)
+
+    XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v);
+    XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v);
+    XMVECTOR E1 = XMVectorExp(V1);
+    XMVECTOR E2 = XMVectorExp(V2);
+
+    return vsubq_f32(E1, E2);
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_sinh_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f)
+
+    XMVECTOR V1 = XM_FMADD_PS(V, Scale, g_XMNegativeOne);
+    XMVECTOR V2 = XM_FNMADD_PS(V, Scale, g_XMNegativeOne);
+    XMVECTOR E1 = XMVectorExp(V1);
+    XMVECTOR E2 = XMVectorExp(V2);
+
+    return _mm_sub_ps(E1, E2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorCosH(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            coshf(V.vector4_f32[0]),
+            coshf(V.vector4_f32[1]),
+            coshf(V.vector4_f32[2]),
+            coshf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f)
+
+    XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v);
+    XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v);
+    XMVECTOR E1 = XMVectorExp(V1);
+    XMVECTOR E2 = XMVectorExp(V2);
+    return vaddq_f32(E1, E2);
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_cosh_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f)
+
+    XMVECTOR V1 = XM_FMADD_PS(V, Scale.v, g_XMNegativeOne.v);
+    XMVECTOR V2 = XM_FNMADD_PS(V, Scale.v, g_XMNegativeOne.v);
+    XMVECTOR E1 = XMVectorExp(V1);
+    XMVECTOR E2 = XMVectorExp(V2);
+    return _mm_add_ps(E1, E2);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorTanH(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            tanhf(V.vector4_f32[0]),
+            tanhf(V.vector4_f32[1]),
+            tanhf(V.vector4_f32[2]),
+            tanhf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Scale = { { { 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f } } }; // 2.0f / ln(2.0f)
+
+    XMVECTOR E = vmulq_f32(V, Scale.v);
+    E = XMVectorExp(E);
+    E = vmlaq_f32(g_XMOneHalf.v, E, g_XMOneHalf.v);
+    E = XMVectorReciprocal(E);
+    return vsubq_f32(g_XMOne.v, E);
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_tanh_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Scale = { { { 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f } } }; // 2.0f / ln(2.0f)
+
+    XMVECTOR E = _mm_mul_ps(V, Scale.v);
+    E = XMVectorExp(E);
+    E = XM_FMADD_PS(E, g_XMOneHalf.v, g_XMOneHalf.v);
+    E = _mm_div_ps(g_XMOne.v, E);
+    return _mm_sub_ps(g_XMOne.v, E);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorASin(FXMVECTOR V) noexcept
+{
+    // 7-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            asinf(V.vector4_f32[0]),
+            asinf(V.vector4_f32[1]),
+            asinf(V.vector4_f32[2]),
+            asinf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
+    float32x4_t x = vabsq_f32(V);
+
+    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
+    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
+    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
+    float32x4_t root = XMVectorSqrt(clampOneMValue);
+
+    // Compute polynomial approximation
+    const XMVECTOR AC1 = g_XMArcCoefficients1;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0);
+    XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AC1), 1);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1);
+    t0 = vmlaq_f32(vConstants, t0, x);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0);
+    t0 = vmlaq_f32(vConstants, t0, x);
+
+    const XMVECTOR AC0 = g_XMArcCoefficients0;
+    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1);
+    t0 = vmlaq_f32(vConstants, t0, x);
+
+    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0);
+    t0 = vmlaq_f32(vConstants, t0, x);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1);
+    t0 = vmlaq_f32(vConstants, t0, x);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0);
+    t0 = vmlaq_f32(vConstants, t0, x);
+    t0 = vmulq_f32(t0, root);
+
+    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
+    t0 = vbslq_f32(nonnegative, t0, t1);
+    t0 = vsubq_f32(g_XMHalfPi, t0);
+    return t0;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_asin_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
+    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
+    __m128 x = _mm_max_ps(V, mvalue);  // |V|
+
+    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
+    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
+    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
+    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
+
+    // Compute polynomial approximation
+    const XMVECTOR AC1 = g_XMArcCoefficients1;
+    __m128 vConstantsB = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(2, 2, 2, 2));
+    __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants);
+
+    vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(1, 1, 1, 1));
+    t0 = XM_FMADD_PS(t0, x, vConstants);
+
+    vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(0, 0, 0, 0));
+    t0 = XM_FMADD_PS(t0, x, vConstants);
+
+    const XMVECTOR AC0 = g_XMArcCoefficients0;
+    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(3, 3, 3, 3));
+    t0 = XM_FMADD_PS(t0, x, vConstants);
+
+    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(2, 2, 2, 2));
+    t0 = XM_FMADD_PS(t0, x, vConstants);
+
+    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(1, 1, 1, 1));
+    t0 = XM_FMADD_PS(t0, x, vConstants);
+
+    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(0, 0, 0, 0));
+    t0 = XM_FMADD_PS(t0, x, vConstants);
+    t0 = _mm_mul_ps(t0, root);
+
+    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
+    t0 = _mm_and_ps(nonnegative, t0);
+    t1 = _mm_andnot_ps(nonnegative, t1);
+    t0 = _mm_or_ps(t0, t1);
+    t0 = _mm_sub_ps(g_XMHalfPi, t0);
+    return t0;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorACos(FXMVECTOR V) noexcept
+{
+    // 7-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            acosf(V.vector4_f32[0]),
+            acosf(V.vector4_f32[1]),
+            acosf(V.vector4_f32[2]),
+            acosf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
+    float32x4_t x = vabsq_f32(V);
+
+    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
+    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
+    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
+    float32x4_t root = XMVectorSqrt(clampOneMValue);
+
+    // Compute polynomial approximation
+    const XMVECTOR AC1 = g_XMArcCoefficients1;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0);
+    XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AC1), 1);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1);
+    t0 = vmlaq_f32(vConstants, t0, x);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0);
+    t0 = vmlaq_f32(vConstants, t0, x);
+
+    const XMVECTOR AC0 = g_XMArcCoefficients0;
+    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1);
+    t0 = vmlaq_f32(vConstants, t0, x);
+
+    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0);
+    t0 = vmlaq_f32(vConstants, t0, x);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1);
+    t0 = vmlaq_f32(vConstants, t0, x);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0);
+    t0 = vmlaq_f32(vConstants, t0, x);
+    t0 = vmulq_f32(t0, root);
+
+    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
+    t0 = vbslq_f32(nonnegative, t0, t1);
+    return t0;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_acos_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
+    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
+    __m128 x = _mm_max_ps(V, mvalue);  // |V|
+
+    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
+    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
+    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
+    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
+
+    // Compute polynomial approximation
+    const XMVECTOR AC1 = g_XMArcCoefficients1;
+    __m128 vConstantsB = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(2, 2, 2, 2));
+    __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants);
+
+    vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(1, 1, 1, 1));
+    t0 = XM_FMADD_PS(t0, x, vConstants);
+
+    vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(0, 0, 0, 0));
+    t0 = XM_FMADD_PS(t0, x, vConstants);
+
+    const XMVECTOR AC0 = g_XMArcCoefficients0;
+    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(3, 3, 3, 3));
+    t0 = XM_FMADD_PS(t0, x, vConstants);
+
+    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(2, 2, 2, 2));
+    t0 = XM_FMADD_PS(t0, x, vConstants);
+
+    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(1, 1, 1, 1));
+    t0 = XM_FMADD_PS(t0, x, vConstants);
+
+    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(0, 0, 0, 0));
+    t0 = XM_FMADD_PS(t0, x, vConstants);
+    t0 = _mm_mul_ps(t0, root);
+
+    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
+    t0 = _mm_and_ps(nonnegative, t0);
+    t1 = _mm_andnot_ps(nonnegative, t1);
+    t0 = _mm_or_ps(t0, t1);
+    return t0;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorATan(FXMVECTOR V) noexcept
+{
+    // 17-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            atanf(V.vector4_f32[0]),
+            atanf(V.vector4_f32[1]),
+            atanf(V.vector4_f32[2]),
+            atanf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t absV = vabsq_f32(V);
+    float32x4_t invV = XMVectorReciprocal(V);
+    uint32x4_t comp = vcgtq_f32(V, g_XMOne);
+    float32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
+    comp = vcleq_f32(absV, g_XMOne);
+    sign = vbslq_f32(comp, g_XMZero, sign);
+    float32x4_t x = vbslq_f32(comp, V, invV);
+
+    float32x4_t x2 = vmulq_f32(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR TC1 = g_XMATanCoefficients1;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0);
+    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(TC1), 1);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    const XMVECTOR TC0 = g_XMATanCoefficients0;
+    vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    Result = vmulq_f32(Result, x);
+
+    float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi);
+    result1 = vsubq_f32(result1, Result);
+
+    comp = vceqq_f32(sign, g_XMZero);
+    Result = vbslq_f32(comp, Result, result1);
+    return Result;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_atan_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 absV = XMVectorAbs(V);
+    __m128 invV = _mm_div_ps(g_XMOne, V);
+    __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
+    __m128 select0 = _mm_and_ps(comp, g_XMOne);
+    __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
+    __m128 sign = _mm_or_ps(select0, select1);
+    comp = _mm_cmple_ps(absV, g_XMOne);
+    select0 = _mm_and_ps(comp, g_XMZero);
+    select1 = _mm_andnot_ps(comp, sign);
+    sign = _mm_or_ps(select0, select1);
+    select0 = _mm_and_ps(comp, V);
+    select1 = _mm_andnot_ps(comp, invV);
+    __m128 x = _mm_or_ps(select0, select1);
+
+    __m128 x2 = _mm_mul_ps(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR TC1 = g_XMATanCoefficients1;
+    __m128 vConstantsB = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(2, 2, 2, 2));
+    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(1, 1, 1, 1));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(0, 0, 0, 0));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    const XMVECTOR TC0 = g_XMATanCoefficients0;
+    vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(3, 3, 3, 3));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(2, 2, 2, 2));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(1, 1, 1, 1));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(0, 0, 0, 0));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    Result = XM_FMADD_PS(Result, x2, g_XMOne);
+
+    Result = _mm_mul_ps(Result, x);
+    __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
+    result1 = _mm_sub_ps(result1, Result);
+
+    comp = _mm_cmpeq_ps(sign, g_XMZero);
+    select0 = _mm_and_ps(comp, Result);
+    select1 = _mm_andnot_ps(comp, result1);
+    Result = _mm_or_ps(select0, select1);
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorATan2
+(
+    FXMVECTOR Y,
+    FXMVECTOR X
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            atan2f(Y.vector4_f32[0], X.vector4_f32[0]),
+            atan2f(Y.vector4_f32[1], X.vector4_f32[1]),
+            atan2f(Y.vector4_f32[2], X.vector4_f32[2]),
+            atan2f(Y.vector4_f32[3], X.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_atan2_ps(Y, X);
+    return Result;
+#else
+
+    // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions:
+
+    //     Y == 0 and X is Negative         -> Pi with the sign of Y
+    //     y == 0 and x is positive         -> 0 with the sign of y
+    //     Y != 0 and X == 0                -> Pi / 2 with the sign of Y
+    //     Y != 0 and X is Negative         -> atan(y/x) + (PI with the sign of Y)
+    //     X == -Infinity and Finite Y      -> Pi with the sign of Y
+    //     X == +Infinity and Finite Y      -> 0 with the sign of Y
+    //     Y == Infinity and X is Finite    -> Pi / 2 with the sign of Y
+    //     Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y
+    //     Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y
+
+    static const XMVECTORF32 ATan2Constants = { { { XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f } } };
+
+    XMVECTOR Zero = XMVectorZero();
+    XMVECTOR ATanResultValid = XMVectorTrueInt();
+
+    XMVECTOR Pi = XMVectorSplatX(ATan2Constants);
+    XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants);
+    XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants);
+    XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants);
+
+    XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero);
+    XMVECTOR XEqualsZero = XMVectorEqual(X, Zero);
+    XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
+    XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
+    XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
+    XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);
+
+    XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
+    Pi = XMVectorOrInt(Pi, YSign);
+    PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
+    PiOverFour = XMVectorOrInt(PiOverFour, YSign);
+    ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);
+
+    XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive);
+    XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
+    XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero);
+    XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
+    XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
+    XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity);
+    ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
+
+    XMVECTOR V = XMVectorDivide(Y, X);
+
+    XMVECTOR R0 = XMVectorATan(V);
+
+    R1 = XMVectorSelect(Pi, g_XMNegativeZero, XIsPositive);
+    R2 = XMVectorAdd(R0, R1);
+
+    return XMVectorSelect(Result, R2, ATanResultValid);
+
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorSinEst(FXMVECTOR V) noexcept
+{
+    // 7-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            sinf(V.vector4_f32[0]),
+            sinf(V.vector4_f32[1]),
+            sinf(V.vector4_f32[2]),
+            sinf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Force the value within the bounds of pi
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
+    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
+    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    float32x4_t absx = vabsq_f32(x);
+    float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
+    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
+    x = vbslq_f32(comp, x, rflx);
+
+    float32x4_t x2 = vmulq_f32(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR SEC = g_XMSinCoefficients1;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0);
+    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    Result = vmulq_f32(Result, x);
+    return Result;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_sin_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Force the value within the bounds of pi
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
+    __m128 sign = _mm_and_ps(x, g_XMNegativeZero);
+    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
+    __m128 rflx = _mm_sub_ps(c, x);
+    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
+    __m128 select0 = _mm_and_ps(comp, x);
+    __m128 select1 = _mm_andnot_ps(comp, rflx);
+    x = _mm_or_ps(select0, select1);
+
+    __m128 x2 = _mm_mul_ps(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR SEC = g_XMSinCoefficients1;
+    __m128 vConstantsB = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(2, 2, 2, 2));
+    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(1, 1, 1, 1));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+    Result = XM_FMADD_PS(Result, x2, g_XMOne);
+    Result = _mm_mul_ps(Result, x);
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorCosEst(FXMVECTOR V) noexcept
+{
+    // 6-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            cosf(V.vector4_f32[0]),
+            cosf(V.vector4_f32[1]),
+            cosf(V.vector4_f32[2]),
+            cosf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Map V to x in [-pi,pi].
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
+    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
+    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    float32x4_t absx = vabsq_f32(x);
+    float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
+    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
+    x = vbslq_f32(comp, x, rflx);
+    float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
+
+    float32x4_t x2 = vmulq_f32(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR CEC = g_XMCosCoefficients1;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0);
+    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    Result = vmulq_f32(Result, fsign);
+    return Result;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_cos_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Map V to x in [-pi,pi].
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
+    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
+    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
+    __m128 rflx = _mm_sub_ps(c, x);
+    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
+    __m128 select0 = _mm_and_ps(comp, x);
+    __m128 select1 = _mm_andnot_ps(comp, rflx);
+    x = _mm_or_ps(select0, select1);
+    select0 = _mm_and_ps(comp, g_XMOne);
+    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
+    sign = _mm_or_ps(select0, select1);
+
+    __m128 x2 = _mm_mul_ps(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR CEC = g_XMCosCoefficients1;
+    __m128 vConstantsB = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(2, 2, 2, 2));
+    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(1, 1, 1, 1));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+    Result = XM_FMADD_PS(Result, x2, g_XMOne);
+    Result = _mm_mul_ps(Result, sign);
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVectorSinCosEst
+(
+    XMVECTOR* pSin,
+    XMVECTOR* pCos,
+    FXMVECTOR  V
+) noexcept
+{
+    assert(pSin != nullptr);
+    assert(pCos != nullptr);
+
+    // 7/6-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Sin = { { {
+            sinf(V.vector4_f32[0]),
+            sinf(V.vector4_f32[1]),
+            sinf(V.vector4_f32[2]),
+            sinf(V.vector4_f32[3])
+        } } };
+
+    XMVECTORF32 Cos = { { {
+            cosf(V.vector4_f32[0]),
+            cosf(V.vector4_f32[1]),
+            cosf(V.vector4_f32[2]),
+            cosf(V.vector4_f32[3])
+        } } };
+
+    *pSin = Sin.v;
+    *pCos = Cos.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Force the value within the bounds of pi
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
+    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
+    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    float32x4_t absx = vabsq_f32(x);
+    float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
+    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
+    x = vbslq_f32(comp, x, rflx);
+    float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
+
+    float32x4_t x2 = vmulq_f32(x, x);
+
+    // Compute polynomial approximation for sine
+    const XMVECTOR SEC = g_XMSinCoefficients1;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0);
+    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    *pSin = vmulq_f32(Result, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR CEC = g_XMCosCoefficients1;
+    vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0);
+    Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    Result = vmlaq_f32(g_XMOne, Result, x2);
+    *pCos = vmulq_f32(Result, fsign);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Force the value within the bounds of pi
+    XMVECTOR x = XMVectorModAngles(V);
+
+    // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
+    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
+    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
+    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
+    __m128 rflx = _mm_sub_ps(c, x);
+    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
+    __m128 select0 = _mm_and_ps(comp, x);
+    __m128 select1 = _mm_andnot_ps(comp, rflx);
+    x = _mm_or_ps(select0, select1);
+    select0 = _mm_and_ps(comp, g_XMOne);
+    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
+    sign = _mm_or_ps(select0, select1);
+
+    __m128 x2 = _mm_mul_ps(x, x);
+
+    // Compute polynomial approximation for sine
+    const XMVECTOR SEC = g_XMSinCoefficients1;
+    __m128 vConstantsB = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(2, 2, 2, 2));
+    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(1, 1, 1, 1));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+    Result = XM_FMADD_PS(Result, x2, g_XMOne);
+    Result = _mm_mul_ps(Result, x);
+    *pSin = Result;
+
+    // Compute polynomial approximation for cosine
+    const XMVECTOR CEC = g_XMCosCoefficients1;
+    vConstantsB = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(3, 3, 3, 3));
+    vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(2, 2, 2, 2));
+    Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(1, 1, 1, 1));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+    Result = XM_FMADD_PS(Result, x2, g_XMOne);
+    Result = _mm_mul_ps(Result, sign);
+    *pCos = Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorTanEst(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            tanf(V.vector4_f32[0]),
+            tanf(V.vector4_f32[1]),
+            tanf(V.vector4_f32[2]),
+            tanf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_tan_ps(V);
+    return Result;
+#else
+
+    XMVECTOR OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v);
+
+    XMVECTOR V1 = XMVectorMultiply(V, OneOverPi);
+    V1 = XMVectorRound(V1);
+
+    V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V);
+
+    XMVECTOR T0 = XMVectorSplatX(g_XMTanEstCoefficients.v);
+    XMVECTOR T1 = XMVectorSplatY(g_XMTanEstCoefficients.v);
+    XMVECTOR T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v);
+
+    XMVECTOR V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2);
+    XMVECTOR V2 = XMVectorMultiply(V1, V1);
+    XMVECTOR V1T0 = XMVectorMultiply(V1, T0);
+    XMVECTOR V1T1 = XMVectorMultiply(V1, T1);
+
+    XMVECTOR D = XMVectorReciprocalEst(V2T2);
+    XMVECTOR N = XMVectorMultiplyAdd(V2, V1T1, V1T0);
+
+    return XMVectorMultiply(N, D);
+
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorASinEst(FXMVECTOR V) noexcept
+{
+    // 3-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result;
+    Result.f[0] = asinf(V.vector4_f32[0]);
+    Result.f[1] = asinf(V.vector4_f32[1]);
+    Result.f[2] = asinf(V.vector4_f32[2]);
+    Result.f[3] = asinf(V.vector4_f32[3]);
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
+    float32x4_t x = vabsq_f32(V);
+
+    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
+    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
+    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
+    float32x4_t root = XMVectorSqrt(clampOneMValue);
+
+    // Compute polynomial approximation
+    const XMVECTOR AEC = g_XMArcEstCoefficients;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
+    XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AEC), 1);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
+    t0 = vmlaq_f32(vConstants, t0, x);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
+    t0 = vmlaq_f32(vConstants, t0, x);
+    t0 = vmulq_f32(t0, root);
+
+    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
+    t0 = vbslq_f32(nonnegative, t0, t1);
+    t0 = vsubq_f32(g_XMHalfPi, t0);
+    return t0;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_asin_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
+    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
+    __m128 x = _mm_max_ps(V, mvalue);  // |V|
+
+    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
+    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
+    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
+    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
+
+    // Compute polynomial approximation
+    const XMVECTOR AEC = g_XMArcEstCoefficients;
+    __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2));
+    __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants);
+
+    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1));
+    t0 = XM_FMADD_PS(t0, x, vConstants);
+
+    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0));
+    t0 = XM_FMADD_PS(t0, x, vConstants);
+    t0 = _mm_mul_ps(t0, root);
+
+    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
+    t0 = _mm_and_ps(nonnegative, t0);
+    t1 = _mm_andnot_ps(nonnegative, t1);
+    t0 = _mm_or_ps(t0, t1);
+    t0 = _mm_sub_ps(g_XMHalfPi, t0);
+    return t0;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorACosEst(FXMVECTOR V) noexcept
+{
+    // 3-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            acosf(V.vector4_f32[0]),
+            acosf(V.vector4_f32[1]),
+            acosf(V.vector4_f32[2]),
+            acosf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
+    float32x4_t x = vabsq_f32(V);
+
+    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
+    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
+    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
+    float32x4_t root = XMVectorSqrt(clampOneMValue);
+
+    // Compute polynomial approximation
+    const XMVECTOR AEC = g_XMArcEstCoefficients;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
+    XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AEC), 1);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
+    t0 = vmlaq_f32(vConstants, t0, x);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
+    t0 = vmlaq_f32(vConstants, t0, x);
+    t0 = vmulq_f32(t0, root);
+
+    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
+    t0 = vbslq_f32(nonnegative, t0, t1);
+    return t0;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_acos_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
+    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
+    __m128 x = _mm_max_ps(V, mvalue);  // |V|
+
+    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
+    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
+    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
+    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
+
+    // Compute polynomial approximation
+    const XMVECTOR AEC = g_XMArcEstCoefficients;
+    __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2));
+    __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants);
+
+    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1));
+    t0 = XM_FMADD_PS(t0, x, vConstants);
+
+    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0));
+    t0 = XM_FMADD_PS(t0, x, vConstants);
+    t0 = _mm_mul_ps(t0, root);
+
+    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
+    t0 = _mm_and_ps(nonnegative, t0);
+    t1 = _mm_andnot_ps(nonnegative, t1);
+    t0 = _mm_or_ps(t0, t1);
+    return t0;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorATanEst(FXMVECTOR V) noexcept
+{
+    // 9-degree minimax approximation
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            atanf(V.vector4_f32[0]),
+            atanf(V.vector4_f32[1]),
+            atanf(V.vector4_f32[2]),
+            atanf(V.vector4_f32[3])
+        } } };
+    return Result.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t absV = vabsq_f32(V);
+    float32x4_t invV = XMVectorReciprocalEst(V);
+    uint32x4_t comp = vcgtq_f32(V, g_XMOne);
+    float32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
+    comp = vcleq_f32(absV, g_XMOne);
+    sign = vbslq_f32(comp, g_XMZero, sign);
+    float32x4_t x = vbslq_f32(comp, V, invV);
+
+    float32x4_t x2 = vmulq_f32(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR AEC = g_XMATanEstCoefficients1;
+    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
+    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(AEC), 1);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
+    Result = vmlaq_f32(vConstants, Result, x2);
+
+    // ATanEstCoefficients0 is already splatted
+    Result = vmlaq_f32(g_XMATanEstCoefficients0, Result, x2);
+    Result = vmulq_f32(Result, x);
+
+    float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi);
+    result1 = vsubq_f32(result1, Result);
+
+    comp = vceqq_f32(sign, g_XMZero);
+    Result = vbslq_f32(comp, Result, result1);
+    return Result;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_atan_ps(V);
+    return Result;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128 absV = XMVectorAbs(V);
+    __m128 invV = _mm_div_ps(g_XMOne, V);
+    __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
+    __m128 select0 = _mm_and_ps(comp, g_XMOne);
+    __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
+    __m128 sign = _mm_or_ps(select0, select1);
+    comp = _mm_cmple_ps(absV, g_XMOne);
+    select0 = _mm_and_ps(comp, g_XMZero);
+    select1 = _mm_andnot_ps(comp, sign);
+    sign = _mm_or_ps(select0, select1);
+    select0 = _mm_and_ps(comp, V);
+    select1 = _mm_andnot_ps(comp, invV);
+    __m128 x = _mm_or_ps(select0, select1);
+
+    __m128 x2 = _mm_mul_ps(x, x);
+
+    // Compute polynomial approximation
+    const XMVECTOR AEC = g_XMATanEstCoefficients1;
+    __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2));
+    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+
+    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0));
+    Result = XM_FMADD_PS(Result, x2, vConstants);
+    // ATanEstCoefficients0 is already splatted
+    Result = XM_FMADD_PS(Result, x2, g_XMATanEstCoefficients0);
+    Result = _mm_mul_ps(Result, x);
+    __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
+    result1 = _mm_sub_ps(result1, Result);
+
+    comp = _mm_cmpeq_ps(sign, g_XMZero);
+    select0 = _mm_and_ps(comp, Result);
+    select1 = _mm_andnot_ps(comp, result1);
+    Result = _mm_or_ps(select0, select1);
+    return Result;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorATan2Est
+(
+    FXMVECTOR Y,
+    FXMVECTOR X
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 Result = { { {
+            atan2f(Y.vector4_f32[0], X.vector4_f32[0]),
+            atan2f(Y.vector4_f32[1], X.vector4_f32[1]),
+            atan2f(Y.vector4_f32[2], X.vector4_f32[2]),
+            atan2f(Y.vector4_f32[3], X.vector4_f32[3]),
+        } } };
+    return Result.v;
+#elif defined(_XM_SVML_INTRINSICS_)
+    XMVECTOR Result = _mm_atan2_ps(Y, X);
+    return Result;
+#else
+
+    static const XMVECTORF32 ATan2Constants = { { { XM_PI, XM_PIDIV2, XM_PIDIV4, 2.3561944905f /* Pi*3/4 */ } } };
+
+    const XMVECTOR Zero = XMVectorZero();
+    XMVECTOR ATanResultValid = XMVectorTrueInt();
+
+    XMVECTOR Pi = XMVectorSplatX(ATan2Constants);
+    XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants);
+    XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants);
+    XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants);
+
+    XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero);
+    XMVECTOR XEqualsZero = XMVectorEqual(X, Zero);
+    XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
+    XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
+    XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
+    XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);
+
+    XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
+    Pi = XMVectorOrInt(Pi, YSign);
+    PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
+    PiOverFour = XMVectorOrInt(PiOverFour, YSign);
+    ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);
+
+    XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive);
+    XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
+    XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero);
+    XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
+    XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
+    XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity);
+    ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
+
+    XMVECTOR Reciprocal = XMVectorReciprocalEst(X);
+    XMVECTOR V = XMVectorMultiply(Y, Reciprocal);
+    XMVECTOR R0 = XMVectorATanEst(V);
+
+    R1 = XMVectorSelect(Pi, g_XMNegativeZero, XIsPositive);
+    R2 = XMVectorAdd(R0, R1);
+
+    Result = XMVectorSelect(Result, R2, ATanResultValid);
+
+    return Result;
+
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorLerp
+(
+    FXMVECTOR V0,
+    FXMVECTOR V1,
+    float    t
+) noexcept
+{
+    // V0 + t * (V1 - V0)
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Scale = XMVectorReplicate(t);
+    XMVECTOR Length = XMVectorSubtract(V1, V0);
+    return XMVectorMultiplyAdd(Length, Scale, V0);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR L = vsubq_f32(V1, V0);
+    return vmlaq_n_f32(V0, L, t);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR L = _mm_sub_ps(V1, V0);
+    XMVECTOR S = _mm_set_ps1(t);
+    return XM_FMADD_PS(L, S, V0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorLerpV
+(
+    FXMVECTOR V0,
+    FXMVECTOR V1,
+    FXMVECTOR T
+) noexcept
+{
+    // V0 + T * (V1 - V0)
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Length = XMVectorSubtract(V1, V0);
+    return XMVectorMultiplyAdd(Length, T, V0);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR L = vsubq_f32(V1, V0);
+    return vmlaq_f32(V0, L, T);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR Length = _mm_sub_ps(V1, V0);
+    return XM_FMADD_PS(Length, T, V0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorHermite
+(
+    FXMVECTOR Position0,
+    FXMVECTOR Tangent0,
+    FXMVECTOR Position1,
+    GXMVECTOR Tangent1,
+    float    t
+) noexcept
+{
+    // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
+    //          (t^3 - 2 * t^2 + t) * Tangent0 +
+    //          (-2 * t^3 + 3 * t^2) * Position1 +
+    //          (t^3 - t^2) * Tangent1
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    XMVECTOR P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f);
+    XMVECTOR T0 = XMVectorReplicate(t3 - 2.0f * t2 + t);
+    XMVECTOR P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2);
+    XMVECTOR T1 = XMVectorReplicate(t3 - t2);
+
+    XMVECTOR Result = XMVectorMultiply(P0, Position0);
+    Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
+    Result = XMVectorMultiplyAdd(P1, Position1, Result);
+    Result = XMVectorMultiplyAdd(T1, Tangent1, Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    float p0 = 2.0f * t3 - 3.0f * t2 + 1.0f;
+    float t0 = t3 - 2.0f * t2 + t;
+    float p1 = -2.0f * t3 + 3.0f * t2;
+    float t1 = t3 - t2;
+
+    XMVECTOR vResult = vmulq_n_f32(Position0, p0);
+    vResult = vmlaq_n_f32(vResult, Tangent0, t0);
+    vResult = vmlaq_n_f32(vResult, Position1, p1);
+    vResult = vmlaq_n_f32(vResult, Tangent1, t1);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f);
+    XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t);
+    XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2);
+    XMVECTOR T1 = _mm_set_ps1(t3 - t2);
+
+    XMVECTOR vResult = _mm_mul_ps(P0, Position0);
+    vResult = XM_FMADD_PS(Tangent0, T0, vResult);
+    vResult = XM_FMADD_PS(Position1, P1, vResult);
+    vResult = XM_FMADD_PS(Tangent1, T1, vResult);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorHermiteV
+(
+    FXMVECTOR Position0,
+    FXMVECTOR Tangent0,
+    FXMVECTOR Position1,
+    GXMVECTOR Tangent1,
+    HXMVECTOR T
+) noexcept
+{
+    // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
+    //          (t^3 - 2 * t^2 + t) * Tangent0 +
+    //          (-2 * t^3 + 3 * t^2) * Position1 +
+    //          (t^3 - t^2) * Tangent1
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR T2 = XMVectorMultiply(T, T);
+    XMVECTOR T3 = XMVectorMultiply(T, T2);
+
+    XMVECTOR P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f);
+    XMVECTOR T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]);
+    XMVECTOR P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]);
+    XMVECTOR T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]);
+
+    XMVECTOR Result = XMVectorMultiply(P0, Position0);
+    Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
+    Result = XMVectorMultiplyAdd(P1, Position1, Result);
+    Result = XMVectorMultiplyAdd(T1, Tangent1, Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 CatMulT2 = { { { -3.0f, -2.0f, 3.0f, -1.0f } } };
+    static const XMVECTORF32 CatMulT3 = { { { 2.0f, 1.0f, -2.0f, 1.0f } } };
+
+    XMVECTOR T2 = vmulq_f32(T, T);
+    XMVECTOR T3 = vmulq_f32(T, T2);
+    // Mul by the constants against t^2
+    T2 = vmulq_f32(T2, CatMulT2);
+    // Mul by the constants against t^3
+    T3 = vmlaq_f32(T2, T3, CatMulT3);
+    // T3 now has the pre-result.
+    // I need to add t.y only
+    T2 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T), g_XMMaskY));
+    T3 = vaddq_f32(T3, T2);
+    // Add 1.0f to x
+    T3 = vaddq_f32(T3, g_XMIdentityR0);
+    // Now, I have the constants created
+    // Mul the x constant to Position0
+    XMVECTOR vResult = vmulq_lane_f32(Position0, vget_low_f32(T3), 0); // T3[0]
+    // Mul the y constant to Tangent0
+    vResult = vmlaq_lane_f32(vResult, Tangent0, vget_low_f32(T3), 1); // T3[1]
+    // Mul the z constant to Position1
+    vResult = vmlaq_lane_f32(vResult, Position1, vget_high_f32(T3), 0); // T3[2]
+    // Mul the w constant to Tangent1
+    vResult = vmlaq_lane_f32(vResult, Tangent1, vget_high_f32(T3), 1); // T3[3]
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 CatMulT2 = { { { -3.0f, -2.0f, 3.0f, -1.0f } } };
+    static const XMVECTORF32 CatMulT3 = { { { 2.0f, 1.0f, -2.0f, 1.0f } } };
+
+    XMVECTOR T2 = _mm_mul_ps(T, T);
+    XMVECTOR T3 = _mm_mul_ps(T, T2);
+    // Mul by the constants against t^2
+    T2 = _mm_mul_ps(T2, CatMulT2);
+    // Mul by the constants against t^3
+    T3 = XM_FMADD_PS(T3, CatMulT3, T2);
+    // T3 now has the pre-result.
+    // I need to add t.y only
+    T2 = _mm_and_ps(T, g_XMMaskY);
+    T3 = _mm_add_ps(T3, T2);
+    // Add 1.0f to x
+    T3 = _mm_add_ps(T3, g_XMIdentityR0);
+    // Now, I have the constants created
+    // Mul the x constant to Position0
+    XMVECTOR vResult = XM_PERMUTE_PS(T3, _MM_SHUFFLE(0, 0, 0, 0));
+    vResult = _mm_mul_ps(vResult, Position0);
+    // Mul the y constant to Tangent0
+    T2 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(1, 1, 1, 1));
+    vResult = XM_FMADD_PS(T2, Tangent0, vResult);
+    // Mul the z constant to Position1
+    T2 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(2, 2, 2, 2));
+    vResult = XM_FMADD_PS(T2, Position1, vResult);
+    // Mul the w constant to Tangent1
+    T3 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(3, 3, 3, 3));
+    vResult = XM_FMADD_PS(T3, Tangent1, vResult);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorCatmullRom
+(
+    FXMVECTOR Position0,
+    FXMVECTOR Position1,
+    FXMVECTOR Position2,
+    GXMVECTOR Position3,
+    float    t
+) noexcept
+{
+    // Result = ((-t^3 + 2 * t^2 - t) * Position0 +
+    //           (3 * t^3 - 5 * t^2 + 2) * Position1 +
+    //           (-3 * t^3 + 4 * t^2 + t) * Position2 +
+    //           (t^3 - t^2) * Position3) * 0.5
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    XMVECTOR P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f);
+    XMVECTOR P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
+    XMVECTOR P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
+    XMVECTOR P3 = XMVectorReplicate((t3 - t2) * 0.5f);
+
+    XMVECTOR Result = XMVectorMultiply(P0, Position0);
+    Result = XMVectorMultiplyAdd(P1, Position1, Result);
+    Result = XMVectorMultiplyAdd(P2, Position2, Result);
+    Result = XMVectorMultiplyAdd(P3, Position3, Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    float p0 = (-t3 + 2.0f * t2 - t) * 0.5f;
+    float p1 = (3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f;
+    float p2 = (-3.0f * t3 + 4.0f * t2 + t) * 0.5f;
+    float p3 = (t3 - t2) * 0.5f;
+
+    XMVECTOR P1 = vmulq_n_f32(Position1, p1);
+    XMVECTOR P0 = vmlaq_n_f32(P1, Position0, p0);
+    XMVECTOR P3 = vmulq_n_f32(Position3, p3);
+    XMVECTOR P2 = vmlaq_n_f32(P3, Position2, p2);
+    P0 = vaddq_f32(P0, P2);
+    return P0;
+#elif defined(_XM_SSE_INTRINSICS_)
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f);
+    XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
+    XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
+    XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f);
+
+    P1 = _mm_mul_ps(Position1, P1);
+    P0 = XM_FMADD_PS(Position0, P0, P1);
+    P3 = _mm_mul_ps(Position3, P3);
+    P2 = XM_FMADD_PS(Position2, P2, P3);
+    P0 = _mm_add_ps(P0, P2);
+    return P0;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorCatmullRomV
+(
+    FXMVECTOR Position0,
+    FXMVECTOR Position1,
+    FXMVECTOR Position2,
+    GXMVECTOR Position3,
+    HXMVECTOR T
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float fx = T.vector4_f32[0];
+    float fy = T.vector4_f32[1];
+    float fz = T.vector4_f32[2];
+    float fw = T.vector4_f32[3];
+    XMVECTORF32 vResult = { { {
+            0.5f * ((-fx * fx * fx + 2 * fx * fx - fx) * Position0.vector4_f32[0]
+            + (3 * fx * fx * fx - 5 * fx * fx + 2) * Position1.vector4_f32[0]
+            + (-3 * fx * fx * fx + 4 * fx * fx + fx) * Position2.vector4_f32[0]
+            + (fx * fx * fx - fx * fx) * Position3.vector4_f32[0]),
+
+            0.5f * ((-fy * fy * fy + 2 * fy * fy - fy) * Position0.vector4_f32[1]
+            + (3 * fy * fy * fy - 5 * fy * fy + 2) * Position1.vector4_f32[1]
+            + (-3 * fy * fy * fy + 4 * fy * fy + fy) * Position2.vector4_f32[1]
+            + (fy * fy * fy - fy * fy) * Position3.vector4_f32[1]),
+
+            0.5f * ((-fz * fz * fz + 2 * fz * fz - fz) * Position0.vector4_f32[2]
+            + (3 * fz * fz * fz - 5 * fz * fz + 2) * Position1.vector4_f32[2]
+            + (-3 * fz * fz * fz + 4 * fz * fz + fz) * Position2.vector4_f32[2]
+            + (fz * fz * fz - fz * fz) * Position3.vector4_f32[2]),
+
+            0.5f * ((-fw * fw * fw + 2 * fw * fw - fw) * Position0.vector4_f32[3]
+            + (3 * fw * fw * fw - 5 * fw * fw + 2) * Position1.vector4_f32[3]
+            + (-3 * fw * fw * fw + 4 * fw * fw + fw) * Position2.vector4_f32[3]
+            + (fw * fw * fw - fw * fw) * Position3.vector4_f32[3])
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Catmul2 = { { { 2.0f, 2.0f, 2.0f, 2.0f } } };
+    static const XMVECTORF32 Catmul3 = { { { 3.0f, 3.0f, 3.0f, 3.0f } } };
+    static const XMVECTORF32 Catmul4 = { { { 4.0f, 4.0f, 4.0f, 4.0f } } };
+    static const XMVECTORF32 Catmul5 = { { { 5.0f, 5.0f, 5.0f, 5.0f } } };
+    // Cache T^2 and T^3
+    XMVECTOR T2 = vmulq_f32(T, T);
+    XMVECTOR T3 = vmulq_f32(T, T2);
+    // Perform the Position0 term
+    XMVECTOR vResult = vaddq_f32(T2, T2);
+    vResult = vsubq_f32(vResult, T);
+    vResult = vsubq_f32(vResult, T3);
+    vResult = vmulq_f32(vResult, Position0);
+    // Perform the Position1 term and add
+    XMVECTOR vTemp = vmulq_f32(T3, Catmul3);
+    vTemp = vmlsq_f32(vTemp, T2, Catmul5);
+    vTemp = vaddq_f32(vTemp, Catmul2);
+    vResult = vmlaq_f32(vResult, vTemp, Position1);
+    // Perform the Position2 term and add
+    vTemp = vmulq_f32(T2, Catmul4);
+    vTemp = vmlsq_f32(vTemp, T3, Catmul3);
+    vTemp = vaddq_f32(vTemp, T);
+    vResult = vmlaq_f32(vResult, vTemp, Position2);
+    // Position3 is the last term
+    T3 = vsubq_f32(T3, T2);
+    vResult = vmlaq_f32(vResult, T3, Position3);
+    // Multiply by 0.5f and exit
+    vResult = vmulq_f32(vResult, g_XMOneHalf);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Catmul2 = { { { 2.0f, 2.0f, 2.0f, 2.0f } } };
+    static const XMVECTORF32 Catmul3 = { { { 3.0f, 3.0f, 3.0f, 3.0f } } };
+    static const XMVECTORF32 Catmul4 = { { { 4.0f, 4.0f, 4.0f, 4.0f } } };
+    static const XMVECTORF32 Catmul5 = { { { 5.0f, 5.0f, 5.0f, 5.0f } } };
+    // Cache T^2 and T^3
+    XMVECTOR T2 = _mm_mul_ps(T, T);
+    XMVECTOR T3 = _mm_mul_ps(T, T2);
+    // Perform the Position0 term
+    XMVECTOR vResult = _mm_add_ps(T2, T2);
+    vResult = _mm_sub_ps(vResult, T);
+    vResult = _mm_sub_ps(vResult, T3);
+    vResult = _mm_mul_ps(vResult, Position0);
+    // Perform the Position1 term and add
+    XMVECTOR vTemp = _mm_mul_ps(T3, Catmul3);
+    vTemp = XM_FNMADD_PS(T2, Catmul5, vTemp);
+    vTemp = _mm_add_ps(vTemp, Catmul2);
+    vResult = XM_FMADD_PS(vTemp, Position1, vResult);
+    // Perform the Position2 term and add
+    vTemp = _mm_mul_ps(T2, Catmul4);
+    vTemp = XM_FNMADD_PS(T3, Catmul3, vTemp);
+    vTemp = _mm_add_ps(vTemp, T);
+    vResult = XM_FMADD_PS(vTemp, Position2, vResult);
+    // Position3 is the last term
+    T3 = _mm_sub_ps(T3, T2);
+    vResult = XM_FMADD_PS(T3, Position3, vResult);
+    // Multiply by 0.5f and exit
+    vResult = _mm_mul_ps(vResult, g_XMOneHalf);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorBaryCentric
+(
+    FXMVECTOR Position0,
+    FXMVECTOR Position1,
+    FXMVECTOR Position2,
+    float    f,
+    float    g
+) noexcept
+{
+    // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0)
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR P10 = XMVectorSubtract(Position1, Position0);
+    XMVECTOR ScaleF = XMVectorReplicate(f);
+
+    XMVECTOR P20 = XMVectorSubtract(Position2, Position0);
+    XMVECTOR ScaleG = XMVectorReplicate(g);
+
+    XMVECTOR Result = XMVectorMultiplyAdd(P10, ScaleF, Position0);
+    Result = XMVectorMultiplyAdd(P20, ScaleG, Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR R1 = vsubq_f32(Position1, Position0);
+    XMVECTOR R2 = vsubq_f32(Position2, Position0);
+    R1 = vmlaq_n_f32(Position0, R1, f);
+    return vmlaq_n_f32(R1, R2, g);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR R1 = _mm_sub_ps(Position1, Position0);
+    XMVECTOR R2 = _mm_sub_ps(Position2, Position0);
+    XMVECTOR SF = _mm_set_ps1(f);
+    R1 = XM_FMADD_PS(R1, SF, Position0);
+    XMVECTOR SG = _mm_set_ps1(g);
+    return XM_FMADD_PS(R2, SG, R1);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVectorBaryCentricV
+(
+    FXMVECTOR Position0,
+    FXMVECTOR Position1,
+    FXMVECTOR Position2,
+    GXMVECTOR F,
+    HXMVECTOR G
+) noexcept
+{
+    // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0)
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR P10 = XMVectorSubtract(Position1, Position0);
+    XMVECTOR P20 = XMVectorSubtract(Position2, Position0);
+
+    XMVECTOR Result = XMVectorMultiplyAdd(P10, F, Position0);
+    Result = XMVectorMultiplyAdd(P20, G, Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR R1 = vsubq_f32(Position1, Position0);
+    XMVECTOR R2 = vsubq_f32(Position2, Position0);
+    R1 = vmlaq_f32(Position0, R1, F);
+    return vmlaq_f32(R1, R2, G);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR R1 = _mm_sub_ps(Position1, Position0);
+    XMVECTOR R2 = _mm_sub_ps(Position2, Position0);
+    R1 = XM_FMADD_PS(R1, F, Position0);
+    return XM_FMADD_PS(R2, G, R1);
+#endif
+}
+
+/****************************************************************************
+ *
+ * 2D Vector
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+ // Comparison operations
+ //------------------------------------------------------------------------------
+
+ //------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2Equal
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2));
+    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
+    // z and w are don't care
+    return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0);
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector2EqualR
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t CR = 0;
+    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
+        (V1.vector4_f32[1] == V2.vector4_f32[1]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
+        (V1.vector4_f32[1] != V2.vector4_f32[1]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2));
+    uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0);
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFFFFFFFFFFFU)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!r)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
+    // z and w are don't care
+    int iTest = _mm_movemask_ps(vTemp) & 3;
+    uint32_t CR = 0;
+    if (iTest == 3)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2EqualInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)), vget_low_u32(vreinterpretq_u32_f32(V2)));
+    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
+    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3) == 3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector2EqualIntR
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t CR = 0;
+    if ((V1.vector4_u32[0] == V2.vector4_u32[0]) &&
+        (V1.vector4_u32[1] == V2.vector4_u32[1]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) &&
+        (V1.vector4_u32[1] != V2.vector4_u32[1]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)), vget_low_u32(vreinterpretq_u32_f32(V2)));
+    uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0);
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFFFFFFFFFFFU)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!r)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
+    int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3;
+    uint32_t CR = 0;
+    if (iTest == 3)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2NearEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2,
+    FXMVECTOR Epsilon
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]);
+    float dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]);
+    return ((dx <= Epsilon.vector4_f32[0]) &&
+        (dy <= Epsilon.vector4_f32[1]));
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t vDelta = vsub_f32(vget_low_f32(V1), vget_low_f32(V2));
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    uint32x2_t vTemp = vacle_f32(vDelta, vget_low_u32(Epsilon));
+#else
+    uint32x2_t vTemp = vcle_f32(vabs_f32(vDelta), vget_low_f32(Epsilon));
+#endif
+    uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0);
+    return (r == 0xFFFFFFFFFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Get the difference
+    XMVECTOR vDelta = _mm_sub_ps(V1, V2);
+    // Get the absolute value of the difference
+    XMVECTOR vTemp = _mm_setzero_ps();
+    vTemp = _mm_sub_ps(vTemp, vDelta);
+    vTemp = _mm_max_ps(vTemp, vDelta);
+    vTemp = _mm_cmple_ps(vTemp, Epsilon);
+    // z and w are don't care
+    return (((_mm_movemask_ps(vTemp) & 3) == 0x3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2NotEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2));
+    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) != 0xFFFFFFFFFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
+    // z and w are don't care
+    return (((_mm_movemask_ps(vTemp) & 3) != 3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2NotEqualInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)), vget_low_u32(vreinterpretq_u32_f32(V2)));
+    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) != 0xFFFFFFFFFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
+    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3) != 3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2Greater
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vcgt_f32(vget_low_f32(V1), vget_low_f32(V2));
+    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
+    // z and w are don't care
+    return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector2GreaterR
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t CR = 0;
+    if ((V1.vector4_f32[0] > V2.vector4_f32[0]) &&
+        (V1.vector4_f32[1] > V2.vector4_f32[1]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) &&
+        (V1.vector4_f32[1] <= V2.vector4_f32[1]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vcgt_f32(vget_low_f32(V1), vget_low_f32(V2));
+    uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0);
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFFFFFFFFFFFU)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!r)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
+    int iTest = _mm_movemask_ps(vTemp) & 3;
+    uint32_t CR = 0;
+    if (iTest == 3)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2GreaterOrEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vcge_f32(vget_low_f32(V1), vget_low_f32(V2));
+    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
+    return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector2GreaterOrEqualR
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t CR = 0;
+    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
+        (V1.vector4_f32[1] >= V2.vector4_f32[1]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
+        (V1.vector4_f32[1] < V2.vector4_f32[1]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vcge_f32(vget_low_f32(V1), vget_low_f32(V2));
+    uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0);
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFFFFFFFFFFFU)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!r)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
+    int iTest = _mm_movemask_ps(vTemp) & 3;
+    uint32_t CR = 0;
+    if (iTest == 3)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2Less
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vclt_f32(vget_low_f32(V1), vget_low_f32(V2));
+    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmplt_ps(V1, V2);
+    return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2LessOrEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vTemp = vcle_f32(vget_low_f32(V1), vget_low_f32(V2));
+    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmple_ps(V1, V2);
+    return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2InBounds
+(
+    FXMVECTOR V,
+    FXMVECTOR Bounds
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
+        (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    float32x2_t B = vget_low_f32(Bounds);
+    // Test if less than or equal
+    uint32x2_t ivTemp1 = vcle_f32(VL, B);
+    // Negate the bounds
+    float32x2_t vTemp2 = vneg_f32(B);
+    // Test if greater or equal (Reversed)
+    uint32x2_t ivTemp2 = vcle_f32(vTemp2, VL);
+    // Blend answers
+    ivTemp1 = vand_u32(ivTemp1, ivTemp2);
+    // x and y in bounds?
+    return (vget_lane_u64(vreinterpret_u64_u32(ivTemp1), 0) == 0xFFFFFFFFFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test if less than or equal
+    XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds);
+    // Negate the bounds
+    XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne);
+    // Test if greater or equal (Reversed)
+    vTemp2 = _mm_cmple_ps(vTemp2, V);
+    // Blend answers
+    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
+    // x and y in bounds? (z and w are don't care)
+    return (((_mm_movemask_ps(vTemp1) & 0x3) == 0x3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#pragma float_control(push)
+#pragma float_control(precise, on)
+#endif
+
+inline bool XM_CALLCONV XMVector2IsNaN(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (XMISNAN(V.vector4_f32[0]) ||
+        XMISNAN(V.vector4_f32[1]));
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    // Test against itself. NaN is always not equal
+    uint32x2_t vTempNan = vceq_f32(VL, VL);
+    // If x or y are NaN, the mask is zero
+    return (vget_lane_u64(vreinterpret_u64_u32(vTempNan), 0) != 0xFFFFFFFFFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test against itself. NaN is always not equal
+    XMVECTOR vTempNan = _mm_cmpneq_ps(V, V);
+    // If x or y are NaN, the mask is non-zero
+    return ((_mm_movemask_ps(vTempNan) & 3) != 0);
+#endif
+}
+
+#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#pragma float_control(pop)
+#endif
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector2IsInfinite(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    return (XMISINF(V.vector4_f32[0]) ||
+        XMISINF(V.vector4_f32[1]));
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Mask off the sign bit
+    uint32x2_t vTemp = vand_u32(vget_low_u32(vreinterpretq_u32_f32(V)), vget_low_u32(g_XMAbsMask));
+    // Compare to infinity
+    vTemp = vceq_f32(vreinterpret_f32_u32(vTemp), vget_low_f32(g_XMInfinity));
+    // If any are infinity, the signs are true.
+    return vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) != 0;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Mask off the sign bit
+    __m128 vTemp = _mm_and_ps(V, g_XMAbsMask);
+    // Compare to infinity
+    vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity);
+    // If x or z are infinity, the signs are true.
+    return ((_mm_movemask_ps(vTemp) & 3) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Dot
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result;
+    Result.f[0] =
+        Result.f[1] =
+        Result.f[2] =
+        Result.f[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1];
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Perform the dot product on x and y
+    float32x2_t vTemp = vmul_f32(vget_low_f32(V1), vget_low_f32(V2));
+    vTemp = vpadd_f32(vTemp, vTemp);
+    return vcombine_f32(vTemp, vTemp);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_dp_ps(V1, V2, 0x3f);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vDot = _mm_mul_ps(V1, V2);
+    vDot = _mm_hadd_ps(vDot, vDot);
+    vDot = _mm_moveldup_ps(vDot);
+    return vDot;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x and y
+    XMVECTOR vLengthSq = _mm_mul_ps(V1, V2);
+    // vTemp has y splatted
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
+    // x+y
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Cross
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+    // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ]
+
+#if defined(_XM_NO_INTRINSICS_)
+    float fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]);
+    XMVECTORF32 vResult;
+    vResult.f[0] =
+        vResult.f[1] =
+        vResult.f[2] =
+        vResult.f[3] = fCross;
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Negate = { { { 1.f, -1.f, 0, 0 } } };
+
+    float32x2_t vTemp = vmul_f32(vget_low_f32(V1), vrev64_f32(vget_low_f32(V2)));
+    vTemp = vmul_f32(vTemp, vget_low_f32(Negate));
+    vTemp = vpadd_f32(vTemp, vTemp);
+    return vcombine_f32(vTemp, vTemp);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Swap x and y
+    XMVECTOR vResult = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 1, 0, 1));
+    // Perform the muls
+    vResult = _mm_mul_ps(vResult, V1);
+    // Splat y
+    XMVECTOR vTemp = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(1, 1, 1, 1));
+    // Sub the values
+    vResult = _mm_sub_ss(vResult, vTemp);
+    // Splat the cross product
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 0, 0, 0));
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2LengthSq(FXMVECTOR V) noexcept
+{
+    return XMVector2Dot(V, V);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVector2LengthSq(V);
+    Result = XMVectorReciprocalSqrtEst(Result);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    // Dot2
+    float32x2_t vTemp = vmul_f32(VL, VL);
+    vTemp = vpadd_f32(vTemp, vTemp);
+    // Reciprocal sqrt (estimate)
+    vTemp = vrsqrte_f32(vTemp);
+    return vcombine_f32(vTemp, vTemp);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
+    return _mm_rsqrt_ps(vTemp);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_rsqrt_ss(vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x and y
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    // vTemp has y splatted
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
+    // x+y
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vLengthSq = _mm_rsqrt_ss(vLengthSq);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVector2LengthSq(V);
+    Result = XMVectorReciprocalSqrt(Result);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    // Dot2
+    float32x2_t vTemp = vmul_f32(VL, VL);
+    vTemp = vpadd_f32(vTemp, vTemp);
+    // Reciprocal sqrt
+    float32x2_t  S0 = vrsqrte_f32(vTemp);
+    float32x2_t  P0 = vmul_f32(vTemp, S0);
+    float32x2_t  R0 = vrsqrts_f32(P0, S0);
+    float32x2_t  S1 = vmul_f32(S0, R0);
+    float32x2_t  P1 = vmul_f32(vTemp, S1);
+    float32x2_t  R1 = vrsqrts_f32(P1, S1);
+    float32x2_t Result = vmul_f32(S1, R1);
+    return vcombine_f32(Result, Result);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
+    XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp);
+    return _mm_div_ps(g_XMOne, vLengthSq);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ss(vTemp);
+    vLengthSq = _mm_div_ss(g_XMOne, vLengthSq);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x and y
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    // vTemp has y splatted
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
+    // x+y
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vLengthSq = _mm_sqrt_ss(vLengthSq);
+    vLengthSq = _mm_div_ss(g_XMOne, vLengthSq);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2LengthEst(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVector2LengthSq(V);
+    Result = XMVectorSqrtEst(Result);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    // Dot2
+    float32x2_t vTemp = vmul_f32(VL, VL);
+    vTemp = vpadd_f32(vTemp, vTemp);
+    const float32x2_t zero = vdup_n_f32(0);
+    uint32x2_t VEqualsZero = vceq_f32(vTemp, zero);
+    // Sqrt (estimate)
+    float32x2_t Result = vrsqrte_f32(vTemp);
+    Result = vmul_f32(vTemp, Result);
+    Result = vbsl_f32(VEqualsZero, zero, Result);
+    return vcombine_f32(Result, Result);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
+    return _mm_sqrt_ps(vTemp);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ss(vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x and y
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    // vTemp has y splatted
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
+    // x+y
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vLengthSq = _mm_sqrt_ss(vLengthSq);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Length(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVector2LengthSq(V);
+    Result = XMVectorSqrt(Result);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    // Dot2
+    float32x2_t vTemp = vmul_f32(VL, VL);
+    vTemp = vpadd_f32(vTemp, vTemp);
+    const float32x2_t zero = vdup_n_f32(0);
+    uint32x2_t VEqualsZero = vceq_f32(vTemp, zero);
+    // Sqrt
+    float32x2_t S0 = vrsqrte_f32(vTemp);
+    float32x2_t P0 = vmul_f32(vTemp, S0);
+    float32x2_t R0 = vrsqrts_f32(P0, S0);
+    float32x2_t S1 = vmul_f32(S0, R0);
+    float32x2_t P1 = vmul_f32(vTemp, S1);
+    float32x2_t R1 = vrsqrts_f32(P1, S1);
+    float32x2_t Result = vmul_f32(S1, R1);
+    Result = vmul_f32(vTemp, Result);
+    Result = vbsl_f32(VEqualsZero, zero, Result);
+    return vcombine_f32(Result, Result);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
+    return _mm_sqrt_ps(vTemp);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ss(vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x and y
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    // vTemp has y splatted
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
+    // x+y
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// XMVector2NormalizeEst uses a reciprocal estimate and
+// returns QNaN on zero and infinite vectors.
+
+inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVector2ReciprocalLength(V);
+    Result = XMVectorMultiply(V, Result);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    // Dot2
+    float32x2_t vTemp = vmul_f32(VL, VL);
+    vTemp = vpadd_f32(vTemp, vTemp);
+    // Reciprocal sqrt (estimate)
+    vTemp = vrsqrte_f32(vTemp);
+    // Normalize
+    float32x2_t Result = vmul_f32(VL, vTemp);
+    return vcombine_f32(Result, Result);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
+    XMVECTOR vResult = _mm_rsqrt_ps(vTemp);
+    return _mm_mul_ps(vResult, V);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_rsqrt_ss(vLengthSq);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    vLengthSq = _mm_mul_ps(vLengthSq, V);
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x and y
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    // vTemp has y splatted
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
+    // x+y
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vLengthSq = _mm_rsqrt_ss(vLengthSq);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    vLengthSq = _mm_mul_ps(vLengthSq, V);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Normalize(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR vResult = XMVector2Length(V);
+    float fLength = vResult.vector4_f32[0];
+
+    // Prevent divide by zero
+    if (fLength > 0)
+    {
+        fLength = 1.0f / fLength;
+    }
+
+    vResult.vector4_f32[0] = V.vector4_f32[0] * fLength;
+    vResult.vector4_f32[1] = V.vector4_f32[1] * fLength;
+    vResult.vector4_f32[2] = V.vector4_f32[2] * fLength;
+    vResult.vector4_f32[3] = V.vector4_f32[3] * fLength;
+    return vResult;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    // Dot2
+    float32x2_t vTemp = vmul_f32(VL, VL);
+    vTemp = vpadd_f32(vTemp, vTemp);
+    uint32x2_t VEqualsZero = vceq_f32(vTemp, vdup_n_f32(0));
+    uint32x2_t VEqualsInf = vceq_f32(vTemp, vget_low_f32(g_XMInfinity));
+    // Reciprocal sqrt (2 iterations of Newton-Raphson)
+    float32x2_t S0 = vrsqrte_f32(vTemp);
+    float32x2_t P0 = vmul_f32(vTemp, S0);
+    float32x2_t R0 = vrsqrts_f32(P0, S0);
+    float32x2_t S1 = vmul_f32(S0, R0);
+    float32x2_t P1 = vmul_f32(vTemp, S1);
+    float32x2_t R1 = vrsqrts_f32(P1, S1);
+    vTemp = vmul_f32(S1, R1);
+    // Normalize
+    float32x2_t Result = vmul_f32(VL, vTemp);
+    Result = vbsl_f32(VEqualsZero, vdup_n_f32(0), Result);
+    Result = vbsl_f32(VEqualsInf, vget_low_f32(g_XMQNaN), Result);
+    return vcombine_f32(Result, Result);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0x3f);
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(V, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
+    vResult = _mm_or_ps(vTemp1, vTemp2);
+    return vResult;
+#elif defined(_XM_SSE3_INTRINSICS_)
+    // Perform the dot product on x and y only
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_moveldup_ps(vLengthSq);
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(V, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
+    vResult = _mm_or_ps(vTemp1, vTemp2);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x and y only
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_div_ps(V, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
+    vResult = _mm_or_ps(vTemp1, vTemp2);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2ClampLength
+(
+    FXMVECTOR V,
+    float    LengthMin,
+    float    LengthMax
+) noexcept
+{
+    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
+    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
+    return XMVector2ClampLengthV(V, ClampMin, ClampMax);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2ClampLengthV
+(
+    FXMVECTOR V,
+    FXMVECTOR LengthMin,
+    FXMVECTOR LengthMax
+) noexcept
+{
+    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)));
+    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)));
+    assert(XMVector2GreaterOrEqual(LengthMin, g_XMZero));
+    assert(XMVector2GreaterOrEqual(LengthMax, g_XMZero));
+    assert(XMVector2GreaterOrEqual(LengthMax, LengthMin));
+
+    XMVECTOR LengthSq = XMVector2LengthSq(V);
+
+    const XMVECTOR Zero = XMVectorZero();
+
+    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
+
+    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
+    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
+
+    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
+
+    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
+
+    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
+    Length = XMVectorSelect(LengthSq, Length, Select);
+    Normal = XMVectorSelect(LengthSq, Normal, Select);
+
+    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
+    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
+
+    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
+    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
+
+    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
+
+    // Preserve the original vector (with no precision loss) if the length falls within the given range
+    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
+    Result = XMVectorSelect(Result, V, Control);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Reflect
+(
+    FXMVECTOR Incident,
+    FXMVECTOR Normal
+) noexcept
+{
+    // Result = Incident - (2 * dot(Incident, Normal)) * Normal
+
+    XMVECTOR Result;
+    Result = XMVector2Dot(Incident, Normal);
+    Result = XMVectorAdd(Result, Result);
+    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Refract
+(
+    FXMVECTOR Incident,
+    FXMVECTOR Normal,
+    float    RefractionIndex
+) noexcept
+{
+    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
+    return XMVector2RefractV(Incident, Normal, Index);
+}
+
+//------------------------------------------------------------------------------
+
+// Return the refraction of a 2D vector
+inline XMVECTOR XM_CALLCONV XMVector2RefractV
+(
+    FXMVECTOR Incident,
+    FXMVECTOR Normal,
+    FXMVECTOR RefractionIndex
+) noexcept
+{
+    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
+    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    float IDotN = (Incident.vector4_f32[0] * Normal.vector4_f32[0]) + (Incident.vector4_f32[1] * Normal.vector4_f32[1]);
+    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    float RY = 1.0f - (IDotN * IDotN);
+    float RX = 1.0f - (RY * RefractionIndex.vector4_f32[0] * RefractionIndex.vector4_f32[0]);
+    RY = 1.0f - (RY * RefractionIndex.vector4_f32[1] * RefractionIndex.vector4_f32[1]);
+    if (RX >= 0.0f)
+    {
+        RX = (RefractionIndex.vector4_f32[0] * Incident.vector4_f32[0]) - (Normal.vector4_f32[0] * ((RefractionIndex.vector4_f32[0] * IDotN) + sqrtf(RX)));
+    }
+    else
+    {
+        RX = 0.0f;
+    }
+    if (RY >= 0.0f)
+    {
+        RY = (RefractionIndex.vector4_f32[1] * Incident.vector4_f32[1]) - (Normal.vector4_f32[1] * ((RefractionIndex.vector4_f32[1] * IDotN) + sqrtf(RY)));
+    }
+    else
+    {
+        RY = 0.0f;
+    }
+
+    XMVECTOR vResult;
+    vResult.vector4_f32[0] = RX;
+    vResult.vector4_f32[1] = RY;
+    vResult.vector4_f32[2] = 0.0f;
+    vResult.vector4_f32[3] = 0.0f;
+    return vResult;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t IL = vget_low_f32(Incident);
+    float32x2_t NL = vget_low_f32(Normal);
+    float32x2_t RIL = vget_low_f32(RefractionIndex);
+    // Get the 2D Dot product of Incident-Normal
+    float32x2_t vTemp = vmul_f32(IL, NL);
+    float32x2_t IDotN = vpadd_f32(vTemp, vTemp);
+    // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    vTemp = vmls_f32(vget_low_f32(g_XMOne), IDotN, IDotN);
+    vTemp = vmul_f32(vTemp, RIL);
+    vTemp = vmls_f32(vget_low_f32(g_XMOne), vTemp, RIL);
+    // If any terms are <=0, sqrt() will fail, punt to zero
+    uint32x2_t vMask = vcgt_f32(vTemp, vget_low_f32(g_XMZero));
+    // Sqrt(vTemp)
+    float32x2_t S0 = vrsqrte_f32(vTemp);
+    float32x2_t P0 = vmul_f32(vTemp, S0);
+    float32x2_t R0 = vrsqrts_f32(P0, S0);
+    float32x2_t S1 = vmul_f32(S0, R0);
+    float32x2_t P1 = vmul_f32(vTemp, S1);
+    float32x2_t R1 = vrsqrts_f32(P1, S1);
+    float32x2_t S2 = vmul_f32(S1, R1);
+    vTemp = vmul_f32(vTemp, S2);
+    // R = RefractionIndex * IDotN + sqrt(R)
+    vTemp = vmla_f32(vTemp, RIL, IDotN);
+    // Result = RefractionIndex * Incident - Normal * R
+    float32x2_t vResult = vmul_f32(RIL, IL);
+    vResult = vmls_f32(vResult, vTemp, NL);
+    vResult = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(vResult), vMask));
+    return vcombine_f32(vResult, vResult);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
+    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
+    // Get the 2D Dot product of Incident-Normal
+    XMVECTOR IDotN = XMVector2Dot(Incident, Normal);
+    // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    XMVECTOR vTemp = XM_FNMADD_PS(IDotN, IDotN, g_XMOne);
+    vTemp = _mm_mul_ps(vTemp, RefractionIndex);
+    vTemp = XM_FNMADD_PS(vTemp, RefractionIndex, g_XMOne);
+    // If any terms are <=0, sqrt() will fail, punt to zero
+    XMVECTOR vMask = _mm_cmpgt_ps(vTemp, g_XMZero);
+    // R = RefractionIndex * IDotN + sqrt(R)
+    vTemp = _mm_sqrt_ps(vTemp);
+    vTemp = XM_FMADD_PS(RefractionIndex, IDotN, vTemp);
+    // Result = RefractionIndex * Incident - Normal * R
+    XMVECTOR vResult = _mm_mul_ps(RefractionIndex, Incident);
+    vResult = XM_FNMADD_PS(vTemp, Normal, vResult);
+    vResult = _mm_and_ps(vResult, vMask);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Orthogonal(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result = { { {
+            -V.vector4_f32[1],
+            V.vector4_f32[0],
+            0.f,
+            0.f
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Negate = { { { -1.f, 1.f, 0, 0 } } };
+    const float32x2_t zero = vdup_n_f32(0);
+
+    float32x2_t VL = vget_low_f32(V);
+    float32x2_t Result = vmul_f32(vrev64_f32(VL), vget_low_f32(Negate));
+    return vcombine_f32(Result, zero);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
+    vResult = _mm_mul_ps(vResult, g_XMNegateX);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst
+(
+    FXMVECTOR N1,
+    FXMVECTOR N2
+) noexcept
+{
+    XMVECTOR Result = XMVector2Dot(N1, N2);
+    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
+    Result = XMVectorACosEst(Result);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals
+(
+    FXMVECTOR N1,
+    FXMVECTOR N2
+) noexcept
+{
+    XMVECTOR Result = XMVector2Dot(N1, N2);
+    Result = XMVectorClamp(Result, g_XMNegativeOne, g_XMOne);
+    Result = XMVectorACos(Result);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+    XMVECTOR L1 = XMVector2ReciprocalLength(V1);
+    XMVECTOR L2 = XMVector2ReciprocalLength(V2);
+
+    XMVECTOR Dot = XMVector2Dot(V1, V2);
+
+    L1 = XMVectorMultiply(L1, L2);
+
+    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
+    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
+
+    return XMVectorACos(CosAngle);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2LinePointDistance
+(
+    FXMVECTOR LinePoint1,
+    FXMVECTOR LinePoint2,
+    FXMVECTOR Point
+) noexcept
+{
+    // Given a vector PointVector from LinePoint1 to Point and a vector
+    // LineVector from LinePoint1 to LinePoint2, the scaled distance
+    // PointProjectionScale from LinePoint1 to the perpendicular projection
+    // of PointVector onto the line is defined as:
+    //
+    //     PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector)
+
+    XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1);
+    XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1);
+
+    XMVECTOR LengthSq = XMVector2LengthSq(LineVector);
+
+    XMVECTOR PointProjectionScale = XMVector2Dot(PointVector, LineVector);
+    PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq);
+
+    XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale);
+    DistanceVector = XMVectorSubtract(PointVector, DistanceVector);
+
+    return XMVector2Length(DistanceVector);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2IntersectLine
+(
+    FXMVECTOR Line1Point1,
+    FXMVECTOR Line1Point2,
+    FXMVECTOR Line2Point1,
+    GXMVECTOR Line2Point2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
+
+    XMVECTOR V1 = XMVectorSubtract(Line1Point2, Line1Point1);
+    XMVECTOR V2 = XMVectorSubtract(Line2Point2, Line2Point1);
+    XMVECTOR V3 = XMVectorSubtract(Line1Point1, Line2Point1);
+
+    XMVECTOR C1 = XMVector2Cross(V1, V2);
+    XMVECTOR C2 = XMVector2Cross(V2, V3);
+
+    XMVECTOR Result;
+    const XMVECTOR Zero = XMVectorZero();
+    if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v))
+    {
+        if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v))
+        {
+            // Coincident
+            Result = g_XMInfinity.v;
+        }
+        else
+        {
+            // Parallel
+            Result = g_XMQNaN.v;
+        }
+    }
+    else
+    {
+        // Intersection point = Line1Point1 + V1 * (C2 / C1)
+        XMVECTOR Scale = XMVectorReciprocal(C1);
+        Scale = XMVectorMultiply(C2, Scale);
+        Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1);
+    }
+
+    return Result;
+
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1);
+    XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1);
+    XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1);
+    // Generate the cross products
+    XMVECTOR C1 = XMVector2Cross(V1, V2);
+    XMVECTOR C2 = XMVector2Cross(V2, V3);
+    // If C1 is not close to epsilon, use the calculated value
+    XMVECTOR vResultMask = _mm_setzero_ps();
+    vResultMask = _mm_sub_ps(vResultMask, C1);
+    vResultMask = _mm_max_ps(vResultMask, C1);
+    // 0xFFFFFFFF if the calculated value is to be used
+    vResultMask = _mm_cmpgt_ps(vResultMask, g_XMEpsilon);
+    // If C1 is close to epsilon, which fail type is it? INFINITY or NAN?
+    XMVECTOR vFailMask = _mm_setzero_ps();
+    vFailMask = _mm_sub_ps(vFailMask, C2);
+    vFailMask = _mm_max_ps(vFailMask, C2);
+    vFailMask = _mm_cmple_ps(vFailMask, g_XMEpsilon);
+    XMVECTOR vFail = _mm_and_ps(vFailMask, g_XMInfinity);
+    vFailMask = _mm_andnot_ps(vFailMask, g_XMQNaN);
+    // vFail is NAN or INF
+    vFail = _mm_or_ps(vFail, vFailMask);
+    // Intersection point = Line1Point1 + V1 * (C2 / C1)
+    XMVECTOR vResult = _mm_div_ps(C2, C1);
+    vResult = XM_FMADD_PS(vResult, V1, Line1Point1);
+    // Use result, or failure value
+    vResult = _mm_and_ps(vResult, vResultMask);
+    vResultMask = _mm_andnot_ps(vResultMask, vFail);
+    vResult = _mm_or_ps(vResult, vResultMask);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2Transform
+(
+    FXMVECTOR V,
+    FXMMATRIX M
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Y = XMVectorSplatY(V);
+    XMVECTOR X = XMVectorSplatX(V);
+
+    XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
+    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    float32x4_t Result = vmlaq_lane_f32(M.r[3], M.r[1], VL, 1); // Y
+    return vmlaq_lane_f32(Result, M.r[0], VL, 0); // X
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y
+    vResult = XM_FMADD_PS(vResult, M.r[1], M.r[3]);
+    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X
+    vResult = XM_FMADD_PS(vTemp, M.r[0], vResult);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream
+(
+    XMFLOAT4* pOutputStream,
+    size_t          OutputStride,
+    const XMFLOAT2* pInputStream,
+    size_t          InputStride,
+    size_t          VectorCount,
+    FXMMATRIX       M
+) noexcept
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT2));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT2));
+
+    assert(OutputStride >= sizeof(XMFLOAT4));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row3 = M.r[3];
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pInputVector));
+        XMVECTOR Y = XMVectorSplatY(V);
+        XMVECTOR X = XMVectorSplatX(V);
+
+        XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3);
+        Result = XMVectorMultiplyAdd(X, row0, Result);
+
+#ifdef _PREFAST_
+#pragma prefast(push)
+#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" )
+#endif
+
+        XMStoreFloat4(reinterpret_cast<XMFLOAT4*>(pOutputVector), Result);
+
+#ifdef _PREFAST_
+#pragma prefast(pop)
+#endif
+
+        pInputVector += InputStride;
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT4)))
+        {
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x2_t V = vld2q_f32(reinterpret_cast<const float*>(pInputVector));
+                pInputVector += sizeof(XMFLOAT2) * 4;
+
+                float32x2_t r3 = vget_low_f32(row3);
+                float32x2_t r = vget_low_f32(row0);
+                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M
+                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N
+
+                XM_PREFETCH(pInputVector);
+
+                r3 = vget_high_f32(row3);
+                r = vget_high_f32(row0);
+                XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O
+                XMVECTOR vResult3 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P
+
+                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
+
+                r = vget_low_f32(row1);
+                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M
+                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
+
+                r = vget_high_f32(row1);
+                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O
+                vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1); // Dx+Hy+P
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
+
+                float32x4x4_t R;
+                R.val[0] = vResult0;
+                R.val[1] = vResult1;
+                R.val[2] = vResult2;
+                R.val[3] = vResult3;
+
+                vst4q_f32(reinterpret_cast<float*>(pOutputVector), R);
+                pOutputVector += sizeof(XMFLOAT4) * 4;
+
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        float32x2_t V = vld1_f32(reinterpret_cast<const float*>(pInputVector));
+        pInputVector += InputStride;
+
+        XMVECTOR vResult = vmlaq_lane_f32(row3, row0, V, 0); // X
+        vResult = vmlaq_lane_f32(vResult, row1, V, 1); // Y
+
+        vst1q_f32(reinterpret_cast<float*>(pOutputVector), vResult);
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+#elif defined(_XM_AVX2_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        __m256 row0 = _mm256_broadcast_ps(&M.r[0]);
+        __m256 row1 = _mm256_broadcast_ps(&M.r[1]);
+        __m256 row3 = _mm256_broadcast_ps(&M.r[3]);
+
+        if (InputStride == sizeof(XMFLOAT2))
+        {
+            if (OutputStride == sizeof(XMFLOAT4))
+            {
+                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0x1F))
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        pInputVector += sizeof(XMFLOAT2) * 4;
+
+                        __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
+                        __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
+                        __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
+                        __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
+                        __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
+                        __m256 vTempA = _mm256_mul_ps(X1, row0);
+                        __m256 vTempA2 = _mm256_mul_ps(X2, row0);
+                        vTempA = _mm256_add_ps(vTempA, vTempB);
+                        vTempA2 = _mm256_add_ps(vTempA2, vTempB2);
+
+                        X1 = _mm256_insertf128_ps(vTempA, _mm256_castps256_ps128(vTempA2), 1);
+                        XM256_STREAM_PS(reinterpret_cast<float*>(pOutputVector), X1);
+                        pOutputVector += sizeof(XMFLOAT4) * 2;
+
+                        X2 = _mm256_insertf128_ps(vTempA2, _mm256_extractf128_ps(vTempA, 1), 0);
+                        XM256_STREAM_PS(reinterpret_cast<float*>(pOutputVector), X2);
+                        pOutputVector += sizeof(XMFLOAT4) * 2;
+
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        pInputVector += sizeof(XMFLOAT2) * 4;
+
+                        __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
+                        __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
+                        __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
+                        __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
+                        __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
+                        __m256 vTempA = _mm256_mul_ps(X1, row0);
+                        __m256 vTempA2 = _mm256_mul_ps(X2, row0);
+                        vTempA = _mm256_add_ps(vTempA, vTempB);
+                        vTempA2 = _mm256_add_ps(vTempA2, vTempB2);
+
+                        X1 = _mm256_insertf128_ps(vTempA, _mm256_castps256_ps128(vTempA2), 1);
+                        _mm256_storeu_ps(reinterpret_cast<float*>(pOutputVector), X1);
+                        pOutputVector += sizeof(XMFLOAT4) * 2;
+
+                        X2 = _mm256_insertf128_ps(vTempA2, _mm256_extractf128_ps(vTempA, 1), 0);
+                        _mm256_storeu_ps(reinterpret_cast<float*>(pOutputVector), X2);
+                        pOutputVector += sizeof(XMFLOAT4) * 2;
+
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, unpacked output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                    pInputVector += sizeof(XMFLOAT2) * 4;
+
+                    __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
+                    __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
+                    __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
+                    __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
+                    __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
+                    __m256 vTempA = _mm256_mul_ps(X1, row0);
+                    __m256 vTempA2 = _mm256_mul_ps(X2, row0);
+                    vTempA = _mm256_add_ps(vTempA, vTempB);
+                    vTempA2 = _mm256_add_ps(vTempA2, vTempB2);
+
+                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), _mm256_castps256_ps128(vTempA));
+                    pOutputVector += OutputStride;
+
+                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), _mm256_castps256_ps128(vTempA2));
+                    pOutputVector += OutputStride;
+
+                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), _mm256_extractf128_ps(vTempA, 1));
+                    pOutputVector += OutputStride;
+
+                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), _mm256_extractf128_ps(vTempA2, 1));
+                    pOutputVector += OutputStride;
+
+                    i += 4;
+                }
+            }
+        }
+    }
+
+    if (i < VectorCount)
+    {
+        const XMVECTOR row0 = M.r[0];
+        const XMVECTOR row1 = M.r[1];
+        const XMVECTOR row3 = M.r[3];
+
+        for (; i < VectorCount; i++)
+        {
+            __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
+            pInputVector += InputStride;
+
+            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
+            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
+
+            XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
+            XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
+            vTemp = _mm_add_ps(vTemp, vTemp2);
+
+            _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
+            pOutputVector += OutputStride;
+        }
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t two = VectorCount >> 1;
+    if (two > 0)
+    {
+        if (InputStride == sizeof(XMFLOAT2))
+        {
+            if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF))
+            {
+                // Packed input, aligned output
+                for (size_t j = 0; j < two; ++j)
+                {
+                    XMVECTOR V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                    pInputVector += sizeof(XMFLOAT2) * 2;
+
+                    XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+                    XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
+                    XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+
+                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+                    X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+
+                    vTemp = XM_FMADD_PS(Y, row1, row3);
+                    vTemp2 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+
+                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    i += 2;
+                }
+            }
+            else
+            {
+                // Packed input, unaligned output
+                for (size_t j = 0; j < two; ++j)
+                {
+                    XMVECTOR V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                    pInputVector += sizeof(XMFLOAT2) * 2;
+
+                    XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+                    XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
+                    XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+
+                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+                    X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+
+                    vTemp = XM_FMADD_PS(Y, row1, row3);
+                    vTemp2 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+
+                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    i += 2;
+                }
+            }
+        }
+    }
+
+    if (!(reinterpret_cast<uintptr_t>(pInputVector) & 0xF) && !(InputStride & 0xF))
+    {
+        if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF))
+        {
+            // Aligned input, aligned output
+            for (; i < VectorCount; i++)
+            {
+                XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(pInputVector)));
+                pInputVector += InputStride;
+
+                XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+                XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+                XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
+                XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
+                vTemp = _mm_add_ps(vTemp, vTemp2);
+
+                XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
+                pOutputVector += OutputStride;
+            }
+        }
+        else
+        {
+            // Aligned input, unaligned output
+            for (; i < VectorCount; i++)
+            {
+                XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(pInputVector)));
+                pInputVector += InputStride;
+
+                XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+                XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+                XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
+                XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
+                vTemp = _mm_add_ps(vTemp, vTemp2);
+
+                _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
+                pOutputVector += OutputStride;
+            }
+        }
+    }
+    else
+    {
+        // Unaligned input
+        for (; i < VectorCount; i++)
+        {
+            __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
+            pInputVector += InputStride;
+
+            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
+            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
+
+            XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
+            XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
+            vTemp = _mm_add_ps(vTemp, vTemp2);
+
+            _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
+            pOutputVector += OutputStride;
+        }
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2TransformCoord
+(
+    FXMVECTOR V,
+    FXMMATRIX M
+) noexcept
+{
+    XMVECTOR Y = XMVectorSplatY(V);
+    XMVECTOR X = XMVectorSplatX(V);
+
+    XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
+    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+    XMVECTOR W = XMVectorSplatW(Result);
+    return XMVectorDivide(Result, W);
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
+(
+    XMFLOAT2* pOutputStream,
+    size_t          OutputStride,
+    const XMFLOAT2* pInputStream,
+    size_t          InputStride,
+    size_t          VectorCount,
+    FXMMATRIX       M
+) noexcept
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT2));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT2));
+
+    assert(OutputStride >= sizeof(XMFLOAT2));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row3 = M.r[3];
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pInputVector));
+        XMVECTOR Y = XMVectorSplatY(V);
+        XMVECTOR X = XMVectorSplatX(V);
+
+        XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3);
+        Result = XMVectorMultiplyAdd(X, row0, Result);
+
+        XMVECTOR W = XMVectorSplatW(Result);
+
+        Result = XMVectorDivide(Result, W);
+
+#ifdef _PREFAST_
+#pragma prefast(push)
+#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" )
+#endif
+
+        XMStoreFloat2(reinterpret_cast<XMFLOAT2*>(pOutputVector), Result);
+
+#ifdef _PREFAST_
+#pragma prefast(pop)
+#endif
+
+        pInputVector += InputStride;
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2)))
+        {
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x2_t V = vld2q_f32(reinterpret_cast<const float*>(pInputVector));
+                pInputVector += sizeof(XMFLOAT2) * 4;
+
+                float32x2_t r3 = vget_low_f32(row3);
+                float32x2_t r = vget_low_f32(row0);
+                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M
+                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N
+
+                XM_PREFETCH(pInputVector);
+
+                r3 = vget_high_f32(row3);
+                r = vget_high_f32(row0);
+                XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P
+
+                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
+
+                r = vget_low_f32(row1);
+                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M
+                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
+
+                r = vget_high_f32(row1);
+                W = vmlaq_lane_f32(W, V.val[1], r, 1); // Dx+Hy+P
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
+
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+                V.val[0] = vdivq_f32(vResult0, W);
+                V.val[1] = vdivq_f32(vResult1, W);
+#else
+                // 2 iterations of Newton-Raphson refinement of reciprocal
+                float32x4_t Reciprocal = vrecpeq_f32(W);
+                float32x4_t S = vrecpsq_f32(Reciprocal, W);
+                Reciprocal = vmulq_f32(S, Reciprocal);
+                S = vrecpsq_f32(Reciprocal, W);
+                Reciprocal = vmulq_f32(S, Reciprocal);
+
+                V.val[0] = vmulq_f32(vResult0, Reciprocal);
+                V.val[1] = vmulq_f32(vResult1, Reciprocal);
+#endif
+
+                vst2q_f32(reinterpret_cast<float*>(pOutputVector), V);
+                pOutputVector += sizeof(XMFLOAT2) * 4;
+
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        float32x2_t V = vld1_f32(reinterpret_cast<const float*>(pInputVector));
+        pInputVector += InputStride;
+
+        XMVECTOR vResult = vmlaq_lane_f32(row3, row0, V, 0); // X
+        vResult = vmlaq_lane_f32(vResult, row1, V, 1); // Y
+
+        V = vget_high_f32(vResult);
+        float32x2_t W = vdup_lane_f32(V, 1);
+
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+        V = vget_low_f32(vResult);
+        V = vdiv_f32(V, W);
+#else
+        // 2 iterations of Newton-Raphson refinement of reciprocal for W
+        float32x2_t Reciprocal = vrecpe_f32(W);
+        float32x2_t S = vrecps_f32(Reciprocal, W);
+        Reciprocal = vmul_f32(S, Reciprocal);
+        S = vrecps_f32(Reciprocal, W);
+        Reciprocal = vmul_f32(S, Reciprocal);
+
+        V = vget_low_f32(vResult);
+        V = vmul_f32(V, Reciprocal);
+#endif
+
+        vst1_f32(reinterpret_cast<float*>(pOutputVector), V);
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+#elif defined(_XM_AVX2_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        __m256 row0 = _mm256_broadcast_ps(&M.r[0]);
+        __m256 row1 = _mm256_broadcast_ps(&M.r[1]);
+        __m256 row3 = _mm256_broadcast_ps(&M.r[3]);
+
+        if (InputStride == sizeof(XMFLOAT2))
+        {
+            if (OutputStride == sizeof(XMFLOAT2))
+            {
+                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0x1F))
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        pInputVector += sizeof(XMFLOAT2) * 4;
+
+                        __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
+                        __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
+                        __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
+                        __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
+                        __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
+                        __m256 vTempA = _mm256_mul_ps(X1, row0);
+                        __m256 vTempA2 = _mm256_mul_ps(X2, row0);
+                        vTempA = _mm256_add_ps(vTempA, vTempB);
+                        vTempA2 = _mm256_add_ps(vTempA2, vTempB2);
+
+                        __m256 W = _mm256_shuffle_ps(vTempA, vTempA, _MM_SHUFFLE(3, 3, 3, 3));
+                        vTempA = _mm256_div_ps(vTempA, W);
+
+                        W = _mm256_shuffle_ps(vTempA2, vTempA2, _MM_SHUFFLE(3, 3, 3, 3));
+                        vTempA2 = _mm256_div_ps(vTempA2, W);
+
+                        X1 = _mm256_shuffle_ps(vTempA, vTempA2, 0x44);
+                        XM256_STREAM_PS(reinterpret_cast<float*>(pOutputVector), X1);
+                        pOutputVector += sizeof(XMFLOAT2) * 4;
+
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        pInputVector += sizeof(XMFLOAT2) * 4;
+
+                        __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
+                        __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
+                        __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
+                        __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
+                        __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
+                        __m256 vTempA = _mm256_mul_ps(X1, row0);
+                        __m256 vTempA2 = _mm256_mul_ps(X2, row0);
+                        vTempA = _mm256_add_ps(vTempA, vTempB);
+                        vTempA2 = _mm256_add_ps(vTempA2, vTempB2);
+
+                        __m256 W = _mm256_shuffle_ps(vTempA, vTempA, _MM_SHUFFLE(3, 3, 3, 3));
+                        vTempA = _mm256_div_ps(vTempA, W);
+
+                        W = _mm256_shuffle_ps(vTempA2, vTempA2, _MM_SHUFFLE(3, 3, 3, 3));
+                        vTempA2 = _mm256_div_ps(vTempA2, W);
+
+                        X1 = _mm256_shuffle_ps(vTempA, vTempA2, 0x44);
+                        _mm256_storeu_ps(reinterpret_cast<float*>(pOutputVector), X1);
+                        pOutputVector += sizeof(XMFLOAT2) * 4;
+
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, unpacked output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                    pInputVector += sizeof(XMFLOAT2) * 4;
+
+                    __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
+                    __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
+                    __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
+                    __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
+                    __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
+                    __m256 vTempA = _mm256_mul_ps(X1, row0);
+                    __m256 vTempA2 = _mm256_mul_ps(X2, row0);
+                    vTempA = _mm256_add_ps(vTempA, vTempB);
+                    vTempA2 = _mm256_add_ps(vTempA2, vTempB2);
+
+                    __m256 W = _mm256_shuffle_ps(vTempA, vTempA, _MM_SHUFFLE(3, 3, 3, 3));
+                    vTempA = _mm256_div_ps(vTempA, W);
+
+                    W = _mm256_shuffle_ps(vTempA2, vTempA2, _MM_SHUFFLE(3, 3, 3, 3));
+                    vTempA2 = _mm256_div_ps(vTempA2, W);
+
+                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
+                        _mm_castps_pd(_mm256_castps256_ps128(vTempA)));
+                    pOutputVector += OutputStride;
+
+                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
+                        _mm_castps_pd(_mm256_castps256_ps128(vTempA2)));
+                    pOutputVector += OutputStride;
+
+                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
+                        _mm_castps_pd(_mm256_extractf128_ps(vTempA, 1)));
+                    pOutputVector += OutputStride;
+
+                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
+                        _mm_castps_pd(_mm256_extractf128_ps(vTempA2, 1)));
+                    pOutputVector += OutputStride;
+
+                    i += 4;
+                }
+            }
+        }
+    }
+
+    if (i < VectorCount)
+    {
+        const XMVECTOR row0 = M.r[0];
+        const XMVECTOR row1 = M.r[1];
+        const XMVECTOR row3 = M.r[3];
+
+        for (; i < VectorCount; i++)
+        {
+            __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
+            pInputVector += InputStride;
+
+            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
+            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
+
+            XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
+            XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
+            vTemp = _mm_add_ps(vTemp, vTemp2);
+
+            XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+            vTemp = _mm_div_ps(vTemp, W);
+
+            _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
+            pOutputVector += OutputStride;
+        }
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t two = VectorCount >> 1;
+    if (two > 0)
+    {
+        if (InputStride == sizeof(XMFLOAT2))
+        {
+            if (OutputStride == sizeof(XMFLOAT2))
+            {
+                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF))
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < two; ++j)
+                    {
+                        XMVECTOR V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        pInputVector += sizeof(XMFLOAT2) * 2;
+
+                        // Result 1
+                        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+                        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
+                        XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+
+                        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                        XMVECTOR V1 = _mm_div_ps(vTemp, W);
+
+                        // Result 2
+                        Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+                        X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+
+                        vTemp = XM_FMADD_PS(Y, row1, row3);
+                        vTemp2 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                        XMVECTOR V2 = _mm_div_ps(vTemp, W);
+
+                        vTemp = _mm_movelh_ps(V1, V2);
+
+                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
+                        pOutputVector += sizeof(XMFLOAT2) * 2;
+
+                        i += 2;
+                    }
+                }
+                else
+                {
+                    // Packed input, unaligned & packed output
+                    for (size_t j = 0; j < two; ++j)
+                    {
+                        XMVECTOR V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        pInputVector += sizeof(XMFLOAT2) * 2;
+
+                        // Result 1
+                        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+                        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
+                        XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+
+                        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                        XMVECTOR V1 = _mm_div_ps(vTemp, W);
+
+                        // Result 2
+                        Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+                        X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+
+                        vTemp = XM_FMADD_PS(Y, row1, row3);
+                        vTemp2 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                        XMVECTOR V2 = _mm_div_ps(vTemp, W);
+
+                        vTemp = _mm_movelh_ps(V1, V2);
+
+                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
+                        pOutputVector += sizeof(XMFLOAT2) * 2;
+
+                        i += 2;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, unpacked output
+                for (size_t j = 0; j < two; ++j)
+                {
+                    XMVECTOR V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                    pInputVector += sizeof(XMFLOAT2) * 2;
+
+                    // Result 1
+                    XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+                    XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
+                    XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+
+                    XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                    vTemp = _mm_div_ps(vTemp, W);
+
+                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
+                    pOutputVector += OutputStride;
+
+                    // Result 2
+                    Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+                    X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+
+                    vTemp = XM_FMADD_PS(Y, row1, row3);
+                    vTemp2 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+
+                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                    vTemp = _mm_div_ps(vTemp, W);
+
+                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
+                    pOutputVector += OutputStride;
+
+                    i += 2;
+                }
+            }
+        }
+    }
+
+    if (!(reinterpret_cast<uintptr_t>(pInputVector) & 0xF) && !(InputStride & 0xF))
+    {
+        // Aligned input
+        for (; i < VectorCount; i++)
+        {
+            XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(pInputVector)));
+            pInputVector += InputStride;
+
+            XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+            XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+            XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
+            XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
+            vTemp = _mm_add_ps(vTemp, vTemp2);
+
+            XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+            vTemp = _mm_div_ps(vTemp, W);
+
+            _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
+            pOutputVector += OutputStride;
+        }
+    }
+    else
+    {
+        // Unaligned input
+        for (; i < VectorCount; i++)
+        {
+            __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
+            pInputVector += InputStride;
+
+            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
+            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
+
+            XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
+            XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
+            vTemp = _mm_add_ps(vTemp, vTemp2);
+
+            XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+            vTemp = _mm_div_ps(vTemp, W);
+
+            _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
+            pOutputVector += OutputStride;
+        }
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector2TransformNormal
+(
+    FXMVECTOR V,
+    FXMMATRIX M
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Y = XMVectorSplatY(V);
+    XMVECTOR X = XMVectorSplatX(V);
+
+    XMVECTOR Result = XMVectorMultiply(Y, M.r[1]);
+    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    float32x4_t Result = vmulq_lane_f32(M.r[1], VL, 1); // Y
+    return vmlaq_lane_f32(Result, M.r[0], VL, 0); // X
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y
+    vResult = _mm_mul_ps(vResult, M.r[1]);
+    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X
+    vResult = XM_FMADD_PS(vTemp, M.r[0], vResult);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
+(
+    XMFLOAT2* pOutputStream,
+    size_t          OutputStride,
+    const XMFLOAT2* pInputStream,
+    size_t          InputStride,
+    size_t          VectorCount,
+    FXMMATRIX       M
+) noexcept
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT2));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT2));
+
+    assert(OutputStride >= sizeof(XMFLOAT2));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pInputVector));
+        XMVECTOR Y = XMVectorSplatY(V);
+        XMVECTOR X = XMVectorSplatX(V);
+
+        XMVECTOR Result = XMVectorMultiply(Y, row1);
+        Result = XMVectorMultiplyAdd(X, row0, Result);
+
+#ifdef _PREFAST_
+#pragma prefast(push)
+#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" )
+#endif
+
+        XMStoreFloat2(reinterpret_cast<XMFLOAT2*>(pOutputVector), Result);
+
+#ifdef _PREFAST_
+#pragma prefast(pop)
+#endif
+
+        pInputVector += InputStride;
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2)))
+        {
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x2_t V = vld2q_f32(reinterpret_cast<const float*>(pInputVector));
+                pInputVector += sizeof(XMFLOAT2) * 4;
+
+                float32x2_t r = vget_low_f32(row0);
+                XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0); // Ax
+                XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1); // Bx
+
+                XM_PREFETCH(pInputVector);
+                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
+
+                r = vget_low_f32(row1);
+                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey
+                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
+
+                V.val[0] = vResult0;
+                V.val[1] = vResult1;
+
+                vst2q_f32(reinterpret_cast<float*>(pOutputVector), V);
+                pOutputVector += sizeof(XMFLOAT2) * 4;
+
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        float32x2_t V = vld1_f32(reinterpret_cast<const float*>(pInputVector));
+        pInputVector += InputStride;
+
+        XMVECTOR vResult = vmulq_lane_f32(row0, V, 0); // X
+        vResult = vmlaq_lane_f32(vResult, row1, V, 1); // Y
+
+        V = vget_low_f32(vResult);
+        vst1_f32(reinterpret_cast<float*>(pOutputVector), V);
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+#elif defined(_XM_AVX2_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        __m256 row0 = _mm256_broadcast_ps(&M.r[0]);
+        __m256 row1 = _mm256_broadcast_ps(&M.r[1]);
+
+        if (InputStride == sizeof(XMFLOAT2))
+        {
+            if (OutputStride == sizeof(XMFLOAT2))
+            {
+                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0x1F))
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        pInputVector += sizeof(XMFLOAT2) * 4;
+
+                        __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
+                        __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
+                        __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
+                        __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        __m256 vTempA = _mm256_mul_ps(Y1, row1);
+                        __m256 vTempB = _mm256_mul_ps(Y2, row1);
+                        vTempA = _mm256_fmadd_ps(X1, row0, vTempA);
+                        vTempB = _mm256_fmadd_ps(X2, row0, vTempB);
+
+                        X1 = _mm256_shuffle_ps(vTempA, vTempB, 0x44);
+                        XM256_STREAM_PS(reinterpret_cast<float*>(pOutputVector), X1);
+                        pOutputVector += sizeof(XMFLOAT2) * 4;
+
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        pInputVector += sizeof(XMFLOAT2) * 4;
+
+                        __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
+                        __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
+                        __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
+                        __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        __m256 vTempA = _mm256_mul_ps(Y1, row1);
+                        __m256 vTempB = _mm256_mul_ps(Y2, row1);
+                        vTempA = _mm256_fmadd_ps(X1, row0, vTempA);
+                        vTempB = _mm256_fmadd_ps(X2, row0, vTempB);
+
+                        X1 = _mm256_shuffle_ps(vTempA, vTempB, 0x44);
+                        _mm256_storeu_ps(reinterpret_cast<float*>(pOutputVector), X1);
+                        pOutputVector += sizeof(XMFLOAT2) * 4;
+
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, unpacked output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                    pInputVector += sizeof(XMFLOAT2) * 4;
+
+                    __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
+                    __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
+                    __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
+                    __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    __m256 vTempA = _mm256_mul_ps(Y1, row1);
+                    __m256 vTempB = _mm256_mul_ps(Y2, row1);
+                    vTempA = _mm256_fmadd_ps(X1, row0, vTempA);
+                    vTempB = _mm256_fmadd_ps(X2, row0, vTempB);
+
+                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
+                        _mm_castps_pd(_mm256_castps256_ps128(vTempA)));
+                    pOutputVector += OutputStride;
+
+                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
+                        _mm_castps_pd(_mm256_castps256_ps128(vTempB)));
+                    pOutputVector += OutputStride;
+
+                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
+                        _mm_castps_pd(_mm256_extractf128_ps(vTempA, 1)));
+                    pOutputVector += OutputStride;
+
+                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
+                        _mm_castps_pd(_mm256_extractf128_ps(vTempB, 1)));
+                    pOutputVector += OutputStride;
+
+                    i += 4;
+                }
+            }
+        }
+    }
+
+    if (i < VectorCount)
+    {
+        const XMVECTOR row0 = M.r[0];
+        const XMVECTOR row1 = M.r[1];
+
+        for (; i < VectorCount; i++)
+        {
+            __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
+            pInputVector += InputStride;
+
+            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
+            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
+
+            XMVECTOR vTemp = _mm_mul_ps(Y, row1);
+            vTemp = XM_FMADD_PS(X, row0, vTemp);
+
+            _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
+            pOutputVector += OutputStride;
+        }
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+
+    size_t i = 0;
+    size_t two = VectorCount >> 1;
+    if (two > 0)
+    {
+        if (InputStride == sizeof(XMFLOAT2))
+        {
+            if (OutputStride == sizeof(XMFLOAT2))
+            {
+                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF))
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < two; ++j)
+                    {
+                        XMVECTOR V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        pInputVector += sizeof(XMFLOAT2) * 2;
+
+                        // Result 1
+                        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+                        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        XMVECTOR vTemp = _mm_mul_ps(Y, row1);
+                        XMVECTOR V1 = XM_FMADD_PS(X, row0, vTemp);
+
+                        // Result 2
+                        Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+                        X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+
+                        vTemp = _mm_mul_ps(Y, row1);
+                        XMVECTOR V2 = XM_FMADD_PS(X, row0, vTemp);
+
+                        vTemp = _mm_movelh_ps(V1, V2);
+
+                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
+                        pOutputVector += sizeof(XMFLOAT2) * 2;
+
+                        i += 2;
+                    }
+                }
+                else
+                {
+                    // Packed input, unaligned & packed output
+                    for (size_t j = 0; j < two; ++j)
+                    {
+                        XMVECTOR V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        pInputVector += sizeof(XMFLOAT2) * 2;
+
+                        // Result 1
+                        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+                        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        XMVECTOR vTemp = _mm_mul_ps(Y, row1);
+                        XMVECTOR V1 = XM_FMADD_PS(X, row0, vTemp);
+
+                        // Result 2
+                        Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+                        X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+
+                        vTemp = _mm_mul_ps(Y, row1);
+                        XMVECTOR V2 = XM_FMADD_PS(X, row0, vTemp);
+
+                        vTemp = _mm_movelh_ps(V1, V2);
+
+                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
+                        pOutputVector += sizeof(XMFLOAT2) * 2;
+
+                        i += 2;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, unpacked output
+                for (size_t j = 0; j < two; ++j)
+                {
+                    XMVECTOR V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                    pInputVector += sizeof(XMFLOAT2) * 2;
+
+                    // Result 1
+                    XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+                    XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    XMVECTOR vTemp = _mm_mul_ps(Y, row1);
+                    vTemp = XM_FMADD_PS(X, row0, vTemp);
+
+                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
+                    pOutputVector += OutputStride;
+
+                    // Result 2
+                    Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+                    X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+
+                    vTemp = _mm_mul_ps(Y, row1);
+                    vTemp = XM_FMADD_PS(X, row0, vTemp);
+
+                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
+                    pOutputVector += OutputStride;
+
+                    i += 2;
+                }
+            }
+        }
+    }
+
+    if (!(reinterpret_cast<uintptr_t>(pInputVector) & 0xF) && !(InputStride & 0xF))
+    {
+        // Aligned input
+        for (; i < VectorCount; i++)
+        {
+            XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(pInputVector)));
+            pInputVector += InputStride;
+
+            XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+            XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+            XMVECTOR vTemp = _mm_mul_ps(Y, row1);
+            vTemp = XM_FMADD_PS(X, row0, vTemp);
+
+            _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
+            pOutputVector += OutputStride;
+        }
+    }
+    else
+    {
+        // Unaligned input
+        for (; i < VectorCount; i++)
+        {
+            __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
+            pInputVector += InputStride;
+
+            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
+            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
+
+            XMVECTOR vTemp = _mm_mul_ps(Y, row1);
+            vTemp = XM_FMADD_PS(X, row0, vTemp);
+
+            _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
+            pOutputVector += OutputStride;
+        }
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+/****************************************************************************
+ *
+ * 3D Vector
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+ // Comparison operations
+ //------------------------------------------------------------------------------
+
+ //------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3Equal
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
+    return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector3EqualR
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t CR = 0;
+    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
+        (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] == V2.vector4_f32[2]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
+        (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] != V2.vector4_f32[2]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU;
+
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFU)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!r)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
+    int iTest = _mm_movemask_ps(vTemp) & 7;
+    uint32_t CR = 0;
+    if (iTest == 7)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3EqualInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
+    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7) == 7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector3EqualIntR
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t CR = 0;
+    if ((V1.vector4_u32[0] == V2.vector4_u32[0]) &&
+        (V1.vector4_u32[1] == V2.vector4_u32[1]) &&
+        (V1.vector4_u32[2] == V2.vector4_u32[2]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) &&
+        (V1.vector4_u32[1] != V2.vector4_u32[1]) &&
+        (V1.vector4_u32[2] != V2.vector4_u32[2]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU;
+
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFU)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!r)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
+    int iTemp = _mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7;
+    uint32_t CR = 0;
+    if (iTemp == 7)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTemp)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3NearEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2,
+    FXMVECTOR Epsilon
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float dx, dy, dz;
+
+    dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]);
+    dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]);
+    dz = fabsf(V1.vector4_f32[2] - V2.vector4_f32[2]);
+    return (((dx <= Epsilon.vector4_f32[0]) &&
+        (dy <= Epsilon.vector4_f32[1]) &&
+        (dz <= Epsilon.vector4_f32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vDelta = vsubq_f32(V1, V2);
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    uint32x4_t vResult = vacleq_f32(vDelta, Epsilon);
+#else
+    uint32x4_t vResult = vcleq_f32(vabsq_f32(vDelta), Epsilon);
+#endif
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Get the difference
+    XMVECTOR vDelta = _mm_sub_ps(V1, V2);
+    // Get the absolute value of the difference
+    XMVECTOR vTemp = _mm_setzero_ps();
+    vTemp = _mm_sub_ps(vTemp, vDelta);
+    vTemp = _mm_max_ps(vTemp, vDelta);
+    vTemp = _mm_cmple_ps(vTemp, Epsilon);
+    // w is don't care
+    return (((_mm_movemask_ps(vTemp) & 7) == 0x7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3NotEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0xFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
+    return (((_mm_movemask_ps(vTemp) & 7) != 7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3NotEqualInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0xFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
+    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7) != 7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3Greater
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgtq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
+    return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector3GreaterR
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t CR = 0;
+    if ((V1.vector4_f32[0] > V2.vector4_f32[0]) &&
+        (V1.vector4_f32[1] > V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] > V2.vector4_f32[2]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) &&
+        (V1.vector4_f32[1] <= V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] <= V2.vector4_f32[2]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgtq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU;
+
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFU)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!r)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
+    uint32_t CR = 0;
+    int iTest = _mm_movemask_ps(vTemp) & 7;
+    if (iTest == 7)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3GreaterOrEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgeq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
+    return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector3GreaterOrEqualR
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t CR = 0;
+    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
+        (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] >= V2.vector4_f32[2]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
+        (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] < V2.vector4_f32[2]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgeq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU;
+
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFU)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!r)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
+    uint32_t CR = 0;
+    int iTest = _mm_movemask_ps(vTemp) & 7;
+    if (iTest == 7)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3Less
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcltq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmplt_ps(V1, V2);
+    return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3LessOrEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcleq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmple_ps(V1, V2);
+    return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3InBounds
+(
+    FXMVECTOR V,
+    FXMVECTOR Bounds
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
+        (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
+        (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Test if less than or equal
+    uint32x4_t ivTemp1 = vcleq_f32(V, Bounds);
+    // Negate the bounds
+    float32x4_t vTemp2 = vnegq_f32(Bounds);
+    // Test if greater or equal (Reversed)
+    uint32x4_t ivTemp2 = vcleq_f32(vTemp2, V);
+    // Blend answers
+    ivTemp1 = vandq_u32(ivTemp1, ivTemp2);
+    // in bounds?
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(ivTemp1)), vget_high_u8(vreinterpretq_u8_u32(ivTemp1)));
+    uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test if less than or equal
+    XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds);
+    // Negate the bounds
+    XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne);
+    // Test if greater or equal (Reversed)
+    vTemp2 = _mm_cmple_ps(vTemp2, V);
+    // Blend answers
+    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
+    // x,y and z in bounds? (w is don't care)
+    return (((_mm_movemask_ps(vTemp1) & 0x7) == 0x7) != 0);
+#else
+    return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#pragma float_control(push)
+#pragma float_control(precise, on)
+#endif
+
+inline bool XM_CALLCONV XMVector3IsNaN(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    return (XMISNAN(V.vector4_f32[0]) ||
+        XMISNAN(V.vector4_f32[1]) ||
+        XMISNAN(V.vector4_f32[2]));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Test against itself. NaN is always not equal
+    uint32x4_t vTempNan = vceqq_f32(V, V);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempNan)), vget_high_u8(vreinterpretq_u8_u32(vTempNan)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    // If x or y or z are NaN, the mask is zero
+    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0xFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test against itself. NaN is always not equal
+    XMVECTOR vTempNan = _mm_cmpneq_ps(V, V);
+    // If x or y or z are NaN, the mask is non-zero
+    return ((_mm_movemask_ps(vTempNan) & 7) != 0);
+#endif
+}
+
+#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#pragma float_control(pop)
+#endif
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector3IsInfinite(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (XMISINF(V.vector4_f32[0]) ||
+        XMISINF(V.vector4_f32[1]) ||
+        XMISINF(V.vector4_f32[2]));
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Mask off the sign bit
+    uint32x4_t vTempInf = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
+    // Compare to infinity
+    vTempInf = vceqq_f32(vreinterpretq_f32_u32(vTempInf), g_XMInfinity);
+    // If any are infinity, the signs are true.
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempInf)), vget_high_u8(vreinterpretq_u8_u32(vTempInf)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Mask off the sign bit
+    __m128 vTemp = _mm_and_ps(V, g_XMAbsMask);
+    // Compare to infinity
+    vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity);
+    // If x,y or z are infinity, the signs are true.
+    return ((_mm_movemask_ps(vTemp) & 7) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Dot
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2];
+    XMVECTORF32 vResult;
+    vResult.f[0] =
+        vResult.f[1] =
+        vResult.f[2] =
+        vResult.f[3] = fValue;
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vTemp = vmulq_f32(V1, V2);
+    float32x2_t v1 = vget_low_f32(vTemp);
+    float32x2_t v2 = vget_high_f32(vTemp);
+    v1 = vpadd_f32(v1, v1);
+    v2 = vdup_lane_f32(v2, 0);
+    v1 = vadd_f32(v1, v2);
+    return vcombine_f32(v1, v1);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_dp_ps(V1, V2, 0x7f);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vTemp = _mm_mul_ps(V1, V2);
+    vTemp = _mm_and_ps(vTemp, g_XMMask3);
+    vTemp = _mm_hadd_ps(vTemp, vTemp);
+    return _mm_hadd_ps(vTemp, vTemp);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product
+    XMVECTOR vDot = _mm_mul_ps(V1, V2);
+    // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2]
+    XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1));
+    // Result.vector4_f32[0] = x+y
+    vDot = _mm_add_ss(vDot, vTemp);
+    // x=Dot.vector4_f32[2]
+    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
+    // Result.vector4_f32[0] = (x+y)+z
+    vDot = _mm_add_ss(vDot, vTemp);
+    // Splat x
+    return XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Cross
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+    // [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ]
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]),
+            (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]),
+            (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]),
+            0.0f
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t v1xy = vget_low_f32(V1);
+    float32x2_t v2xy = vget_low_f32(V2);
+
+    float32x2_t v1yx = vrev64_f32(v1xy);
+    float32x2_t v2yx = vrev64_f32(v2xy);
+
+    float32x2_t v1zz = vdup_lane_f32(vget_high_f32(V1), 0);
+    float32x2_t v2zz = vdup_lane_f32(vget_high_f32(V2), 0);
+
+    XMVECTOR vResult = vmulq_f32(vcombine_f32(v1yx, v1xy), vcombine_f32(v2zz, v2yx));
+    vResult = vmlsq_f32(vResult, vcombine_f32(v1zz, v1yx), vcombine_f32(v2yx, v2xy));
+    vResult = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(vResult), g_XMFlipY));
+    return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vResult), g_XMMask3));
+#elif defined(_XM_SSE_INTRINSICS_)
+    // y1,z1,x1,w1
+    XMVECTOR vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(3, 0, 2, 1));
+    // z2,x2,y2,w2
+    XMVECTOR vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(3, 1, 0, 2));
+    // Perform the left operation
+    XMVECTOR vResult = _mm_mul_ps(vTemp1, vTemp2);
+    // z1,x1,y1,w1
+    vTemp1 = XM_PERMUTE_PS(vTemp1, _MM_SHUFFLE(3, 0, 2, 1));
+    // y2,z2,x2,w2
+    vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(3, 1, 0, 2));
+    // Perform the right operation
+    vResult = XM_FNMADD_PS(vTemp1, vTemp2, vResult);
+    // Set w to zero
+    return _mm_and_ps(vResult, g_XMMask3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3LengthSq(FXMVECTOR V) noexcept
+{
+    return XMVector3Dot(V, V);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+
+    Result = XMVector3LengthSq(V);
+    Result = XMVectorReciprocalSqrtEst(Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot3
+    float32x4_t vTemp = vmulq_f32(V, V);
+    float32x2_t v1 = vget_low_f32(vTemp);
+    float32x2_t v2 = vget_high_f32(vTemp);
+    v1 = vpadd_f32(v1, v1);
+    v2 = vdup_lane_f32(v2, 0);
+    v1 = vadd_f32(v1, v2);
+    // Reciprocal sqrt (estimate)
+    v2 = vrsqrte_f32(v1);
+    return vcombine_f32(v2, v2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
+    return _mm_rsqrt_ps(vTemp);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_rsqrt_ps(vLengthSq);
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y and z
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    // vTemp has z and y
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2));
+    // x+z, y
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    // y,y,y,y
+    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
+    // x+z+y,??,??,??
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    // Splat the length squared
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    // Get the reciprocal
+    vLengthSq = _mm_rsqrt_ps(vLengthSq);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+
+    Result = XMVector3LengthSq(V);
+    Result = XMVectorReciprocalSqrt(Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot3
+    float32x4_t vTemp = vmulq_f32(V, V);
+    float32x2_t v1 = vget_low_f32(vTemp);
+    float32x2_t v2 = vget_high_f32(vTemp);
+    v1 = vpadd_f32(v1, v1);
+    v2 = vdup_lane_f32(v2, 0);
+    v1 = vadd_f32(v1, v2);
+    // Reciprocal sqrt
+    float32x2_t  S0 = vrsqrte_f32(v1);
+    float32x2_t  P0 = vmul_f32(v1, S0);
+    float32x2_t  R0 = vrsqrts_f32(P0, S0);
+    float32x2_t  S1 = vmul_f32(S0, R0);
+    float32x2_t  P1 = vmul_f32(v1, S1);
+    float32x2_t  R1 = vrsqrts_f32(P1, S1);
+    float32x2_t Result = vmul_f32(S1, R1);
+    return vcombine_f32(Result, Result);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
+    XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp);
+    return _mm_div_ps(g_XMOne, vLengthSq);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vDot = _mm_mul_ps(V, V);
+    vDot = _mm_and_ps(vDot, g_XMMask3);
+    vDot = _mm_hadd_ps(vDot, vDot);
+    vDot = _mm_hadd_ps(vDot, vDot);
+    vDot = _mm_sqrt_ps(vDot);
+    vDot = _mm_div_ps(g_XMOne, vDot);
+    return vDot;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product
+    XMVECTOR vDot = _mm_mul_ps(V, V);
+    // x=Dot.y, y=Dot.z
+    XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1));
+    // Result.x = x+y
+    vDot = _mm_add_ss(vDot, vTemp);
+    // x=Dot.z
+    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
+    // Result.x = (x+y)+z
+    vDot = _mm_add_ss(vDot, vTemp);
+    // Splat x
+    vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
+    // Get the reciprocal
+    vDot = _mm_sqrt_ps(vDot);
+    // Get the reciprocal
+    vDot = _mm_div_ps(g_XMOne, vDot);
+    return vDot;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3LengthEst(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+
+    Result = XMVector3LengthSq(V);
+    Result = XMVectorSqrtEst(Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot3
+    float32x4_t vTemp = vmulq_f32(V, V);
+    float32x2_t v1 = vget_low_f32(vTemp);
+    float32x2_t v2 = vget_high_f32(vTemp);
+    v1 = vpadd_f32(v1, v1);
+    v2 = vdup_lane_f32(v2, 0);
+    v1 = vadd_f32(v1, v2);
+    const float32x2_t zero = vdup_n_f32(0);
+    uint32x2_t VEqualsZero = vceq_f32(v1, zero);
+    // Sqrt (estimate)
+    float32x2_t Result = vrsqrte_f32(v1);
+    Result = vmul_f32(v1, Result);
+    Result = vbsl_f32(VEqualsZero, zero, Result);
+    return vcombine_f32(Result, Result);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
+    return _mm_sqrt_ps(vTemp);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y and z
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    // vTemp has z and y
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2));
+    // x+z, y
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    // y,y,y,y
+    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
+    // x+z+y,??,??,??
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    // Splat the length squared
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    // Get the length
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Length(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+
+    Result = XMVector3LengthSq(V);
+    Result = XMVectorSqrt(Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot3
+    float32x4_t vTemp = vmulq_f32(V, V);
+    float32x2_t v1 = vget_low_f32(vTemp);
+    float32x2_t v2 = vget_high_f32(vTemp);
+    v1 = vpadd_f32(v1, v1);
+    v2 = vdup_lane_f32(v2, 0);
+    v1 = vadd_f32(v1, v2);
+    const float32x2_t zero = vdup_n_f32(0);
+    uint32x2_t VEqualsZero = vceq_f32(v1, zero);
+    // Sqrt
+    float32x2_t S0 = vrsqrte_f32(v1);
+    float32x2_t P0 = vmul_f32(v1, S0);
+    float32x2_t R0 = vrsqrts_f32(P0, S0);
+    float32x2_t S1 = vmul_f32(S0, R0);
+    float32x2_t P1 = vmul_f32(v1, S1);
+    float32x2_t R1 = vrsqrts_f32(P1, S1);
+    float32x2_t Result = vmul_f32(S1, R1);
+    Result = vmul_f32(v1, Result);
+    Result = vbsl_f32(VEqualsZero, zero, Result);
+    return vcombine_f32(Result, Result);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
+    return _mm_sqrt_ps(vTemp);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y and z
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    // vTemp has z and y
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2));
+    // x+z, y
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    // y,y,y,y
+    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
+    // x+z+y,??,??,??
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    // Splat the length squared
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    // Get the length
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// XMVector3NormalizeEst uses a reciprocal estimate and
+// returns QNaN on zero and infinite vectors.
+
+inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVector3ReciprocalLength(V);
+    Result = XMVectorMultiply(V, Result);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot3
+    float32x4_t vTemp = vmulq_f32(V, V);
+    float32x2_t v1 = vget_low_f32(vTemp);
+    float32x2_t v2 = vget_high_f32(vTemp);
+    v1 = vpadd_f32(v1, v1);
+    v2 = vdup_lane_f32(v2, 0);
+    v1 = vadd_f32(v1, v2);
+    // Reciprocal sqrt (estimate)
+    v2 = vrsqrte_f32(v1);
+    // Normalize
+    return vmulq_f32(V, vcombine_f32(v2, v2));
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
+    XMVECTOR vResult = _mm_rsqrt_ps(vTemp);
+    return _mm_mul_ps(vResult, V);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vDot = _mm_mul_ps(V, V);
+    vDot = _mm_and_ps(vDot, g_XMMask3);
+    vDot = _mm_hadd_ps(vDot, vDot);
+    vDot = _mm_hadd_ps(vDot, vDot);
+    vDot = _mm_rsqrt_ps(vDot);
+    vDot = _mm_mul_ps(vDot, V);
+    return vDot;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product
+    XMVECTOR vDot = _mm_mul_ps(V, V);
+    // x=Dot.y, y=Dot.z
+    XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1));
+    // Result.x = x+y
+    vDot = _mm_add_ss(vDot, vTemp);
+    // x=Dot.z
+    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
+    // Result.x = (x+y)+z
+    vDot = _mm_add_ss(vDot, vTemp);
+    // Splat x
+    vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
+    // Get the reciprocal
+    vDot = _mm_rsqrt_ps(vDot);
+    // Perform the normalization
+    vDot = _mm_mul_ps(vDot, V);
+    return vDot;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Normalize(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float fLength;
+    XMVECTOR vResult;
+
+    vResult = XMVector3Length(V);
+    fLength = vResult.vector4_f32[0];
+
+    // Prevent divide by zero
+    if (fLength > 0)
+    {
+        fLength = 1.0f / fLength;
+    }
+
+    vResult.vector4_f32[0] = V.vector4_f32[0] * fLength;
+    vResult.vector4_f32[1] = V.vector4_f32[1] * fLength;
+    vResult.vector4_f32[2] = V.vector4_f32[2] * fLength;
+    vResult.vector4_f32[3] = V.vector4_f32[3] * fLength;
+    return vResult;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot3
+    float32x4_t vTemp = vmulq_f32(V, V);
+    float32x2_t v1 = vget_low_f32(vTemp);
+    float32x2_t v2 = vget_high_f32(vTemp);
+    v1 = vpadd_f32(v1, v1);
+    v2 = vdup_lane_f32(v2, 0);
+    v1 = vadd_f32(v1, v2);
+    uint32x2_t VEqualsZero = vceq_f32(v1, vdup_n_f32(0));
+    uint32x2_t VEqualsInf = vceq_f32(v1, vget_low_f32(g_XMInfinity));
+    // Reciprocal sqrt (2 iterations of Newton-Raphson)
+    float32x2_t S0 = vrsqrte_f32(v1);
+    float32x2_t P0 = vmul_f32(v1, S0);
+    float32x2_t R0 = vrsqrts_f32(P0, S0);
+    float32x2_t S1 = vmul_f32(S0, R0);
+    float32x2_t P1 = vmul_f32(v1, S1);
+    float32x2_t R1 = vrsqrts_f32(P1, S1);
+    v2 = vmul_f32(S1, R1);
+    // Normalize
+    XMVECTOR vResult = vmulq_f32(V, vcombine_f32(v2, v2));
+    vResult = vbslq_f32(vcombine_u32(VEqualsZero, VEqualsZero), vdupq_n_f32(0), vResult);
+    return vbslq_f32(vcombine_u32(VEqualsInf, VEqualsInf), g_XMQNaN, vResult);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0x7f);
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
+    vResult = _mm_or_ps(vTemp1, vTemp2);
+    return vResult;
+#elif defined(_XM_SSE3_INTRINSICS_)
+    // Perform the dot product on x,y and z only
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
+    vResult = _mm_or_ps(vTemp1, vTemp2);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y and z only
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1));
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
+    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
+    vResult = _mm_or_ps(vTemp1, vTemp2);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3ClampLength
+(
+    FXMVECTOR V,
+    float    LengthMin,
+    float    LengthMax
+) noexcept
+{
+    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
+    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
+
+    return XMVector3ClampLengthV(V, ClampMin, ClampMax);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3ClampLengthV
+(
+    FXMVECTOR V,
+    FXMVECTOR LengthMin,
+    FXMVECTOR LengthMax
+) noexcept
+{
+    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)));
+    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)));
+    assert(XMVector3GreaterOrEqual(LengthMin, XMVectorZero()));
+    assert(XMVector3GreaterOrEqual(LengthMax, XMVectorZero()));
+    assert(XMVector3GreaterOrEqual(LengthMax, LengthMin));
+
+    XMVECTOR LengthSq = XMVector3LengthSq(V);
+
+    const XMVECTOR Zero = XMVectorZero();
+
+    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
+
+    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
+    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
+
+    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
+
+    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
+
+    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
+    Length = XMVectorSelect(LengthSq, Length, Select);
+    Normal = XMVectorSelect(LengthSq, Normal, Select);
+
+    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
+    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
+
+    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
+    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
+
+    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
+
+    // Preserve the original vector (with no precision loss) if the length falls within the given range
+    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
+    Result = XMVectorSelect(Result, V, Control);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Reflect
+(
+    FXMVECTOR Incident,
+    FXMVECTOR Normal
+) noexcept
+{
+    // Result = Incident - (2 * dot(Incident, Normal)) * Normal
+
+    XMVECTOR Result = XMVector3Dot(Incident, Normal);
+    Result = XMVectorAdd(Result, Result);
+    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Refract
+(
+    FXMVECTOR Incident,
+    FXMVECTOR Normal,
+    float    RefractionIndex
+) noexcept
+{
+    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
+    return XMVector3RefractV(Incident, Normal, Index);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3RefractV
+(
+    FXMVECTOR Incident,
+    FXMVECTOR Normal,
+    FXMVECTOR RefractionIndex
+) noexcept
+{
+    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
+    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    const XMVECTOR  Zero = XMVectorZero();
+
+    XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
+
+    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    XMVECTOR R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
+    R = XMVectorMultiply(R, RefractionIndex);
+    R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);
+
+    if (XMVector4LessOrEqual(R, Zero))
+    {
+        // Total internal reflection
+        return Zero;
+    }
+    else
+    {
+        // R = RefractionIndex * IDotN + sqrt(R)
+        R = XMVectorSqrt(R);
+        R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);
+
+        // Result = RefractionIndex * Incident - Normal * R
+        XMVECTOR Result = XMVectorMultiply(RefractionIndex, Incident);
+        Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);
+
+        return Result;
+    }
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
+
+    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    float32x4_t R = vmlsq_f32(g_XMOne, IDotN, IDotN);
+    R = vmulq_f32(R, RefractionIndex);
+    R = vmlsq_f32(g_XMOne, R, RefractionIndex);
+
+    uint32x4_t isrzero = vcleq_f32(R, g_XMZero);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(isrzero)), vget_high_u8(vreinterpretq_u8_u32(isrzero)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+
+    float32x4_t vResult;
+    if (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU)
+    {
+        // Total internal reflection
+        vResult = g_XMZero;
+    }
+    else
+    {
+        // Sqrt(R)
+        float32x4_t S0 = vrsqrteq_f32(R);
+        float32x4_t P0 = vmulq_f32(R, S0);
+        float32x4_t R0 = vrsqrtsq_f32(P0, S0);
+        float32x4_t S1 = vmulq_f32(S0, R0);
+        float32x4_t P1 = vmulq_f32(R, S1);
+        float32x4_t R1 = vrsqrtsq_f32(P1, S1);
+        float32x4_t S2 = vmulq_f32(S1, R1);
+        R = vmulq_f32(R, S2);
+        // R = RefractionIndex * IDotN + sqrt(R)
+        R = vmlaq_f32(R, RefractionIndex, IDotN);
+        // Result = RefractionIndex * Incident - Normal * R
+        vResult = vmulq_f32(RefractionIndex, Incident);
+        vResult = vmlsq_f32(vResult, R, Normal);
+    }
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
+    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
+    XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
+    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    XMVECTOR R = XM_FNMADD_PS(IDotN, IDotN, g_XMOne);
+    XMVECTOR R2 = _mm_mul_ps(RefractionIndex, RefractionIndex);
+    R = XM_FNMADD_PS(R, R2, g_XMOne);
+
+    XMVECTOR vResult = _mm_cmple_ps(R, g_XMZero);
+    if (_mm_movemask_ps(vResult) == 0x0f)
+    {
+        // Total internal reflection
+        vResult = g_XMZero;
+    }
+    else
+    {
+        // R = RefractionIndex * IDotN + sqrt(R)
+        R = _mm_sqrt_ps(R);
+        R = XM_FMADD_PS(RefractionIndex, IDotN, R);
+        // Result = RefractionIndex * Incident - Normal * R
+        vResult = _mm_mul_ps(RefractionIndex, Incident);
+        vResult = XM_FNMADD_PS(R, Normal, vResult);
+    }
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Orthogonal(FXMVECTOR V) noexcept
+{
+    XMVECTOR Zero = XMVectorZero();
+    XMVECTOR Z = XMVectorSplatZ(V);
+    XMVECTOR YZYY = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(V);
+
+    XMVECTOR NegativeV = XMVectorSubtract(Zero, V);
+
+    XMVECTOR ZIsNegative = XMVectorLess(Z, Zero);
+    XMVECTOR YZYYIsNegative = XMVectorLess(YZYY, Zero);
+
+    XMVECTOR S = XMVectorAdd(YZYY, Z);
+    XMVECTOR D = XMVectorSubtract(YZYY, Z);
+
+    XMVECTOR Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative);
+
+    XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(NegativeV, S);
+    XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(V, D);
+
+    return XMVectorSelect(R1, R0, Select);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst
+(
+    FXMVECTOR N1,
+    FXMVECTOR N2
+) noexcept
+{
+    XMVECTOR Result = XMVector3Dot(N1, N2);
+    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
+    Result = XMVectorACosEst(Result);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals
+(
+    FXMVECTOR N1,
+    FXMVECTOR N2
+) noexcept
+{
+    XMVECTOR Result = XMVector3Dot(N1, N2);
+    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
+    Result = XMVectorACos(Result);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+    XMVECTOR L1 = XMVector3ReciprocalLength(V1);
+    XMVECTOR L2 = XMVector3ReciprocalLength(V2);
+
+    XMVECTOR Dot = XMVector3Dot(V1, V2);
+
+    L1 = XMVectorMultiply(L1, L2);
+
+    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
+    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
+
+    return XMVectorACos(CosAngle);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3LinePointDistance
+(
+    FXMVECTOR LinePoint1,
+    FXMVECTOR LinePoint2,
+    FXMVECTOR Point
+) noexcept
+{
+    // Given a vector PointVector from LinePoint1 to Point and a vector
+    // LineVector from LinePoint1 to LinePoint2, the scaled distance
+    // PointProjectionScale from LinePoint1 to the perpendicular projection
+    // of PointVector onto the line is defined as:
+    //
+    //     PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector)
+
+    XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1);
+    XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1);
+
+    XMVECTOR LengthSq = XMVector3LengthSq(LineVector);
+
+    XMVECTOR PointProjectionScale = XMVector3Dot(PointVector, LineVector);
+    PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq);
+
+    XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale);
+    DistanceVector = XMVectorSubtract(PointVector, DistanceVector);
+
+    return XMVector3Length(DistanceVector);
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline void XM_CALLCONV XMVector3ComponentsFromNormal
+(
+    XMVECTOR* pParallel,
+    XMVECTOR* pPerpendicular,
+    FXMVECTOR  V,
+    FXMVECTOR  Normal
+) noexcept
+{
+    assert(pParallel != nullptr);
+    assert(pPerpendicular != nullptr);
+
+    XMVECTOR Scale = XMVector3Dot(V, Normal);
+
+    XMVECTOR Parallel = XMVectorMultiply(Normal, Scale);
+
+    *pParallel = Parallel;
+    *pPerpendicular = XMVectorSubtract(V, Parallel);
+}
+
+//------------------------------------------------------------------------------
+// Transform a vector using a rotation expressed as a unit quaternion
+
+inline XMVECTOR XM_CALLCONV XMVector3Rotate
+(
+    FXMVECTOR V,
+    FXMVECTOR RotationQuaternion
+) noexcept
+{
+    XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
+    XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion);
+    XMVECTOR Result = XMQuaternionMultiply(Q, A);
+    return XMQuaternionMultiply(Result, RotationQuaternion);
+}
+
+//------------------------------------------------------------------------------
+// Transform a vector using the inverse of a rotation expressed as a unit quaternion
+
+inline XMVECTOR XM_CALLCONV XMVector3InverseRotate
+(
+    FXMVECTOR V,
+    FXMVECTOR RotationQuaternion
+) noexcept
+{
+    XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
+    XMVECTOR Result = XMQuaternionMultiply(RotationQuaternion, A);
+    XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion);
+    return XMQuaternionMultiply(Result, Q);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Transform
+(
+    FXMVECTOR V,
+    FXMMATRIX M
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Z = XMVectorSplatZ(V);
+    XMVECTOR Y = XMVectorSplatY(V);
+    XMVECTOR X = XMVectorSplatX(V);
+
+    XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
+    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
+    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    XMVECTOR vResult = vmlaq_lane_f32(M.r[3], M.r[0], VL, 0); // X
+    vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1); // Y
+    return vmlaq_lane_f32(vResult, M.r[2], vget_high_f32(V), 0); // Z
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z
+    vResult = XM_FMADD_PS(vResult, M.r[2], M.r[3]);
+    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y
+    vResult = XM_FMADD_PS(vTemp, M.r[1], vResult);
+    vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X
+    vResult = XM_FMADD_PS(vTemp, M.r[0], vResult);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+#ifdef _PREFAST_
+#pragma prefast(push)
+#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
+#endif
+
+_Use_decl_annotations_
+inline XMFLOAT4* XM_CALLCONV XMVector3TransformStream
+(
+    XMFLOAT4* pOutputStream,
+    size_t          OutputStride,
+    const XMFLOAT3* pInputStream,
+    size_t          InputStride,
+    size_t          VectorCount,
+    FXMMATRIX       M
+) noexcept
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
+
+    assert(OutputStride >= sizeof(XMFLOAT4));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+        XMVECTOR Z = XMVectorSplatZ(V);
+        XMVECTOR Y = XMVectorSplatY(V);
+        XMVECTOR X = XMVectorSplatX(V);
+
+        XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3);
+        Result = XMVectorMultiplyAdd(Y, row1, Result);
+        Result = XMVectorMultiplyAdd(X, row0, Result);
+
+        XMStoreFloat4(reinterpret_cast<XMFLOAT4*>(pOutputVector), Result);
+
+        pInputVector += InputStride;
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT4)))
+        {
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x3_t V = vld3q_f32(reinterpret_cast<const float*>(pInputVector));
+                pInputVector += sizeof(XMFLOAT3) * 4;
+
+                float32x2_t r3 = vget_low_f32(row3);
+                float32x2_t r = vget_low_f32(row0);
+                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M
+                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N
+
+                XM_PREFETCH(pInputVector);
+
+                r3 = vget_high_f32(row3);
+                r = vget_high_f32(row0);
+                XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O
+                XMVECTOR vResult3 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P
+
+                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
+
+                r = vget_low_f32(row1);
+                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M
+                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
+
+                r = vget_high_f32(row1);
+                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O
+                vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1); // Dx+Hy+P
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
+
+                r = vget_low_f32(row2);
+                vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz+M
+                vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz+N
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));
+
+                r = vget_high_f32(row2);
+                vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz+O
+                vResult3 = vmlaq_lane_f32(vResult3, V.val[2], r, 1); // Dx+Hy+Lz+P
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));
+
+                float32x4x4_t R;
+                R.val[0] = vResult0;
+                R.val[1] = vResult1;
+                R.val[2] = vResult2;
+                R.val[3] = vResult3;
+
+                vst4q_f32(reinterpret_cast<float*>(pOutputVector), R);
+                pOutputVector += sizeof(XMFLOAT4) * 4;
+
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        float32x2_t VL = vld1_f32(reinterpret_cast<const float*>(pInputVector));
+        float32x2_t zero = vdup_n_f32(0);
+        float32x2_t VH = vld1_lane_f32(reinterpret_cast<const float*>(pInputVector) + 2, zero, 0);
+        pInputVector += InputStride;
+
+        XMVECTOR vResult = vmlaq_lane_f32(row3, row0, VL, 0); // X
+        vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y
+        vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z
+
+        vst1q_f32(reinterpret_cast<float*>(pOutputVector), vResult);
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(XMFLOAT3))
+        {
+            if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF))
+            {
+                // Packed input, aligned output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                    __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
+                    __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
+                    pInputVector += sizeof(XMFLOAT3) * 4;
+
+                    // Unpack the 4 vectors (.w components are junk)
+                    XM3UNPACK3INTO4(V1, L2, L3);
+
+                    // Result 1
+                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
+                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
+                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
+                    XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
+                    XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 2
+                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = XM_FMADD_PS(Z, row2, row3);
+                    vTemp2 = _mm_mul_ps(Y, row1);
+                    vTemp3 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 3
+                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = XM_FMADD_PS(Z, row2, row3);
+                    vTemp2 = _mm_mul_ps(Y, row1);
+                    vTemp3 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 4
+                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = XM_FMADD_PS(Z, row2, row3);
+                    vTemp2 = _mm_mul_ps(Y, row1);
+                    vTemp3 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    i += 4;
+                }
+            }
+            else
+            {
+                // Packed input, unaligned output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                    __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
+                    __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
+                    pInputVector += sizeof(XMFLOAT3) * 4;
+
+                    // Unpack the 4 vectors (.w components are junk)
+                    XM3UNPACK3INTO4(V1, L2, L3);
+
+                    // Result 1
+                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
+                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
+                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
+                    XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
+                    XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 2
+                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = XM_FMADD_PS(Z, row2, row3);
+                    vTemp2 = _mm_mul_ps(Y, row1);
+                    vTemp3 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 3
+                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = XM_FMADD_PS(Z, row2, row3);
+                    vTemp2 = _mm_mul_ps(Y, row1);
+                    vTemp3 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 4
+                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = XM_FMADD_PS(Z, row2, row3);
+                    vTemp2 = _mm_mul_ps(Y, row1);
+                    vTemp3 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    i += 4;
+                }
+            }
+        }
+    }
+
+    if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF))
+    {
+        // Aligned output
+        for (; i < VectorCount; ++i)
+        {
+            XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+            pInputVector += InputStride;
+
+            XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+            XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+            XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+            XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
+            XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
+            XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
+            vTemp = _mm_add_ps(vTemp, vTemp2);
+            vTemp = _mm_add_ps(vTemp, vTemp3);
+
+            XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
+            pOutputVector += OutputStride;
+        }
+    }
+    else
+    {
+        // Unaligned output
+        for (; i < VectorCount; ++i)
+        {
+            XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+            pInputVector += InputStride;
+
+            XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+            XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+            XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+            XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
+            XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
+            XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
+            vTemp = _mm_add_ps(vTemp, vTemp2);
+            vTemp = _mm_add_ps(vTemp, vTemp3);
+
+            _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
+            pOutputVector += OutputStride;
+        }
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+#ifdef _PREFAST_
+#pragma prefast(pop)
+#endif
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3TransformCoord
+(
+    FXMVECTOR V,
+    FXMMATRIX M
+) noexcept
+{
+    XMVECTOR Z = XMVectorSplatZ(V);
+    XMVECTOR Y = XMVectorSplatY(V);
+    XMVECTOR X = XMVectorSplatX(V);
+
+    XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
+    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
+    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+    XMVECTOR W = XMVectorSplatW(Result);
+    return XMVectorDivide(Result, W);
+}
+
+//------------------------------------------------------------------------------
+
+#ifdef _PREFAST_
+#pragma prefast(push)
+#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
+#endif
+
+_Use_decl_annotations_
+inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
+(
+    XMFLOAT3* pOutputStream,
+    size_t          OutputStride,
+    const XMFLOAT3* pInputStream,
+    size_t          InputStride,
+    size_t          VectorCount,
+    FXMMATRIX       M
+) noexcept
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
+
+    assert(OutputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+        XMVECTOR Z = XMVectorSplatZ(V);
+        XMVECTOR Y = XMVectorSplatY(V);
+        XMVECTOR X = XMVectorSplatX(V);
+
+        XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3);
+        Result = XMVectorMultiplyAdd(Y, row1, Result);
+        Result = XMVectorMultiplyAdd(X, row0, Result);
+
+        XMVECTOR W = XMVectorSplatW(Result);
+
+        Result = XMVectorDivide(Result, W);
+
+        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), Result);
+
+        pInputVector += InputStride;
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
+        {
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x3_t V = vld3q_f32(reinterpret_cast<const float*>(pInputVector));
+                pInputVector += sizeof(XMFLOAT3) * 4;
+
+                float32x2_t r3 = vget_low_f32(row3);
+                float32x2_t r = vget_low_f32(row0);
+                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M
+                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N
+
+                XM_PREFETCH(pInputVector);
+
+                r3 = vget_high_f32(row3);
+                r = vget_high_f32(row0);
+                XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O
+                XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P
+
+                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
+
+                r = vget_low_f32(row1);
+                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M
+                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
+
+                r = vget_high_f32(row1);
+                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O
+                W = vmlaq_lane_f32(W, V.val[1], r, 1); // Dx+Hy+P
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
+
+                r = vget_low_f32(row2);
+                vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz+M
+                vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz+N
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));
+
+                r = vget_high_f32(row2);
+                vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz+O
+                W = vmlaq_lane_f32(W, V.val[2], r, 1); // Dx+Hy+Lz+P
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));
+
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+                V.val[0] = vdivq_f32(vResult0, W);
+                V.val[1] = vdivq_f32(vResult1, W);
+                V.val[2] = vdivq_f32(vResult2, W);
+#else
+                // 2 iterations of Newton-Raphson refinement of reciprocal
+                float32x4_t Reciprocal = vrecpeq_f32(W);
+                float32x4_t S = vrecpsq_f32(Reciprocal, W);
+                Reciprocal = vmulq_f32(S, Reciprocal);
+                S = vrecpsq_f32(Reciprocal, W);
+                Reciprocal = vmulq_f32(S, Reciprocal);
+
+                V.val[0] = vmulq_f32(vResult0, Reciprocal);
+                V.val[1] = vmulq_f32(vResult1, Reciprocal);
+                V.val[2] = vmulq_f32(vResult2, Reciprocal);
+#endif
+
+                vst3q_f32(reinterpret_cast<float*>(pOutputVector), V);
+                pOutputVector += sizeof(XMFLOAT3) * 4;
+
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        float32x2_t VL = vld1_f32(reinterpret_cast<const float*>(pInputVector));
+        float32x2_t zero = vdup_n_f32(0);
+        float32x2_t VH = vld1_lane_f32(reinterpret_cast<const float*>(pInputVector) + 2, zero, 0);
+        pInputVector += InputStride;
+
+        XMVECTOR vResult = vmlaq_lane_f32(row3, row0, VL, 0); // X
+        vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y
+        vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z
+
+        VH = vget_high_f32(vResult);
+        XMVECTOR W = vdupq_lane_f32(VH, 1);
+
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+        vResult = vdivq_f32(vResult, W);
+#else
+        // 2 iterations of Newton-Raphson refinement of reciprocal for W
+        float32x4_t Reciprocal = vrecpeq_f32(W);
+        float32x4_t S = vrecpsq_f32(Reciprocal, W);
+        Reciprocal = vmulq_f32(S, Reciprocal);
+        S = vrecpsq_f32(Reciprocal, W);
+        Reciprocal = vmulq_f32(S, Reciprocal);
+
+        vResult = vmulq_f32(vResult, Reciprocal);
+#endif
+
+        VL = vget_low_f32(vResult);
+        vst1_f32(reinterpret_cast<float*>(pOutputVector), VL);
+        vst1q_lane_f32(reinterpret_cast<float*>(pOutputVector) + 2, vResult, 2);
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(XMFLOAT3))
+        {
+            if (OutputStride == sizeof(XMFLOAT3))
+            {
+                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF))
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
+                        __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
+                        pInputVector += sizeof(XMFLOAT3) * 4;
+
+                        // Unpack the 4 vectors (.w components are junk)
+                        XM3UNPACK3INTO4(V1, L2, L3);
+
+                        // Result 1
+                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
+                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
+                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
+                        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
+                        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                        V1 = _mm_div_ps(vTemp, W);
+
+                        // Result 2
+                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, row2, row3);
+                        vTemp2 = _mm_mul_ps(Y, row1);
+                        vTemp3 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                        V2 = _mm_div_ps(vTemp, W);
+
+                        // Result 3
+                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, row2, row3);
+                        vTemp2 = _mm_mul_ps(Y, row1);
+                        vTemp3 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                        V3 = _mm_div_ps(vTemp, W);
+
+                        // Result 4
+                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, row2, row3);
+                        vTemp2 = _mm_mul_ps(Y, row1);
+                        vTemp3 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                        V4 = _mm_div_ps(vTemp, W);
+
+                        // Pack and store the vectors
+                        XM3PACK4INTO3(vTemp);
+                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), V1);
+                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
+                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 32), V3);
+                        pOutputVector += sizeof(XMFLOAT3) * 4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, unaligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
+                        __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
+                        pInputVector += sizeof(XMFLOAT3) * 4;
+
+                        // Unpack the 4 vectors (.w components are junk)
+                        XM3UNPACK3INTO4(V1, L2, L3);
+
+                        // Result 1
+                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
+                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
+                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
+                        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
+                        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                        V1 = _mm_div_ps(vTemp, W);
+
+                        // Result 2
+                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, row2, row3);
+                        vTemp2 = _mm_mul_ps(Y, row1);
+                        vTemp3 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                        V2 = _mm_div_ps(vTemp, W);
+
+                        // Result 3
+                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, row2, row3);
+                        vTemp2 = _mm_mul_ps(Y, row1);
+                        vTemp3 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                        V3 = _mm_div_ps(vTemp, W);
+
+                        // Result 4
+                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, row2, row3);
+                        vTemp2 = _mm_mul_ps(Y, row1);
+                        vTemp3 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                        V4 = _mm_div_ps(vTemp, W);
+
+                        // Pack and store the vectors
+                        XM3PACK4INTO3(vTemp);
+                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), V1);
+                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
+                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 32), V3);
+                        pOutputVector += sizeof(XMFLOAT3) * 4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, unpacked output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                    __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
+                    __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
+                    pInputVector += sizeof(XMFLOAT3) * 4;
+
+                    // Unpack the 4 vectors (.w components are junk)
+                    XM3UNPACK3INTO4(V1, L2, L3);
+
+                    // Result 1
+                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
+                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
+                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
+                    XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
+                    XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                    XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                    vTemp = _mm_div_ps(vTemp, W);
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 2
+                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = XM_FMADD_PS(Z, row2, row3);
+                    vTemp2 = _mm_mul_ps(Y, row1);
+                    vTemp3 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                    vTemp = _mm_div_ps(vTemp, W);
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 3
+                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = XM_FMADD_PS(Z, row2, row3);
+                    vTemp2 = _mm_mul_ps(Y, row1);
+                    vTemp3 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                    vTemp = _mm_div_ps(vTemp, W);
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 4
+                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = XM_FMADD_PS(Z, row2, row3);
+                    vTemp2 = _mm_mul_ps(Y, row1);
+                    vTemp3 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+                    vTemp = _mm_div_ps(vTemp, W);
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    i += 4;
+                }
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+        pInputVector += InputStride;
+
+        XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+        XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
+        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
+        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
+        vTemp = _mm_add_ps(vTemp, vTemp2);
+        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+
+        vTemp = _mm_div_ps(vTemp, W);
+
+        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+        pOutputVector += OutputStride;
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+#ifdef _PREFAST_
+#pragma prefast(pop)
+#endif
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3TransformNormal
+(
+    FXMVECTOR V,
+    FXMMATRIX M
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Z = XMVectorSplatZ(V);
+    XMVECTOR Y = XMVectorSplatY(V);
+    XMVECTOR X = XMVectorSplatX(V);
+
+    XMVECTOR Result = XMVectorMultiply(Z, M.r[2]);
+    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
+    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    XMVECTOR vResult = vmulq_lane_f32(M.r[0], VL, 0); // X
+    vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1); // Y
+    return vmlaq_lane_f32(vResult, M.r[2], vget_high_f32(V), 0); // Z
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z
+    vResult = _mm_mul_ps(vResult, M.r[2]);
+    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y
+    vResult = XM_FMADD_PS(vTemp, M.r[1], vResult);
+    vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X
+    vResult = XM_FMADD_PS(vTemp, M.r[0], vResult);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+#ifdef _PREFAST_
+#pragma prefast(push)
+#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
+#endif
+
+_Use_decl_annotations_
+inline XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream
+(
+    XMFLOAT3* pOutputStream,
+    size_t          OutputStride,
+    const XMFLOAT3* pInputStream,
+    size_t          InputStride,
+    size_t          VectorCount,
+    FXMMATRIX       M
+) noexcept
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
+
+    assert(OutputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+        XMVECTOR Z = XMVectorSplatZ(V);
+        XMVECTOR Y = XMVectorSplatY(V);
+        XMVECTOR X = XMVectorSplatX(V);
+
+        XMVECTOR Result = XMVectorMultiply(Z, row2);
+        Result = XMVectorMultiplyAdd(Y, row1, Result);
+        Result = XMVectorMultiplyAdd(X, row0, Result);
+
+        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), Result);
+
+        pInputVector += InputStride;
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
+        {
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x3_t V = vld3q_f32(reinterpret_cast<const float*>(pInputVector));
+                pInputVector += sizeof(XMFLOAT3) * 4;
+
+                float32x2_t r = vget_low_f32(row0);
+                XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0); // Ax
+                XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1); // Bx
+
+                XM_PREFETCH(pInputVector);
+
+                r = vget_high_f32(row0);
+                XMVECTOR vResult2 = vmulq_lane_f32(V.val[0], r, 0); // Cx
+
+                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
+
+                r = vget_low_f32(row1);
+                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey
+                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
+
+                r = vget_high_f32(row1);
+                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
+
+                r = vget_low_f32(row2);
+                vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz
+                vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));
+
+                r = vget_high_f32(row2);
+                vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));
+
+                V.val[0] = vResult0;
+                V.val[1] = vResult1;
+                V.val[2] = vResult2;
+
+                vst3q_f32(reinterpret_cast<float*>(pOutputVector), V);
+                pOutputVector += sizeof(XMFLOAT3) * 4;
+
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        float32x2_t VL = vld1_f32(reinterpret_cast<const float*>(pInputVector));
+        float32x2_t zero = vdup_n_f32(0);
+        float32x2_t VH = vld1_lane_f32(reinterpret_cast<const float*>(pInputVector) + 2, zero, 0);
+        pInputVector += InputStride;
+
+        XMVECTOR vResult = vmulq_lane_f32(row0, VL, 0); // X
+        vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y
+        vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z
+
+        VL = vget_low_f32(vResult);
+        vst1_f32(reinterpret_cast<float*>(pOutputVector), VL);
+        vst1q_lane_f32(reinterpret_cast<float*>(pOutputVector) + 2, vResult, 2);
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(XMFLOAT3))
+        {
+            if (OutputStride == sizeof(XMFLOAT3))
+            {
+                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF))
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
+                        __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
+                        pInputVector += sizeof(XMFLOAT3) * 4;
+
+                        // Unpack the 4 vectors (.w components are junk)
+                        XM3UNPACK3INTO4(V1, L2, L3);
+
+                        // Result 1
+                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
+                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
+                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        XMVECTOR vTemp = _mm_mul_ps(Z, row2);
+                        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
+                        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        V1 = _mm_add_ps(vTemp, vTemp3);
+
+                        // Result 2
+                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = _mm_mul_ps(Z, row2);
+                        vTemp2 = _mm_mul_ps(Y, row1);
+                        vTemp3 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        V2 = _mm_add_ps(vTemp, vTemp3);
+
+                        // Result 3
+                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = _mm_mul_ps(Z, row2);
+                        vTemp2 = _mm_mul_ps(Y, row1);
+                        vTemp3 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        V3 = _mm_add_ps(vTemp, vTemp3);
+
+                        // Result 4
+                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = _mm_mul_ps(Z, row2);
+                        vTemp2 = _mm_mul_ps(Y, row1);
+                        vTemp3 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        V4 = _mm_add_ps(vTemp, vTemp3);
+
+                        // Pack and store the vectors
+                        XM3PACK4INTO3(vTemp);
+                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), V1);
+                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
+                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 32), V3);
+                        pOutputVector += sizeof(XMFLOAT3) * 4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, unaligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
+                        __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
+                        pInputVector += sizeof(XMFLOAT3) * 4;
+
+                        // Unpack the 4 vectors (.w components are junk)
+                        XM3UNPACK3INTO4(V1, L2, L3);
+
+                        // Result 1
+                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
+                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
+                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        XMVECTOR vTemp = _mm_mul_ps(Z, row2);
+                        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
+                        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        V1 = _mm_add_ps(vTemp, vTemp3);
+
+                        // Result 2
+                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = _mm_mul_ps(Z, row2);
+                        vTemp2 = _mm_mul_ps(Y, row1);
+                        vTemp3 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        V2 = _mm_add_ps(vTemp, vTemp3);
+
+                        // Result 3
+                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = _mm_mul_ps(Z, row2);
+                        vTemp2 = _mm_mul_ps(Y, row1);
+                        vTemp3 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        V3 = _mm_add_ps(vTemp, vTemp3);
+
+                        // Result 4
+                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = _mm_mul_ps(Z, row2);
+                        vTemp2 = _mm_mul_ps(Y, row1);
+                        vTemp3 = _mm_mul_ps(X, row0);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        V4 = _mm_add_ps(vTemp, vTemp3);
+
+                        // Pack and store the vectors
+                        XM3PACK4INTO3(vTemp);
+                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), V1);
+                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
+                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 32), V3);
+                        pOutputVector += sizeof(XMFLOAT3) * 4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, unpacked output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                    __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
+                    __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
+                    pInputVector += sizeof(XMFLOAT3) * 4;
+
+                    // Unpack the 4 vectors (.w components are junk)
+                    XM3UNPACK3INTO4(V1, L2, L3);
+
+                    // Result 1
+                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
+                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
+                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    XMVECTOR vTemp = _mm_mul_ps(Z, row2);
+                    XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
+                    XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 2
+                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = _mm_mul_ps(Z, row2);
+                    vTemp2 = _mm_mul_ps(Y, row1);
+                    vTemp3 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 3
+                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = _mm_mul_ps(Z, row2);
+                    vTemp2 = _mm_mul_ps(Y, row1);
+                    vTemp3 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 4
+                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = _mm_mul_ps(Z, row2);
+                    vTemp2 = _mm_mul_ps(Y, row1);
+                    vTemp3 = _mm_mul_ps(X, row0);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    i += 4;
+                }
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+        pInputVector += InputStride;
+
+        XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+        XMVECTOR vTemp = _mm_mul_ps(Z, row2);
+        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
+        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
+        vTemp = _mm_add_ps(vTemp, vTemp2);
+        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+        pOutputVector += OutputStride;
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+#ifdef _PREFAST_
+#pragma prefast(pop)
+#endif
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Project
+(
+    FXMVECTOR V,
+    float    ViewportX,
+    float    ViewportY,
+    float    ViewportWidth,
+    float    ViewportHeight,
+    float    ViewportMinZ,
+    float    ViewportMaxZ,
+    FXMMATRIX Projection,
+    CXMMATRIX View,
+    CXMMATRIX World
+) noexcept
+{
+    const float HalfViewportWidth = ViewportWidth * 0.5f;
+    const float HalfViewportHeight = ViewportHeight * 0.5f;
+
+    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f);
+    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
+
+    XMMATRIX Transform = XMMatrixMultiply(World, View);
+    Transform = XMMatrixMultiply(Transform, Projection);
+
+    XMVECTOR Result = XMVector3TransformCoord(V, Transform);
+
+    Result = XMVectorMultiplyAdd(Result, Scale, Offset);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+#ifdef _PREFAST_
+#pragma prefast(push)
+#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
+#endif
+
+_Use_decl_annotations_
+inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
+(
+    XMFLOAT3* pOutputStream,
+    size_t          OutputStride,
+    const XMFLOAT3* pInputStream,
+    size_t          InputStride,
+    size_t          VectorCount,
+    float           ViewportX,
+    float           ViewportY,
+    float           ViewportWidth,
+    float           ViewportHeight,
+    float           ViewportMinZ,
+    float           ViewportMaxZ,
+    FXMMATRIX     Projection,
+    CXMMATRIX     View,
+    CXMMATRIX     World
+) noexcept
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
+
+    assert(OutputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    const float HalfViewportWidth = ViewportWidth * 0.5f;
+    const float HalfViewportHeight = ViewportHeight * 0.5f;
+
+    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
+    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
+
+    XMMATRIX Transform = XMMatrixMultiply(World, View);
+    Transform = XMMatrixMultiply(Transform, Projection);
+
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+
+        XMVECTOR Result = XMVector3TransformCoord(V, Transform);
+        Result = XMVectorMultiplyAdd(Result, Scale, Offset);
+
+        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), Result);
+
+        pInputVector += InputStride;
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    const float HalfViewportWidth = ViewportWidth * 0.5f;
+    const float HalfViewportHeight = ViewportHeight * 0.5f;
+
+    XMMATRIX Transform = XMMatrixMultiply(World, View);
+    Transform = XMMatrixMultiply(Transform, Projection);
+
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
+        {
+            XMVECTOR ScaleX = vdupq_n_f32(HalfViewportWidth);
+            XMVECTOR ScaleY = vdupq_n_f32(-HalfViewportHeight);
+            XMVECTOR ScaleZ = vdupq_n_f32(ViewportMaxZ - ViewportMinZ);
+
+            XMVECTOR OffsetX = vdupq_n_f32(ViewportX + HalfViewportWidth);
+            XMVECTOR OffsetY = vdupq_n_f32(ViewportY + HalfViewportHeight);
+            XMVECTOR OffsetZ = vdupq_n_f32(ViewportMinZ);
+
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x3_t V = vld3q_f32(reinterpret_cast<const float*>(pInputVector));
+                pInputVector += sizeof(XMFLOAT3) * 4;
+
+                float32x2_t r3 = vget_low_f32(Transform.r[3]);
+                float32x2_t r = vget_low_f32(Transform.r[0]);
+                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M
+                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N
+
+                XM_PREFETCH(pInputVector);
+
+                r3 = vget_high_f32(Transform.r[3]);
+                r = vget_high_f32(Transform.r[0]);
+                XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O
+                XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P
+
+                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
+
+                r = vget_low_f32(Transform.r[1]);
+                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M
+                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
+
+                r = vget_high_f32(Transform.r[1]);
+                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O
+                W = vmlaq_lane_f32(W, V.val[1], r, 1); // Dx+Hy+P
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
+
+                r = vget_low_f32(Transform.r[2]);
+                vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz+M
+                vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz+N
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));
+
+                r = vget_high_f32(Transform.r[2]);
+                vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz+O
+                W = vmlaq_lane_f32(W, V.val[2], r, 1); // Dx+Hy+Lz+P
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));
+
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+                vResult0 = vdivq_f32(vResult0, W);
+                vResult1 = vdivq_f32(vResult1, W);
+                vResult2 = vdivq_f32(vResult2, W);
+#else
+                // 2 iterations of Newton-Raphson refinement of reciprocal
+                float32x4_t Reciprocal = vrecpeq_f32(W);
+                float32x4_t S = vrecpsq_f32(Reciprocal, W);
+                Reciprocal = vmulq_f32(S, Reciprocal);
+                S = vrecpsq_f32(Reciprocal, W);
+                Reciprocal = vmulq_f32(S, Reciprocal);
+
+                vResult0 = vmulq_f32(vResult0, Reciprocal);
+                vResult1 = vmulq_f32(vResult1, Reciprocal);
+                vResult2 = vmulq_f32(vResult2, Reciprocal);
+#endif
+
+                V.val[0] = vmlaq_f32(OffsetX, vResult0, ScaleX);
+                V.val[1] = vmlaq_f32(OffsetY, vResult1, ScaleY);
+                V.val[2] = vmlaq_f32(OffsetZ, vResult2, ScaleZ);
+
+                vst3q_f32(reinterpret_cast<float*>(pOutputVector), V);
+                pOutputVector += sizeof(XMFLOAT3) * 4;
+
+                i += 4;
+            }
+        }
+    }
+
+    if (i < VectorCount)
+    {
+        XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
+        XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
+
+        for (; i < VectorCount; i++)
+        {
+            float32x2_t VL = vld1_f32(reinterpret_cast<const float*>(pInputVector));
+            float32x2_t zero = vdup_n_f32(0);
+            float32x2_t VH = vld1_lane_f32(reinterpret_cast<const float*>(pInputVector) + 2, zero, 0);
+            pInputVector += InputStride;
+
+            XMVECTOR vResult = vmlaq_lane_f32(Transform.r[3], Transform.r[0], VL, 0); // X
+            vResult = vmlaq_lane_f32(vResult, Transform.r[1], VL, 1); // Y
+            vResult = vmlaq_lane_f32(vResult, Transform.r[2], VH, 0); // Z
+
+            VH = vget_high_f32(vResult);
+            XMVECTOR W = vdupq_lane_f32(VH, 1);
+
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+            vResult = vdivq_f32(vResult, W);
+#else
+            // 2 iterations of Newton-Raphson refinement of reciprocal for W
+            float32x4_t Reciprocal = vrecpeq_f32(W);
+            float32x4_t S = vrecpsq_f32(Reciprocal, W);
+            Reciprocal = vmulq_f32(S, Reciprocal);
+            S = vrecpsq_f32(Reciprocal, W);
+            Reciprocal = vmulq_f32(S, Reciprocal);
+
+            vResult = vmulq_f32(vResult, Reciprocal);
+#endif
+
+            vResult = vmlaq_f32(Offset, vResult, Scale);
+
+            VL = vget_low_f32(vResult);
+            vst1_f32(reinterpret_cast<float*>(pOutputVector), VL);
+            vst1q_lane_f32(reinterpret_cast<float*>(pOutputVector) + 2, vResult, 2);
+            pOutputVector += OutputStride;
+        }
+    }
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    const float HalfViewportWidth = ViewportWidth * 0.5f;
+    const float HalfViewportHeight = ViewportHeight * 0.5f;
+
+    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
+    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
+
+    XMMATRIX Transform = XMMatrixMultiply(World, View);
+    Transform = XMMatrixMultiply(Transform, Projection);
+
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(XMFLOAT3))
+        {
+            if (OutputStride == sizeof(XMFLOAT3))
+            {
+                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF))
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
+                        __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
+                        pInputVector += sizeof(XMFLOAT3) * 4;
+
+                        // Unpack the 4 vectors (.w components are junk)
+                        XM3UNPACK3INTO4(V1, L2, L3);
+
+                        // Result 1
+                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
+                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
+                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                        vTemp = _mm_div_ps(vTemp, W);
+                        V1 = XM_FMADD_PS(vTemp, Scale, Offset);
+
+                        // Result 2
+                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                        vTemp = _mm_div_ps(vTemp, W);
+                        V2 = XM_FMADD_PS(vTemp, Scale, Offset);
+
+                        // Result 3
+                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                        vTemp = _mm_div_ps(vTemp, W);
+                        V3 = XM_FMADD_PS(vTemp, Scale, Offset);
+
+                        // Result 4
+                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                        vTemp = _mm_div_ps(vTemp, W);
+                        V4 = XM_FMADD_PS(vTemp, Scale, Offset);
+
+                        // Pack and store the vectors
+                        XM3PACK4INTO3(vTemp);
+                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), V1);
+                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
+                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 32), V3);
+                        pOutputVector += sizeof(XMFLOAT3) * 4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, unaligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
+                        __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
+                        pInputVector += sizeof(XMFLOAT3) * 4;
+
+                        // Unpack the 4 vectors (.w components are junk)
+                        XM3UNPACK3INTO4(V1, L2, L3);
+
+                        // Result 1
+                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
+                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
+                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                        vTemp = _mm_div_ps(vTemp, W);
+                        V1 = XM_FMADD_PS(vTemp, Scale, Offset);
+
+                        // Result 2
+                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                        vTemp = _mm_div_ps(vTemp, W);
+                        V2 = XM_FMADD_PS(vTemp, Scale, Offset);
+
+                        // Result 3
+                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                        vTemp = _mm_div_ps(vTemp, W);
+                        V3 = XM_FMADD_PS(vTemp, Scale, Offset);
+
+                        // Result 4
+                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                        vTemp = _mm_div_ps(vTemp, W);
+                        V4 = XM_FMADD_PS(vTemp, Scale, Offset);
+
+                        // Pack and store the vectors
+                        XM3PACK4INTO3(vTemp);
+                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), V1);
+                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
+                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 32), V3);
+                        pOutputVector += sizeof(XMFLOAT3) * 4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, unpacked output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                    __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
+                    __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
+                    pInputVector += sizeof(XMFLOAT3) * 4;
+
+                    // Unpack the 4 vectors (.w components are junk)
+                    XM3UNPACK3INTO4(V1, L2, L3);
+
+                    // Result 1
+                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
+                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
+                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                    XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                    XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                    XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                    vTemp = _mm_div_ps(vTemp, W);
+                    vTemp = XM_FMADD_PS(vTemp, Scale, Offset);
+
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 2
+                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                    vTemp = _mm_div_ps(vTemp, W);
+                    vTemp = XM_FMADD_PS(vTemp, Scale, Offset);
+
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 3
+                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                    vTemp = _mm_div_ps(vTemp, W);
+                    vTemp = XM_FMADD_PS(vTemp, Scale, Offset);
+
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 4
+                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                    vTemp = _mm_div_ps(vTemp, W);
+                    vTemp = XM_FMADD_PS(vTemp, Scale, Offset);
+
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    i += 4;
+                }
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+        pInputVector += InputStride;
+
+        XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+        XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+        vTemp = _mm_add_ps(vTemp, vTemp2);
+        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+        vTemp = _mm_div_ps(vTemp, W);
+        vTemp = XM_FMADD_PS(vTemp, Scale, Offset);
+
+        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+        pOutputVector += OutputStride;
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+#ifdef _PREFAST_
+#pragma prefast(pop)
+#endif
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector3Unproject
+(
+    FXMVECTOR V,
+    float     ViewportX,
+    float     ViewportY,
+    float     ViewportWidth,
+    float     ViewportHeight,
+    float     ViewportMinZ,
+    float     ViewportMaxZ,
+    FXMMATRIX Projection,
+    CXMMATRIX View,
+    CXMMATRIX World
+) noexcept
+{
+    static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };
+
+    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
+    Scale = XMVectorReciprocal(Scale);
+
+    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
+    Offset = XMVectorMultiplyAdd(Scale, Offset, D.v);
+
+    XMMATRIX Transform = XMMatrixMultiply(World, View);
+    Transform = XMMatrixMultiply(Transform, Projection);
+    Transform = XMMatrixInverse(nullptr, Transform);
+
+    XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset);
+
+    return XMVector3TransformCoord(Result, Transform);
+}
+
+//------------------------------------------------------------------------------
+
+#ifdef _PREFAST_
+#pragma prefast(push)
+#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
+#endif
+
+_Use_decl_annotations_
+inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
+(
+    XMFLOAT3* pOutputStream,
+    size_t          OutputStride,
+    const XMFLOAT3* pInputStream,
+    size_t          InputStride,
+    size_t          VectorCount,
+    float           ViewportX,
+    float           ViewportY,
+    float           ViewportWidth,
+    float           ViewportHeight,
+    float           ViewportMinZ,
+    float           ViewportMaxZ,
+    FXMMATRIX       Projection,
+    CXMMATRIX       View,
+    CXMMATRIX       World
+) noexcept
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
+
+    assert(OutputStride >= sizeof(XMFLOAT3));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };
+
+    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
+    Scale = XMVectorReciprocal(Scale);
+
+    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
+    Offset = XMVectorMultiplyAdd(Scale, Offset, D.v);
+
+    XMMATRIX Transform = XMMatrixMultiply(World, View);
+    Transform = XMMatrixMultiply(Transform, Projection);
+    Transform = XMMatrixInverse(nullptr, Transform);
+
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+
+        XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset);
+
+        Result = XMVector3TransformCoord(Result, Transform);
+
+        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), Result);
+
+        pInputVector += InputStride;
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMMATRIX Transform = XMMatrixMultiply(World, View);
+    Transform = XMMatrixMultiply(Transform, Projection);
+    Transform = XMMatrixInverse(nullptr, Transform);
+
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    float sx = 1.f / (ViewportWidth * 0.5f);
+    float sy = 1.f / (-ViewportHeight * 0.5f);
+    float sz = 1.f / (ViewportMaxZ - ViewportMinZ);
+
+    float ox = (-ViewportX * sx) - 1.f;
+    float oy = (-ViewportY * sy) + 1.f;
+    float oz = (-ViewportMinZ * sz);
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
+        {
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x3_t V = vld3q_f32(reinterpret_cast<const float*>(pInputVector));
+                pInputVector += sizeof(XMFLOAT3) * 4;
+
+                XMVECTOR ScaleX = vdupq_n_f32(sx);
+                XMVECTOR OffsetX = vdupq_n_f32(ox);
+                XMVECTOR VX = vmlaq_f32(OffsetX, ScaleX, V.val[0]);
+
+                float32x2_t r3 = vget_low_f32(Transform.r[3]);
+                float32x2_t r = vget_low_f32(Transform.r[0]);
+                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), VX, r, 0); // Ax+M
+                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), VX, r, 1); // Bx+N
+
+                XM_PREFETCH(pInputVector);
+
+                r3 = vget_high_f32(Transform.r[3]);
+                r = vget_high_f32(Transform.r[0]);
+                XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), VX, r, 0); // Cx+O
+                XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), VX, r, 1); // Dx+P
+
+                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
+
+                XMVECTOR ScaleY = vdupq_n_f32(sy);
+                XMVECTOR OffsetY = vdupq_n_f32(oy);
+                XMVECTOR VY = vmlaq_f32(OffsetY, ScaleY, V.val[1]);
+
+                r = vget_low_f32(Transform.r[1]);
+                vResult0 = vmlaq_lane_f32(vResult0, VY, r, 0); // Ax+Ey+M
+                vResult1 = vmlaq_lane_f32(vResult1, VY, r, 1); // Bx+Fy+N
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
+
+                r = vget_high_f32(Transform.r[1]);
+                vResult2 = vmlaq_lane_f32(vResult2, VY, r, 0); // Cx+Gy+O
+                W = vmlaq_lane_f32(W, VY, r, 1); // Dx+Hy+P
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
+
+                XMVECTOR ScaleZ = vdupq_n_f32(sz);
+                XMVECTOR OffsetZ = vdupq_n_f32(oz);
+                XMVECTOR VZ = vmlaq_f32(OffsetZ, ScaleZ, V.val[2]);
+
+                r = vget_low_f32(Transform.r[2]);
+                vResult0 = vmlaq_lane_f32(vResult0, VZ, r, 0); // Ax+Ey+Iz+M
+                vResult1 = vmlaq_lane_f32(vResult1, VZ, r, 1); // Bx+Fy+Jz+N
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));
+
+                r = vget_high_f32(Transform.r[2]);
+                vResult2 = vmlaq_lane_f32(vResult2, VZ, r, 0); // Cx+Gy+Kz+O
+                W = vmlaq_lane_f32(W, VZ, r, 1); // Dx+Hy+Lz+P
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));
+
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+                V.val[0] = vdivq_f32(vResult0, W);
+                V.val[1] = vdivq_f32(vResult1, W);
+                V.val[2] = vdivq_f32(vResult2, W);
+#else
+                // 2 iterations of Newton-Raphson refinement of reciprocal
+                float32x4_t Reciprocal = vrecpeq_f32(W);
+                float32x4_t S = vrecpsq_f32(Reciprocal, W);
+                Reciprocal = vmulq_f32(S, Reciprocal);
+                S = vrecpsq_f32(Reciprocal, W);
+                Reciprocal = vmulq_f32(S, Reciprocal);
+
+                V.val[0] = vmulq_f32(vResult0, Reciprocal);
+                V.val[1] = vmulq_f32(vResult1, Reciprocal);
+                V.val[2] = vmulq_f32(vResult2, Reciprocal);
+#endif
+
+                vst3q_f32(reinterpret_cast<float*>(pOutputVector), V);
+                pOutputVector += sizeof(XMFLOAT3) * 4;
+
+                i += 4;
+            }
+        }
+    }
+
+    if (i < VectorCount)
+    {
+        float32x2_t ScaleL = vcreate_f32(
+            static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&sx))
+            | (static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&sy)) << 32));
+        float32x2_t ScaleH = vcreate_f32(static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&sz)));
+
+        float32x2_t OffsetL = vcreate_f32(
+            static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&ox))
+            | (static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&oy)) << 32));
+        float32x2_t OffsetH = vcreate_f32(static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&oz)));
+
+        for (; i < VectorCount; i++)
+        {
+            float32x2_t VL = vld1_f32(reinterpret_cast<const float*>(pInputVector));
+            float32x2_t zero = vdup_n_f32(0);
+            float32x2_t VH = vld1_lane_f32(reinterpret_cast<const float*>(pInputVector) + 2, zero, 0);
+            pInputVector += InputStride;
+
+            VL = vmla_f32(OffsetL, VL, ScaleL);
+            VH = vmla_f32(OffsetH, VH, ScaleH);
+
+            XMVECTOR vResult = vmlaq_lane_f32(Transform.r[3], Transform.r[0], VL, 0); // X
+            vResult = vmlaq_lane_f32(vResult, Transform.r[1], VL, 1); // Y
+            vResult = vmlaq_lane_f32(vResult, Transform.r[2], VH, 0); // Z
+
+            VH = vget_high_f32(vResult);
+            XMVECTOR W = vdupq_lane_f32(VH, 1);
+
+#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
+            vResult = vdivq_f32(vResult, W);
+#else
+            // 2 iterations of Newton-Raphson refinement of reciprocal for W
+            float32x4_t Reciprocal = vrecpeq_f32(W);
+            float32x4_t S = vrecpsq_f32(Reciprocal, W);
+            Reciprocal = vmulq_f32(S, Reciprocal);
+            S = vrecpsq_f32(Reciprocal, W);
+            Reciprocal = vmulq_f32(S, Reciprocal);
+
+            vResult = vmulq_f32(vResult, Reciprocal);
+#endif
+
+            VL = vget_low_f32(vResult);
+            vst1_f32(reinterpret_cast<float*>(pOutputVector), VL);
+            vst1q_lane_f32(reinterpret_cast<float*>(pOutputVector) + 2, vResult, 2);
+            pOutputVector += OutputStride;
+        }
+    }
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };
+
+    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
+    Scale = XMVectorReciprocal(Scale);
+
+    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
+    Offset = _mm_mul_ps(Scale, Offset);
+    Offset = _mm_add_ps(Offset, D);
+
+    XMMATRIX Transform = XMMatrixMultiply(World, View);
+    Transform = XMMatrixMultiply(Transform, Projection);
+    Transform = XMMatrixInverse(nullptr, Transform);
+
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(XMFLOAT3))
+        {
+            if (OutputStride == sizeof(XMFLOAT3))
+            {
+                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF))
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
+                        __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
+                        pInputVector += sizeof(XMFLOAT3) * 4;
+
+                        // Unpack the 4 vectors (.w components are junk)
+                        XM3UNPACK3INTO4(V1, L2, L3);
+
+                        // Result 1
+                        V1 = XM_FMADD_PS(V1, Scale, Offset);
+
+                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
+                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
+                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                        V1 = _mm_div_ps(vTemp, W);
+
+                        // Result 2
+                        V2 = XM_FMADD_PS(V2, Scale, Offset);
+
+                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                        V2 = _mm_div_ps(vTemp, W);
+
+                        // Result 3
+                        V3 = XM_FMADD_PS(V3, Scale, Offset);
+
+                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                        V3 = _mm_div_ps(vTemp, W);
+
+                        // Result 4
+                        V4 = XM_FMADD_PS(V4, Scale, Offset);
+
+                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                        V4 = _mm_div_ps(vTemp, W);
+
+                        // Pack and store the vectors
+                        XM3PACK4INTO3(vTemp);
+                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), V1);
+                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
+                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 32), V3);
+                        pOutputVector += sizeof(XMFLOAT3) * 4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, unaligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
+                        __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
+                        pInputVector += sizeof(XMFLOAT3) * 4;
+
+                        // Unpack the 4 vectors (.w components are junk)
+                        XM3UNPACK3INTO4(V1, L2, L3);
+
+                        // Result 1
+                        V1 = XM_FMADD_PS(V1, Scale, Offset);
+
+                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
+                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
+                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                        V1 = _mm_div_ps(vTemp, W);
+
+                        // Result 2
+                        V2 = XM_FMADD_PS(V2, Scale, Offset);
+
+                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                        V2 = _mm_div_ps(vTemp, W);
+
+                        // Result 3
+                        V3 = XM_FMADD_PS(V3, Scale, Offset);
+
+                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                        V3 = _mm_div_ps(vTemp, W);
+
+                        // Result 4
+                        V4 = XM_FMADD_PS(V4, Scale, Offset);
+
+                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
+                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
+                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
+
+                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                        vTemp = _mm_add_ps(vTemp, vTemp2);
+                        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                        V4 = _mm_div_ps(vTemp, W);
+
+                        // Pack and store the vectors
+                        XM3PACK4INTO3(vTemp);
+                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), V1);
+                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
+                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 32), V3);
+                        pOutputVector += sizeof(XMFLOAT3) * 4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, unpacked output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                    __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
+                    __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
+                    pInputVector += sizeof(XMFLOAT3) * 4;
+
+                    // Unpack the 4 vectors (.w components are junk)
+                    XM3UNPACK3INTO4(V1, L2, L3);
+
+                    // Result 1
+                    V1 = XM_FMADD_PS(V1, Scale, Offset);
+
+                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
+                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
+                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                    XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                    XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                    XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                    vTemp = _mm_div_ps(vTemp, W);
+
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 2
+                    V2 = XM_FMADD_PS(V2, Scale, Offset);
+
+                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                    vTemp = _mm_div_ps(vTemp, W);
+
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 3
+                    V3 = XM_FMADD_PS(V3, Scale, Offset);
+
+                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                    vTemp = _mm_div_ps(vTemp, W);
+
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    // Result 4
+                    V4 = XM_FMADD_PS(V4, Scale, Offset);
+
+                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
+                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
+                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
+
+                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+                    vTemp = _mm_add_ps(vTemp, vTemp2);
+                    vTemp = _mm_add_ps(vTemp, vTemp3);
+
+                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+                    vTemp = _mm_div_ps(vTemp, W);
+
+                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+                    pOutputVector += OutputStride;
+
+                    i += 4;
+                }
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
+        pInputVector += InputStride;
+
+        V = _mm_mul_ps(V, Scale);
+        V = _mm_add_ps(V, Offset);
+
+        XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+
+        XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
+        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
+        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
+        vTemp = _mm_add_ps(vTemp, vTemp2);
+        vTemp = _mm_add_ps(vTemp, vTemp3);
+
+        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
+        vTemp = _mm_div_ps(vTemp, W);
+
+        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
+        pOutputVector += OutputStride;
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+#ifdef _PREFAST_
+#pragma prefast(pop)
+#endif
+
+/****************************************************************************
+ *
+ * 4D Vector
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+ // Comparison operations
+ //------------------------------------------------------------------------------
+
+ //------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4Equal
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
+    return ((_mm_movemask_ps(vTemp) == 0x0f) != 0);
+#else
+    return XMComparisonAllTrue(XMVector4EqualR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector4EqualR
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t CR = 0;
+
+    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
+        (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] == V2.vector4_f32[2]) &&
+        (V1.vector4_f32[3] == V2.vector4_f32[3]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
+        (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] != V2.vector4_f32[2]) &&
+        (V1.vector4_f32[3] != V2.vector4_f32[3]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
+
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFFFU)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!r)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
+    int iTest = _mm_movemask_ps(vTemp);
+    uint32_t CR = 0;
+    if (iTest == 0xf)     // All equal?
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (iTest == 0)  // All not equal?
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4EqualInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
+    return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) == 0xf) != 0);
+#else
+    return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector4EqualIntR
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t CR = 0;
+    if (V1.vector4_u32[0] == V2.vector4_u32[0] &&
+        V1.vector4_u32[1] == V2.vector4_u32[1] &&
+        V1.vector4_u32[2] == V2.vector4_u32[2] &&
+        V1.vector4_u32[3] == V2.vector4_u32[3])
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (V1.vector4_u32[0] != V2.vector4_u32[0] &&
+        V1.vector4_u32[1] != V2.vector4_u32[1] &&
+        V1.vector4_u32[2] != V2.vector4_u32[2] &&
+        V1.vector4_u32[3] != V2.vector4_u32[3])
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
+
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFFFU)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!r)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
+    int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp));
+    uint32_t CR = 0;
+    if (iTest == 0xf)     // All equal?
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (iTest == 0)  // All not equal?
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+inline bool XM_CALLCONV XMVector4NearEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2,
+    FXMVECTOR Epsilon
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float dx, dy, dz, dw;
+
+    dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]);
+    dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]);
+    dz = fabsf(V1.vector4_f32[2] - V2.vector4_f32[2]);
+    dw = fabsf(V1.vector4_f32[3] - V2.vector4_f32[3]);
+    return (((dx <= Epsilon.vector4_f32[0]) &&
+        (dy <= Epsilon.vector4_f32[1]) &&
+        (dz <= Epsilon.vector4_f32[2]) &&
+        (dw <= Epsilon.vector4_f32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vDelta = vsubq_f32(V1, V2);
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
+    uint32x4_t vResult = vacleq_f32(vDelta, Epsilon);
+#else
+    uint32x4_t vResult = vcleq_f32(vabsq_f32(vDelta), Epsilon);
+#endif
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Get the difference
+    XMVECTOR vDelta = _mm_sub_ps(V1, V2);
+    // Get the absolute value of the difference
+    XMVECTOR vTemp = _mm_setzero_ps();
+    vTemp = _mm_sub_ps(vTemp, vDelta);
+    vTemp = _mm_max_ps(vTemp, vDelta);
+    vTemp = _mm_cmple_ps(vTemp, Epsilon);
+    return ((_mm_movemask_ps(vTemp) == 0xf) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4NotEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0xFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpneq_ps(V1, V2);
+    return ((_mm_movemask_ps(vTemp)) != 0);
+#else
+    return XMComparisonAnyFalse(XMVector4EqualR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4NotEqualInt
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0xFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
+    return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) != 0xF) != 0);
+#else
+    return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4Greater
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgtq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
+    return ((_mm_movemask_ps(vTemp) == 0x0f) != 0);
+#else
+    return XMComparisonAllTrue(XMVector4GreaterR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector4GreaterR
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t CR = 0;
+    if (V1.vector4_f32[0] > V2.vector4_f32[0] &&
+        V1.vector4_f32[1] > V2.vector4_f32[1] &&
+        V1.vector4_f32[2] > V2.vector4_f32[2] &&
+        V1.vector4_f32[3] > V2.vector4_f32[3])
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (V1.vector4_f32[0] <= V2.vector4_f32[0] &&
+        V1.vector4_f32[1] <= V2.vector4_f32[1] &&
+        V1.vector4_f32[2] <= V2.vector4_f32[2] &&
+        V1.vector4_f32[3] <= V2.vector4_f32[3])
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgtq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
+
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFFFU)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!r)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    uint32_t CR = 0;
+    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
+    int iTest = _mm_movemask_ps(vTemp);
+    if (iTest == 0xf)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4GreaterOrEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgeq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
+    return ((_mm_movemask_ps(vTemp) == 0x0f) != 0);
+#else
+    return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline uint32_t XM_CALLCONV XMVector4GreaterOrEqualR
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t CR = 0;
+    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
+        (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] >= V2.vector4_f32[2]) &&
+        (V1.vector4_f32[3] >= V2.vector4_f32[3]))
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
+        (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
+        (V1.vector4_f32[2] < V2.vector4_f32[2]) &&
+        (V1.vector4_f32[3] < V2.vector4_f32[3]))
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcgeq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
+
+    uint32_t CR = 0;
+    if (r == 0xFFFFFFFFU)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!r)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#elif defined(_XM_SSE_INTRINSICS_)
+    uint32_t CR = 0;
+    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
+    int iTest = _mm_movemask_ps(vTemp);
+    if (iTest == 0x0f)
+    {
+        CR = XM_CRMASK_CR6TRUE;
+    }
+    else if (!iTest)
+    {
+        CR = XM_CRMASK_CR6FALSE;
+    }
+    return CR;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4Less
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcltq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmplt_ps(V1, V2);
+    return ((_mm_movemask_ps(vTemp) == 0x0f) != 0);
+#else
+    return XMComparisonAllTrue(XMVector4GreaterR(V2, V1));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4LessOrEqual
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vResult = vcleq_f32(V1, V2);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp = _mm_cmple_ps(V1, V2);
+    return ((_mm_movemask_ps(vTemp) == 0x0f) != 0);
+#else
+    return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4InBounds
+(
+    FXMVECTOR V,
+    FXMVECTOR Bounds
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
+        (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
+        (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) &&
+        (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0);
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Test if less than or equal
+    uint32x4_t ivTemp1 = vcleq_f32(V, Bounds);
+    // Negate the bounds
+    float32x4_t vTemp2 = vnegq_f32(Bounds);
+    // Test if greater or equal (Reversed)
+    uint32x4_t ivTemp2 = vcleq_f32(vTemp2, V);
+    // Blend answers
+    ivTemp1 = vandq_u32(ivTemp1, ivTemp2);
+    // in bounds?
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(ivTemp1)), vget_high_u8(vreinterpretq_u8_u32(ivTemp1)));
+    uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return (vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1) == 0xFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test if less than or equal
+    XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds);
+    // Negate the bounds
+    XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne);
+    // Test if greater or equal (Reversed)
+    vTemp2 = _mm_cmple_ps(vTemp2, V);
+    // Blend answers
+    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
+    // All in bounds?
+    return ((_mm_movemask_ps(vTemp1) == 0x0f) != 0);
+#else
+    return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds));
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#pragma float_control(push)
+#pragma float_control(precise, on)
+#endif
+
+inline bool XM_CALLCONV XMVector4IsNaN(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    return (XMISNAN(V.vector4_f32[0]) ||
+        XMISNAN(V.vector4_f32[1]) ||
+        XMISNAN(V.vector4_f32[2]) ||
+        XMISNAN(V.vector4_f32[3]));
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Test against itself. NaN is always not equal
+    uint32x4_t vTempNan = vceqq_f32(V, V);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempNan)), vget_high_u8(vreinterpretq_u8_u32(vTempNan)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    // If any are NaN, the mask is zero
+    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0xFFFFFFFFU);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Test against itself. NaN is always not equal
+    XMVECTOR vTempNan = _mm_cmpneq_ps(V, V);
+    // If any are NaN, the mask is non-zero
+    return (_mm_movemask_ps(vTempNan) != 0);
+#endif
+}
+
+#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#pragma float_control(pop)
+#endif
+
+//------------------------------------------------------------------------------
+
+inline bool XM_CALLCONV XMVector4IsInfinite(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    return (XMISINF(V.vector4_f32[0]) ||
+        XMISINF(V.vector4_f32[1]) ||
+        XMISINF(V.vector4_f32[2]) ||
+        XMISINF(V.vector4_f32[3]));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Mask off the sign bit
+    uint32x4_t vTempInf = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
+    // Compare to infinity
+    vTempInf = vceqq_f32(vreinterpretq_f32_u32(vTempInf), g_XMInfinity);
+    // If any are infinity, the signs are true.
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempInf)), vget_high_u8(vreinterpretq_u8_u32(vTempInf)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Mask off the sign bit
+    XMVECTOR vTemp = _mm_and_ps(V, g_XMAbsMask);
+    // Compare to infinity
+    vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity);
+    // If any are infinity, the signs are true.
+    return (_mm_movemask_ps(vTemp) != 0);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Computation operations
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Dot
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result;
+    Result.f[0] =
+        Result.f[1] =
+        Result.f[2] =
+        Result.f[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3];
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vTemp = vmulq_f32(V1, V2);
+    float32x2_t v1 = vget_low_f32(vTemp);
+    float32x2_t v2 = vget_high_f32(vTemp);
+    v1 = vadd_f32(v1, v2);
+    v1 = vpadd_f32(v1, v1);
+    return vcombine_f32(v1, v1);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    return _mm_dp_ps(V1, V2, 0xff);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vTemp = _mm_mul_ps(V1, V2);
+    vTemp = _mm_hadd_ps(vTemp, vTemp);
+    return _mm_hadd_ps(vTemp, vTemp);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vTemp2 = V2;
+    XMVECTOR vTemp = _mm_mul_ps(V1, vTemp2);
+    vTemp2 = _mm_shuffle_ps(vTemp2, vTemp, _MM_SHUFFLE(1, 0, 0, 0)); // Copy X to the Z position and Y to the W position
+    vTemp2 = _mm_add_ps(vTemp2, vTemp);          // Add Z = X+Z; W = Y+W;
+    vTemp = _mm_shuffle_ps(vTemp, vTemp2, _MM_SHUFFLE(0, 3, 0, 0));  // Copy W to the Z position
+    vTemp = _mm_add_ps(vTemp, vTemp2);           // Add Z and W together
+    return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(2, 2, 2, 2));    // Splat Z and return
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Cross
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2,
+    FXMVECTOR V3
+) noexcept
+{
+    // [ ((v2.z*v3.w-v2.w*v3.z)*v1.y)-((v2.y*v3.w-v2.w*v3.y)*v1.z)+((v2.y*v3.z-v2.z*v3.y)*v1.w),
+    //   ((v2.w*v3.z-v2.z*v3.w)*v1.x)-((v2.w*v3.x-v2.x*v3.w)*v1.z)+((v2.z*v3.x-v2.x*v3.z)*v1.w),
+    //   ((v2.y*v3.w-v2.w*v3.y)*v1.x)-((v2.x*v3.w-v2.w*v3.x)*v1.y)+((v2.x*v3.y-v2.y*v3.x)*v1.w),
+    //   ((v2.z*v3.y-v2.y*v3.z)*v1.x)-((v2.z*v3.x-v2.x*v3.z)*v1.y)+((v2.y*v3.x-v2.x*v3.y)*v1.z) ]
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result = { { {
+            (((V2.vector4_f32[2] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[2])) * V1.vector4_f32[1]) - (((V2.vector4_f32[1] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[1])) * V1.vector4_f32[2]) + (((V2.vector4_f32[1] * V3.vector4_f32[2]) - (V2.vector4_f32[2] * V3.vector4_f32[1])) * V1.vector4_f32[3]),
+            (((V2.vector4_f32[3] * V3.vector4_f32[2]) - (V2.vector4_f32[2] * V3.vector4_f32[3])) * V1.vector4_f32[0]) - (((V2.vector4_f32[3] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[3])) * V1.vector4_f32[2]) + (((V2.vector4_f32[2] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[2])) * V1.vector4_f32[3]),
+            (((V2.vector4_f32[1] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[1])) * V1.vector4_f32[0]) - (((V2.vector4_f32[0] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[0])) * V1.vector4_f32[1]) + (((V2.vector4_f32[0] * V3.vector4_f32[1]) - (V2.vector4_f32[1] * V3.vector4_f32[0])) * V1.vector4_f32[3]),
+            (((V2.vector4_f32[2] * V3.vector4_f32[1]) - (V2.vector4_f32[1] * V3.vector4_f32[2])) * V1.vector4_f32[0]) - (((V2.vector4_f32[2] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[2])) * V1.vector4_f32[1]) + (((V2.vector4_f32[1] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[1])) * V1.vector4_f32[2]),
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    const uint32x2_t select = vget_low_u32(g_XMMaskX);
+
+    // Term1: V2zwyz * V3wzwy
+    const float32x2_t v2xy = vget_low_f32(V2);
+    const float32x2_t v2zw = vget_high_f32(V2);
+    const float32x2_t v2yx = vrev64_f32(v2xy);
+    const float32x2_t v2wz = vrev64_f32(v2zw);
+    const float32x2_t v2yz = vbsl_f32(select, v2yx, v2wz);
+
+    const float32x2_t v3zw = vget_high_f32(V3);
+    const float32x2_t v3wz = vrev64_f32(v3zw);
+    const float32x2_t v3xy = vget_low_f32(V3);
+    const float32x2_t v3wy = vbsl_f32(select, v3wz, v3xy);
+
+    float32x4_t vTemp1 = vcombine_f32(v2zw, v2yz);
+    float32x4_t vTemp2 = vcombine_f32(v3wz, v3wy);
+    XMVECTOR vResult = vmulq_f32(vTemp1, vTemp2);
+
+    // - V2wzwy * V3zwyz
+    const float32x2_t v2wy = vbsl_f32(select, v2wz, v2xy);
+
+    const float32x2_t v3yx = vrev64_f32(v3xy);
+    const float32x2_t v3yz = vbsl_f32(select, v3yx, v3wz);
+
+    vTemp1 = vcombine_f32(v2wz, v2wy);
+    vTemp2 = vcombine_f32(v3zw, v3yz);
+    vResult = vmlsq_f32(vResult, vTemp1, vTemp2);
+
+    // term1 * V1yxxx
+    const float32x2_t v1xy = vget_low_f32(V1);
+    const float32x2_t v1yx = vrev64_f32(v1xy);
+
+    vTemp1 = vcombine_f32(v1yx, vdup_lane_f32(v1yx, 1));
+    vResult = vmulq_f32(vResult, vTemp1);
+
+    // Term2: V2ywxz * V3wxwx
+    const float32x2_t v2yw = vrev64_f32(v2wy);
+    const float32x2_t v2xz = vbsl_f32(select, v2xy, v2wz);
+
+    const float32x2_t v3wx = vbsl_f32(select, v3wz, v3yx);
+
+    vTemp1 = vcombine_f32(v2yw, v2xz);
+    vTemp2 = vcombine_f32(v3wx, v3wx);
+    float32x4_t vTerm = vmulq_f32(vTemp1, vTemp2);
+
+    // - V2wxwx * V3ywxz
+    const float32x2_t v2wx = vbsl_f32(select, v2wz, v2yx);
+
+    const float32x2_t v3yw = vrev64_f32(v3wy);
+    const float32x2_t v3xz = vbsl_f32(select, v3xy, v3wz);
+
+    vTemp1 = vcombine_f32(v2wx, v2wx);
+    vTemp2 = vcombine_f32(v3yw, v3xz);
+    vTerm = vmlsq_f32(vTerm, vTemp1, vTemp2);
+
+    // vResult - term2 * V1zzyy
+    const float32x2_t v1zw = vget_high_f32(V1);
+
+    vTemp1 = vcombine_f32(vdup_lane_f32(v1zw, 0), vdup_lane_f32(v1yx, 0));
+    vResult = vmlsq_f32(vResult, vTerm, vTemp1);
+
+    // Term3: V2yzxy * V3zxyx
+    const float32x2_t v3zx = vrev64_f32(v3xz);
+
+    vTemp1 = vcombine_f32(v2yz, v2xy);
+    vTemp2 = vcombine_f32(v3zx, v3yx);
+    vTerm = vmulq_f32(vTemp1, vTemp2);
+
+    // - V2zxyx * V3yzxy
+    const float32x2_t v2zx = vrev64_f32(v2xz);
+
+    vTemp1 = vcombine_f32(v2zx, v2yx);
+    vTemp2 = vcombine_f32(v3yz, v3xy);
+    vTerm = vmlsq_f32(vTerm, vTemp1, vTemp2);
+
+    // vResult + term3 * V1wwwz
+    const float32x2_t v1wz = vrev64_f32(v1zw);
+
+    vTemp1 = vcombine_f32(vdup_lane_f32(v1wz, 0), v1wz);
+    return vmlaq_f32(vResult, vTerm, vTemp1);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // V2zwyz * V3wzwy
+    XMVECTOR vResult = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 1, 3, 2));
+    XMVECTOR vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 3, 2, 3));
+    vResult = _mm_mul_ps(vResult, vTemp3);
+    // - V2wzwy * V3zwyz
+    XMVECTOR vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 3, 2, 3));
+    vTemp3 = XM_PERMUTE_PS(vTemp3, _MM_SHUFFLE(1, 3, 0, 1));
+    vResult = XM_FNMADD_PS(vTemp2, vTemp3, vResult);
+     // term1 * V1yxxx
+    XMVECTOR vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 1));
+    vResult = _mm_mul_ps(vResult, vTemp1);
+
+    // V2ywxz * V3wxwx
+    vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 0, 3, 1));
+    vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 3, 0, 3));
+    vTemp3 = _mm_mul_ps(vTemp3, vTemp2);
+    // - V2wxwx * V3ywxz
+    vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(2, 1, 2, 1));
+    vTemp1 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 0, 3, 1));
+    vTemp3 = XM_FNMADD_PS(vTemp2, vTemp1, vTemp3);
+    // vResult - temp * V1zzyy
+    vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 2, 2));
+    vResult = XM_FNMADD_PS(vTemp1, vTemp3, vResult);
+
+    // V2yzxy * V3zxyx
+    vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 0, 2, 1));
+    vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 1, 0, 2));
+    vTemp3 = _mm_mul_ps(vTemp3, vTemp2);
+    // - V2zxyx * V3yzxy
+    vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(2, 0, 2, 1));
+    vTemp1 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 0, 2, 1));
+    vTemp3 = XM_FNMADD_PS(vTemp1, vTemp2, vTemp3);
+    // vResult + term * V1wwwz
+    vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 3, 3, 3));
+    vResult = XM_FMADD_PS(vTemp3, vTemp1, vResult);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4LengthSq(FXMVECTOR V) noexcept
+{
+    return XMVector4Dot(V, V);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+
+    Result = XMVector4LengthSq(V);
+    Result = XMVectorReciprocalSqrtEst(Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot4
+    float32x4_t vTemp = vmulq_f32(V, V);
+    float32x2_t v1 = vget_low_f32(vTemp);
+    float32x2_t v2 = vget_high_f32(vTemp);
+    v1 = vadd_f32(v1, v2);
+    v1 = vpadd_f32(v1, v1);
+    // Reciprocal sqrt (estimate)
+    v2 = vrsqrte_f32(v1);
+    return vcombine_f32(v2, v2);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff);
+    return _mm_rsqrt_ps(vTemp);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_rsqrt_ps(vLengthSq);
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y,z and w
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    // vTemp has z and w
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
+    // x+z, y+w
+    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
+    // x+z,x+z,x+z,y+w
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
+    // ??,??,y+w,y+w
+    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
+    // ??,??,x+z+y+w,??
+    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
+    // Splat the length
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
+    // Get the reciprocal
+    vLengthSq = _mm_rsqrt_ps(vLengthSq);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+
+    Result = XMVector4LengthSq(V);
+    Result = XMVectorReciprocalSqrt(Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot4
+    float32x4_t vTemp = vmulq_f32(V, V);
+    float32x2_t v1 = vget_low_f32(vTemp);
+    float32x2_t v2 = vget_high_f32(vTemp);
+    v1 = vadd_f32(v1, v2);
+    v1 = vpadd_f32(v1, v1);
+    // Reciprocal sqrt
+    float32x2_t  S0 = vrsqrte_f32(v1);
+    float32x2_t  P0 = vmul_f32(v1, S0);
+    float32x2_t  R0 = vrsqrts_f32(P0, S0);
+    float32x2_t  S1 = vmul_f32(S0, R0);
+    float32x2_t  P1 = vmul_f32(v1, S1);
+    float32x2_t  R1 = vrsqrts_f32(P1, S1);
+    float32x2_t Result = vmul_f32(S1, R1);
+    return vcombine_f32(Result, Result);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff);
+    XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp);
+    return _mm_div_ps(g_XMOne, vLengthSq);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    vLengthSq = _mm_div_ps(g_XMOne, vLengthSq);
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y,z and w
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    // vTemp has z and w
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
+    // x+z, y+w
+    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
+    // x+z,x+z,x+z,y+w
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
+    // ??,??,y+w,y+w
+    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
+    // ??,??,x+z+y+w,??
+    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
+    // Splat the length
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
+    // Get the reciprocal
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    // Accurate!
+    vLengthSq = _mm_div_ps(g_XMOne, vLengthSq);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4LengthEst(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+
+    Result = XMVector4LengthSq(V);
+    Result = XMVectorSqrtEst(Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot4
+    float32x4_t vTemp = vmulq_f32(V, V);
+    float32x2_t v1 = vget_low_f32(vTemp);
+    float32x2_t v2 = vget_high_f32(vTemp);
+    v1 = vadd_f32(v1, v2);
+    v1 = vpadd_f32(v1, v1);
+    const float32x2_t zero = vdup_n_f32(0);
+    uint32x2_t VEqualsZero = vceq_f32(v1, zero);
+    // Sqrt (estimate)
+    float32x2_t Result = vrsqrte_f32(v1);
+    Result = vmul_f32(v1, Result);
+    Result = vbsl_f32(VEqualsZero, zero, Result);
+    return vcombine_f32(Result, Result);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff);
+    return _mm_sqrt_ps(vTemp);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y,z and w
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    // vTemp has z and w
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
+    // x+z, y+w
+    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
+    // x+z,x+z,x+z,y+w
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
+    // ??,??,y+w,y+w
+    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
+    // ??,??,x+z+y+w,??
+    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
+    // Splat the length
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
+    // Get the length
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Length(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+
+    Result = XMVector4LengthSq(V);
+    Result = XMVectorSqrt(Result);
+
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot4
+    float32x4_t vTemp = vmulq_f32(V, V);
+    float32x2_t v1 = vget_low_f32(vTemp);
+    float32x2_t v2 = vget_high_f32(vTemp);
+    v1 = vadd_f32(v1, v2);
+    v1 = vpadd_f32(v1, v1);
+    const float32x2_t zero = vdup_n_f32(0);
+    uint32x2_t VEqualsZero = vceq_f32(v1, zero);
+    // Sqrt
+    float32x2_t S0 = vrsqrte_f32(v1);
+    float32x2_t P0 = vmul_f32(v1, S0);
+    float32x2_t R0 = vrsqrts_f32(P0, S0);
+    float32x2_t S1 = vmul_f32(S0, R0);
+    float32x2_t P1 = vmul_f32(v1, S1);
+    float32x2_t R1 = vrsqrts_f32(P1, S1);
+    float32x2_t Result = vmul_f32(S1, R1);
+    Result = vmul_f32(v1, Result);
+    Result = vbsl_f32(VEqualsZero, zero, Result);
+    return vcombine_f32(Result, Result);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff);
+    return _mm_sqrt_ps(vTemp);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y,z and w
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    // vTemp has z and w
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
+    // x+z, y+w
+    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
+    // x+z,x+z,x+z,y+w
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
+    // ??,??,y+w,y+w
+    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
+    // ??,??,x+z+y+w,??
+    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
+    // Splat the length
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
+    // Get the length
+    vLengthSq = _mm_sqrt_ps(vLengthSq);
+    return vLengthSq;
+#endif
+}
+
+//------------------------------------------------------------------------------
+// XMVector4NormalizeEst uses a reciprocal estimate and
+// returns QNaN on zero and infinite vectors.
+
+inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR Result;
+    Result = XMVector4ReciprocalLength(V);
+    Result = XMVectorMultiply(V, Result);
+    return Result;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot4
+    float32x4_t vTemp = vmulq_f32(V, V);
+    float32x2_t v1 = vget_low_f32(vTemp);
+    float32x2_t v2 = vget_high_f32(vTemp);
+    v1 = vadd_f32(v1, v2);
+    v1 = vpadd_f32(v1, v1);
+    // Reciprocal sqrt (estimate)
+    v2 = vrsqrte_f32(v1);
+    // Normalize
+    return vmulq_f32(V, vcombine_f32(v2, v2));
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff);
+    XMVECTOR vResult = _mm_rsqrt_ps(vTemp);
+    return _mm_mul_ps(vResult, V);
+#elif defined(_XM_SSE3_INTRINSICS_)
+    XMVECTOR vDot = _mm_mul_ps(V, V);
+    vDot = _mm_hadd_ps(vDot, vDot);
+    vDot = _mm_hadd_ps(vDot, vDot);
+    vDot = _mm_rsqrt_ps(vDot);
+    vDot = _mm_mul_ps(vDot, V);
+    return vDot;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y,z and w
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    // vTemp has z and w
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
+    // x+z, y+w
+    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
+    // x+z,x+z,x+z,y+w
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
+    // ??,??,y+w,y+w
+    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
+    // ??,??,x+z+y+w,??
+    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
+    // Splat the length
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
+    // Get the reciprocal
+    XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq);
+    // Reciprocal mul to perform the normalization
+    vResult = _mm_mul_ps(vResult, V);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Normalize(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+    float fLength;
+    XMVECTOR vResult;
+
+    vResult = XMVector4Length(V);
+    fLength = vResult.vector4_f32[0];
+
+    // Prevent divide by zero
+    if (fLength > 0)
+    {
+        fLength = 1.0f / fLength;
+    }
+
+    vResult.vector4_f32[0] = V.vector4_f32[0] * fLength;
+    vResult.vector4_f32[1] = V.vector4_f32[1] * fLength;
+    vResult.vector4_f32[2] = V.vector4_f32[2] * fLength;
+    vResult.vector4_f32[3] = V.vector4_f32[3] * fLength;
+    return vResult;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    // Dot4
+    float32x4_t vTemp = vmulq_f32(V, V);
+    float32x2_t v1 = vget_low_f32(vTemp);
+    float32x2_t v2 = vget_high_f32(vTemp);
+    v1 = vadd_f32(v1, v2);
+    v1 = vpadd_f32(v1, v1);
+    uint32x2_t VEqualsZero = vceq_f32(v1, vdup_n_f32(0));
+    uint32x2_t VEqualsInf = vceq_f32(v1, vget_low_f32(g_XMInfinity));
+    // Reciprocal sqrt (2 iterations of Newton-Raphson)
+    float32x2_t S0 = vrsqrte_f32(v1);
+    float32x2_t P0 = vmul_f32(v1, S0);
+    float32x2_t R0 = vrsqrts_f32(P0, S0);
+    float32x2_t S1 = vmul_f32(S0, R0);
+    float32x2_t P1 = vmul_f32(v1, S1);
+    float32x2_t R1 = vrsqrts_f32(P1, S1);
+    v2 = vmul_f32(S1, R1);
+    // Normalize
+    XMVECTOR vResult = vmulq_f32(V, vcombine_f32(v2, v2));
+    vResult = vbslq_f32(vcombine_u32(VEqualsZero, VEqualsZero), vdupq_n_f32(0), vResult);
+    return vbslq_f32(vcombine_u32(VEqualsInf, VEqualsInf), g_XMQNaN, vResult);
+#elif defined(_XM_SSE4_INTRINSICS_)
+    XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0xff);
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
+    vResult = _mm_or_ps(vTemp1, vTemp2);
+    return vResult;
+#elif defined(_XM_SSE3_INTRINSICS_)
+    // Perform the dot product on x,y,z and w
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
+    vResult = _mm_or_ps(vTemp1, vTemp2);
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Perform the dot product on x,y,z and w
+    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
+    // vTemp has z and w
+    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
+    // x+z, y+w
+    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
+    // x+z,x+z,x+z,y+w
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
+    // ??,??,y+w,y+w
+    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
+    // ??,??,x+z+y+w,??
+    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
+    // Splat the length
+    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
+    // Prepare for the division
+    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
+    // Create zero with a single instruction
+    XMVECTOR vZeroMask = _mm_setzero_ps();
+    // Test for a divide by zero (Must be FP to detect -0.0)
+    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
+    // Failsafe on zero (Or epsilon) length planes
+    // If the length is infinity, set the elements to zero
+    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
+    // Divide to perform the normalization
+    vResult = _mm_div_ps(V, vResult);
+    // Any that are infinity, set to zero
+    vResult = _mm_and_ps(vResult, vZeroMask);
+    // Select qnan or result based on infinite length
+    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
+    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
+    vResult = _mm_or_ps(vTemp1, vTemp2);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4ClampLength
+(
+    FXMVECTOR V,
+    float    LengthMin,
+    float    LengthMax
+) noexcept
+{
+    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
+    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
+
+    return XMVector4ClampLengthV(V, ClampMin, ClampMax);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4ClampLengthV
+(
+    FXMVECTOR V,
+    FXMVECTOR LengthMin,
+    FXMVECTOR LengthMax
+) noexcept
+{
+    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin)));
+    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax)));
+    assert(XMVector4GreaterOrEqual(LengthMin, XMVectorZero()));
+    assert(XMVector4GreaterOrEqual(LengthMax, XMVectorZero()));
+    assert(XMVector4GreaterOrEqual(LengthMax, LengthMin));
+
+    XMVECTOR LengthSq = XMVector4LengthSq(V);
+
+    const XMVECTOR Zero = XMVectorZero();
+
+    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
+
+    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
+    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
+
+    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
+
+    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
+
+    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
+    Length = XMVectorSelect(LengthSq, Length, Select);
+    Normal = XMVectorSelect(LengthSq, Normal, Select);
+
+    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
+    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
+
+    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
+    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
+
+    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
+
+    // Preserve the original vector (with no precision loss) if the length falls within the given range
+    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
+    Result = XMVectorSelect(Result, V, Control);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Reflect
+(
+    FXMVECTOR Incident,
+    FXMVECTOR Normal
+) noexcept
+{
+    // Result = Incident - (2 * dot(Incident, Normal)) * Normal
+
+    XMVECTOR Result = XMVector4Dot(Incident, Normal);
+    Result = XMVectorAdd(Result, Result);
+    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
+
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Refract
+(
+    FXMVECTOR Incident,
+    FXMVECTOR Normal,
+    float    RefractionIndex
+) noexcept
+{
+    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
+    return XMVector4RefractV(Incident, Normal, Index);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4RefractV
+(
+    FXMVECTOR Incident,
+    FXMVECTOR Normal,
+    FXMVECTOR RefractionIndex
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR        IDotN;
+    XMVECTOR        R;
+    const XMVECTOR  Zero = XMVectorZero();
+
+    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
+    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
+
+    IDotN = XMVector4Dot(Incident, Normal);
+
+    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
+    R = XMVectorMultiply(R, RefractionIndex);
+    R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);
+
+    if (XMVector4LessOrEqual(R, Zero))
+    {
+        // Total internal reflection
+        return Zero;
+    }
+    else
+    {
+        XMVECTOR Result;
+
+        // R = RefractionIndex * IDotN + sqrt(R)
+        R = XMVectorSqrt(R);
+        R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);
+
+        // Result = RefractionIndex * Incident - Normal * R
+        Result = XMVectorMultiply(RefractionIndex, Incident);
+        Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);
+
+        return Result;
+    }
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    XMVECTOR IDotN = XMVector4Dot(Incident, Normal);
+
+    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    float32x4_t R = vmlsq_f32(g_XMOne, IDotN, IDotN);
+    R = vmulq_f32(R, RefractionIndex);
+    R = vmlsq_f32(g_XMOne, R, RefractionIndex);
+
+    uint32x4_t isrzero = vcleq_f32(R, g_XMZero);
+    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(isrzero)), vget_high_u8(vreinterpretq_u8_u32(isrzero)));
+    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
+
+    float32x4_t vResult;
+    if (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU)
+    {
+        // Total internal reflection
+        vResult = g_XMZero;
+    }
+    else
+    {
+        // Sqrt(R)
+        float32x4_t S0 = vrsqrteq_f32(R);
+        float32x4_t P0 = vmulq_f32(R, S0);
+        float32x4_t R0 = vrsqrtsq_f32(P0, S0);
+        float32x4_t S1 = vmulq_f32(S0, R0);
+        float32x4_t P1 = vmulq_f32(R, S1);
+        float32x4_t R1 = vrsqrtsq_f32(P1, S1);
+        float32x4_t S2 = vmulq_f32(S1, R1);
+        R = vmulq_f32(R, S2);
+        // R = RefractionIndex * IDotN + sqrt(R)
+        R = vmlaq_f32(R, RefractionIndex, IDotN);
+        // Result = RefractionIndex * Incident - Normal * R
+        vResult = vmulq_f32(RefractionIndex, Incident);
+        vResult = vmlsq_f32(vResult, R, Normal);
+    }
+    return vResult;
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR IDotN = XMVector4Dot(Incident, Normal);
+
+    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
+    XMVECTOR R = XM_FNMADD_PS(IDotN, IDotN, g_XMOne);
+    XMVECTOR R2 = _mm_mul_ps(RefractionIndex, RefractionIndex);
+    R = XM_FNMADD_PS(R, R2, g_XMOne);
+
+    XMVECTOR vResult = _mm_cmple_ps(R, g_XMZero);
+    if (_mm_movemask_ps(vResult) == 0x0f)
+    {
+        // Total internal reflection
+        vResult = g_XMZero;
+    }
+    else
+    {
+        // R = RefractionIndex * IDotN + sqrt(R)
+        R = _mm_sqrt_ps(R);
+        R = XM_FMADD_PS(RefractionIndex, IDotN, R);
+        // Result = RefractionIndex * Incident - Normal * R
+        vResult = _mm_mul_ps(RefractionIndex, Incident);
+        vResult = XM_FNMADD_PS(R, Normal, vResult);
+    }
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Orthogonal(FXMVECTOR V) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTORF32 Result = { { {
+            V.vector4_f32[2],
+            V.vector4_f32[3],
+            -V.vector4_f32[0],
+            -V.vector4_f32[1]
+        } } };
+    return Result.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Negate = { { { 1.f, 1.f, -1.f, -1.f } } };
+
+    float32x4_t Result = vcombine_f32(vget_high_f32(V), vget_low_f32(V));
+    return vmulq_f32(Result, Negate);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 FlipZW = { { { 1.0f, 1.0f, -1.0f, -1.0f } } };
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 0, 3, 2));
+    vResult = _mm_mul_ps(vResult, FlipZW);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst
+(
+    FXMVECTOR N1,
+    FXMVECTOR N2
+) noexcept
+{
+    XMVECTOR Result = XMVector4Dot(N1, N2);
+    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
+    Result = XMVectorACosEst(Result);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals
+(
+    FXMVECTOR N1,
+    FXMVECTOR N2
+) noexcept
+{
+    XMVECTOR Result = XMVector4Dot(N1, N2);
+    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
+    Result = XMVectorACos(Result);
+    return Result;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+    XMVECTOR L1 = XMVector4ReciprocalLength(V1);
+    XMVECTOR L2 = XMVector4ReciprocalLength(V2);
+
+    XMVECTOR Dot = XMVector4Dot(V1, V2);
+
+    L1 = XMVectorMultiply(L1, L2);
+
+    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
+    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
+
+    return XMVectorACos(CosAngle);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV XMVector4Transform
+(
+    FXMVECTOR V,
+    FXMMATRIX M
+) noexcept
+{
+#if defined(_XM_NO_INTRINSICS_)
+
+    float fX = (M.m[0][0] * V.vector4_f32[0]) + (M.m[1][0] * V.vector4_f32[1]) + (M.m[2][0] * V.vector4_f32[2]) + (M.m[3][0] * V.vector4_f32[3]);
+    float fY = (M.m[0][1] * V.vector4_f32[0]) + (M.m[1][1] * V.vector4_f32[1]) + (M.m[2][1] * V.vector4_f32[2]) + (M.m[3][1] * V.vector4_f32[3]);
+    float fZ = (M.m[0][2] * V.vector4_f32[0]) + (M.m[1][2] * V.vector4_f32[1]) + (M.m[2][2] * V.vector4_f32[2]) + (M.m[3][2] * V.vector4_f32[3]);
+    float fW = (M.m[0][3] * V.vector4_f32[0]) + (M.m[1][3] * V.vector4_f32[1]) + (M.m[2][3] * V.vector4_f32[2]) + (M.m[3][3] * V.vector4_f32[3]);
+    XMVECTORF32 vResult = { { { fX, fY, fZ, fW } } };
+    return vResult.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x2_t VL = vget_low_f32(V);
+    XMVECTOR vResult = vmulq_lane_f32(M.r[0], VL, 0); // X
+    vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1); // Y
+    float32x2_t VH = vget_high_f32(V);
+    vResult = vmlaq_lane_f32(vResult, M.r[2], VH, 0); // Z
+    return vmlaq_lane_f32(vResult, M.r[3], VH, 1); // W
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); // W
+    vResult = _mm_mul_ps(vResult, M.r[3]);
+    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z
+    vResult = XM_FMADD_PS(vTemp, M.r[2], vResult);
+    vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y
+    vResult = XM_FMADD_PS(vTemp, M.r[1], vResult);
+    vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X
+    vResult = XM_FMADD_PS(vTemp, M.r[0], vResult);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMFLOAT4* XM_CALLCONV XMVector4TransformStream
+(
+    XMFLOAT4* pOutputStream,
+    size_t          OutputStride,
+    const XMFLOAT4* pInputStream,
+    size_t          InputStride,
+    size_t          VectorCount,
+    FXMMATRIX       M
+) noexcept
+{
+    assert(pOutputStream != nullptr);
+    assert(pInputStream != nullptr);
+
+    assert(InputStride >= sizeof(XMFLOAT4));
+    _Analysis_assume_(InputStride >= sizeof(XMFLOAT4));
+
+    assert(OutputStride >= sizeof(XMFLOAT4));
+    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4));
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    for (size_t i = 0; i < VectorCount; i++)
+    {
+        XMVECTOR V = XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pInputVector));
+        XMVECTOR W = XMVectorSplatW(V);
+        XMVECTOR Z = XMVectorSplatZ(V);
+        XMVECTOR Y = XMVectorSplatY(V);
+        XMVECTOR X = XMVectorSplatX(V);
+
+        XMVECTOR Result = XMVectorMultiply(W, row3);
+        Result = XMVectorMultiplyAdd(Z, row2, Result);
+        Result = XMVectorMultiplyAdd(Y, row1, Result);
+        Result = XMVectorMultiplyAdd(X, row0, Result);
+
+#ifdef _PREFAST_
+#pragma prefast(push)
+#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" )
+#endif
+
+        XMStoreFloat4(reinterpret_cast<XMFLOAT4*>(pOutputVector), Result);
+
+#ifdef _PREFAST_
+#pragma prefast(pop)
+#endif
+
+        pInputVector += InputStride;
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    size_t i = 0;
+    size_t four = VectorCount >> 2;
+    if (four > 0)
+    {
+        if ((InputStride == sizeof(XMFLOAT4)) && (OutputStride == sizeof(XMFLOAT4)))
+        {
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4x4_t V = vld4q_f32(reinterpret_cast<const float*>(pInputVector));
+                pInputVector += sizeof(XMFLOAT4) * 4;
+
+                float32x2_t r = vget_low_f32(row0);
+                XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0); // Ax
+                XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1); // Bx
+
+                XM_PREFETCH(pInputVector);
+
+                r = vget_high_f32(row0);
+                XMVECTOR vResult2 = vmulq_lane_f32(V.val[0], r, 0); // Cx
+                XMVECTOR vResult3 = vmulq_lane_f32(V.val[0], r, 1); // Dx
+
+                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
+
+                r = vget_low_f32(row1);
+                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey
+                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
+
+                r = vget_high_f32(row1);
+                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy
+                vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1); // Dx+Hy
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
+
+                r = vget_low_f32(row2);
+                vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz
+                vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));
+
+                r = vget_high_f32(row2);
+                vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz
+                vResult3 = vmlaq_lane_f32(vResult3, V.val[2], r, 1); // Dx+Hy+Lz
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));
+
+                r = vget_low_f32(row3);
+                vResult0 = vmlaq_lane_f32(vResult0, V.val[3], r, 0); // Ax+Ey+Iz+Mw
+                vResult1 = vmlaq_lane_f32(vResult1, V.val[3], r, 1); // Bx+Fy+Jz+Nw
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 6));
+
+                r = vget_high_f32(row3);
+                vResult2 = vmlaq_lane_f32(vResult2, V.val[3], r, 0); // Cx+Gy+Kz+Ow
+                vResult3 = vmlaq_lane_f32(vResult3, V.val[3], r, 1); // Dx+Hy+Lz+Pw
+
+                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 7));
+
+                V.val[0] = vResult0;
+                V.val[1] = vResult1;
+                V.val[2] = vResult2;
+                V.val[3] = vResult3;
+
+                vst4q_f32(reinterpret_cast<float*>(pOutputVector), V);
+                pOutputVector += sizeof(XMFLOAT4) * 4;
+
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < VectorCount; i++)
+    {
+        XMVECTOR V = vld1q_f32(reinterpret_cast<const float*>(pInputVector));
+        pInputVector += InputStride;
+
+        float32x2_t VL = vget_low_f32(V);
+        XMVECTOR vResult = vmulq_lane_f32(row0, VL, 0); // X
+        vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y
+        float32x2_t VH = vget_high_f32(V);
+        vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z
+        vResult = vmlaq_lane_f32(vResult, row3, VH, 1); // W
+
+        vst1q_f32(reinterpret_cast<float*>(pOutputVector), vResult);
+        pOutputVector += OutputStride;
+    }
+
+    return pOutputStream;
+#elif defined(_XM_AVX2_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t two = VectorCount >> 1;
+    if (two > 0)
+    {
+        __m256 row0 = _mm256_broadcast_ps(&M.r[0]);
+        __m256 row1 = _mm256_broadcast_ps(&M.r[1]);
+        __m256 row2 = _mm256_broadcast_ps(&M.r[2]);
+        __m256 row3 = _mm256_broadcast_ps(&M.r[3]);
+
+        if (InputStride == sizeof(XMFLOAT4))
+        {
+            if (OutputStride == sizeof(XMFLOAT4))
+            {
+                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0x1F))
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < two; ++j)
+                    {
+                        __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        pInputVector += sizeof(XMFLOAT4) * 2;
+
+                        __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
+                        __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
+                        __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
+                        __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
+
+                        vTempX = _mm256_mul_ps(vTempX, row0);
+                        vTempY = _mm256_mul_ps(vTempY, row1);
+                        vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX);
+                        vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY);
+                        vTempX = _mm256_add_ps(vTempZ, vTempW);
+
+                        XM256_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTempX);
+                        pOutputVector += sizeof(XMFLOAT4) * 2;
+
+                        i += 2;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < two; ++j)
+                    {
+                        __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                        pInputVector += sizeof(XMFLOAT4) * 2;
+
+                        __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
+                        __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
+                        __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
+                        __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
+
+                        vTempX = _mm256_mul_ps(vTempX, row0);
+                        vTempY = _mm256_mul_ps(vTempY, row1);
+                        vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX);
+                        vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY);
+                        vTempX = _mm256_add_ps(vTempZ, vTempW);
+
+                        _mm256_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTempX);
+                        pOutputVector += sizeof(XMFLOAT4) * 2;
+
+                        i += 2;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, unpacked output
+                for (size_t j = 0; j < two; ++j)
+                {
+                    __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                    pInputVector += sizeof(XMFLOAT4) * 2;
+
+                    __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
+                    __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
+                    __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
+                    __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
+
+                    vTempX = _mm256_mul_ps(vTempX, row0);
+                    vTempY = _mm256_mul_ps(vTempY, row1);
+                    vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX);
+                    vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY);
+                    vTempX = _mm256_add_ps(vTempZ, vTempW);
+
+                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), _mm256_castps256_ps128(vTempX));
+                    pOutputVector += OutputStride;
+
+                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), _mm256_extractf128_ps(vTempX, 1));
+                    pOutputVector += OutputStride;
+                    i += 2;
+                }
+            }
+        }
+    }
+
+    if (i < VectorCount)
+    {
+        const XMVECTOR row0 = M.r[0];
+        const XMVECTOR row1 = M.r[1];
+        const XMVECTOR row2 = M.r[2];
+        const XMVECTOR row3 = M.r[3];
+
+        for (; i < VectorCount; i++)
+        {
+            __m128 V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+            pInputVector += InputStride;
+
+            XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+            XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+            XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+            XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+
+            vTempX = _mm_mul_ps(vTempX, row0);
+            vTempY = _mm_mul_ps(vTempY, row1);
+            vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX);
+            vTempW = XM_FMADD_PS(vTempW, row3, vTempY);
+            vTempX = _mm_add_ps(vTempZ, vTempW);
+
+            _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTempX);
+            pOutputVector += OutputStride;
+        }
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#elif defined(_XM_SSE_INTRINSICS_)
+    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    const XMVECTOR row0 = M.r[0];
+    const XMVECTOR row1 = M.r[1];
+    const XMVECTOR row2 = M.r[2];
+    const XMVECTOR row3 = M.r[3];
+
+    if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF))
+    {
+        if (!(reinterpret_cast<uintptr_t>(pInputStream) & 0xF) && !(InputStride & 0xF))
+        {
+            // Aligned input, aligned output
+            for (size_t i = 0; i < VectorCount; i++)
+            {
+                __m128 V = _mm_load_ps(reinterpret_cast<const float*>(pInputVector));
+                pInputVector += InputStride;
+
+                XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+                XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+                XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+                XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+
+                vTempX = _mm_mul_ps(vTempX, row0);
+                vTempY = _mm_mul_ps(vTempY, row1);
+                vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX);
+                vTempW = XM_FMADD_PS(vTempW, row3, vTempY);
+                vTempX = _mm_add_ps(vTempZ, vTempW);
+
+                XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTempX);
+                pOutputVector += OutputStride;
+            }
+        }
+        else
+        {
+            // Unaligned input, aligned output
+            for (size_t i = 0; i < VectorCount; i++)
+            {
+                __m128 V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                pInputVector += InputStride;
+
+                XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+                XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+                XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+                XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+
+                vTempX = _mm_mul_ps(vTempX, row0);
+                vTempY = _mm_mul_ps(vTempY, row1);
+                vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX);
+                vTempW = XM_FMADD_PS(vTempW, row3, vTempY);
+                vTempX = _mm_add_ps(vTempZ, vTempW);
+
+                XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTempX);
+                pOutputVector += OutputStride;
+            }
+        }
+    }
+    else
+    {
+        if (!(reinterpret_cast<uintptr_t>(pInputStream) & 0xF) && !(InputStride & 0xF))
+        {
+            // Aligned input, unaligned output
+            for (size_t i = 0; i < VectorCount; i++)
+            {
+                __m128 V = _mm_load_ps(reinterpret_cast<const float*>(pInputVector));
+                pInputVector += InputStride;
+
+                XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+                XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+                XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+                XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+
+                vTempX = _mm_mul_ps(vTempX, row0);
+                vTempY = _mm_mul_ps(vTempY, row1);
+                vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX);
+                vTempW = XM_FMADD_PS(vTempW, row3, vTempY);
+                vTempX = _mm_add_ps(vTempZ, vTempW);
+
+                _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTempX);
+                pOutputVector += OutputStride;
+            }
+        }
+        else
+        {
+            // Unaligned input, unaligned output
+            for (size_t i = 0; i < VectorCount; i++)
+            {
+                __m128 V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
+                pInputVector += InputStride;
+
+                XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
+                XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
+                XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
+                XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
+
+                vTempX = _mm_mul_ps(vTempX, row0);
+                vTempY = _mm_mul_ps(vTempY, row1);
+                vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX);
+                vTempW = XM_FMADD_PS(vTempW, row3, vTempY);
+                vTempX = _mm_add_ps(vTempZ, vTempW);
+
+                _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTempX);
+                pOutputVector += OutputStride;
+            }
+        }
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#endif
+}
+
+/****************************************************************************
+ *
+ * XMVECTOR operators
+ *
+ ****************************************************************************/
+
+#ifndef _XM_NO_XMVECTOR_OVERLOADS_
+
+ //------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V) noexcept
+{
+    return V;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator- (FXMVECTOR V) noexcept
+{
+    return XMVectorNegate(V);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR& XM_CALLCONV operator+=
+(
+    XMVECTOR& V1,
+    FXMVECTOR       V2
+) noexcept
+{
+    V1 = XMVectorAdd(V1, V2);
+    return V1;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR& XM_CALLCONV operator-=
+(
+    XMVECTOR& V1,
+    FXMVECTOR       V2
+) noexcept
+{
+    V1 = XMVectorSubtract(V1, V2);
+    return V1;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR& XM_CALLCONV operator*=
+(
+    XMVECTOR& V1,
+    FXMVECTOR       V2
+) noexcept
+{
+    V1 = XMVectorMultiply(V1, V2);
+    return V1;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR& XM_CALLCONV operator/=
+(
+    XMVECTOR& V1,
+    FXMVECTOR       V2
+) noexcept
+{
+    V1 = XMVectorDivide(V1, V2);
+    return V1;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR& operator*=
+(
+    XMVECTOR& V,
+    const float S
+) noexcept
+{
+    V = XMVectorScale(V, S);
+    return V;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR& operator/=
+(
+    XMVECTOR& V,
+    const float S
+) noexcept
+{
+    XMVECTOR vS = XMVectorReplicate(S);
+    V = XMVectorDivide(V, vS);
+    return V;
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator+
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+    return XMVectorAdd(V1, V2);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator-
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+    return XMVectorSubtract(V1, V2);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator*
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+    return XMVectorMultiply(V1, V2);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator/
+(
+    FXMVECTOR V1,
+    FXMVECTOR V2
+) noexcept
+{
+    return XMVectorDivide(V1, V2);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator*
+(
+    FXMVECTOR      V,
+    const float    S
+) noexcept
+{
+    return XMVectorScale(V, S);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator/
+(
+    FXMVECTOR      V,
+    const float    S
+) noexcept
+{
+    XMVECTOR vS = XMVectorReplicate(S);
+    return XMVectorDivide(V, vS);
+}
+
+//------------------------------------------------------------------------------
+
+inline XMVECTOR XM_CALLCONV operator*
+(
+    float           S,
+    FXMVECTOR       V
+) noexcept
+{
+    return XMVectorScale(V, S);
+}
+
+#endif /* !_XM_NO_XMVECTOR_OVERLOADS_ */
+
+#if defined(_XM_NO_INTRINSICS_)
+#undef XMISNAN
+#undef XMISINF
+#endif
+
+#if defined(_XM_SSE_INTRINSICS_)
+#undef XM3UNPACK3INTO4
+#undef XM3PACK4INTO3
+#endif
+
diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXPackedVector.h b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXPackedVector.h
new file mode 100644
index 000000000..1484b476a
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXPackedVector.h
@@ -0,0 +1,1224 @@
+//-------------------------------------------------------------------------------------
+// DirectXPackedVector.h -- SIMD C++ Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#include "DirectXMath.h"
+
+namespace DirectX
+{
+
+    namespace PackedVector
+    {
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4201 4365 4324 4996)
+        // C4201: nonstandard extension used
+        // C4365: Off by default noise
+        // C4324: alignment padding warnings
+        // C4996: deprecation warnings
+#endif
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
+#pragma clang diagnostic ignored "-Wnested-anon-types"
+#endif
+
+        //------------------------------------------------------------------------------
+        // ARGB Color; 8-8-8-8 bit unsigned normalized integer components packed into
+        // a 32 bit integer.  The normalized color is packed into 32 bits using 8 bit
+        // unsigned, normalized integers for the alpha, red, green, and blue components.
+        // The alpha component is stored in the most significant bits and the blue
+        // component in the least significant bits (A8R8G8B8):
+        // [32] aaaaaaaa rrrrrrrr gggggggg bbbbbbbb [0]
+        struct XMCOLOR
+        {
+            union
+            {
+                struct
+                {
+                    uint8_t b;  // Blue:    0/255 to 255/255
+                    uint8_t g;  // Green:   0/255 to 255/255
+                    uint8_t r;  // Red:     0/255 to 255/255
+                    uint8_t a;  // Alpha:   0/255 to 255/255
+                };
+                uint32_t c;
+            };
+
+            XMCOLOR() = default;
+
+            XMCOLOR(const XMCOLOR&) = default;
+            XMCOLOR& operator=(const XMCOLOR&) = default;
+
+            XMCOLOR(XMCOLOR&&) = default;
+            XMCOLOR& operator=(XMCOLOR&&) = default;
+
+            constexpr XMCOLOR(uint32_t Color) noexcept : c(Color) {}
+            XMCOLOR(float _r, float _g, float _b, float _a) noexcept;
+            explicit XMCOLOR(_In_reads_(4) const float* pArray) noexcept;
+
+            operator uint32_t () const noexcept { return c; }
+
+            XMCOLOR& operator= (const uint32_t Color) noexcept { c = Color; return *this; }
+        };
+
+        //------------------------------------------------------------------------------
+        // 16 bit floating point number consisting of a sign bit, a 5 bit biased
+        // exponent, and a 10 bit mantissa
+        using HALF = uint16_t;
+
+        //------------------------------------------------------------------------------
+        // 2D Vector; 16 bit floating point components
+        struct XMHALF2
+        {
+            union
+            {
+                struct
+                {
+                    HALF x;
+                    HALF y;
+                };
+                uint32_t v;
+            };
+
+            XMHALF2() = default;
+
+            XMHALF2(const XMHALF2&) = default;
+            XMHALF2& operator=(const XMHALF2&) = default;
+
+            XMHALF2(XMHALF2&&) = default;
+            XMHALF2& operator=(XMHALF2&&) = default;
+
+            explicit constexpr XMHALF2(uint32_t Packed) noexcept : v(Packed) {}
+            constexpr XMHALF2(HALF _x, HALF _y) noexcept : x(_x), y(_y) {}
+            explicit XMHALF2(_In_reads_(2) const HALF* pArray) noexcept : x(pArray[0]), y(pArray[1]) {}
+            XMHALF2(float _x, float _y) noexcept;
+            explicit XMHALF2(_In_reads_(2) const float* pArray) noexcept;
+
+            XMHALF2& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        //------------------------------------------------------------------------------
+        // 2D Vector; 16 bit signed normalized integer components
+        struct XMSHORTN2
+        {
+            union
+            {
+                struct
+                {
+                    int16_t x;
+                    int16_t y;
+                };
+                uint32_t v;
+            };
+
+            XMSHORTN2() = default;
+
+            XMSHORTN2(const XMSHORTN2&) = default;
+            XMSHORTN2& operator=(const XMSHORTN2&) = default;
+
+            XMSHORTN2(XMSHORTN2&&) = default;
+            XMSHORTN2& operator=(XMSHORTN2&&) = default;
+
+            explicit constexpr XMSHORTN2(uint32_t Packed) noexcept : v(Packed) {}
+            constexpr XMSHORTN2(int16_t _x, int16_t _y) noexcept : x(_x), y(_y) {}
+            explicit XMSHORTN2(_In_reads_(2) const int16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {}
+            XMSHORTN2(float _x, float _y) noexcept;
+            explicit XMSHORTN2(_In_reads_(2) const float* pArray) noexcept;
+
+            XMSHORTN2& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 2D Vector; 16 bit signed integer components
+        struct XMSHORT2
+        {
+            union
+            {
+                struct
+                {
+                    int16_t x;
+                    int16_t y;
+                };
+                uint32_t v;
+            };
+
+            XMSHORT2() = default;
+
+            XMSHORT2(const XMSHORT2&) = default;
+            XMSHORT2& operator=(const XMSHORT2&) = default;
+
+            XMSHORT2(XMSHORT2&&) = default;
+            XMSHORT2& operator=(XMSHORT2&&) = default;
+
+            explicit constexpr XMSHORT2(uint32_t Packed) noexcept : v(Packed) {}
+            constexpr XMSHORT2(int16_t _x, int16_t _y) noexcept : x(_x), y(_y) {}
+            explicit XMSHORT2(_In_reads_(2) const int16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {}
+            XMSHORT2(float _x, float _y) noexcept;
+            explicit XMSHORT2(_In_reads_(2) const float* pArray) noexcept;
+
+            XMSHORT2& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 2D Vector; 16 bit unsigned normalized integer components
+        struct XMUSHORTN2
+        {
+            union
+            {
+                struct
+                {
+                    uint16_t x;
+                    uint16_t y;
+                };
+                uint32_t v;
+            };
+
+            XMUSHORTN2() = default;
+
+            XMUSHORTN2(const XMUSHORTN2&) = default;
+            XMUSHORTN2& operator=(const XMUSHORTN2&) = default;
+
+            XMUSHORTN2(XMUSHORTN2&&) = default;
+            XMUSHORTN2& operator=(XMUSHORTN2&&) = default;
+
+            explicit constexpr XMUSHORTN2(uint32_t Packed) noexcept : v(Packed) {}
+            constexpr XMUSHORTN2(uint16_t _x, uint16_t _y) noexcept : x(_x), y(_y) {}
+            explicit XMUSHORTN2(_In_reads_(2) const uint16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {}
+            XMUSHORTN2(float _x, float _y) noexcept;
+            explicit XMUSHORTN2(_In_reads_(2) const float* pArray) noexcept;
+
+            XMUSHORTN2& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 2D Vector; 16 bit unsigned integer components
+        struct XMUSHORT2
+        {
+            union
+            {
+                struct
+                {
+                    uint16_t x;
+                    uint16_t y;
+                };
+                uint32_t v;
+            };
+
+            XMUSHORT2() = default;
+
+            XMUSHORT2(const XMUSHORT2&) = default;
+            XMUSHORT2& operator=(const XMUSHORT2&) = default;
+
+            XMUSHORT2(XMUSHORT2&&) = default;
+            XMUSHORT2& operator=(XMUSHORT2&&) = default;
+
+            explicit constexpr XMUSHORT2(uint32_t Packed) noexcept : v(Packed) {}
+            constexpr XMUSHORT2(uint16_t _x, uint16_t _y) noexcept : x(_x), y(_y) {}
+            explicit XMUSHORT2(_In_reads_(2) const uint16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {}
+            XMUSHORT2(float _x, float _y) noexcept;
+            explicit XMUSHORT2(_In_reads_(2) const float* pArray) noexcept;
+
+            XMUSHORT2& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        //------------------------------------------------------------------------------
+        // 2D Vector; 8 bit signed normalized integer components
+        struct XMBYTEN2
+        {
+            union
+            {
+                struct
+                {
+                    int8_t x;
+                    int8_t y;
+                };
+                uint16_t v;
+            };
+
+            XMBYTEN2() = default;
+
+            XMBYTEN2(const XMBYTEN2&) = default;
+            XMBYTEN2& operator=(const XMBYTEN2&) = default;
+
+            XMBYTEN2(XMBYTEN2&&) = default;
+            XMBYTEN2& operator=(XMBYTEN2&&) = default;
+
+            explicit constexpr XMBYTEN2(uint16_t Packed) noexcept : v(Packed) {}
+            constexpr XMBYTEN2(int8_t _x, int8_t _y) noexcept : x(_x), y(_y) {}
+            explicit XMBYTEN2(_In_reads_(2) const int8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {}
+            XMBYTEN2(float _x, float _y) noexcept;
+            explicit XMBYTEN2(_In_reads_(2) const float* pArray) noexcept;
+
+            XMBYTEN2& operator= (uint16_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 2D Vector; 8 bit signed integer components
+        struct XMBYTE2
+        {
+            union
+            {
+                struct
+                {
+                    int8_t x;
+                    int8_t y;
+                };
+                uint16_t v;
+            };
+
+            XMBYTE2() = default;
+
+            XMBYTE2(const XMBYTE2&) = default;
+            XMBYTE2& operator=(const XMBYTE2&) = default;
+
+            XMBYTE2(XMBYTE2&&) = default;
+            XMBYTE2& operator=(XMBYTE2&&) = default;
+
+            explicit constexpr XMBYTE2(uint16_t Packed) noexcept : v(Packed) {}
+            constexpr XMBYTE2(int8_t _x, int8_t _y) noexcept : x(_x), y(_y) {}
+            explicit XMBYTE2(_In_reads_(2) const int8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {}
+            XMBYTE2(float _x, float _y) noexcept;
+            explicit XMBYTE2(_In_reads_(2) const float* pArray) noexcept;
+
+            XMBYTE2& operator= (uint16_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 2D Vector; 8 bit unsigned normalized integer components
+        struct XMUBYTEN2
+        {
+            union
+            {
+                struct
+                {
+                    uint8_t x;
+                    uint8_t y;
+                };
+                uint16_t v;
+            };
+
+            XMUBYTEN2() = default;
+
+            XMUBYTEN2(const XMUBYTEN2&) = default;
+            XMUBYTEN2& operator=(const XMUBYTEN2&) = default;
+
+            XMUBYTEN2(XMUBYTEN2&&) = default;
+            XMUBYTEN2& operator=(XMUBYTEN2&&) = default;
+
+            explicit constexpr XMUBYTEN2(uint16_t Packed) noexcept : v(Packed) {}
+            constexpr XMUBYTEN2(uint8_t _x, uint8_t _y) noexcept : x(_x), y(_y) {}
+            explicit XMUBYTEN2(_In_reads_(2) const uint8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {}
+            XMUBYTEN2(float _x, float _y) noexcept;
+            explicit XMUBYTEN2(_In_reads_(2) const float* pArray) noexcept;
+
+            XMUBYTEN2& operator= (uint16_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 2D Vector; 8 bit unsigned integer components
+        struct XMUBYTE2
+        {
+            union
+            {
+                struct
+                {
+                    uint8_t x;
+                    uint8_t y;
+                };
+                uint16_t v;
+            };
+
+            XMUBYTE2() = default;
+
+            XMUBYTE2(const XMUBYTE2&) = default;
+            XMUBYTE2& operator=(const XMUBYTE2&) = default;
+
+            XMUBYTE2(XMUBYTE2&&) = default;
+            XMUBYTE2& operator=(XMUBYTE2&&) = default;
+
+            explicit constexpr XMUBYTE2(uint16_t Packed) noexcept : v(Packed) {}
+            constexpr XMUBYTE2(uint8_t _x, uint8_t _y) noexcept : x(_x), y(_y) {}
+            explicit XMUBYTE2(_In_reads_(2) const uint8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {}
+            XMUBYTE2(float _x, float _y) noexcept;
+            explicit XMUBYTE2(_In_reads_(2) const float* pArray) noexcept;
+
+            XMUBYTE2& operator= (uint16_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        //------------------------------------------------------------------------------
+        // 3D vector: 5/6/5 unsigned integer components
+        struct XMU565
+        {
+            union
+            {
+                struct
+                {
+                    uint16_t x : 5;    // 0 to 31
+                    uint16_t y : 6;    // 0 to 63
+                    uint16_t z : 5;    // 0 to 31
+                };
+                uint16_t v;
+            };
+
+            XMU565() = default;
+
+            XMU565(const XMU565&) = default;
+            XMU565& operator=(const XMU565&) = default;
+
+            XMU565(XMU565&&) = default;
+            XMU565& operator=(XMU565&&) = default;
+
+            explicit constexpr XMU565(uint16_t Packed) noexcept : v(Packed) {}
+            constexpr XMU565(uint8_t _x, uint8_t _y, uint8_t _z) noexcept : x(_x), y(_y), z(_z) {}
+            explicit XMU565(_In_reads_(3) const uint8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
+            XMU565(float _x, float _y, float _z) noexcept;
+            explicit XMU565(_In_reads_(3) const float* pArray) noexcept;
+
+            operator uint16_t () const noexcept { return v; }
+
+            XMU565& operator= (uint16_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        //------------------------------------------------------------------------------
+        // 3D vector: 11/11/10 floating-point components
+        // The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent
+        // and 6-bit mantissa for x component, a 5-bit biased exponent and
+        // 6-bit mantissa for y component, a 5-bit biased exponent and a 5-bit
+        // mantissa for z. The z component is stored in the most significant bits
+        // and the x component in the least significant bits. No sign bits so
+        // all partial-precision numbers are positive.
+        // (Z10Y11X11): [32] ZZZZZzzz zzzYYYYY yyyyyyXX XXXxxxxx [0]
+        struct XMFLOAT3PK
+        {
+            union
+            {
+                struct
+                {
+                    uint32_t xm : 6; // x-mantissa
+                    uint32_t xe : 5; // x-exponent
+                    uint32_t ym : 6; // y-mantissa
+                    uint32_t ye : 5; // y-exponent
+                    uint32_t zm : 5; // z-mantissa
+                    uint32_t ze : 5; // z-exponent
+                };
+                uint32_t v;
+            };
+
+            XMFLOAT3PK() = default;
+
+            XMFLOAT3PK(const XMFLOAT3PK&) = default;
+            XMFLOAT3PK& operator=(const XMFLOAT3PK&) = default;
+
+            XMFLOAT3PK(XMFLOAT3PK&&) = default;
+            XMFLOAT3PK& operator=(XMFLOAT3PK&&) = default;
+
+            explicit constexpr XMFLOAT3PK(uint32_t Packed) noexcept : v(Packed) {}
+            XMFLOAT3PK(float _x, float _y, float _z) noexcept;
+            explicit XMFLOAT3PK(_In_reads_(3) const float* pArray) noexcept;
+
+            operator uint32_t () const noexcept { return v; }
+
+            XMFLOAT3PK& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        //------------------------------------------------------------------------------
+        // 3D vector: 9/9/9 floating-point components with shared 5-bit exponent
+        // The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent
+        // with 9-bit mantissa for the x, y, and z component. The shared exponent
+        // is stored in the most significant bits and the x component mantissa is in
+        // the least significant bits. No sign bits so all partial-precision numbers
+        // are positive.
+        // (E5Z9Y9X9): [32] EEEEEzzz zzzzzzyy yyyyyyyx xxxxxxxx [0]
+        struct XMFLOAT3SE
+        {
+            union
+            {
+                struct
+                {
+                    uint32_t xm : 9; // x-mantissa
+                    uint32_t ym : 9; // y-mantissa
+                    uint32_t zm : 9; // z-mantissa
+                    uint32_t e : 5; // shared exponent
+                };
+                uint32_t v;
+            };
+
+            XMFLOAT3SE() = default;
+
+            XMFLOAT3SE(const XMFLOAT3SE&) = default;
+            XMFLOAT3SE& operator=(const XMFLOAT3SE&) = default;
+
+            XMFLOAT3SE(XMFLOAT3SE&&) = default;
+            XMFLOAT3SE& operator=(XMFLOAT3SE&&) = default;
+
+            explicit constexpr XMFLOAT3SE(uint32_t Packed) noexcept : v(Packed) {}
+            XMFLOAT3SE(float _x, float _y, float _z) noexcept;
+            explicit XMFLOAT3SE(_In_reads_(3) const float* pArray) noexcept;
+
+            operator uint32_t () const noexcept { return v; }
+
+            XMFLOAT3SE& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        //------------------------------------------------------------------------------
+        // 4D Vector; 16 bit floating point components
+        struct XMHALF4
+        {
+            union
+            {
+                struct
+                {
+                    HALF x;
+                    HALF y;
+                    HALF z;
+                    HALF w;
+                };
+                uint64_t v;
+            };
+
+            XMHALF4() = default;
+
+            XMHALF4(const XMHALF4&) = default;
+            XMHALF4& operator=(const XMHALF4&) = default;
+
+            XMHALF4(XMHALF4&&) = default;
+            XMHALF4& operator=(XMHALF4&&) = default;
+
+            explicit constexpr XMHALF4(uint64_t Packed) noexcept : v(Packed) {}
+            constexpr XMHALF4(HALF _x, HALF _y, HALF _z, HALF _w) noexcept : x(_x), y(_y), z(_z), w(_w) {}
+            explicit XMHALF4(_In_reads_(4) const HALF* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+            XMHALF4(float _x, float _y, float _z, float _w) noexcept;
+            explicit XMHALF4(_In_reads_(4) const float* pArray) noexcept;
+
+            XMHALF4& operator= (uint64_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        //------------------------------------------------------------------------------
+        // 4D Vector; 16 bit signed normalized integer components
+        struct XMSHORTN4
+        {
+            union
+            {
+                struct
+                {
+                    int16_t x;
+                    int16_t y;
+                    int16_t z;
+                    int16_t w;
+                };
+                uint64_t v;
+            };
+
+            XMSHORTN4() = default;
+
+            XMSHORTN4(const XMSHORTN4&) = default;
+            XMSHORTN4& operator=(const XMSHORTN4&) = default;
+
+            XMSHORTN4(XMSHORTN4&&) = default;
+            XMSHORTN4& operator=(XMSHORTN4&&) = default;
+
+            explicit constexpr XMSHORTN4(uint64_t Packed) noexcept : v(Packed) {}
+            constexpr XMSHORTN4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {}
+            explicit XMSHORTN4(_In_reads_(4) const int16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+            XMSHORTN4(float _x, float _y, float _z, float _w) noexcept;
+            explicit XMSHORTN4(_In_reads_(4) const float* pArray) noexcept;
+
+            XMSHORTN4& operator= (uint64_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 4D Vector; 16 bit signed integer components
+        struct XMSHORT4
+        {
+            union
+            {
+                struct
+                {
+                    int16_t x;
+                    int16_t y;
+                    int16_t z;
+                    int16_t w;
+                };
+                uint64_t v;
+            };
+
+            XMSHORT4() = default;
+
+            XMSHORT4(const XMSHORT4&) = default;
+            XMSHORT4& operator=(const XMSHORT4&) = default;
+
+            XMSHORT4(XMSHORT4&&) = default;
+            XMSHORT4& operator=(XMSHORT4&&) = default;
+
+            explicit constexpr XMSHORT4(uint64_t Packed) noexcept : v(Packed) {}
+            constexpr XMSHORT4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {}
+            explicit XMSHORT4(_In_reads_(4) const int16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+            XMSHORT4(float _x, float _y, float _z, float _w) noexcept;
+            explicit XMSHORT4(_In_reads_(4) const float* pArray) noexcept;
+
+            XMSHORT4& operator= (uint64_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 4D Vector; 16 bit unsigned normalized integer components
+        struct XMUSHORTN4
+        {
+            union
+            {
+                struct
+                {
+                    uint16_t x;
+                    uint16_t y;
+                    uint16_t z;
+                    uint16_t w;
+                };
+                uint64_t v;
+            };
+
+            XMUSHORTN4() = default;
+
+            XMUSHORTN4(const XMUSHORTN4&) = default;
+            XMUSHORTN4& operator=(const XMUSHORTN4&) = default;
+
+            XMUSHORTN4(XMUSHORTN4&&) = default;
+            XMUSHORTN4& operator=(XMUSHORTN4&&) = default;
+
+            explicit constexpr XMUSHORTN4(uint64_t Packed) noexcept : v(Packed) {}
+            constexpr XMUSHORTN4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {}
+            explicit XMUSHORTN4(_In_reads_(4) const uint16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+            XMUSHORTN4(float _x, float _y, float _z, float _w) noexcept;
+            explicit XMUSHORTN4(_In_reads_(4) const float* pArray) noexcept;
+
+            XMUSHORTN4& operator= (uint64_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 4D Vector; 16 bit unsigned integer components
+        struct XMUSHORT4
+        {
+            union
+            {
+                struct
+                {
+                    uint16_t x;
+                    uint16_t y;
+                    uint16_t z;
+                    uint16_t w;
+                };
+                uint64_t v;
+            };
+
+            XMUSHORT4() = default;
+
+            XMUSHORT4(const XMUSHORT4&) = default;
+            XMUSHORT4& operator=(const XMUSHORT4&) = default;
+
+            XMUSHORT4(XMUSHORT4&&) = default;
+            XMUSHORT4& operator=(XMUSHORT4&&) = default;
+
+            explicit constexpr XMUSHORT4(uint64_t Packed) noexcept : v(Packed) {}
+            constexpr XMUSHORT4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {}
+            explicit XMUSHORT4(_In_reads_(4) const uint16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+            XMUSHORT4(float _x, float _y, float _z, float _w) noexcept;
+            explicit XMUSHORT4(_In_reads_(4) const float* pArray) noexcept;
+
+            XMUSHORT4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        //------------------------------------------------------------------------------
+        // 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer
+        // The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned,
+        // normalized integer for the w component and 10 bit signed, normalized
+        // integers for the z, y, and x components.  The w component is stored in the
+        // most significant bits and the x component in the least significant bits
+        // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+        struct XMXDECN4
+        {
+            union
+            {
+                struct
+                {
+                    int32_t x : 10;    // -511/511 to 511/511
+                    int32_t y : 10;    // -511/511 to 511/511
+                    int32_t z : 10;    // -511/511 to 511/511
+                    uint32_t w : 2;     //      0/3 to     3/3
+                };
+                uint32_t v;
+            };
+
+            XMXDECN4() = default;
+
+            XMXDECN4(const XMXDECN4&) = default;
+            XMXDECN4& operator=(const XMXDECN4&) = default;
+
+            XMXDECN4(XMXDECN4&&) = default;
+            XMXDECN4& operator=(XMXDECN4&&) = default;
+
+            explicit constexpr XMXDECN4(uint32_t Packed) : v(Packed) {}
+            XMXDECN4(float _x, float _y, float _z, float _w) noexcept;
+            explicit XMXDECN4(_In_reads_(4) const float* pArray) noexcept;
+
+            operator uint32_t () const noexcept { return v; }
+
+            XMXDECN4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer
+        // The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned
+        // integer for the w component and 10 bit signed integers for the
+        // z, y, and x components.  The w component is stored in the
+        // most significant bits and the x component in the least significant bits
+        // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+        struct XM_DEPRECATED XMXDEC4
+        {
+            union
+            {
+                struct
+                {
+                    int32_t x : 10;    // -511 to 511
+                    int32_t y : 10;    // -511 to 511
+                    int32_t z : 10;    // -511 to 511
+                    uint32_t w : 2;     // 0 to 3
+                };
+                uint32_t v;
+            };
+
+            XMXDEC4() = default;
+
+            XMXDEC4(const XMXDEC4&) = default;
+            XMXDEC4& operator=(const XMXDEC4&) = default;
+
+            XMXDEC4(XMXDEC4&&) = default;
+            XMXDEC4& operator=(XMXDEC4&&) = default;
+
+            explicit constexpr XMXDEC4(uint32_t Packed) noexcept : v(Packed) {}
+            XMXDEC4(float _x, float _y, float _z, float _w) noexcept;
+            explicit XMXDEC4(_In_reads_(4) const float* pArray) noexcept;
+
+            operator uint32_t () const noexcept { return v; }
+
+            XMXDEC4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer
+        // The normalized 4D Vector is packed into 32 bits as follows: a 2 bit signed,
+        // normalized integer for the w component and 10 bit signed, normalized
+        // integers for the z, y, and x components.  The w component is stored in the
+        // most significant bits and the x component in the least significant bits
+        // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+        struct XM_DEPRECATED XMDECN4
+        {
+            union
+            {
+                struct
+                {
+                    int32_t x : 10;    // -511/511 to 511/511
+                    int32_t y : 10;    // -511/511 to 511/511
+                    int32_t z : 10;    // -511/511 to 511/511
+                    int32_t w : 2;     //     -1/1 to     1/1
+                };
+                uint32_t v;
+            };
+
+            XMDECN4() = default;
+
+            XMDECN4(const XMDECN4&) = default;
+            XMDECN4& operator=(const XMDECN4&) = default;
+
+            XMDECN4(XMDECN4&&) = default;
+            XMDECN4& operator=(XMDECN4&&) = default;
+
+            explicit constexpr XMDECN4(uint32_t Packed) noexcept : v(Packed) {}
+            XMDECN4(float _x, float _y, float _z, float _w) noexcept;
+            explicit XMDECN4(_In_reads_(4) const float* pArray) noexcept;
+
+            operator uint32_t () const noexcept { return v; }
+
+            XMDECN4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer
+        // The 4D Vector is packed into 32 bits as follows: a 2 bit signed,
+        // integer for the w component and 10 bit signed integers for the
+        // z, y, and x components.  The w component is stored in the
+        // most significant bits and the x component in the least significant bits
+        // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+        struct XM_DEPRECATED XMDEC4
+        {
+            union
+            {
+                struct
+                {
+                    int32_t  x : 10;    // -511 to 511
+                    int32_t  y : 10;    // -511 to 511
+                    int32_t  z : 10;    // -511 to 511
+                    int32_t  w : 2;     //   -1 to   1
+                };
+                uint32_t v;
+            };
+
+            XMDEC4() = default;
+
+            XMDEC4(const XMDEC4&) = default;
+            XMDEC4& operator=(const XMDEC4&) = default;
+
+            XMDEC4(XMDEC4&&) = default;
+            XMDEC4& operator=(XMDEC4&&) = default;
+
+            explicit constexpr XMDEC4(uint32_t Packed) noexcept : v(Packed) {}
+            XMDEC4(float _x, float _y, float _z, float _w) noexcept;
+            explicit XMDEC4(_In_reads_(4) const float* pArray) noexcept;
+
+            operator uint32_t () const noexcept { return v; }
+
+            XMDEC4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer
+        // The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned,
+        // normalized integer for the w component and 10 bit unsigned, normalized
+        // integers for the z, y, and x components.  The w component is stored in the
+        // most significant bits and the x component in the least significant bits
+        // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+        struct XMUDECN4
+        {
+            union
+            {
+                struct
+                {
+                    uint32_t x : 10;    // 0/1023 to 1023/1023
+                    uint32_t y : 10;    // 0/1023 to 1023/1023
+                    uint32_t z : 10;    // 0/1023 to 1023/1023
+                    uint32_t w : 2;     //    0/3 to       3/3
+                };
+                uint32_t v;
+            };
+
+            XMUDECN4() = default;
+
+            XMUDECN4(const XMUDECN4&) = default;
+            XMUDECN4& operator=(const XMUDECN4&) = default;
+
+            XMUDECN4(XMUDECN4&&) = default;
+            XMUDECN4& operator=(XMUDECN4&&) = default;
+
+            explicit constexpr XMUDECN4(uint32_t Packed) noexcept : v(Packed) {}
+            XMUDECN4(float _x, float _y, float _z, float _w) noexcept;
+            explicit XMUDECN4(_In_reads_(4) const float* pArray) noexcept;
+
+            operator uint32_t () const noexcept { return v; }
+
+            XMUDECN4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer
+        // The 4D Vector is packed into 32 bits as follows: a 2 bit unsigned,
+        // integer for the w component and 10 bit unsigned integers
+        // for the z, y, and x components.  The w component is stored in the
+        // most significant bits and the x component in the least significant bits
+        // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
+        struct XMUDEC4
+        {
+            union
+            {
+                struct
+                {
+                    uint32_t x : 10;    // 0 to 1023
+                    uint32_t y : 10;    // 0 to 1023
+                    uint32_t z : 10;    // 0 to 1023
+                    uint32_t w : 2;     // 0 to    3
+                };
+                uint32_t v;
+            };
+
+            XMUDEC4() = default;
+
+            XMUDEC4(const XMUDEC4&) = default;
+            XMUDEC4& operator=(const XMUDEC4&) = default;
+
+            XMUDEC4(XMUDEC4&&) = default;
+            XMUDEC4& operator=(XMUDEC4&&) = default;
+
+            explicit constexpr XMUDEC4(uint32_t Packed) noexcept : v(Packed) {}
+            XMUDEC4(float _x, float _y, float _z, float _w) noexcept;
+            explicit XMUDEC4(_In_reads_(4) const float* pArray) noexcept;
+
+            operator uint32_t () const noexcept { return v; }
+
+            XMUDEC4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        //------------------------------------------------------------------------------
+        // 4D Vector; 8 bit signed normalized integer components
+        struct XMBYTEN4
+        {
+            union
+            {
+                struct
+                {
+                    int8_t x;
+                    int8_t y;
+                    int8_t z;
+                    int8_t w;
+                };
+                uint32_t v;
+            };
+
+            XMBYTEN4() = default;
+
+            XMBYTEN4(const XMBYTEN4&) = default;
+            XMBYTEN4& operator=(const XMBYTEN4&) = default;
+
+            XMBYTEN4(XMBYTEN4&&) = default;
+            XMBYTEN4& operator=(XMBYTEN4&&) = default;
+
+            constexpr XMBYTEN4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {}
+            explicit constexpr XMBYTEN4(uint32_t Packed) noexcept : v(Packed) {}
+            explicit XMBYTEN4(_In_reads_(4) const int8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+            XMBYTEN4(float _x, float _y, float _z, float _w) noexcept;
+            explicit XMBYTEN4(_In_reads_(4) const float* pArray) noexcept;
+
+            XMBYTEN4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 4D Vector; 8 bit signed integer components
+        struct XMBYTE4
+        {
+            union
+            {
+                struct
+                {
+                    int8_t x;
+                    int8_t y;
+                    int8_t z;
+                    int8_t w;
+                };
+                uint32_t v;
+            };
+
+            XMBYTE4() = default;
+
+            XMBYTE4(const XMBYTE4&) = default;
+            XMBYTE4& operator=(const XMBYTE4&) = default;
+
+            XMBYTE4(XMBYTE4&&) = default;
+            XMBYTE4& operator=(XMBYTE4&&) = default;
+
+            constexpr XMBYTE4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {}
+            explicit constexpr XMBYTE4(uint32_t Packed) noexcept : v(Packed) {}
+            explicit XMBYTE4(_In_reads_(4) const int8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+            XMBYTE4(float _x, float _y, float _z, float _w) noexcept;
+            explicit XMBYTE4(_In_reads_(4) const float* pArray) noexcept;
+
+            XMBYTE4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 4D Vector; 8 bit unsigned normalized integer components
+        struct XMUBYTEN4
+        {
+            union
+            {
+                struct
+                {
+                    uint8_t x;
+                    uint8_t y;
+                    uint8_t z;
+                    uint8_t w;
+                };
+                uint32_t v;
+            };
+
+            XMUBYTEN4() = default;
+
+            XMUBYTEN4(const XMUBYTEN4&) = default;
+            XMUBYTEN4& operator=(const XMUBYTEN4&) = default;
+
+            XMUBYTEN4(XMUBYTEN4&&) = default;
+            XMUBYTEN4& operator=(XMUBYTEN4&&) = default;
+
+            constexpr XMUBYTEN4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {}
+            explicit constexpr XMUBYTEN4(uint32_t Packed) noexcept : v(Packed) {}
+            explicit XMUBYTEN4(_In_reads_(4) const uint8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+            XMUBYTEN4(float _x, float _y, float _z, float _w) noexcept;
+            explicit XMUBYTEN4(_In_reads_(4) const float* pArray) noexcept;
+
+            XMUBYTEN4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        // 4D Vector; 8 bit unsigned integer components
+        struct XMUBYTE4
+        {
+            union
+            {
+                struct
+                {
+                    uint8_t x;
+                    uint8_t y;
+                    uint8_t z;
+                    uint8_t w;
+                };
+                uint32_t v;
+            };
+
+            XMUBYTE4() = default;
+
+            XMUBYTE4(const XMUBYTE4&) = default;
+            XMUBYTE4& operator=(const XMUBYTE4&) = default;
+
+            XMUBYTE4(XMUBYTE4&&) = default;
+            XMUBYTE4& operator=(XMUBYTE4&&) = default;
+
+            constexpr XMUBYTE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {}
+            explicit constexpr XMUBYTE4(uint32_t Packed)  noexcept : v(Packed) {}
+            explicit XMUBYTE4(_In_reads_(4) const uint8_t* pArray)  noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+            XMUBYTE4(float _x, float _y, float _z, float _w) noexcept;
+            explicit XMUBYTE4(_In_reads_(4) const float* pArray) noexcept;
+
+            XMUBYTE4& operator= (uint32_t Packed)  noexcept { v = Packed; return *this; }
+        };
+
+        //------------------------------------------------------------------------------
+        // 4D vector; 4 bit unsigned integer components
+        struct XMUNIBBLE4
+        {
+            union
+            {
+                struct
+                {
+                    uint16_t x : 4;    // 0 to 15
+                    uint16_t y : 4;    // 0 to 15
+                    uint16_t z : 4;    // 0 to 15
+                    uint16_t w : 4;    // 0 to 15
+                };
+                uint16_t v;
+            };
+
+            XMUNIBBLE4() = default;
+
+            XMUNIBBLE4(const XMUNIBBLE4&) = default;
+            XMUNIBBLE4& operator=(const XMUNIBBLE4&) = default;
+
+            XMUNIBBLE4(XMUNIBBLE4&&) = default;
+            XMUNIBBLE4& operator=(XMUNIBBLE4&&) = default;
+
+            explicit constexpr XMUNIBBLE4(uint16_t Packed)  noexcept : v(Packed) {}
+            constexpr XMUNIBBLE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w)  noexcept : x(_x), y(_y), z(_z), w(_w) {}
+            explicit XMUNIBBLE4(_In_reads_(4) const uint8_t* pArray)  noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
+            XMUNIBBLE4(float _x, float _y, float _z, float _w) noexcept;
+            explicit XMUNIBBLE4(_In_reads_(4) const float* pArray) noexcept;
+
+            operator uint16_t () const  noexcept { return v; }
+
+            XMUNIBBLE4& operator= (uint16_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+        //------------------------------------------------------------------------------
+        // 4D vector: 5/5/5/1 unsigned integer components
+        struct XMU555
+        {
+            union
+            {
+                struct
+                {
+                    uint16_t x : 5;    // 0 to 31
+                    uint16_t y : 5;    // 0 to 31
+                    uint16_t z : 5;    // 0 to 31
+                    uint16_t w : 1;    // 0 or 1
+                };
+                uint16_t v;
+            };
+
+            XMU555() = default;
+
+            XMU555(const XMU555&) = default;
+            XMU555& operator=(const XMU555&) = default;
+
+            XMU555(XMU555&&) = default;
+            XMU555& operator=(XMU555&&) = default;
+
+            explicit constexpr XMU555(uint16_t Packed) noexcept : v(Packed) {}
+            constexpr XMU555(uint8_t _x, uint8_t _y, uint8_t _z, bool _w) noexcept : x(_x), y(_y), z(_z), w(_w ? 0x1 : 0) {}
+            XMU555(_In_reads_(3) const uint8_t* pArray, _In_ bool _w) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(_w ? 0x1 : 0) {}
+            XMU555(float _x, float _y, float _z, bool _w) noexcept;
+            XMU555(_In_reads_(3) const float* pArray, _In_ bool _w) noexcept;
+
+            operator uint16_t () const noexcept { return v; }
+
+            XMU555& operator= (uint16_t Packed) noexcept { v = Packed; return *this; }
+        };
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+        /****************************************************************************
+         *
+         * Data conversion operations
+         *
+         ****************************************************************************/
+
+        float           XMConvertHalfToFloat(HALF Value) noexcept;
+        float* XMConvertHalfToFloatStream(_Out_writes_bytes_(sizeof(float) + OutputStride * (HalfCount - 1)) float* pOutputStream,
+            _In_ size_t OutputStride,
+            _In_reads_bytes_(sizeof(HALF) + InputStride * (HalfCount - 1)) const HALF* pInputStream,
+            _In_ size_t InputStride, _In_ size_t HalfCount) noexcept;
+        HALF            XMConvertFloatToHalf(float Value) noexcept;
+        HALF* XMConvertFloatToHalfStream(_Out_writes_bytes_(sizeof(HALF) + OutputStride * (FloatCount - 1)) HALF* pOutputStream,
+            _In_ size_t OutputStride,
+            _In_reads_bytes_(sizeof(float) + InputStride * (FloatCount - 1)) const float* pInputStream,
+            _In_ size_t InputStride, _In_ size_t FloatCount) noexcept;
+
+        /****************************************************************************
+         *
+         * Load operations
+         *
+         ****************************************************************************/
+
+        XMVECTOR    XM_CALLCONV     XMLoadColor(_In_ const XMCOLOR* pSource) noexcept;
+
+        XMVECTOR    XM_CALLCONV     XMLoadHalf2(_In_ const XMHALF2* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadShortN2(_In_ const XMSHORTN2* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadShort2(_In_ const XMSHORT2* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadUShortN2(_In_ const XMUSHORTN2* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadUShort2(_In_ const XMUSHORT2* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadByteN2(_In_ const XMBYTEN2* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadByte2(_In_ const XMBYTE2* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadUByteN2(_In_ const XMUBYTEN2* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadUByte2(_In_ const XMUBYTE2* pSource) noexcept;
+
+        XMVECTOR    XM_CALLCONV     XMLoadU565(_In_ const XMU565* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadFloat3PK(_In_ const XMFLOAT3PK* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadFloat3SE(_In_ const XMFLOAT3SE* pSource) noexcept;
+
+        XMVECTOR    XM_CALLCONV     XMLoadHalf4(_In_ const XMHALF4* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadShortN4(_In_ const XMSHORTN4* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadShort4(_In_ const XMSHORT4* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadUShortN4(_In_ const XMUSHORTN4* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadUShort4(_In_ const XMUSHORT4* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadXDecN4(_In_ const XMXDECN4* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadUDecN4(_In_ const XMUDECN4* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadUDecN4_XR(_In_ const XMUDECN4* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadUDec4(_In_ const XMUDEC4* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadByteN4(_In_ const XMBYTEN4* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadByte4(_In_ const XMBYTE4* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadUByteN4(_In_ const XMUBYTEN4* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadUByte4(_In_ const XMUBYTE4* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadUNibble4(_In_ const XMUNIBBLE4* pSource) noexcept;
+        XMVECTOR    XM_CALLCONV     XMLoadU555(_In_ const XMU555* pSource) noexcept;
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4996)
+        // C4996: ignore deprecation warning
+#endif
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+        XMVECTOR    XM_DEPRECATED XM_CALLCONV XMLoadDecN4(_In_ const XMDECN4* pSource) noexcept;
+        XMVECTOR    XM_DEPRECATED XM_CALLCONV XMLoadDec4(_In_ const XMDEC4* pSource) noexcept;
+        XMVECTOR    XM_DEPRECATED XM_CALLCONV XMLoadXDec4(_In_ const XMXDEC4* pSource) noexcept;
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+        /****************************************************************************
+         *
+         * Store operations
+         *
+         ****************************************************************************/
+
+        void    XM_CALLCONV     XMStoreColor(_Out_ XMCOLOR* pDestination, _In_ FXMVECTOR V) noexcept;
+
+        void    XM_CALLCONV     XMStoreHalf2(_Out_ XMHALF2* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreShortN2(_Out_ XMSHORTN2* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreShort2(_Out_ XMSHORT2* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreUShortN2(_Out_ XMUSHORTN2* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreUShort2(_Out_ XMUSHORT2* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreByteN2(_Out_ XMBYTEN2* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreByte2(_Out_ XMBYTE2* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreUByteN2(_Out_ XMUBYTEN2* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreUByte2(_Out_ XMUBYTE2* pDestination, _In_ FXMVECTOR V) noexcept;
+
+        void    XM_CALLCONV     XMStoreU565(_Out_ XMU565* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreFloat3PK(_Out_ XMFLOAT3PK* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreFloat3SE(_Out_ XMFLOAT3SE* pDestination, _In_ FXMVECTOR V) noexcept;
+
+        void    XM_CALLCONV     XMStoreHalf4(_Out_ XMHALF4* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreShortN4(_Out_ XMSHORTN4* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreShort4(_Out_ XMSHORT4* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreUShortN4(_Out_ XMUSHORTN4* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreUShort4(_Out_ XMUSHORT4* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreXDecN4(_Out_ XMXDECN4* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreUDecN4(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreUDecN4_XR(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreUDec4(_Out_ XMUDEC4* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreByteN4(_Out_ XMBYTEN4* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreByte4(_Out_ XMBYTE4* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreUByteN4(_Out_ XMUBYTEN4* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreUByte4(_Out_ XMUBYTE4* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreUNibble4(_Out_ XMUNIBBLE4* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_CALLCONV     XMStoreU555(_Out_ XMU555* pDestination, _In_ FXMVECTOR V) noexcept;
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4996)
+        // C4996: ignore deprecation warning
+#endif
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+        void    XM_DEPRECATED XM_CALLCONV XMStoreDecN4(_Out_ XMDECN4* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_DEPRECATED XM_CALLCONV XMStoreDec4(_Out_ XMDEC4* pDestination, _In_ FXMVECTOR V) noexcept;
+        void    XM_DEPRECATED XM_CALLCONV XMStoreXDec4(_Out_ XMXDEC4* pDestination, _In_ FXMVECTOR V) noexcept;
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+        /****************************************************************************
+         *
+         * Implementation
+         *
+         ****************************************************************************/
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4068 4214 4204 4365 4616 6001 6101)
+         // C4068/4616: ignore unknown pragmas
+         // C4214/4204: nonstandard extension used
+         // C4365: Off by default noise
+         // C6001/6101: False positives
+#endif
+
+#ifdef _PREFAST_
+#pragma prefast(push)
+#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
+#pragma prefast(disable : 26495, "Union initialization confuses /analyze")
+#endif
+
+#include "DirectXPackedVector.inl"
+
+#ifdef _PREFAST_
+#pragma prefast(pop)
+#endif
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+    } // namespace PackedVector
+
+} // namespace DirectX
+
diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXPackedVector.inl b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXPackedVector.inl
new file mode 100644
index 000000000..5f7e5d775
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXPackedVector.inl
@@ -0,0 +1,4459 @@
+//-------------------------------------------------------------------------------------
+// DirectXPackedVector.inl -- SIMD C++ Math library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+/****************************************************************************
+ *
+ * Data conversion
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline float XMConvertHalfToFloat(HALF Value) noexcept
+{
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128i V1 = _mm_cvtsi32_si128(static_cast<int>(Value));
+    __m128 V2 = _mm_cvtph_ps(V1);
+    return _mm_cvtss_f32(V2);
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
+    uint16x4_t vHalf = vdup_n_u16(Value);
+    float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
+    return vgetq_lane_f32(vFloat, 0);
+#else
+    auto Mantissa = static_cast<uint32_t>(Value & 0x03FF);
+
+    uint32_t Exponent = (Value & 0x7C00);
+    if (Exponent == 0x7C00) // INF/NAN
+    {
+        Exponent = 0x8f;
+    }
+    else if (Exponent != 0)  // The value is normalized
+    {
+        Exponent = static_cast<uint32_t>((static_cast<int>(Value) >> 10) & 0x1F);
+    }
+    else if (Mantissa != 0)     // The value is denormalized
+    {
+        // Normalize the value in the resulting float
+        Exponent = 1;
+
+        do
+        {
+            Exponent--;
+            Mantissa <<= 1;
+        } while ((Mantissa & 0x0400) == 0);
+
+        Mantissa &= 0x03FF;
+    }
+    else                        // The value is zero
+    {
+        Exponent = static_cast<uint32_t>(-112);
+    }
+
+    uint32_t Result =
+        ((static_cast<uint32_t>(Value) & 0x8000) << 16) // Sign
+        | ((Exponent + 112) << 23)                      // Exponent
+        | (Mantissa << 13);                             // Mantissa
+
+    return reinterpret_cast<float*>(&Result)[0];
+#endif // !_XM_F16C_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+#ifdef _PREFAST_
+#pragma prefast(push)
+#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
+#endif
+
+_Use_decl_annotations_
+inline float* XMConvertHalfToFloatStream
+(
+    float* pOutputStream,
+    size_t      OutputStride,
+    const HALF* pInputStream,
+    size_t      InputStride,
+    size_t      HalfCount
+) noexcept
+{
+    assert(pOutputStream);
+    assert(pInputStream);
+
+    assert(InputStride >= sizeof(HALF));
+    _Analysis_assume_(InputStride >= sizeof(HALF));
+
+    assert(OutputStride >= sizeof(float));
+    _Analysis_assume_(OutputStride >= sizeof(float));
+
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    auto pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = HalfCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(HALF))
+        {
+            if (OutputStride == sizeof(float))
+            {
+                if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
+                {
+                    // Packed input, aligned & packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
+                        pHalf += InputStride * 4;
+
+                        __m128 FV = _mm_cvtph_ps(HV);
+
+                        XM_STREAM_PS(reinterpret_cast<float*>(pFloat), FV);
+                        pFloat += OutputStride * 4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
+                        pHalf += InputStride * 4;
+
+                        __m128 FV = _mm_cvtph_ps(HV);
+
+                        _mm_storeu_ps(reinterpret_cast<float*>(pFloat), FV);
+                        pFloat += OutputStride * 4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                // Packed input, scattered output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    __m128i HV = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pHalf));
+                    pHalf += InputStride * 4;
+
+                    __m128 FV = _mm_cvtph_ps(HV);
+
+                    _mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
+                    pFloat += OutputStride;
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
+                    pFloat += OutputStride;
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
+                    pFloat += OutputStride;
+                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
+                    pFloat += OutputStride;
+                    i += 4;
+                }
+            }
+        }
+        else if (OutputStride == sizeof(float))
+        {
+            if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
+            {
+                // Scattered input, aligned & packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+
+                    __m128i HV = _mm_setzero_si128();
+                    HV = _mm_insert_epi16(HV, H1, 0);
+                    HV = _mm_insert_epi16(HV, H2, 1);
+                    HV = _mm_insert_epi16(HV, H3, 2);
+                    HV = _mm_insert_epi16(HV, H4, 3);
+                    __m128 FV = _mm_cvtph_ps(HV);
+
+                    XM_STREAM_PS(reinterpret_cast<float*>(pFloat), FV);
+                    pFloat += OutputStride * 4;
+                    i += 4;
+                }
+            }
+            else
+            {
+                // Scattered input, packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                    pHalf += InputStride;
+
+                    __m128i HV = _mm_setzero_si128();
+                    HV = _mm_insert_epi16(HV, H1, 0);
+                    HV = _mm_insert_epi16(HV, H2, 1);
+                    HV = _mm_insert_epi16(HV, H3, 2);
+                    HV = _mm_insert_epi16(HV, H4, 3);
+                    __m128 FV = _mm_cvtph_ps(HV);
+
+                    _mm_storeu_ps(reinterpret_cast<float*>(pFloat), FV);
+                    pFloat += OutputStride * 4;
+                    i += 4;
+                }
+            }
+        }
+        else
+        {
+            // Scattered input, scattered output
+            for (size_t j = 0; j < four; ++j)
+            {
+                uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+
+                __m128i HV = _mm_setzero_si128();
+                HV = _mm_insert_epi16(HV, H1, 0);
+                HV = _mm_insert_epi16(HV, H2, 1);
+                HV = _mm_insert_epi16(HV, H3, 2);
+                HV = _mm_insert_epi16(HV, H4, 3);
+                __m128 FV = _mm_cvtph_ps(HV);
+
+                _mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
+                pFloat += OutputStride;
+                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
+                pFloat += OutputStride;
+                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
+                pFloat += OutputStride;
+                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
+                pFloat += OutputStride;
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < HalfCount; ++i)
+    {
+        *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
+        pHalf += InputStride;
+        pFloat += OutputStride;
+    }
+
+    XM_SFENCE();
+
+    return pOutputStream;
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) ||__aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
+    auto pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = HalfCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(HALF))
+        {
+            if (OutputStride == sizeof(float))
+            {
+                // Packed input, packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16x4_t vHalf = vld1_u16(reinterpret_cast<const uint16_t*>(pHalf));
+                    pHalf += InputStride * 4;
+
+                    float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
+
+                    vst1q_f32(reinterpret_cast<float*>(pFloat), vFloat);
+                    pFloat += OutputStride * 4;
+                    i += 4;
+                }
+            }
+            else
+            {
+                // Packed input, scattered output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    uint16x4_t vHalf = vld1_u16(reinterpret_cast<const uint16_t*>(pHalf));
+                    pHalf += InputStride * 4;
+
+                    float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
+
+                    vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 0);
+                    pFloat += OutputStride;
+                    vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 1);
+                    pFloat += OutputStride;
+                    vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 2);
+                    pFloat += OutputStride;
+                    vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 3);
+                    pFloat += OutputStride;
+                    i += 4;
+                }
+            }
+        }
+        else if (OutputStride == sizeof(float))
+        {
+            // Scattered input, packed output
+            for (size_t j = 0; j < four; ++j)
+            {
+                uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+
+                uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) | (uint64_t(H3) << 32) | (uint64_t(H4) << 48);
+                uint16x4_t vHalf = vcreate_u16(iHalf);
+
+                float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
+
+                vst1q_f32(reinterpret_cast<float*>(pFloat), vFloat);
+                pFloat += OutputStride * 4;
+                i += 4;
+            }
+        }
+        else
+        {
+            // Scattered input, scattered output
+            for (size_t j = 0; j < four; ++j)
+            {
+                uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+                uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
+                pHalf += InputStride;
+
+                uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) | (uint64_t(H3) << 32) | (uint64_t(H4) << 48);
+                uint16x4_t vHalf = vcreate_u16(iHalf);
+
+                float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
+
+                vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 0);
+                pFloat += OutputStride;
+                vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 1);
+                pFloat += OutputStride;
+                vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 2);
+                pFloat += OutputStride;
+                vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 3);
+                pFloat += OutputStride;
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < HalfCount; ++i)
+    {
+        *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
+        pHalf += InputStride;
+        pFloat += OutputStride;
+    }
+
+    return pOutputStream;
+#else
+    auto pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    for (size_t i = 0; i < HalfCount; i++)
+    {
+        *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
+        pHalf += InputStride;
+        pFloat += OutputStride;
+    }
+
+    return pOutputStream;
+#endif // !_XM_F16C_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+
+inline HALF XMConvertFloatToHalf(float Value) noexcept
+{
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128 V1 = _mm_set_ss(Value);
+    __m128i V2 = _mm_cvtps_ph(V1, _MM_FROUND_TO_NEAREST_INT);
+    return static_cast<HALF>(_mm_extract_epi16(V2, 0));
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
+    float32x4_t vFloat = vdupq_n_f32(Value);
+    float16x4_t vHalf = vcvt_f16_f32(vFloat);
+    return vget_lane_u16(vreinterpret_u16_f16(vHalf), 0);
+#else
+    uint32_t Result;
+
+    auto IValue = reinterpret_cast<uint32_t*>(&Value)[0];
+    uint32_t Sign = (IValue & 0x80000000U) >> 16U;
+    IValue = IValue & 0x7FFFFFFFU;      // Hack off the sign
+    if (IValue >= 0x47800000 /*e+16*/)
+    {
+        // The number is too large to be represented as a half. Return infinity or NaN
+        Result = 0x7C00U | ((IValue > 0x7F800000) ? (0x200 | ((IValue >> 13U) & 0x3FFU)) : 0U);
+    }
+    else if (IValue <= 0x33000000U /*e-25*/)
+    {
+        Result = 0;
+    }
+    else if (IValue < 0x38800000U /*e-14*/)
+    {
+        // The number is too small to be represented as a normalized half.
+        // Convert it to a denormalized value.
+        uint32_t Shift = 125U - (IValue >> 23U);
+        IValue = 0x800000U | (IValue & 0x7FFFFFU);
+        Result = IValue >> (Shift + 1);
+        uint32_t s = (IValue & ((1U << Shift) - 1)) != 0;
+        Result += (Result | s) & ((IValue >> Shift) & 1U);
+    }
+    else
+    {
+        // Rebias the exponent to represent the value as a normalized half.
+        IValue += 0xC8000000U;
+        Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U) & 0x7FFFU;
+    }
+    return static_cast<HALF>(Result | Sign);
+#endif // !_XM_F16C_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline HALF* XMConvertFloatToHalfStream
+(
+    HALF* pOutputStream,
+    size_t       OutputStride,
+    const float* pInputStream,
+    size_t       InputStride,
+    size_t       FloatCount
+) noexcept
+{
+    assert(pOutputStream);
+    assert(pInputStream);
+
+    assert(InputStride >= sizeof(float));
+    _Analysis_assume_(InputStride >= sizeof(float));
+
+    assert(OutputStride >= sizeof(HALF));
+    _Analysis_assume_(OutputStride >= sizeof(HALF));
+
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    auto pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = FloatCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(float))
+        {
+            if (OutputStride == sizeof(HALF))
+            {
+                if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
+                {
+                    // Aligned and packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
+                        pFloat += InputStride * 4;
+
+                        __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
+
+                        _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
+                        pHalf += OutputStride * 4;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, packed output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
+                        pFloat += InputStride * 4;
+
+                        __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
+
+                        _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
+                        pHalf += OutputStride * 4;
+                        i += 4;
+                    }
+                }
+            }
+            else
+            {
+                if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0)
+                {
+                    // Aligned & packed input, scattered output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
+                        pFloat += InputStride * 4;
+
+                        __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
+
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
+                        pHalf += OutputStride;
+                        i += 4;
+                    }
+                }
+                else
+                {
+                    // Packed input, scattered output
+                    for (size_t j = 0; j < four; ++j)
+                    {
+                        __m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
+                        pFloat += InputStride * 4;
+
+                        __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
+
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
+                        pHalf += OutputStride;
+                        *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
+                        pHalf += OutputStride;
+                        i += 4;
+                    }
+                }
+            }
+        }
+        else if (OutputStride == sizeof(HALF))
+        {
+            // Scattered input, packed output
+            for (size_t j = 0; j < four; ++j)
+            {
+                __m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
+                __m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
+                FV = _mm_blend_ps(FV, FT, 0xC);
+
+                __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
+
+                _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
+                pHalf += OutputStride * 4;
+                i += 4;
+            }
+        }
+        else
+        {
+            // Scattered input, scattered output
+            for (size_t j = 0; j < four; ++j)
+            {
+                __m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
+                pFloat += InputStride;
+
+                __m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
+                __m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
+                FV = _mm_blend_ps(FV, FT, 0xC);
+
+                __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
+
+                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
+                pHalf += OutputStride;
+                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
+                pHalf += OutputStride;
+                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
+                pHalf += OutputStride;
+                *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
+                pHalf += OutputStride;
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < FloatCount; ++i)
+    {
+        *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
+        pFloat += InputStride;
+        pHalf += OutputStride;
+    }
+
+    return pOutputStream;
+#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
+    auto pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    size_t i = 0;
+    size_t four = FloatCount >> 2;
+    if (four > 0)
+    {
+        if (InputStride == sizeof(float))
+        {
+            if (OutputStride == sizeof(HALF))
+            {
+                // Packed input, packed output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    float32x4_t vFloat = vld1q_f32(reinterpret_cast<const float*>(pFloat));
+                    pFloat += InputStride * 4;
+
+                    uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
+
+                    vst1_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf);
+                    pHalf += OutputStride * 4;
+                    i += 4;
+                }
+            }
+            else
+            {
+                // Packed input, scattered output
+                for (size_t j = 0; j < four; ++j)
+                {
+                    float32x4_t vFloat = vld1q_f32(reinterpret_cast<const float*>(pFloat));
+                    pFloat += InputStride * 4;
+
+                    uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
+
+                    vst1_lane_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf, 0);
+                    pHalf += OutputStride;
+                    vst1_lane_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf, 1);
+                    pHalf += OutputStride;
+                    vst1_lane_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf, 2);
+                    pHalf += OutputStride;
+                    vst1_lane_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf, 3);
+                    pHalf += OutputStride;
+                    i += 4;
+                }
+            }
+        }
+        else if (OutputStride == sizeof(HALF))
+        {
+            // Scattered input, packed output
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4_t vFloat = vdupq_n_f32(0);
+                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 0);
+                pFloat += InputStride;
+
+                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 1);
+                pFloat += InputStride;
+
+                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 2);
+                pFloat += InputStride;
+
+                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 3);
+                pFloat += InputStride;
+
+                uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
+
+                vst1_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf);
+                pHalf += OutputStride * 4;
+                i += 4;
+            }
+        }
+        else
+        {
+            // Scattered input, scattered output
+            for (size_t j = 0; j < four; ++j)
+            {
+                float32x4_t vFloat = vdupq_n_f32(0);
+                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 0);
+                pFloat += InputStride;
+
+                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 1);
+                pFloat += InputStride;
+
+                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 2);
+                pFloat += InputStride;
+
+                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 3);
+                pFloat += InputStride;
+
+                uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
+
+                vst1_lane_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf, 0);
+                pHalf += OutputStride;
+                vst1_lane_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf, 1);
+                pHalf += OutputStride;
+                vst1_lane_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf, 2);
+                pHalf += OutputStride;
+                vst1_lane_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf, 3);
+                pHalf += OutputStride;
+                i += 4;
+            }
+        }
+    }
+
+    for (; i < FloatCount; ++i)
+    {
+        *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
+        pFloat += InputStride;
+        pHalf += OutputStride;
+    }
+
+    return pOutputStream;
+#else
+    auto pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
+    auto pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
+
+    for (size_t i = 0; i < FloatCount; i++)
+    {
+        *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
+        pFloat += InputStride;
+        pHalf += OutputStride;
+    }
+    return pOutputStream;
+#endif // !_XM_F16C_INTRINSICS_
+}
+
+#ifdef _PREFAST_
+#pragma prefast(pop)
+#endif
+
+/****************************************************************************
+ *
+ * Vector and matrix load operations
+ *
+ ****************************************************************************/
+
+#ifdef _PREFAST_
+#pragma prefast(push)
+#pragma prefast(disable:28931, "PREfast noise: Esp:1266")
+#endif
+
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadColor(const XMCOLOR* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    // int32_t -> Float conversions are done in one instruction.
+    // uint32_t -> Float calls a runtime function. Keep in int32_t
+    auto iColor = static_cast<int32_t>(pSource->c);
+    XMVECTORF32 vColor = { { {
+            static_cast<float>((iColor >> 16) & 0xFF)* (1.0f / 255.0f),
+            static_cast<float>((iColor >> 8) & 0xFF)* (1.0f / 255.0f),
+            static_cast<float>(iColor & 0xFF)* (1.0f / 255.0f),
+            static_cast<float>((iColor >> 24) & 0xFF)* (1.0f / 255.0f)
+        } } };
+    return vColor.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32_t bgra = pSource->c;
+    uint32_t rgba = (bgra & 0xFF00FF00) | ((bgra >> 16) & 0xFF) | ((bgra << 16) & 0xFF0000);
+    uint32x2_t vInt8 = vdup_n_u32(rgba);
+    uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u32(vInt8));
+    uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16));
+    float32x4_t R = vcvtq_f32_u32(vInt);
+    return vmulq_n_f32(R, 1.0f / 255.0f);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the color in all four entries
+    __m128i vInt = _mm_set1_epi32(static_cast<int>(pSource->c));
+    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+    vInt = _mm_and_si128(vInt, g_XMMaskA8R8G8B8);
+    // a is unsigned! Flip the bit to convert the order to signed
+    vInt = _mm_xor_si128(vInt, g_XMFlipA8R8G8B8);
+    // Convert to floating point numbers
+    XMVECTOR vTemp = _mm_cvtepi32_ps(vInt);
+    // RGB + 0, A + 0x80000000.f to undo the signed order.
+    vTemp = _mm_add_ps(vTemp, g_XMFixAA8R8G8B8);
+    // Convert 0-255 to 0.0f-1.0f
+    return _mm_mul_ps(vTemp, g_XMNormalizeA8R8G8B8);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadHalf2(const XMHALF2* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128 V = _mm_load_ss(reinterpret_cast<const float*>(pSource));
+    return _mm_cvtph_ps(_mm_castps_si128(V));
+#else
+    XMVECTORF32 vResult = { { {
+            XMConvertHalfToFloat(pSource->x),
+            XMConvertHalfToFloat(pSource->y),
+            0.0f,
+            0.0f
+        } } };
+    return vResult.v;
+#endif // !_XM_F16C_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadShortN2(const XMSHORTN2* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            (pSource->x == -32768) ? -1.f : (static_cast<float>(pSource->x)* (1.0f / 32767.0f)),
+            (pSource->y == -32768) ? -1.f : (static_cast<float>(pSource->y)* (1.0f / 32767.0f)),
+            0.0f,
+            0.0f
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vInt16 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
+    int32x4_t vInt = vmovl_s16(vreinterpret_s16_u32(vInt16));
+    vInt = vandq_s32(vInt, g_XMMaskXY);
+    float32x4_t R = vcvtq_f32_s32(vInt);
+    R = vmulq_n_f32(R, 1.0f / 32767.0f);
+    return vmaxq_f32(R, vdupq_n_f32(-1.f));
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the two shorts in all four entries (WORD alignment okay,
+    // DWORD alignment preferred)
+    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->x));
+    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
+    vTemp = _mm_and_ps(vTemp, g_XMMaskX16Y16);
+    // x needs to be sign extended
+    vTemp = _mm_xor_ps(vTemp, g_XMFlipX16Y16);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // x - 0x8000 to undo the signed order.
+    vTemp = _mm_add_ps(vTemp, g_XMFixX16Y16);
+    // Convert -1.0f - 1.0f
+    vTemp = _mm_mul_ps(vTemp, g_XMNormalizeX16Y16);
+    // Clamp result (for case of -32768)
+    return _mm_max_ps(vTemp, g_XMNegativeOne);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadShort2(const XMSHORT2* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(pSource->x),
+            static_cast<float>(pSource->y),
+            0.f,
+            0.f
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vInt16 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
+    int32x4_t vInt = vmovl_s16(vreinterpret_s16_u32(vInt16));
+    vInt = vandq_s32(vInt, g_XMMaskXY);
+    return vcvtq_f32_s32(vInt);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the two shorts in all four entries (WORD alignment okay,
+    // DWORD alignment preferred)
+    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->x));
+    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
+    vTemp = _mm_and_ps(vTemp, g_XMMaskX16Y16);
+    // x needs to be sign extended
+    vTemp = _mm_xor_ps(vTemp, g_XMFlipX16Y16);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // x - 0x8000 to undo the signed order.
+    vTemp = _mm_add_ps(vTemp, g_XMFixX16Y16);
+    // Y is 65536 too large
+    return _mm_mul_ps(vTemp, g_XMFixupY16);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUShortN2(const XMUSHORTN2* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(pSource->x) / 65535.0f,
+            static_cast<float>(pSource->y) / 65535.0f,
+            0.f,
+            0.f
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vInt16 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
+    uint32x4_t vInt = vmovl_u16(vreinterpret_u16_u32(vInt16));
+    vInt = vandq_u32(vInt, g_XMMaskXY);
+    float32x4_t R = vcvtq_f32_u32(vInt);
+    R = vmulq_n_f32(R, 1.0f / 65535.0f);
+    return vmaxq_f32(R, vdupq_n_f32(-1.f));
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 FixupY16 = { { { 1.0f / 65535.0f, 1.0f / (65535.0f * 65536.0f), 0.0f, 0.0f } } };
+    static const XMVECTORF32 FixaddY16 = { { { 0, 32768.0f * 65536.0f, 0, 0 } } };
+    // Splat the two shorts in all four entries (WORD alignment okay,
+    // DWORD alignment preferred)
+    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->x));
+    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
+    vTemp = _mm_and_ps(vTemp, g_XMMaskX16Y16);
+    // y needs to be sign flipped
+    vTemp = _mm_xor_ps(vTemp, g_XMFlipY);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // y + 0x8000 to undo the signed order.
+    vTemp = _mm_add_ps(vTemp, FixaddY16);
+    // Y is 65536 times too large
+    vTemp = _mm_mul_ps(vTemp, FixupY16);
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUShort2(const XMUSHORT2* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(pSource->x),
+            static_cast<float>(pSource->y),
+            0.f,
+            0.f
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vInt16 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
+    uint32x4_t vInt = vmovl_u16(vreinterpret_u16_u32(vInt16));
+    vInt = vandq_u32(vInt, g_XMMaskXY);
+    return vcvtq_f32_u32(vInt);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 FixaddY16 = { { { 0, 32768.0f, 0, 0 } } };
+    // Splat the two shorts in all four entries (WORD alignment okay,
+    // DWORD alignment preferred)
+    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->x));
+    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
+    vTemp = _mm_and_ps(vTemp, g_XMMaskX16Y16);
+    // y needs to be sign flipped
+    vTemp = _mm_xor_ps(vTemp, g_XMFlipY);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // Y is 65536 times too large
+    vTemp = _mm_mul_ps(vTemp, g_XMFixupY16);
+    // y + 0x8000 to undo the signed order.
+    vTemp = _mm_add_ps(vTemp, FixaddY16);
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadByteN2(const XMBYTEN2* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            (pSource->x == -128) ? -1.f : (static_cast<float>(pSource->x)* (1.0f / 127.0f)),
+            (pSource->y == -128) ? -1.f : (static_cast<float>(pSource->y)* (1.0f / 127.0f)),
+            0.0f,
+            0.0f
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
+    int16x8_t vInt16 = vmovl_s8(vreinterpret_s8_u16(vInt8));
+    int32x4_t vInt = vmovl_s16(vget_low_s16(vInt16));
+    vInt = vandq_s32(vInt, g_XMMaskXY);
+    float32x4_t R = vcvtq_f32_s32(vInt);
+    R = vmulq_n_f32(R, 1.0f / 127.0f);
+    return vmaxq_f32(R, vdupq_n_f32(-1.f));
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Scale = { { { 1.0f / 127.0f, 1.0f / (127.0f * 256.0f), 0, 0 } } };
+    static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } };
+    // Splat the color in all four entries (x,z,y,w)
+    __m128i vInt = XM_LOADU_SI16(&pSource->v);
+    XMVECTOR vTemp = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
+    // Mask
+    vTemp = _mm_and_ps(vTemp, Mask);
+    // x,y and z are unsigned! Flip the bits to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp, g_XMXorByte4);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // x, y and z - 0x80 to complete the conversion
+    vTemp = _mm_add_ps(vTemp, g_XMAddByte4);
+    // Fix y, z and w because they are too large
+    vTemp = _mm_mul_ps(vTemp, Scale);
+    // Clamp result (for case of -128)
+    return _mm_max_ps(vTemp, g_XMNegativeOne);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadByte2(const XMBYTE2* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(pSource->x),
+            static_cast<float>(pSource->y),
+            0.0f,
+            0.0f
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
+    int16x8_t vInt16 = vmovl_s8(vreinterpret_s8_u16(vInt8));
+    int32x4_t vInt = vmovl_s16(vget_low_s16(vInt16));
+    vInt = vandq_s32(vInt, g_XMMaskXY);
+    return vcvtq_f32_s32(vInt);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Scale = { { { 1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f * 256.0f) } } };
+    static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } };
+    // Splat the color in all four entries (x,z,y,w)
+    __m128i vInt = XM_LOADU_SI16(&pSource->v);
+    XMVECTOR vTemp = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
+    // Mask
+    vTemp = _mm_and_ps(vTemp, Mask);
+    // x,y and z are unsigned! Flip the bits to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp, g_XMXorByte4);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // x, y and z - 0x80 to complete the conversion
+    vTemp = _mm_add_ps(vTemp, g_XMAddByte4);
+    // Fix y, z and w because they are too large
+    return _mm_mul_ps(vTemp, Scale);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUByteN2(const XMUBYTEN2* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(pSource->x)* (1.0f / 255.0f),
+            static_cast<float>(pSource->y)* (1.0f / 255.0f),
+            0.0f,
+            0.0f
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
+    uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u16(vInt8));
+    uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16));
+    vInt = vandq_u32(vInt, g_XMMaskXY);
+    float32x4_t R = vcvtq_f32_u32(vInt);
+    return vmulq_n_f32(R, 1.0f / 255.0f);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Scale = { { { 1.0f / 255.0f, 1.0f / (255.0f * 256.0f), 0, 0 } } };
+    static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } };
+    // Splat the color in all four entries (x,z,y,w)
+    __m128i vInt = XM_LOADU_SI16(&pSource->v);
+    XMVECTOR vTemp = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
+    // Mask
+    vTemp = _mm_and_ps(vTemp, Mask);
+    // w is signed! Flip the bits to convert the order to unsigned
+    vTemp = _mm_xor_ps(vTemp, g_XMFlipW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // w + 0x80 to complete the conversion
+    vTemp = _mm_add_ps(vTemp, g_XMAddUDec4);
+    // Fix y, z and w because they are too large
+    return _mm_mul_ps(vTemp, Scale);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUByte2(const XMUBYTE2* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(pSource->x),
+            static_cast<float>(pSource->y),
+            0.0f,
+            0.0f
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
+    uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u16(vInt8));
+    uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16));
+    vInt = vandq_u32(vInt, g_XMMaskXY);
+    return vcvtq_f32_u32(vInt);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Scale = { { { 1.0f, 1.0f / 256.0f, 0, 0 } } };
+    static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } };
+    // Splat the color in all four entries (x,z,y,w)
+    __m128i vInt = XM_LOADU_SI16(&pSource->v);
+    XMVECTOR vTemp = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
+    // Mask
+    vTemp = _mm_and_ps(vTemp, Mask);
+    // w is signed! Flip the bits to convert the order to unsigned
+    vTemp = _mm_xor_ps(vTemp, g_XMFlipW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // w + 0x80 to complete the conversion
+    vTemp = _mm_add_ps(vTemp, g_XMAddUDec4);
+    // Fix y, z and w because they are too large
+    return _mm_mul_ps(vTemp, Scale);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadU565(const XMU565* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            float(pSource->v & 0x1F),
+            float((pSource->v >> 5) & 0x3F),
+            float((pSource->v >> 11) & 0x1F),
+            0.f,
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORI32 U565And = { { { 0x1F, 0x3F << 5, 0x1F << 11, 0 } } };
+    static const XMVECTORF32 U565Mul = { { { 1.0f, 1.0f / 32.0f, 1.0f / 2048.f, 0 } } };
+    uint16x4_t vInt16 = vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
+    uint32x4_t vInt = vmovl_u16(vInt16);
+    vInt = vandq_u32(vInt, U565And);
+    float32x4_t R = vcvtq_f32_u32(vInt);
+    return vmulq_f32(R, U565Mul);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORI32 U565And = { { { 0x1F, 0x3F << 5, 0x1F << 11, 0 } } };
+    static const XMVECTORF32 U565Mul = { { { 1.0f, 1.0f / 32.0f, 1.0f / 2048.f, 0 } } };
+    // Get the 16 bit value and splat it
+    __m128i vInt = XM_LOADU_SI16(&pSource->v);
+    XMVECTOR vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
+    // Mask off x, y and z
+    vResult = _mm_and_ps(vResult, U565And);
+    // Convert to float
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
+    // Normalize x, y, and z
+    vResult = _mm_mul_ps(vResult, U565Mul);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadFloat3PK(const XMFLOAT3PK* pSource) noexcept
+{
+    assert(pSource);
+
+    XM_ALIGNED_DATA(16) uint32_t Result[4];
+    uint32_t Mantissa;
+    uint32_t Exponent;
+
+    // X Channel (6-bit mantissa)
+    Mantissa = pSource->xm;
+
+    if (pSource->xe == 0x1f) // INF or NAN
+    {
+        Result[0] = static_cast<uint32_t>(0x7f800000 | (static_cast<int>(pSource->xm) << 17));
+    }
+    else
+    {
+        if (pSource->xe != 0) // The value is normalized
+        {
+            Exponent = pSource->xe;
+        }
+        else if (Mantissa != 0) // The value is denormalized
+        {
+            // Normalize the value in the resulting float
+            Exponent = 1;
+
+            do
+            {
+                Exponent--;
+                Mantissa <<= 1;
+            } while ((Mantissa & 0x40) == 0);
+
+            Mantissa &= 0x3F;
+        }
+        else // The value is zero
+        {
+            Exponent = static_cast<uint32_t>(-112);
+        }
+
+        Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17);
+    }
+
+    // Y Channel (6-bit mantissa)
+    Mantissa = pSource->ym;
+
+    if (pSource->ye == 0x1f) // INF or NAN
+    {
+        Result[1] = static_cast<uint32_t>(0x7f800000 | (static_cast<int>(pSource->ym) << 17));
+    }
+    else
+    {
+        if (pSource->ye != 0) // The value is normalized
+        {
+            Exponent = pSource->ye;
+        }
+        else if (Mantissa != 0) // The value is denormalized
+        {
+            // Normalize the value in the resulting float
+            Exponent = 1;
+
+            do
+            {
+                Exponent--;
+                Mantissa <<= 1;
+            } while ((Mantissa & 0x40) == 0);
+
+            Mantissa &= 0x3F;
+        }
+        else // The value is zero
+        {
+            Exponent = static_cast<uint32_t>(-112);
+        }
+
+        Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17);
+    }
+
+    // Z Channel (5-bit mantissa)
+    Mantissa = pSource->zm;
+
+    if (pSource->ze == 0x1f) // INF or NAN
+    {
+        Result[2] = static_cast<uint32_t>(0x7f800000 | (static_cast<int>(pSource->zm) << 17));
+    }
+    else
+    {
+        if (pSource->ze != 0) // The value is normalized
+        {
+            Exponent = pSource->ze;
+        }
+        else if (Mantissa != 0) // The value is denormalized
+        {
+            // Normalize the value in the resulting float
+            Exponent = 1;
+
+            do
+            {
+                Exponent--;
+                Mantissa <<= 1;
+            } while ((Mantissa & 0x20) == 0);
+
+            Mantissa &= 0x1F;
+        }
+        else // The value is zero
+        {
+            Exponent = static_cast<uint32_t>(-112);
+        }
+
+        Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18);
+    }
+
+    return XMLoadFloat3A(reinterpret_cast<const XMFLOAT3A*>(&Result));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadFloat3SE(const XMFLOAT3SE* pSource) noexcept
+{
+    assert(pSource);
+
+    union { float f; int32_t i; } fi;
+    fi.i = 0x33800000 + (pSource->e << 23);
+    float Scale = fi.f;
+
+    XMVECTORF32 v = { { {
+            Scale * float(pSource->xm),
+            Scale * float(pSource->ym),
+            Scale * float(pSource->zm),
+            1.0f } } };
+    return v;
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadHalf4(const XMHALF4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128i V = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSource));
+    return _mm_cvtph_ps(V);
+#else
+    XMVECTORF32 vResult = { { {
+            XMConvertHalfToFloat(pSource->x),
+            XMConvertHalfToFloat(pSource->y),
+            XMConvertHalfToFloat(pSource->z),
+            XMConvertHalfToFloat(pSource->w)
+        } } };
+    return vResult.v;
+#endif // !_XM_F16C_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadShortN4(const XMSHORTN4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            (pSource->x == -32768) ? -1.f : (static_cast<float>(pSource->x)* (1.0f / 32767.0f)),
+            (pSource->y == -32768) ? -1.f : (static_cast<float>(pSource->y)* (1.0f / 32767.0f)),
+            (pSource->z == -32768) ? -1.f : (static_cast<float>(pSource->z)* (1.0f / 32767.0f)),
+            (pSource->w == -32768) ? -1.f : (static_cast<float>(pSource->w)* (1.0f / 32767.0f))
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int16x4_t vInt = vld1_s16(reinterpret_cast<const int16_t*>(pSource));
+    int32x4_t V = vmovl_s16(vInt);
+    float32x4_t vResult = vcvtq_f32_s32(V);
+    vResult = vmulq_n_f32(vResult, 1.0f / 32767.0f);
+    return vmaxq_f32(vResult, vdupq_n_f32(-1.f));
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the color in all four entries (x,z,y,w)
+    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double*>(&pSource->x));
+    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
+    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd), g_XMMaskX16Y16Z16W16);
+    // x and z are unsigned! Flip the bits to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp, g_XMFlipX16Y16Z16W16);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // x and z - 0x8000 to complete the conversion
+    vTemp = _mm_add_ps(vTemp, g_XMFixX16Y16Z16W16);
+    // Convert to -1.0f - 1.0f
+    vTemp = _mm_mul_ps(vTemp, g_XMNormalizeX16Y16Z16W16);
+    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
+    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 1, 2, 0));
+    // Clamp result (for case of -32768)
+    return _mm_max_ps(vTemp, g_XMNegativeOne);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadShort4(const XMSHORT4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(pSource->x),
+            static_cast<float>(pSource->y),
+            static_cast<float>(pSource->z),
+            static_cast<float>(pSource->w)
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    int16x4_t vInt = vld1_s16(reinterpret_cast<const int16_t*>(pSource));
+    int32x4_t V = vmovl_s16(vInt);
+    return vcvtq_f32_s32(V);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the color in all four entries (x,z,y,w)
+    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double*>(&pSource->x));
+    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
+    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd), g_XMMaskX16Y16Z16W16);
+    // x and z are unsigned! Flip the bits to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp, g_XMFlipX16Y16Z16W16);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // x and z - 0x8000 to complete the conversion
+    vTemp = _mm_add_ps(vTemp, g_XMFixX16Y16Z16W16);
+    // Fix y and w because they are 65536 too large
+    vTemp = _mm_mul_ps(vTemp, g_XMFixupY16W16);
+    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
+    return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 1, 2, 0));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUShortN4(const XMUSHORTN4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(pSource->x) / 65535.0f,
+            static_cast<float>(pSource->y) / 65535.0f,
+            static_cast<float>(pSource->z) / 65535.0f,
+            static_cast<float>(pSource->w) / 65535.0f
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint16x4_t vInt = vld1_u16(reinterpret_cast<const uint16_t*>(pSource));
+    uint32x4_t V = vmovl_u16(vInt);
+    float32x4_t vResult = vcvtq_f32_u32(V);
+    return vmulq_n_f32(vResult, 1.0f / 65535.0f);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 FixupY16W16 = { { { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / (65535.0f * 65536.0f), 1.0f / (65535.0f * 65536.0f) } } };
+    static const XMVECTORF32 FixaddY16W16 = { { { 0, 0, 32768.0f * 65536.0f, 32768.0f * 65536.0f } } };
+    // Splat the color in all four entries (x,z,y,w)
+    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double*>(&pSource->x));
+    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
+    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd), g_XMMaskX16Y16Z16W16);
+    // y and w are signed! Flip the bits to convert the order to unsigned
+    vTemp = _mm_xor_ps(vTemp, g_XMFlipZW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // y and w + 0x8000 to complete the conversion
+    vTemp = _mm_add_ps(vTemp, FixaddY16W16);
+    // Fix y and w because they are 65536 too large
+    vTemp = _mm_mul_ps(vTemp, FixupY16W16);
+    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
+    return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 1, 2, 0));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUShort4(const XMUSHORT4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(pSource->x),
+            static_cast<float>(pSource->y),
+            static_cast<float>(pSource->z),
+            static_cast<float>(pSource->w)
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint16x4_t vInt = vld1_u16(reinterpret_cast<const uint16_t*>(pSource));
+    uint32x4_t V = vmovl_u16(vInt);
+    return vcvtq_f32_u32(V);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 FixaddY16W16 = { { { 0, 0, 32768.0f, 32768.0f } } };
+    // Splat the color in all four entries (x,z,y,w)
+    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double*>(&pSource->x));
+    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
+    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd), g_XMMaskX16Y16Z16W16);
+    // y and w are signed! Flip the bits to convert the order to unsigned
+    vTemp = _mm_xor_ps(vTemp, g_XMFlipZW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // Fix y and w because they are 65536 too large
+    vTemp = _mm_mul_ps(vTemp, g_XMFixupY16W16);
+    // y and w + 0x8000 to complete the conversion
+    vTemp = _mm_add_ps(vTemp, FixaddY16W16);
+    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
+    return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 1, 2, 0));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadXDecN4(const XMXDECN4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    static const uint32_t SignExtend[] = { 0x00000000, 0xFFFFFC00 };
+
+    uint32_t ElementX = pSource->v & 0x3FF;
+    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
+    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
+
+    XMVECTORF32 vResult = { { {
+            (ElementX == 0x200) ? -1.f : (static_cast<float>(static_cast<int16_t>(ElementX | SignExtend[ElementX >> 9])) / 511.0f),
+            (ElementY == 0x200) ? -1.f : (static_cast<float>(static_cast<int16_t>(ElementY | SignExtend[ElementY >> 9])) / 511.0f),
+            (ElementZ == 0x200) ? -1.f : (static_cast<float>(static_cast<int16_t>(ElementZ | SignExtend[ElementZ >> 9])) / 511.0f),
+            static_cast<float>(pSource->v >> 30) / 3.0f
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
+    vInt = vandq_u32(vInt, g_XMMaskA2B10G10R10);
+    vInt = veorq_u32(vInt, g_XMFlipA2B10G10R10);
+    float32x4_t R = vcvtq_f32_s32(vreinterpretq_s32_u32(vInt));
+    R = vaddq_f32(R, g_XMFixAA2B10G10R10);
+    R = vmulq_f32(R, g_XMNormalizeA2B10G10R10);
+    return vmaxq_f32(R, vdupq_n_f32(-1.0f));
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the color in all four entries
+    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
+    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+    vTemp = _mm_and_ps(vTemp, g_XMMaskA2B10G10R10);
+    // a is unsigned! Flip the bit to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp, g_XMFlipA2B10G10R10);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // RGB + 0, A + 0x80000000.f to undo the signed order.
+    vTemp = _mm_add_ps(vTemp, g_XMFixAA2B10G10R10);
+    // Convert 0-255 to 0.0f-1.0f
+    vTemp = _mm_mul_ps(vTemp, g_XMNormalizeA2B10G10R10);
+    // Clamp result (for case of -512)
+    return _mm_max_ps(vTemp, g_XMNegativeOne);
+#endif
+}
+
+//------------------------------------------------------------------------------
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4996)
+// C4996: ignore deprecation warning
+#endif
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadXDec4(const XMXDEC4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    static const uint32_t SignExtend[] = { 0x00000000, 0xFFFFFC00 };
+
+    uint32_t ElementX = pSource->v & 0x3FF;
+    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
+    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
+
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(static_cast<int16_t>(ElementX | SignExtend[ElementX >> 9])),
+            static_cast<float>(static_cast<int16_t>(ElementY | SignExtend[ElementY >> 9])),
+            static_cast<float>(static_cast<int16_t>(ElementZ | SignExtend[ElementZ >> 9])),
+            static_cast<float>(pSource->v >> 30)
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORU32 XDec4Xor = { { { 0x200, 0x200 << 10, 0x200 << 20, 0x80000000 } } };
+    static const XMVECTORF32 XDec4Add = { { { -512.0f, -512.0f * 1024.0f, -512.0f * 1024.0f * 1024.0f, 32768 * 65536.0f } } };
+    uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
+    vInt = vandq_u32(vInt, g_XMMaskDec4);
+    vInt = veorq_u32(vInt, XDec4Xor);
+    float32x4_t R = vcvtq_f32_s32(vreinterpretq_s32_u32(vInt));
+    R = vaddq_f32(R, XDec4Add);
+    return vmulq_f32(R, g_XMMulDec4);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORU32 XDec4Xor = { { { 0x200, 0x200 << 10, 0x200 << 20, 0x80000000 } } };
+    static const XMVECTORF32 XDec4Add = { { { -512.0f, -512.0f * 1024.0f, -512.0f * 1024.0f * 1024.0f, 32768 * 65536.0f } } };
+    // Splat the color in all four entries
+    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
+    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+    vTemp = _mm_and_ps(vTemp, g_XMMaskDec4);
+    // a is unsigned! Flip the bit to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp, XDec4Xor);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // RGB + 0, A + 0x80000000.f to undo the signed order.
+    vTemp = _mm_add_ps(vTemp, XDec4Add);
+    // Convert 0-255 to 0.0f-1.0f
+    vTemp = _mm_mul_ps(vTemp, g_XMMulDec4);
+    return vTemp;
+#endif
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUDecN4(const XMUDECN4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+
+    uint32_t ElementX = pSource->v & 0x3FF;
+    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
+    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
+
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(ElementX) / 1023.0f,
+            static_cast<float>(ElementY) / 1023.0f,
+            static_cast<float>(ElementZ) / 1023.0f,
+            static_cast<float>(pSource->v >> 30) / 3.0f
+        } } };
+    return vResult.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 UDecN4Mul = { { { 1.0f / 1023.0f, 1.0f / (1023.0f * 1024.0f), 1.0f / (1023.0f * 1024.0f * 1024.0f), 1.0f / (3.0f * 1024.0f * 1024.0f * 1024.0f) } } };
+    uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
+    vInt = vandq_u32(vInt, g_XMMaskDec4);
+    float32x4_t R = vcvtq_f32_u32(vInt);
+    return vmulq_f32(R, UDecN4Mul);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 UDecN4Mul = { { { 1.0f / 1023.0f, 1.0f / (1023.0f * 1024.0f), 1.0f / (1023.0f * 1024.0f * 1024.0f), 1.0f / (3.0f * 1024.0f * 1024.0f * 1024.0f) } } };
+    // Splat the color in all four entries
+    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
+    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+    vTemp = _mm_and_ps(vTemp, g_XMMaskDec4);
+    // a is unsigned! Flip the bit to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp, g_XMFlipW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // RGB + 0, A + 0x80000000.f to undo the signed order.
+    vTemp = _mm_add_ps(vTemp, g_XMAddUDec4);
+    // Convert 0-255 to 0.0f-1.0f
+    vTemp = _mm_mul_ps(vTemp, UDecN4Mul);
+    return vTemp;
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUDecN4_XR(const XMUDECN4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+
+    int32_t ElementX = pSource->v & 0x3FF;
+    int32_t ElementY = (pSource->v >> 10) & 0x3FF;
+    int32_t ElementZ = (pSource->v >> 20) & 0x3FF;
+
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(ElementX - 0x180) / 510.0f,
+            static_cast<float>(ElementY - 0x180) / 510.0f,
+            static_cast<float>(ElementZ - 0x180) / 510.0f,
+            static_cast<float>(pSource->v >> 30) / 3.0f
+        } } };
+
+    return vResult.v;
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 XRMul = { { { 1.0f / 510.0f, 1.0f / (510.0f * 1024.0f), 1.0f / (510.0f * 1024.0f * 1024.0f), 1.0f / (3.0f * 1024.0f * 1024.0f * 1024.0f) } } };
+    static const XMVECTORI32 XRBias = { { { 0x180, 0x180 * 1024, 0x180 * 1024 * 1024, 0 } } };
+    uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
+    vInt = vandq_u32(vInt, g_XMMaskDec4);
+    int32x4_t vTemp = vsubq_s32(vreinterpretq_s32_u32(vInt), XRBias);
+    vTemp = veorq_s32(vTemp, g_XMFlipW);
+    float32x4_t R = vcvtq_f32_s32(vTemp);
+    R = vaddq_f32(R, g_XMAddUDec4);
+    return vmulq_f32(R, XRMul);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 XRMul = { { { 1.0f / 510.0f, 1.0f / (510.0f * 1024.0f), 1.0f / (510.0f * 1024.0f * 1024.0f), 1.0f / (3.0f * 1024.0f * 1024.0f * 1024.0f) } } };
+    static const XMVECTORI32 XRBias = { { { 0x180, 0x180 * 1024, 0x180 * 1024 * 1024, 0 } } };
+    // Splat the color in all four entries
+    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
+    // Mask channels
+    vTemp = _mm_and_ps(vTemp, g_XMMaskDec4);
+    // Subtract bias
+    vTemp = _mm_castsi128_ps(_mm_sub_epi32(_mm_castps_si128(vTemp), XRBias));
+    // a is unsigned! Flip the bit to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp, g_XMFlipW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // RGB + 0, A + 0x80000000.f to undo the signed order.
+    vTemp = _mm_add_ps(vTemp, g_XMAddUDec4);
+    // Convert to 0.0f-1.0f
+    return _mm_mul_ps(vTemp, XRMul);
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUDec4(const XMUDEC4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    uint32_t ElementX = pSource->v & 0x3FF;
+    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
+    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
+
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(ElementX),
+            static_cast<float>(ElementY),
+            static_cast<float>(ElementZ),
+            static_cast<float>(pSource->v >> 30)
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
+    vInt = vandq_u32(vInt, g_XMMaskDec4);
+    float32x4_t R = vcvtq_f32_u32(vInt);
+    return vmulq_f32(R, g_XMMulDec4);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the color in all four entries
+    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
+    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+    vTemp = _mm_and_ps(vTemp, g_XMMaskDec4);
+    // a is unsigned! Flip the bit to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp, g_XMFlipW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // RGB + 0, A + 0x80000000.f to undo the signed order.
+    vTemp = _mm_add_ps(vTemp, g_XMAddUDec4);
+    // Convert 0-255 to 0.0f-1.0f
+    vTemp = _mm_mul_ps(vTemp, g_XMMulDec4);
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4996)
+// C4996: ignore deprecation warning
+#endif
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadDecN4(const XMDECN4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    static const uint32_t SignExtend[] = { 0x00000000, 0xFFFFFC00 };
+    static const uint32_t SignExtendW[] = { 0x00000000, 0xFFFFFFFC };
+
+    uint32_t ElementX = pSource->v & 0x3FF;
+    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
+    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
+    uint32_t ElementW = pSource->v >> 30;
+
+    XMVECTORF32 vResult = { { {
+            (ElementX == 0x200) ? -1.f : (static_cast<float>(static_cast<int16_t>(ElementX | SignExtend[ElementX >> 9])) / 511.0f),
+            (ElementY == 0x200) ? -1.f : (static_cast<float>(static_cast<int16_t>(ElementY | SignExtend[ElementY >> 9])) / 511.0f),
+            (ElementZ == 0x200) ? -1.f : (static_cast<float>(static_cast<int16_t>(ElementZ | SignExtend[ElementZ >> 9])) / 511.0f),
+            (ElementW == 0x2) ? -1.f : static_cast<float>(static_cast<int16_t>(ElementW | SignExtendW[(ElementW >> 1) & 1]))
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 DecN4Mul = { { { 1.0f / 511.0f, 1.0f / (511.0f * 1024.0f), 1.0f / (511.0f * 1024.0f * 1024.0f), 1.0f / (1024.0f * 1024.0f * 1024.0f) } } };
+    uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
+    vInt = vandq_u32(vInt, g_XMMaskDec4);
+    vInt = veorq_u32(vInt, g_XMXorDec4);
+    float32x4_t R = vcvtq_f32_s32(vreinterpretq_s32_u32(vInt));
+    R = vaddq_f32(R, g_XMAddDec4);
+    R = vmulq_f32(R, DecN4Mul);
+    return vmaxq_f32(R, vdupq_n_f32(-1.0f));
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 DecN4Mul = { { { 1.0f / 511.0f, 1.0f / (511.0f * 1024.0f), 1.0f / (511.0f * 1024.0f * 1024.0f), 1.0f / (1024.0f * 1024.0f * 1024.0f) } } };
+    // Splat the color in all four entries
+    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
+    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+    vTemp = _mm_and_ps(vTemp, g_XMMaskDec4);
+    // a is unsigned! Flip the bit to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp, g_XMXorDec4);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // RGB + 0, A + 0x80000000.f to undo the signed order.
+    vTemp = _mm_add_ps(vTemp, g_XMAddDec4);
+    // Convert 0-255 to 0.0f-1.0f
+    vTemp = _mm_mul_ps(vTemp, DecN4Mul);
+    // Clamp result (for case of -512/-1)
+    return _mm_max_ps(vTemp, g_XMNegativeOne);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadDec4(const XMDEC4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    static const uint32_t SignExtend[] = { 0x00000000, 0xFFFFFC00 };
+    static const uint32_t SignExtendW[] = { 0x00000000, 0xFFFFFFFC };
+
+    uint32_t ElementX = pSource->v & 0x3FF;
+    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
+    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
+    uint32_t ElementW = pSource->v >> 30;
+
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(static_cast<int16_t>(ElementX | SignExtend[ElementX >> 9])),
+            static_cast<float>(static_cast<int16_t>(ElementY | SignExtend[ElementY >> 9])),
+            static_cast<float>(static_cast<int16_t>(ElementZ | SignExtend[ElementZ >> 9])),
+            static_cast<float>(static_cast<int16_t>(ElementW | SignExtendW[ElementW >> 1]))
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
+    vInt = vandq_u32(vInt, g_XMMaskDec4);
+    vInt = veorq_u32(vInt, g_XMXorDec4);
+    float32x4_t R = vcvtq_f32_s32(vreinterpretq_s32_u32(vInt));
+    R = vaddq_f32(R, g_XMAddDec4);
+    return vmulq_f32(R, g_XMMulDec4);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Splat the color in all four entries
+    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
+    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
+    vTemp = _mm_and_ps(vTemp, g_XMMaskDec4);
+    // a is unsigned! Flip the bit to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp, g_XMXorDec4);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // RGB + 0, A + 0x80000000.f to undo the signed order.
+    vTemp = _mm_add_ps(vTemp, g_XMAddDec4);
+    // Convert 0-255 to 0.0f-1.0f
+    vTemp = _mm_mul_ps(vTemp, g_XMMulDec4);
+    return vTemp;
+#endif
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUByteN4(const XMUBYTEN4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(pSource->x) / 255.0f,
+            static_cast<float>(pSource->y) / 255.0f,
+            static_cast<float>(pSource->z) / 255.0f,
+            static_cast<float>(pSource->w) / 255.0f
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vInt8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
+    uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u32(vInt8));
+    uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16));
+    float32x4_t R = vcvtq_f32_u32(vInt);
+    return vmulq_n_f32(R, 1.0f / 255.0f);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 LoadUByteN4Mul = { { { 1.0f / 255.0f, 1.0f / (255.0f * 256.0f), 1.0f / (255.0f * 65536.0f), 1.0f / (255.0f * 65536.0f * 256.0f) } } };
+    // Splat the color in all four entries (x,z,y,w)
+    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float*>(&pSource->x));
+    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
+    vTemp = _mm_and_ps(vTemp, g_XMMaskByte4);
+    // w is signed! Flip the bits to convert the order to unsigned
+    vTemp = _mm_xor_ps(vTemp, g_XMFlipW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // w + 0x80 to complete the conversion
+    vTemp = _mm_add_ps(vTemp, g_XMAddUDec4);
+    // Fix y, z and w because they are too large
+    vTemp = _mm_mul_ps(vTemp, LoadUByteN4Mul);
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUByte4(const XMUBYTE4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(pSource->x),
+            static_cast<float>(pSource->y),
+            static_cast<float>(pSource->z),
+            static_cast<float>(pSource->w)
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vInt8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
+    uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u32(vInt8));
+    uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16));
+    return vcvtq_f32_u32(vInt);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 LoadUByte4Mul = { { { 1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f * 256.0f) } } };
+    // Splat the color in all four entries (x,z,y,w)
+    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float*>(&pSource->x));
+    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
+    vTemp = _mm_and_ps(vTemp, g_XMMaskByte4);
+    // w is signed! Flip the bits to convert the order to unsigned
+    vTemp = _mm_xor_ps(vTemp, g_XMFlipW);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // w + 0x80 to complete the conversion
+    vTemp = _mm_add_ps(vTemp, g_XMAddUDec4);
+    // Fix y, z and w because they are too large
+    vTemp = _mm_mul_ps(vTemp, LoadUByte4Mul);
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadByteN4(const XMBYTEN4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            (pSource->x == -128) ? -1.f : (static_cast<float>(pSource->x) / 127.0f),
+            (pSource->y == -128) ? -1.f : (static_cast<float>(pSource->y) / 127.0f),
+            (pSource->z == -128) ? -1.f : (static_cast<float>(pSource->z) / 127.0f),
+            (pSource->w == -128) ? -1.f : (static_cast<float>(pSource->w) / 127.0f)
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vInt8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
+    int16x8_t vInt16 = vmovl_s8(vreinterpret_s8_u32(vInt8));
+    int32x4_t vInt = vmovl_s16(vget_low_s16(vInt16));
+    float32x4_t R = vcvtq_f32_s32(vInt);
+    R = vmulq_n_f32(R, 1.0f / 127.0f);
+    return vmaxq_f32(R, vdupq_n_f32(-1.f));
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 LoadByteN4Mul = { { { 1.0f / 127.0f, 1.0f / (127.0f * 256.0f), 1.0f / (127.0f * 65536.0f), 1.0f / (127.0f * 65536.0f * 256.0f) } } };
+    // Splat the color in all four entries (x,z,y,w)
+    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float*>(&pSource->x));
+    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
+    vTemp = _mm_and_ps(vTemp, g_XMMaskByte4);
+    // x,y and z are unsigned! Flip the bits to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp, g_XMXorByte4);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // x, y and z - 0x80 to complete the conversion
+    vTemp = _mm_add_ps(vTemp, g_XMAddByte4);
+    // Fix y, z and w because they are too large
+    vTemp = _mm_mul_ps(vTemp, LoadByteN4Mul);
+    // Clamp result (for case of -128)
+    return _mm_max_ps(vTemp, g_XMNegativeOne);
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadByte4(const XMBYTE4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            static_cast<float>(pSource->x),
+            static_cast<float>(pSource->y),
+            static_cast<float>(pSource->z),
+            static_cast<float>(pSource->w)
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    uint32x2_t vInt8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
+    int16x8_t vInt16 = vmovl_s8(vreinterpret_s8_u32(vInt8));
+    int32x4_t vInt = vmovl_s16(vget_low_s16(vInt16));
+    return vcvtq_f32_s32(vInt);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 LoadByte4Mul = { { { 1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f * 256.0f) } } };
+    // Splat the color in all four entries (x,z,y,w)
+    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float*>(&pSource->x));
+    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
+    vTemp = _mm_and_ps(vTemp, g_XMMaskByte4);
+    // x,y and z are unsigned! Flip the bits to convert the order to signed
+    vTemp = _mm_xor_ps(vTemp, g_XMXorByte4);
+    // Convert to floating point numbers
+    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
+    // x, y and z - 0x80 to complete the conversion
+    vTemp = _mm_add_ps(vTemp, g_XMAddByte4);
+    // Fix y, z and w because they are too large
+    vTemp = _mm_mul_ps(vTemp, LoadByte4Mul);
+    return vTemp;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadUNibble4(const XMUNIBBLE4* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            float(pSource->v & 0xF),
+            float((pSource->v >> 4) & 0xF),
+            float((pSource->v >> 8) & 0xF),
+            float((pSource->v >> 12) & 0xF)
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORI32 UNibble4And = { { { 0xF, 0xF0, 0xF00, 0xF000 } } };
+    static const XMVECTORF32 UNibble4Mul = { { { 1.0f, 1.0f / 16.f, 1.0f / 256.f, 1.0f / 4096.f } } };
+    uint16x4_t vInt16 = vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
+    uint32x4_t vInt = vmovl_u16(vInt16);
+    vInt = vandq_u32(vInt, UNibble4And);
+    float32x4_t R = vcvtq_f32_u32(vInt);
+    return vmulq_f32(R, UNibble4Mul);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORI32 UNibble4And = { { { 0xF, 0xF0, 0xF00, 0xF000 } } };
+    static const XMVECTORF32 UNibble4Mul = { { { 1.0f, 1.0f / 16.f, 1.0f / 256.f, 1.0f / 4096.f } } };
+    // Get the 16 bit value and splat it
+    __m128i vInt = XM_LOADU_SI16(&pSource->v);
+    XMVECTOR vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0,0,0,0));
+    // Mask off x, y and z
+    vResult = _mm_and_ps(vResult, UNibble4And);
+    // Convert to float
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
+    // Normalize x, y, and z
+    vResult = _mm_mul_ps(vResult, UNibble4Mul);
+    return vResult;
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMVECTOR XM_CALLCONV XMLoadU555(const XMU555* pSource) noexcept
+{
+    assert(pSource);
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTORF32 vResult = { { {
+            float(pSource->v & 0x1F),
+            float((pSource->v >> 5) & 0x1F),
+            float((pSource->v >> 10) & 0x1F),
+            float((pSource->v >> 15) & 0x1)
+        } } };
+    return vResult.v;
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORI32 U555And = { { { 0x1F, 0x1F << 5, 0x1F << 10, 0x8000 } } };
+    static const XMVECTORF32 U555Mul = { { { 1.0f, 1.0f / 32.f, 1.0f / 1024.f, 1.0f / 32768.f } } };
+    uint16x4_t vInt16 = vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
+    uint32x4_t vInt = vmovl_u16(vInt16);
+    vInt = vandq_u32(vInt, U555And);
+    float32x4_t R = vcvtq_f32_u32(vInt);
+    return vmulq_f32(R, U555Mul);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORI32 U555And = { { { 0x1F, 0x1F << 5, 0x1F << 10, 0x8000 } } };
+    static const XMVECTORF32 U555Mul = { { { 1.0f, 1.0f / 32.f, 1.0f / 1024.f, 1.0f / 32768.f } } };
+    // Get the 16bit value and splat it
+    __m128i vInt = XM_LOADU_SI16(&pSource->v);
+    XMVECTOR vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
+    // Mask off x, y and z
+    vResult = _mm_and_ps(vResult, U555And);
+    // Convert to float
+    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
+    // Normalize x, y, and z
+    vResult = _mm_mul_ps(vResult, U555Mul);
+    return vResult;
+#endif
+}
+
+#ifdef _PREFAST_
+#pragma prefast(pop)
+#endif
+
+/****************************************************************************
+ *
+ * Vector and matrix store operations
+ *
+ ****************************************************************************/
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreColor
+(
+    XMCOLOR* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorSaturate(V);
+    N = XMVectorMultiply(N, g_UByteMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->c = (static_cast<uint32_t>(tmp.w) << 24) |
+        (static_cast<uint32_t>(tmp.x) << 16) |
+        (static_cast<uint32_t>(tmp.y) << 8) |
+        static_cast<uint32_t>(tmp.z);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0));
+    R = vminq_f32(R, vdupq_n_f32(1.0f));
+    R = vmulq_n_f32(R, 255.0f);
+    R = XMVectorRound(R);
+    uint32x4_t vInt32 = vcvtq_u32_f32(R);
+    uint16x4_t vInt16 = vqmovn_u32(vInt32);
+    uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16));
+    uint32_t rgba = vget_lane_u32(vreinterpret_u32_u8(vInt8), 0);
+    pDestination->c = (rgba & 0xFF00FF00) | ((rgba >> 16) & 0xFF) | ((rgba << 16) & 0xFF0000);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Set <0 to 0
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    // Set>1 to 1
+    vResult = _mm_min_ps(vResult, g_XMOne);
+    // Convert to 0-255
+    vResult = _mm_mul_ps(vResult, g_UByteMax);
+    // Shuffle RGBA to ARGB
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
+    // Convert to int
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // Mash to shorts
+    vInt = _mm_packs_epi32(vInt, vInt);
+    // Mash to bytes
+    vInt = _mm_packus_epi16(vInt, vInt);
+    // Store the color
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination->c), _mm_castsi128_ps(vInt));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreHalf2
+(
+    XMHALF2* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128i V1 = _mm_cvtps_ph(V, _MM_FROUND_TO_NEAREST_INT);
+    _mm_store_ss(reinterpret_cast<float*>(pDestination), _mm_castsi128_ps(V1));
+#else
+    pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
+    pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
+#endif // !_XM_F16C_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreShortN2
+(
+    XMSHORTN2* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+    N = XMVectorMultiply(N, g_ShortMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->x = static_cast<int16_t>(tmp.x);
+    pDestination->y = static_cast<int16_t>(tmp.y);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f));
+    R = vminq_f32(R, vdupq_n_f32(1.0f));
+    R = vmulq_n_f32(R, 32767.0f);
+    int32x4_t vInt32 = vcvtq_s32_f32(R);
+    int16x4_t vInt16 = vqmovn_s32(vInt32);
+    vst1_lane_u32(&pDestination->v, vreinterpret_u32_s16(vInt16), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne);
+    vResult = _mm_min_ps(vResult, g_XMOne);
+    vResult = _mm_mul_ps(vResult, g_ShortMax);
+    __m128i vResulti = _mm_cvtps_epi32(vResult);
+    vResulti = _mm_packs_epi32(vResulti, vResulti);
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination->x), _mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreShort2
+(
+    XMSHORT2* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->x = static_cast<int16_t>(tmp.x);
+    pDestination->y = static_cast<int16_t>(tmp.y);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-32767.f));
+    R = vminq_f32(R, vdupq_n_f32(32767.0f));
+    int32x4_t vInt32 = vcvtq_s32_f32(R);
+    int16x4_t vInt16 = vqmovn_s32(vInt32);
+    vst1_lane_u32(&pDestination->v, vreinterpret_u32_s16(vInt16), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V, g_ShortMin);
+    vResult = _mm_min_ps(vResult, g_ShortMax);
+    // Convert to int with rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // Pack the ints into shorts
+    vInt = _mm_packs_epi32(vInt, vInt);
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination->x), _mm_castsi128_ps(vInt));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUShortN2
+(
+    XMUSHORTN2* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorSaturate(V);
+    N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v);
+    N = XMVectorTruncate(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->x = static_cast<uint16_t>(tmp.x);
+    pDestination->y = static_cast<uint16_t>(tmp.y);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f));
+    R = vminq_f32(R, vdupq_n_f32(1.0f));
+    R = vmulq_n_f32(R, 65535.0f);
+    R = vaddq_f32(R, g_XMOneHalf);
+    uint32x4_t vInt32 = vcvtq_u32_f32(R);
+    uint16x4_t vInt16 = vqmovn_u32(vInt32);
+    vst1_lane_u32(&pDestination->v, vreinterpret_u32_u16(vInt16), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    vResult = _mm_min_ps(vResult, g_XMOne);
+    vResult = _mm_mul_ps(vResult, g_UShortMax);
+    vResult = _mm_add_ps(vResult, g_XMOneHalf);
+    // Convert to int
+    __m128i vInt = _mm_cvttps_epi32(vResult);
+    // Since the SSE pack instruction clamps using signed rules,
+    // manually extract the values to store them to memory
+    pDestination->x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
+    pDestination->y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUShort2
+(
+    XMUSHORT2* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->x = static_cast<uint16_t>(tmp.x);
+    pDestination->y = static_cast<uint16_t>(tmp.y);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f));
+    R = vminq_f32(R, vdupq_n_f32(65535.0f));
+    uint32x4_t vInt32 = vcvtq_u32_f32(R);
+    uint16x4_t vInt16 = vqmovn_u32(vInt32);
+    vst1_lane_u32(&pDestination->v, vreinterpret_u32_u16(vInt16), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    vResult = _mm_min_ps(vResult, g_UShortMax);
+    // Convert to int with rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // Since the SSE pack instruction clamps using signed rules,
+    // manually extract the values to store them to memory
+    pDestination->x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
+    pDestination->y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreByteN2
+(
+    XMBYTEN2* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+    N = XMVectorMultiply(N, g_ByteMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->x = static_cast<int8_t>(tmp.x);
+    pDestination->y = static_cast<int8_t>(tmp.y);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f));
+    R = vminq_f32(R, vdupq_n_f32(1.0f));
+    R = vmulq_n_f32(R, 127.0f);
+    int32x4_t vInt32 = vcvtq_s32_f32(R);
+    int16x4_t vInt16 = vqmovn_s32(vInt32);
+    int8x8_t vInt8 = vqmovn_s16(vcombine_s16(vInt16, vInt16));
+    vst1_lane_u16(reinterpret_cast<uint16_t*>(pDestination), vreinterpret_u16_s8(vInt8), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne);
+    vResult = _mm_min_ps(vResult, g_XMOne);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult, g_ByteMax);
+    // Convert to int by rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // No SSE operations will write to 16-bit values, so we have to extract them manually
+    auto x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
+    auto y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
+    pDestination->v = static_cast<uint16_t>(((static_cast<int>(y) & 0xFF) << 8) | (static_cast<int>(x) & 0xFF));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreByte2
+(
+    XMBYTE2* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->x = static_cast<int8_t>(tmp.x);
+    pDestination->y = static_cast<int8_t>(tmp.y);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f));
+    R = vminq_f32(R, vdupq_n_f32(127.0f));
+    int32x4_t vInt32 = vcvtq_s32_f32(R);
+    int16x4_t vInt16 = vqmovn_s32(vInt32);
+    int8x8_t vInt8 = vqmovn_s16(vcombine_s16(vInt16, vInt16));
+    vst1_lane_u16(reinterpret_cast<uint16_t*>(pDestination), vreinterpret_u16_s8(vInt8), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V, g_ByteMin);
+    vResult = _mm_min_ps(vResult, g_ByteMax);
+    // Convert to int by rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // No SSE operations will write to 16-bit values, so we have to extract them manually
+    auto x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
+    auto y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
+    pDestination->v = static_cast<uint16_t>(((static_cast<int>(y) & 0xFF) << 8) | (static_cast<int>(x) & 0xFF));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUByteN2
+(
+    XMUBYTEN2* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorSaturate(V);
+    N = XMVectorMultiplyAdd(N, g_UByteMax, g_XMOneHalf.v);
+    N = XMVectorTruncate(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->x = static_cast<uint8_t>(tmp.x);
+    pDestination->y = static_cast<uint8_t>(tmp.y);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f));
+    R = vminq_f32(R, vdupq_n_f32(1.0f));
+    R = vmulq_n_f32(R, 255.0f);
+    R = vaddq_f32(R, g_XMOneHalf);
+    uint32x4_t vInt32 = vcvtq_u32_f32(R);
+    uint16x4_t vInt16 = vqmovn_u32(vInt32);
+    uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16));
+    vst1_lane_u16(reinterpret_cast<uint16_t*>(pDestination), vreinterpret_u16_u8(vInt8), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    vResult = _mm_min_ps(vResult, g_XMOne);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult, g_UByteMax);
+    vResult = _mm_add_ps(vResult, g_XMOneHalf);
+    // Convert to int
+    __m128i vInt = _mm_cvttps_epi32(vResult);
+    // No SSE operations will write to 16-bit values, so we have to extract them manually
+    auto x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
+    auto y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
+    pDestination->v = static_cast<uint16_t>(((static_cast<int>(y) & 0xFF) << 8) | (static_cast<int>(x) & 0xFF));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUByte2
+(
+    XMUBYTE2* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->x = static_cast<uint8_t>(tmp.x);
+    pDestination->y = static_cast<uint8_t>(tmp.y);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f));
+    R = vminq_f32(R, vdupq_n_f32(255.0f));
+    uint32x4_t vInt32 = vcvtq_u32_f32(R);
+    uint16x4_t vInt16 = vqmovn_u32(vInt32);
+    uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16));
+    vst1_lane_u16(reinterpret_cast<uint16_t*>(pDestination), vreinterpret_u16_u8(vInt8), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    vResult = _mm_min_ps(vResult, g_UByteMax);
+    // Convert to int by rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // No SSE operations will write to 16-bit values, so we have to extract them manually
+    auto x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
+    auto y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
+    pDestination->v = static_cast<uint16_t>(((static_cast<int>(y) & 0xFF) << 8) | (static_cast<int>(x) & 0xFF));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreU565
+(
+    XMU565* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+    static const XMVECTORF32 Max = { { { 31.0f, 63.0f, 31.0f, 0.0f } } };
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->v = static_cast<uint16_t>(
+        ((static_cast<int>(tmp.z) & 0x1F) << 11)
+        | ((static_cast<int>(tmp.y) & 0x3F) << 5)
+        | ((static_cast<int>(tmp.x) & 0x1F)));
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Scale = { { { 1.0f, 32.f, 32.f * 64.f, 0.f } } };
+    static const XMVECTORU32 Mask = { { { 0x1F, 0x3F << 5, 0x1F << 11, 0 } } };
+    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0));
+    vResult = vminq_f32(vResult, Max);
+    vResult = vmulq_f32(vResult, Scale);
+    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
+    vResulti = vandq_u32(vResulti, Mask);
+    // Do a horizontal or of 4 entries
+    uint32x2_t vTemp = vget_low_u32(vResulti);
+    uint32x2_t vhi = vget_high_u32(vResulti);
+    vTemp = vorr_u32(vTemp, vhi);
+    vTemp = vpadd_u32(vTemp, vTemp);
+    vst1_lane_u16(&pDestination->v, vreinterpret_u16_u32(vTemp), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    vResult = _mm_min_ps(vResult, Max);
+    // Convert to int with rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // No SSE operations will write to 16-bit values, so we have to extract them manually
+    auto x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
+    auto y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
+    auto z = static_cast<uint16_t>(_mm_extract_epi16(vInt, 4));
+    pDestination->v = static_cast<uint16_t>(
+        ((static_cast<int>(z) & 0x1F) << 11)
+        | ((static_cast<int>(y) & 0x3F) << 5)
+        | ((static_cast<int>(x) & 0x1F)));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat3PK
+(
+    XMFLOAT3PK* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+
+    XM_ALIGNED_DATA(16) uint32_t IValue[4];
+    XMStoreFloat3A(reinterpret_cast<XMFLOAT3A*>(&IValue), V);
+
+    uint32_t Result[3];
+
+    // X & Y Channels (5-bit exponent, 6-bit mantissa)
+    for (uint32_t j = 0; j < 2; ++j)
+    {
+        uint32_t Sign = IValue[j] & 0x80000000;
+        uint32_t I = IValue[j] & 0x7FFFFFFF;
+
+        if ((I & 0x7F800000) == 0x7F800000)
+        {
+            // INF or NAN
+            Result[j] = 0x7C0U;
+            if ((I & 0x7FFFFF) != 0)
+            {
+                Result[j] = 0x7FFU;
+            }
+            else if (Sign)
+            {
+                // -INF is clamped to 0 since 3PK is positive only
+                Result[j] = 0;
+            }
+        }
+        else if (Sign || I < 0x35800000)
+        {
+            // 3PK is positive only, so clamp to zero
+            Result[j] = 0;
+        }
+        else if (I > 0x477E0000U)
+        {
+            // The number is too large to be represented as a float11, set to max
+            Result[j] = 0x7BFU;
+        }
+        else
+        {
+            if (I < 0x38800000U)
+            {
+                // The number is too small to be represented as a normalized float11
+                // Convert it to a denormalized value.
+                uint32_t Shift = 113U - (I >> 23U);
+                I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
+            }
+            else
+            {
+                // Rebias the exponent to represent the value as a normalized float11
+                I += 0xC8000000U;
+            }
+
+            Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U) & 0x7ffU;
+        }
+    }
+
+    // Z Channel (5-bit exponent, 5-bit mantissa)
+    uint32_t Sign = IValue[2] & 0x80000000;
+    uint32_t I = IValue[2] & 0x7FFFFFFF;
+
+    if ((I & 0x7F800000) == 0x7F800000)
+    {
+        // INF or NAN
+        Result[2] = 0x3E0U;
+        if (I & 0x7FFFFF)
+        {
+            Result[2] = 0x3FFU;
+        }
+        else if (Sign || I < 0x36000000)
+        {
+            // -INF is clamped to 0 since 3PK is positive only
+            Result[2] = 0;
+        }
+    }
+    else if (Sign)
+    {
+        // 3PK is positive only, so clamp to zero
+        Result[2] = 0;
+    }
+    else if (I > 0x477C0000U)
+    {
+        // The number is too large to be represented as a float10, set to max
+        Result[2] = 0x3DFU;
+    }
+    else
+    {
+        if (I < 0x38800000U)
+        {
+            // The number is too small to be represented as a normalized float10
+            // Convert it to a denormalized value.
+            uint32_t Shift = 113U - (I >> 23U);
+            I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
+        }
+        else
+        {
+            // Rebias the exponent to represent the value as a normalized float10
+            I += 0xC8000000U;
+        }
+
+        Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U) & 0x3ffU;
+    }
+
+    // Pack Result into memory
+    pDestination->v = (Result[0] & 0x7ff)
+        | ((Result[1] & 0x7ff) << 11)
+        | ((Result[2] & 0x3ff) << 22);
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreFloat3SE
+(
+    XMFLOAT3SE* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+
+    XMFLOAT3A tmp;
+    XMStoreFloat3A(&tmp, V);
+
+    static constexpr float maxf9 = float(0x1FF << 7);
+    static constexpr float minf9 = float(1.f / (1 << 16));
+
+    float x = (tmp.x >= 0.f) ? ((tmp.x > maxf9) ? maxf9 : tmp.x) : 0.f;
+    float y = (tmp.y >= 0.f) ? ((tmp.y > maxf9) ? maxf9 : tmp.y) : 0.f;
+    float z = (tmp.z >= 0.f) ? ((tmp.z > maxf9) ? maxf9 : tmp.z) : 0.f;
+
+    const float max_xy = (x > y) ? x : y;
+    const float max_xyz = (max_xy > z) ? max_xy : z;
+
+    const float maxColor = (max_xyz > minf9) ? max_xyz : minf9;
+
+    union { float f; int32_t i; } fi;
+    fi.f = maxColor;
+    fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1)
+
+    auto exp = static_cast<uint32_t>(fi.i) >> 23;
+    pDestination->e = exp - 0x6f;
+
+    fi.i = static_cast<int32_t>(0x83000000 - (exp << 23));
+    float ScaleR = fi.f;
+
+    pDestination->xm = static_cast<uint32_t>(Internal::round_to_nearest(x * ScaleR));
+    pDestination->ym = static_cast<uint32_t>(Internal::round_to_nearest(y * ScaleR));
+    pDestination->zm = static_cast<uint32_t>(Internal::round_to_nearest(z * ScaleR));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreHalf4
+(
+    XMHALF4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
+    __m128i V1 = _mm_cvtps_ph(V, _MM_FROUND_TO_NEAREST_INT);
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(pDestination), V1);
+#else
+    XMFLOAT4A t;
+    XMStoreFloat4A(&t, V);
+
+    pDestination->x = XMConvertFloatToHalf(t.x);
+    pDestination->y = XMConvertFloatToHalf(t.y);
+    pDestination->z = XMConvertFloatToHalf(t.z);
+    pDestination->w = XMConvertFloatToHalf(t.w);
+#endif // !_XM_F16C_INTRINSICS_
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreShortN4
+(
+    XMSHORTN4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+    N = XMVectorMultiply(N, g_ShortMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->x = static_cast<int16_t>(tmp.x);
+    pDestination->y = static_cast<int16_t>(tmp.y);
+    pDestination->z = static_cast<int16_t>(tmp.z);
+    pDestination->w = static_cast<int16_t>(tmp.w);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(-1.f));
+    vResult = vminq_f32(vResult, vdupq_n_f32(1.0f));
+    vResult = vmulq_n_f32(vResult, 32767.0f);
+    int16x4_t vInt = vmovn_s32(vcvtq_s32_f32(vResult));
+    vst1_s16(reinterpret_cast<int16_t*>(pDestination), vInt);
+#elif defined(_XM_SSE_INTRINSICS_)
+    XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne);
+    vResult = _mm_min_ps(vResult, g_XMOne);
+    vResult = _mm_mul_ps(vResult, g_ShortMax);
+    __m128i vResulti = _mm_cvtps_epi32(vResult);
+    vResulti = _mm_packs_epi32(vResulti, vResulti);
+    _mm_store_sd(reinterpret_cast<double*>(&pDestination->x), _mm_castsi128_pd(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreShort4
+(
+    XMSHORT4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->x = static_cast<int16_t>(tmp.x);
+    pDestination->y = static_cast<int16_t>(tmp.y);
+    pDestination->z = static_cast<int16_t>(tmp.z);
+    pDestination->w = static_cast<int16_t>(tmp.w);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vResult = vmaxq_f32(V, g_ShortMin);
+    vResult = vminq_f32(vResult, g_ShortMax);
+    int16x4_t vInt = vmovn_s32(vcvtq_s32_f32(vResult));
+    vst1_s16(reinterpret_cast<int16_t*>(pDestination), vInt);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V, g_ShortMin);
+    vResult = _mm_min_ps(vResult, g_ShortMax);
+    // Convert to int with rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // Pack the ints into shorts
+    vInt = _mm_packs_epi32(vInt, vInt);
+    _mm_store_sd(reinterpret_cast<double*>(&pDestination->x), _mm_castsi128_pd(vInt));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUShortN4
+(
+    XMUSHORTN4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorSaturate(V);
+    N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v);
+    N = XMVectorTruncate(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->x = static_cast<uint16_t>(tmp.x);
+    pDestination->y = static_cast<uint16_t>(tmp.y);
+    pDestination->z = static_cast<uint16_t>(tmp.z);
+    pDestination->w = static_cast<uint16_t>(tmp.w);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0));
+    vResult = vminq_f32(vResult, vdupq_n_f32(1.0f));
+    vResult = vmulq_n_f32(vResult, 65535.0f);
+    vResult = vaddq_f32(vResult, g_XMOneHalf);
+    uint16x4_t vInt = vmovn_u32(vcvtq_u32_f32(vResult));
+    vst1_u16(reinterpret_cast<uint16_t*>(pDestination), vInt);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    vResult = _mm_min_ps(vResult, g_XMOne);
+    vResult = _mm_mul_ps(vResult, g_UShortMax);
+    vResult = _mm_add_ps(vResult, g_XMOneHalf);
+    // Convert to int
+    __m128i vInt = _mm_cvttps_epi32(vResult);
+    // Since the SSE pack instruction clamps using signed rules,
+    // manually extract the values to store them to memory
+    pDestination->x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
+    pDestination->y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
+    pDestination->z = static_cast<uint16_t>(_mm_extract_epi16(vInt, 4));
+    pDestination->w = static_cast<uint16_t>(_mm_extract_epi16(vInt, 6));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUShort4
+(
+    XMUSHORT4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->x = static_cast<uint16_t>(tmp.x);
+    pDestination->y = static_cast<uint16_t>(tmp.y);
+    pDestination->z = static_cast<uint16_t>(tmp.z);
+    pDestination->w = static_cast<uint16_t>(tmp.w);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0));
+    vResult = vminq_f32(vResult, g_UShortMax);
+    uint16x4_t vInt = vmovn_u32(vcvtq_u32_f32(vResult));
+    vst1_u16(reinterpret_cast<uint16_t*>(pDestination), vInt);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    vResult = _mm_min_ps(vResult, g_UShortMax);
+    // Convert to int with rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // Since the SSE pack instruction clamps using signed rules,
+    // manually extract the values to store them to memory
+    pDestination->x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
+    pDestination->y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
+    pDestination->z = static_cast<uint16_t>(_mm_extract_epi16(vInt, 4));
+    pDestination->w = static_cast<uint16_t>(_mm_extract_epi16(vInt, 6));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreXDecN4
+(
+    XMXDECN4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+    static const XMVECTORF32 Min = { { { -1.0f, -1.0f, -1.0f, 0.0f } } };
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    static const XMVECTORF32  Scale = { { { 511.0f, 511.0f, 511.0f, 3.0f } } };
+
+    XMVECTOR N = XMVectorClamp(V, Min.v, g_XMOne.v);
+    N = XMVectorMultiply(N, Scale.v);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->v = static_cast<uint32_t>(
+        (static_cast<int>(tmp.w) << 30)
+        | ((static_cast<int>(tmp.z) & 0x3FF) << 20)
+        | ((static_cast<int>(tmp.y) & 0x3FF) << 10)
+        | (static_cast<int>(tmp.x) & 0x3FF));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Scale = { { { 511.0f, 511.0f * 1024.0f, 511.0f * 1048576.0f, 3.0f * 536870912.0f } } };
+    static const XMVECTORI32 ScaleMask = { { { 0x3FF, 0x3FF << 10, 0x3FF << 20, 0x3 << 29 } } };
+    float32x4_t vResult = vmaxq_f32(V, Min);
+    vResult = vminq_f32(vResult, vdupq_n_f32(1.0f));
+    vResult = vmulq_f32(vResult, Scale);
+    int32x4_t vResulti = vcvtq_s32_f32(vResult);
+    vResulti = vandq_s32(vResulti, ScaleMask);
+    int32x4_t vResultw = vandq_s32(vResulti, g_XMMaskW);
+    vResulti = vaddq_s32(vResulti, vResultw);
+    // Do a horizontal or of all 4 entries
+    uint32x2_t vTemp = vget_low_u32(vreinterpretq_u32_s32(vResulti));
+    uint32x2_t vhi = vget_high_u32(vreinterpretq_u32_s32(vResulti));
+    vTemp = vorr_u32(vTemp, vhi);
+    vTemp = vpadd_u32(vTemp, vTemp);
+    vst1_lane_u32(&pDestination->v, vTemp, 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Scale = { { { 511.0f, 511.0f * 1024.0f, 511.0f * 1048576.0f, 3.0f * 536870912.0f } } };
+    static const XMVECTORI32 ScaleMask = { { { 0x3FF, 0x3FF << 10, 0x3FF << 20, 0x3 << 29 } } };
+    XMVECTOR vResult = _mm_max_ps(V, Min);
+    vResult = _mm_min_ps(vResult, g_XMOne);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult, Scale);
+    // Convert to int (W is unsigned)
+    __m128i vResulti = _mm_cvtps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti, ScaleMask);
+    // To fix W, add itself to shift it up to <<30 instead of <<29
+    __m128i vResultw = _mm_and_si128(vResulti, g_XMMaskW);
+    vResulti = _mm_add_epi32(vResulti, vResultw);
+    // Do a horizontal or of all 4 entries
+    vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vResulti), _MM_SHUFFLE(0, 3, 2, 1));
+    vResulti = _mm_or_si128(vResulti, _mm_castps_si128(vResult));
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 3, 2, 1));
+    vResulti = _mm_or_si128(vResulti, _mm_castps_si128(vResult));
+    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 3, 2, 1));
+    vResulti = _mm_or_si128(vResulti, _mm_castps_si128(vResult));
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v), _mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4996)
+// C4996: ignore deprecation warning
+#endif
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreXDec4
+(
+    XMXDEC4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+    static const XMVECTORF32 MinXDec4 = { { { -511.0f, -511.0f, -511.0f, 0.0f } } };
+    static const XMVECTORF32 MaxXDec4 = { { { 511.0f, 511.0f, 511.0f, 3.0f } } };
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, MinXDec4, MaxXDec4);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->v = static_cast<uint32_t>(
+        (static_cast<int>(tmp.w) << 30)
+        | ((static_cast<int>(tmp.z) & 0x3FF) << 20)
+        | ((static_cast<int>(tmp.y) & 0x3FF) << 10)
+        | ((static_cast<int>(tmp.x) & 0x3FF)));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 ScaleXDec4 = { { { 1.0f, 1024.0f / 2.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f / 2.0f } } };
+    static const XMVECTORI32 MaskXDec4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } };
+    float32x4_t vResult = vmaxq_f32(V, MinXDec4);
+    vResult = vminq_f32(vResult, MaxXDec4);
+    vResult = vmulq_f32(vResult, ScaleXDec4);
+    int32x4_t vResulti = vcvtq_s32_f32(vResult);
+    vResulti = vandq_s32(vResulti, MaskXDec4);
+    // Do a horizontal or of 4 entries
+    uint32x2_t vTemp = vget_low_u32(vreinterpretq_u32_s32(vResulti));
+    uint32x2_t vTemp2 = vget_high_u32(vreinterpretq_u32_s32(vResulti));
+    vTemp = vorr_u32(vTemp, vTemp2);
+    // Perform a single bit left shift on y|w
+    vTemp2 = vdup_lane_u32(vTemp, 1);
+    vTemp2 = vadd_u32(vTemp2, vTemp2);
+    vTemp = vorr_u32(vTemp, vTemp2);
+    vst1_lane_u32(&pDestination->v, vTemp, 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ScaleXDec4 = { { { 1.0f, 1024.0f / 2.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f / 2.0f } } };
+    static const XMVECTORI32 MaskXDec4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } };
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V, MinXDec4);
+    vResult = _mm_min_ps(vResult, MaxXDec4);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult, ScaleXDec4);
+    // Convert to int
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti, MaskXDec4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
+    // Perform a single bit left shift on y|w
+    vResulti2 = _mm_add_epi32(vResulti2, vResulti2);
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v), _mm_castsi128_ps(vResulti));
+#endif
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUDecN4
+(
+    XMUDECN4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    static const XMVECTORF32  Scale = { { { 1023.0f, 1023.0f, 1023.0f, 3.0f } } };
+
+    XMVECTOR N = XMVectorSaturate(V);
+    N = XMVectorMultiply(N, Scale.v);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->v = static_cast<uint32_t>(
+        (static_cast<int>(tmp.w) << 30)
+        | ((static_cast<int>(tmp.z) & 0x3FF) << 20)
+        | ((static_cast<int>(tmp.y) & 0x3FF) << 10)
+        | ((static_cast<int>(tmp.x) & 0x3FF)));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 ScaleUDecN4 = { { { 1023.0f, 1023.0f * 1024.0f * 0.5f, 1023.0f * 1024.0f * 1024.0f, 3.0f * 1024.0f * 1024.0f * 1024.0f * 0.5f } } };
+    static const XMVECTORI32 MaskUDecN4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } };
+    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0.f));
+    vResult = vminq_f32(vResult, vdupq_n_f32(1.f));
+    vResult = vmulq_f32(vResult, ScaleUDecN4);
+    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
+    vResulti = vandq_u32(vResulti, MaskUDecN4);
+    // Do a horizontal or of 4 entries
+    uint32x2_t vTemp = vget_low_u32(vResulti);
+    uint32x2_t vTemp2 = vget_high_u32(vResulti);
+    vTemp = vorr_u32(vTemp, vTemp2);
+    // Perform a single bit left shift on y|w
+    vTemp2 = vdup_lane_u32(vTemp, 1);
+    vTemp2 = vadd_u32(vTemp2, vTemp2);
+    vTemp = vorr_u32(vTemp, vTemp2);
+    vst1_lane_u32(&pDestination->v, vTemp, 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ScaleUDecN4 = { { { 1023.0f, 1023.0f * 1024.0f * 0.5f, 1023.0f * 1024.0f * 1024.0f, 3.0f * 1024.0f * 1024.0f * 1024.0f * 0.5f } } };
+    static const XMVECTORI32 MaskUDecN4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } };
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    vResult = _mm_min_ps(vResult, g_XMOne);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult, ScaleUDecN4);
+    // Convert to int
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti, MaskUDecN4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
+    // Perform a left shift by one bit on y|w
+    vResulti2 = _mm_add_epi32(vResulti2, vResulti2);
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v), _mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUDecN4_XR
+(
+    XMUDECN4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+    static const XMVECTORF32 Scale = { { { 510.0f, 510.0f, 510.0f, 3.0f } } };
+    static const XMVECTORF32 Bias = { { { 384.0f, 384.0f, 384.0f, 0.0f } } };
+    static const XMVECTORF32 C = { { { 1023.f, 1023.f, 1023.f, 3.f } } };
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorMultiplyAdd(V, Scale, Bias);
+    N = XMVectorClamp(N, g_XMZero, C);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->v = static_cast<uint32_t>(
+        (static_cast<int>(tmp.w) << 30)
+        | ((static_cast<int>(tmp.z) & 0x3FF) << 20)
+        | ((static_cast<int>(tmp.y) & 0x3FF) << 10)
+        | ((static_cast<int>(tmp.x) & 0x3FF)));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Shift = { { { 1.0f, 1024.0f * 0.5f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f * 0.5f } } };
+    static const XMVECTORU32 MaskUDecN4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } };
+    float32x4_t vResult = vmlaq_f32(Bias, V, Scale);
+    vResult = vmaxq_f32(vResult, vdupq_n_f32(0.f));
+    vResult = vminq_f32(vResult, C);
+    vResult = vmulq_f32(vResult, Shift);
+    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
+    vResulti = vandq_u32(vResulti, MaskUDecN4);
+    // Do a horizontal or of 4 entries
+    uint32x2_t vTemp = vget_low_u32(vResulti);
+    uint32x2_t vTemp2 = vget_high_u32(vResulti);
+    vTemp = vorr_u32(vTemp, vTemp2);
+    // Perform a single bit left shift on y|w
+    vTemp2 = vdup_lane_u32(vTemp, 1);
+    vTemp2 = vadd_u32(vTemp2, vTemp2);
+    vTemp = vorr_u32(vTemp, vTemp2);
+    vst1_lane_u32(&pDestination->v, vTemp, 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 Shift = { { { 1.0f, 1024.0f * 0.5f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f * 0.5f } } };
+    static const XMVECTORU32 MaskUDecN4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } };
+    // Scale & bias
+    XMVECTOR vResult = XM_FMADD_PS(V, Scale, Bias);
+    // Clamp to bounds
+    vResult = _mm_max_ps(vResult, g_XMZero);
+    vResult = _mm_min_ps(vResult, C);
+    // Scale by shift values
+    vResult = _mm_mul_ps(vResult, Shift);
+    // Convert to int
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti, MaskUDecN4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
+    // Perform a left shift by one bit on y|w
+    vResulti2 = _mm_add_epi32(vResulti2, vResulti2);
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v), _mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUDec4
+(
+    XMUDEC4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+    static const XMVECTORF32 MaxUDec4 = { { { 1023.0f, 1023.0f, 1023.0f, 3.0f } } };
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), MaxUDec4);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->v = static_cast<uint32_t>(
+        (static_cast<int>(tmp.w) << 30)
+        | ((static_cast<int>(tmp.z) & 0x3FF) << 20)
+        | ((static_cast<int>(tmp.y) & 0x3FF) << 10)
+        | ((static_cast<int>(tmp.x) & 0x3FF)));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 ScaleUDec4 = { { { 1.0f, 1024.0f / 2.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f / 2.0f } } };
+    static const XMVECTORI32 MaskUDec4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } };
+    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0.f));
+    vResult = vminq_f32(vResult, MaxUDec4);
+    vResult = vmulq_f32(vResult, ScaleUDec4);
+    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
+    vResulti = vandq_u32(vResulti, MaskUDec4);
+    // Do a horizontal or of 4 entries
+    uint32x2_t vTemp = vget_low_u32(vResulti);
+    uint32x2_t vTemp2 = vget_high_u32(vResulti);
+    vTemp = vorr_u32(vTemp, vTemp2);
+    // Perform a single bit left shift on y|w
+    vTemp2 = vdup_lane_u32(vTemp, 1);
+    vTemp2 = vadd_u32(vTemp2, vTemp2);
+    vTemp = vorr_u32(vTemp, vTemp2);
+    vst1_lane_u32(&pDestination->v, vTemp, 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ScaleUDec4 = { { { 1.0f, 1024.0f / 2.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f / 2.0f } } };
+    static const XMVECTORI32 MaskUDec4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } };
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    vResult = _mm_min_ps(vResult, MaxUDec4);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult, ScaleUDec4);
+    // Convert to int
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti, MaskUDec4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
+    // Perform a left shift by one bit on y|w
+    vResulti2 = _mm_add_epi32(vResulti2, vResulti2);
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v), _mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4996)
+// C4996: ignore deprecation warning
+#endif
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreDecN4
+(
+    XMDECN4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    static const XMVECTORF32 Scale = { { { 511.0f, 511.0f, 511.0f, 1.0f } } };
+
+    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+    N = XMVectorMultiply(N, Scale.v);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->v = static_cast<uint32_t>(
+        (static_cast<int>(tmp.w) << 30)
+        | ((static_cast<int>(tmp.z) & 0x3FF) << 20)
+        | ((static_cast<int>(tmp.y) & 0x3FF) << 10)
+        | ((static_cast<int>(tmp.x) & 0x3FF)));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 ScaleDecN4 = { { { 511.0f, 511.0f * 1024.0f, 511.0f * 1024.0f * 1024.0f, 1.0f * 1024.0f * 1024.0f * 1024.0f } } };
+    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(-1.f));
+    vResult = vminq_f32(vResult, vdupq_n_f32(1.f));
+    vResult = vmulq_f32(vResult, ScaleDecN4);
+    int32x4_t vResulti = vcvtq_s32_f32(vResult);
+    vResulti = vandq_s32(vResulti, g_XMMaskDec4);
+    // Do a horizontal or of 4 entries
+    uint32x2_t vTemp = vget_low_u32(vreinterpretq_u32_s32(vResulti));
+    uint32x2_t vhi = vget_high_u32(vreinterpretq_u32_s32(vResulti));
+    vTemp = vorr_u32(vTemp, vhi);
+    vTemp = vpadd_u32(vTemp, vTemp);
+    vst1_lane_u32(&pDestination->v, vTemp, 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ScaleDecN4 = { { { 511.0f, 511.0f * 1024.0f, 511.0f * 1024.0f * 1024.0f, 1.0f * 1024.0f * 1024.0f * 1024.0f } } };
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne);
+    vResult = _mm_min_ps(vResult, g_XMOne);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult, ScaleDecN4);
+    // Convert to int
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti, g_XMMaskDec4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v), _mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreDec4
+(
+    XMDEC4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+    static const XMVECTORF32 MinDec4 = { { { -511.0f, -511.0f, -511.0f, -1.0f } } };
+    static const XMVECTORF32 MaxDec4 = { { { 511.0f, 511.0f, 511.0f, 1.0f } } };
+
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, MinDec4, MaxDec4);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->v = static_cast<uint32_t>(
+        (static_cast<int>(tmp.w) << 30)
+        | ((static_cast<int>(tmp.z) & 0x3FF) << 20)
+        | ((static_cast<int>(tmp.y) & 0x3FF) << 10)
+        | ((static_cast<int>(tmp.x) & 0x3FF)));
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 ScaleDec4 = { { { 1.0f, 1024.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f } } };
+    float32x4_t vResult = vmaxq_f32(V, MinDec4);
+    vResult = vminq_f32(vResult, MaxDec4);
+    vResult = vmulq_f32(vResult, ScaleDec4);
+    int32x4_t vResulti = vcvtq_s32_f32(vResult);
+    vResulti = vandq_s32(vResulti, g_XMMaskDec4);
+    // Do a horizontal or of all 4 entries
+    uint32x2_t vTemp = vget_low_u32(vreinterpretq_u32_s32(vResulti));
+    uint32x2_t vhi = vget_high_u32(vreinterpretq_u32_s32(vResulti));
+    vTemp = vorr_u32(vTemp, vhi);
+    vTemp = vpadd_u32(vTemp, vTemp);
+    vst1_lane_u32(&pDestination->v, vTemp, 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ScaleDec4 = { { { 1.0f, 1024.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f } } };
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V, MinDec4);
+    vResult = _mm_min_ps(vResult, MaxDec4);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult, ScaleDec4);
+    // Convert to int
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti, g_XMMaskDec4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v), _mm_castsi128_ps(vResulti));
+#endif
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUByteN4
+(
+    XMUBYTEN4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorSaturate(V);
+    N = XMVectorMultiply(N, g_UByteMax);
+    N = XMVectorTruncate(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->x = static_cast<uint8_t>(tmp.x);
+    pDestination->y = static_cast<uint8_t>(tmp.y);
+    pDestination->z = static_cast<uint8_t>(tmp.z);
+    pDestination->w = static_cast<uint8_t>(tmp.w);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0));
+    R = vminq_f32(R, vdupq_n_f32(1.0f));
+    R = vmulq_n_f32(R, 255.0f);
+    uint32x4_t vInt32 = vcvtq_u32_f32(R);
+    uint16x4_t vInt16 = vqmovn_u32(vInt32);
+    uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16));
+    vst1_lane_u32(&pDestination->v, vreinterpret_u32_u8(vInt8), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ScaleUByteN4 = { { { 255.0f, 255.0f * 256.0f * 0.5f, 255.0f * 256.0f * 256.0f, 255.0f * 256.0f * 256.0f * 256.0f * 0.5f } } };
+    static const XMVECTORI32 MaskUByteN4 = { { { 0xFF, 0xFF << (8 - 1), 0xFF << 16, 0xFF << (24 - 1) } } };
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    vResult = _mm_min_ps(vResult, g_XMOne);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult, ScaleUByteN4);
+    // Convert to int
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti, MaskUByteN4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
+    // Perform a single bit left shift to fix y|w
+    vResulti2 = _mm_add_epi32(vResulti2, vResulti2);
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v), _mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUByte4
+(
+    XMUBYTE4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->x = static_cast<uint8_t>(tmp.x);
+    pDestination->y = static_cast<uint8_t>(tmp.y);
+    pDestination->z = static_cast<uint8_t>(tmp.z);
+    pDestination->w = static_cast<uint8_t>(tmp.w);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0));
+    R = vminq_f32(R, vdupq_n_f32(255.0f));
+    uint32x4_t vInt32 = vcvtq_u32_f32(R);
+    uint16x4_t vInt16 = vqmovn_u32(vInt32);
+    uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16));
+    vst1_lane_u32(&pDestination->v, vreinterpret_u32_u8(vInt8), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ScaleUByte4 = { { { 1.0f, 256.0f * 0.5f, 256.0f * 256.0f, 256.0f * 256.0f * 256.0f * 0.5f } } };
+    static const XMVECTORI32 MaskUByte4 = { { { 0xFF, 0xFF << (8 - 1), 0xFF << 16, 0xFF << (24 - 1) } } };
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    vResult = _mm_min_ps(vResult, g_UByteMax);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult, ScaleUByte4);
+    // Convert to int by rounding
+    __m128i vResulti = _mm_cvtps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti, MaskUByte4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
+    // Perform a single bit left shift to fix y|w
+    vResulti2 = _mm_add_epi32(vResulti2, vResulti2);
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v), _mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreByteN4
+(
+    XMBYTEN4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
+    N = XMVectorMultiply(N, g_ByteMax);
+    N = XMVectorTruncate(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->x = static_cast<int8_t>(tmp.x);
+    pDestination->y = static_cast<int8_t>(tmp.y);
+    pDestination->z = static_cast<int8_t>(tmp.z);
+    pDestination->w = static_cast<int8_t>(tmp.w);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f));
+    R = vminq_f32(R, vdupq_n_f32(1.0f));
+    R = vmulq_n_f32(R, 127.0f);
+    int32x4_t vInt32 = vcvtq_s32_f32(R);
+    int16x4_t vInt16 = vqmovn_s32(vInt32);
+    int8x8_t vInt8 = vqmovn_s16(vcombine_s16(vInt16, vInt16));
+    vst1_lane_u32(&pDestination->v, vreinterpret_u32_s8(vInt8), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ScaleByteN4 = { { { 127.0f, 127.0f * 256.0f, 127.0f * 256.0f * 256.0f, 127.0f * 256.0f * 256.0f * 256.0f } } };
+    static const XMVECTORI32 MaskByteN4 = { { { 0xFF, 0xFF << 8, 0xFF << 16, static_cast<int>(0xFF000000) } } };
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne);
+    vResult = _mm_min_ps(vResult, g_XMOne);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult, ScaleByteN4);
+    // Convert to int
+    __m128i vResulti = _mm_cvttps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti, MaskByteN4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v), _mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreByte4
+(
+    XMBYTE4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->x = static_cast<int8_t>(tmp.x);
+    pDestination->y = static_cast<int8_t>(tmp.y);
+    pDestination->z = static_cast<int8_t>(tmp.z);
+    pDestination->w = static_cast<int8_t>(tmp.w);
+
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f));
+    R = vminq_f32(R, vdupq_n_f32(127.f));
+    int32x4_t vInt32 = vcvtq_s32_f32(R);
+    int16x4_t vInt16 = vqmovn_s32(vInt32);
+    int8x8_t vInt8 = vqmovn_s16(vcombine_s16(vInt16, vInt16));
+    vst1_lane_u32(&pDestination->v, vreinterpret_u32_s8(vInt8), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    static const XMVECTORF32 ScaleByte4 = { { { 1.0f, 256.0f, 256.0f * 256.0f, 256.0f * 256.0f * 256.0f } } };
+    static const XMVECTORI32 MaskByte4 = { { { 0xFF, 0xFF << 8, 0xFF << 16, static_cast<int>(0xFF000000) } } };
+    // Clamp to bounds
+    XMVECTOR vResult = _mm_max_ps(V, g_ByteMin);
+    vResult = _mm_min_ps(vResult, g_ByteMax);
+    // Scale by multiplication
+    vResult = _mm_mul_ps(vResult, ScaleByte4);
+    // Convert to int by rounding
+    __m128i vResulti = _mm_cvtps_epi32(vResult);
+    // Mask off any fraction
+    vResulti = _mm_and_si128(vResulti, MaskByte4);
+    // Do a horizontal or of 4 entries
+    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
+    // x = x|z, y = y|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    // Move Z to the x position
+    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
+    // i = x|y|z|w
+    vResulti = _mm_or_si128(vResulti, vResulti2);
+    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v), _mm_castsi128_ps(vResulti));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreUNibble4
+(
+    XMUNIBBLE4* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+    static const XMVECTORF32 Max = { { { 15.0f, 15.0f, 15.0f, 15.0f } } };
+#if defined(_XM_NO_INTRINSICS_)
+
+    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->v = static_cast<uint16_t>(
+        ((static_cast<int>(tmp.w) & 0xF) << 12)
+        | ((static_cast<int>(tmp.z) & 0xF) << 8)
+        | ((static_cast<int>(tmp.y) & 0xF) << 4)
+        | (static_cast<int>(tmp.x) & 0xF));
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Scale = { { { 1.0f, 16.f, 16.f * 16.f, 16.f * 16.f * 16.f } } };
+    static const XMVECTORU32 Mask = { { { 0xF, 0xF << 4, 0xF << 8, 0xF << 12 } } };
+    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0));
+    vResult = vminq_f32(vResult, Max);
+    vResult = vmulq_f32(vResult, Scale);
+    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
+    vResulti = vandq_u32(vResulti, Mask);
+    // Do a horizontal or of 4 entries
+    uint32x2_t vTemp = vget_low_u32(vResulti);
+    uint32x2_t vhi = vget_high_u32(vResulti);
+    vTemp = vorr_u32(vTemp, vhi);
+    vTemp = vpadd_u32(vTemp, vTemp);
+    vst1_lane_u16(&pDestination->v, vreinterpret_u16_u32(vTemp), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    vResult = _mm_min_ps(vResult, Max);
+    // Convert to int with rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // No SSE operations will write to 16-bit values, so we have to extract them manually
+    auto x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
+    auto y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
+    auto z = static_cast<uint16_t>(_mm_extract_epi16(vInt, 4));
+    auto w = static_cast<uint16_t>(_mm_extract_epi16(vInt, 6));
+    pDestination->v = static_cast<uint16_t>(
+        ((static_cast<int>(w) & 0xF) << 12)
+        | ((static_cast<int>(z) & 0xF) << 8)
+        | ((static_cast<int>(y) & 0xF) << 4)
+        | ((static_cast<int>(x) & 0xF)));
+#endif
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline void XM_CALLCONV XMStoreU555
+(
+    XMU555* pDestination,
+    FXMVECTOR V
+) noexcept
+{
+    assert(pDestination);
+    static const XMVECTORF32 Max = { { { 31.0f, 31.0f, 31.0f, 1.0f } } };
+
+#if defined(_XM_NO_INTRINSICS_)
+    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
+    N = XMVectorRound(N);
+
+    XMFLOAT4A tmp;
+    XMStoreFloat4A(&tmp, N);
+
+    pDestination->v = static_cast<uint16_t>(
+        ((tmp.w > 0.f) ? 0x8000 : 0)
+        | ((static_cast<int>(tmp.z) & 0x1F) << 10)
+        | ((static_cast<int>(tmp.y) & 0x1F) << 5)
+        | (static_cast<int>(tmp.x) & 0x1F));
+#elif defined(_XM_ARM_NEON_INTRINSICS_)
+    static const XMVECTORF32 Scale = { { { 1.0f, 32.f / 2.f, 32.f * 32.f, 32.f * 32.f * 32.f / 2.f } } };
+    static const XMVECTORU32 Mask = { { { 0x1F, 0x1F << (5 - 1), 0x1F << 10, 0x1 << (15 - 1) } } };
+    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0));
+    vResult = vminq_f32(vResult, Max);
+    vResult = vmulq_f32(vResult, Scale);
+    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
+    vResulti = vandq_u32(vResulti, Mask);
+    // Do a horizontal or of 4 entries
+    uint32x2_t vTemp = vget_low_u32(vResulti);
+    uint32x2_t vTemp2 = vget_high_u32(vResulti);
+    vTemp = vorr_u32(vTemp, vTemp2);
+    // Perform a single bit left shift on y|w
+    vTemp2 = vdup_lane_u32(vTemp, 1);
+    vTemp2 = vadd_u32(vTemp2, vTemp2);
+    vTemp = vorr_u32(vTemp, vTemp2);
+    vst1_lane_u16(&pDestination->v, vreinterpret_u16_u32(vTemp), 0);
+#elif defined(_XM_SSE_INTRINSICS_)
+    // Bounds check
+    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
+    vResult = _mm_min_ps(vResult, Max);
+    // Convert to int with rounding
+    __m128i vInt = _mm_cvtps_epi32(vResult);
+    // No SSE operations will write to 16-bit values, so we have to extract them manually
+    auto x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
+    auto y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
+    auto z = static_cast<uint16_t>(_mm_extract_epi16(vInt, 4));
+    auto w = static_cast<uint16_t>(_mm_extract_epi16(vInt, 6));
+    pDestination->v = static_cast<uint16_t>(
+        (static_cast<int>(w) ? 0x8000 : 0)
+        | ((static_cast<int>(z) & 0x1F) << 10)
+        | ((static_cast<int>(y) & 0x1F) << 5)
+        | ((static_cast<int>(x) & 0x1F)));
+#endif
+}
+
+
+/****************************************************************************
+ *
+ * XMCOLOR operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMCOLOR::XMCOLOR
+(
+    float _r,
+    float _g,
+    float _b,
+    float _a
+) noexcept
+{
+    XMStoreColor(this, XMVectorSet(_r, _g, _b, _a));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMCOLOR::XMCOLOR(const float* pArray) noexcept
+{
+    XMStoreColor(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMHALF2 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMHALF2::XMHALF2
+(
+    float _x,
+    float _y
+) noexcept
+{
+    x = XMConvertFloatToHalf(_x);
+    y = XMConvertFloatToHalf(_y);
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMHALF2::XMHALF2(const float* pArray) noexcept
+{
+    assert(pArray != nullptr);
+    x = XMConvertFloatToHalf(pArray[0]);
+    y = XMConvertFloatToHalf(pArray[1]);
+}
+
+/****************************************************************************
+ *
+ * XMSHORTN2 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMSHORTN2::XMSHORTN2
+(
+    float _x,
+    float _y
+) noexcept
+{
+    XMStoreShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMSHORTN2::XMSHORTN2(const float* pArray) noexcept
+{
+    XMStoreShortN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMSHORT2 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMSHORT2::XMSHORT2
+(
+    float _x,
+    float _y
+) noexcept
+{
+    XMStoreShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMSHORT2::XMSHORT2(const float* pArray) noexcept
+{
+    XMStoreShort2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUSHORTN2 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMUSHORTN2::XMUSHORTN2
+(
+    float _x,
+    float _y
+) noexcept
+{
+    XMStoreUShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMUSHORTN2::XMUSHORTN2(const float* pArray) noexcept
+{
+    XMStoreUShortN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUSHORT2 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMUSHORT2::XMUSHORT2
+(
+    float _x,
+    float _y
+) noexcept
+{
+    XMStoreUShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMUSHORT2::XMUSHORT2(const float* pArray) noexcept
+{
+    XMStoreUShort2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMBYTEN2 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMBYTEN2::XMBYTEN2
+(
+    float _x,
+    float _y
+) noexcept
+{
+    XMStoreByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMBYTEN2::XMBYTEN2(const float* pArray) noexcept
+{
+    XMStoreByteN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMBYTE2 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMBYTE2::XMBYTE2
+(
+    float _x,
+    float _y
+) noexcept
+{
+    XMStoreByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMBYTE2::XMBYTE2(const float* pArray) noexcept
+{
+    XMStoreByte2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUBYTEN2 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMUBYTEN2::XMUBYTEN2
+(
+    float _x,
+    float _y
+) noexcept
+{
+    XMStoreUByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMUBYTEN2::XMUBYTEN2(const float* pArray) noexcept
+{
+    XMStoreUByteN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUBYTE2 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMUBYTE2::XMUBYTE2
+(
+    float _x,
+    float _y
+) noexcept
+{
+    XMStoreUByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMUBYTE2::XMUBYTE2(const float* pArray) noexcept
+{
+    XMStoreUByte2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMU565 operators
+ *
+ ****************************************************************************/
+
+inline XMU565::XMU565
+(
+    float _x,
+    float _y,
+    float _z
+) noexcept
+{
+    XMStoreU565(this, XMVectorSet(_x, _y, _z, 0.0f));
+}
+
+_Use_decl_annotations_
+inline XMU565::XMU565(const float* pArray) noexcept
+{
+    XMStoreU565(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMFLOAT3PK operators
+ *
+ ****************************************************************************/
+
+inline XMFLOAT3PK::XMFLOAT3PK
+(
+    float _x,
+    float _y,
+    float _z
+) noexcept
+{
+    XMStoreFloat3PK(this, XMVectorSet(_x, _y, _z, 0.0f));
+}
+
+_Use_decl_annotations_
+inline XMFLOAT3PK::XMFLOAT3PK(const float* pArray) noexcept
+{
+    XMStoreFloat3PK(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMFLOAT3SE operators
+ *
+ ****************************************************************************/
+
+inline XMFLOAT3SE::XMFLOAT3SE
+(
+    float _x,
+    float _y,
+    float _z
+) noexcept
+{
+    XMStoreFloat3SE(this, XMVectorSet(_x, _y, _z, 0.0f));
+}
+
+_Use_decl_annotations_
+inline XMFLOAT3SE::XMFLOAT3SE(const float* pArray) noexcept
+{
+    XMStoreFloat3SE(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMHALF4 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMHALF4::XMHALF4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+) noexcept
+{
+    x = XMConvertFloatToHalf(_x);
+    y = XMConvertFloatToHalf(_y);
+    z = XMConvertFloatToHalf(_z);
+    w = XMConvertFloatToHalf(_w);
+}
+
+//------------------------------------------------------------------------------
+
+_Use_decl_annotations_
+inline XMHALF4::XMHALF4(const float* pArray) noexcept
+{
+    XMConvertFloatToHalfStream(&x, sizeof(HALF), pArray, sizeof(float), 4);
+}
+
+/****************************************************************************
+ *
+ * XMSHORTN4 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMSHORTN4::XMSHORTN4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+) noexcept
+{
+    XMStoreShortN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMSHORTN4::XMSHORTN4(const float* pArray) noexcept
+{
+    XMStoreShortN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMSHORT4 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMSHORT4::XMSHORT4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+) noexcept
+{
+    XMStoreShort4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMSHORT4::XMSHORT4(const float* pArray) noexcept
+{
+    XMStoreShort4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUSHORTN4 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMUSHORTN4::XMUSHORTN4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+) noexcept
+{
+    XMStoreUShortN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMUSHORTN4::XMUSHORTN4(const float* pArray) noexcept
+{
+    XMStoreUShortN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUSHORT4 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMUSHORT4::XMUSHORT4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+) noexcept
+{
+    XMStoreUShort4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMUSHORT4::XMUSHORT4(const float* pArray) noexcept
+{
+    XMStoreUShort4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMXDECN4 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMXDECN4::XMXDECN4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+) noexcept
+{
+    XMStoreXDecN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMXDECN4::XMXDECN4(const float* pArray) noexcept
+{
+    XMStoreXDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMXDEC4 operators
+ *
+ ****************************************************************************/
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4996)
+ // C4996: ignore deprecation warning
+#endif
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+ //------------------------------------------------------------------------------
+
+inline XMXDEC4::XMXDEC4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+) noexcept
+{
+    XMStoreXDec4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMXDEC4::XMXDEC4(const float* pArray) noexcept
+{
+    XMStoreXDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMDECN4 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMDECN4::XMDECN4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+) noexcept
+{
+    XMStoreDecN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMDECN4::XMDECN4(const float* pArray) noexcept
+{
+    XMStoreDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMDEC4 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMDEC4::XMDEC4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+) noexcept
+{
+    XMStoreDec4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMDEC4::XMDEC4(const float* pArray) noexcept
+{
+    XMStoreDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+/****************************************************************************
+ *
+ * XMUDECN4 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMUDECN4::XMUDECN4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+) noexcept
+{
+    XMStoreUDecN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMUDECN4::XMUDECN4(const float* pArray) noexcept
+{
+    XMStoreUDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUDEC4 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMUDEC4::XMUDEC4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+) noexcept
+{
+    XMStoreUDec4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMUDEC4::XMUDEC4(const float* pArray) noexcept
+{
+    XMStoreUDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMBYTEN4 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMBYTEN4::XMBYTEN4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+) noexcept
+{
+    XMStoreByteN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMBYTEN4::XMBYTEN4(const float* pArray) noexcept
+{
+    XMStoreByteN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMBYTE4 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMBYTE4::XMBYTE4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+) noexcept
+{
+    XMStoreByte4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMBYTE4::XMBYTE4(const float* pArray) noexcept
+{
+    XMStoreByte4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUBYTEN4 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMUBYTEN4::XMUBYTEN4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+) noexcept
+{
+    XMStoreUByteN4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMUBYTEN4::XMUBYTEN4(const float* pArray) noexcept
+{
+    XMStoreUByteN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUBYTE4 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMUBYTE4::XMUBYTE4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+) noexcept
+{
+    XMStoreUByte4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMUBYTE4::XMUBYTE4(const float* pArray) noexcept
+{
+    XMStoreUByte4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMUNIBBLE4 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMUNIBBLE4::XMUNIBBLE4
+(
+    float _x,
+    float _y,
+    float _z,
+    float _w
+) noexcept
+{
+    XMStoreUNibble4(this, XMVectorSet(_x, _y, _z, _w));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMUNIBBLE4::XMUNIBBLE4(const float* pArray) noexcept
+{
+    XMStoreUNibble4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
+}
+
+/****************************************************************************
+ *
+ * XMU555 operators
+ *
+ ****************************************************************************/
+
+ //------------------------------------------------------------------------------
+
+inline XMU555::XMU555
+(
+    float _x,
+    float _y,
+    float _z,
+    bool _w
+) noexcept
+{
+    XMStoreU555(this, XMVectorSet(_x, _y, _z, ((_w) ? 1.0f : 0.0f)));
+}
+
+//------------------------------------------------------------------------------
+_Use_decl_annotations_
+inline XMU555::XMU555
+(
+    const float* pArray,
+    bool _w
+) noexcept
+{
+    XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray));
+    XMStoreU555(this, XMVectorSetW(V, ((_w) ? 1.0f : 0.0f)));
+}
+
diff --git a/src/thirdparty/DirectXMath-dec2022/LICENSE b/src/thirdparty/DirectXMath-dec2022/LICENSE
new file mode 100644
index 000000000..74ee33848
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/LICENSE
@@ -0,0 +1,21 @@
+                               The MIT License (MIT)
+
+Copyright (c) 2011-2022 Microsoft Corp
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+software and associated documentation files (the "Software"), to deal in the Software 
+without restriction, including without limitation the rights to use, copy, modify, 
+merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 
+permit persons to whom the Software is furnished to do so, subject to the following 
+conditions: 
+
+The above copyright notice and this permission notice shall be included in all copies 
+or substantial portions of the Software.  
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
diff --git a/src/thirdparty/DirectXMath-dec2022/MatrixStack/DirectXMatrixStack.h b/src/thirdparty/DirectXMath-dec2022/MatrixStack/DirectXMatrixStack.h
new file mode 100644
index 000000000..46fe263ca
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/MatrixStack/DirectXMatrixStack.h
@@ -0,0 +1,241 @@
+//-------------------------------------------------------------------------------------
+// DirectXMatrixStack.h -- DirectXMath C++ Matrix Stack
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615560
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <new>
+
+#ifdef _WIN32
+#include <malloc.h>
+#endif
+
+#include <DirectXMath.h>
+
+
+namespace DirectX
+{
+    class MatrixStack
+    {
+    public:
+        MatrixStack(size_t startSize = 16) noexcept(false) :
+            m_stackSize(0),
+            m_current(0),
+            m_stack(nullptr)
+        {
+            assert(startSize > 0);
+            Allocate(startSize);
+            LoadIdentity();
+        }
+
+        MatrixStack(MatrixStack&&) = default;
+        MatrixStack& operator= (MatrixStack&&) = default;
+
+        MatrixStack(MatrixStack const&) = delete;
+        MatrixStack& operator= (MatrixStack const&) = delete;
+
+        const XMMATRIX XM_CALLCONV Top() const noexcept { return m_stack[m_current]; }
+        const XMMATRIX* GetTop() const noexcept { return &m_stack[m_current]; }
+
+        size_t Size() const noexcept { return (m_current + 1); }
+
+        void Pop()
+        {
+            if (m_current > 0)
+            {
+                --m_current;
+            }
+        }
+
+        void Push()
+        {
+            ++m_current;
+
+            if (m_current >= m_stackSize)
+            {
+                Allocate(m_stackSize * 2);
+            }
+
+            // Replicate the original top of the matrix stack.
+            m_stack[m_current] = m_stack[m_current - 1];
+        }
+
+        // Loads identity into the top of the matrix stack.
+        void LoadIdentity() noexcept
+        {
+            m_stack[m_current] = XMMatrixIdentity();
+        }
+
+        // Load a matrix into the top of the matrix stack.
+        void XM_CALLCONV LoadMatrix(FXMMATRIX matrix) noexcept
+        {
+            m_stack[m_current] = matrix;
+        }
+
+        // Multiply a matrix by the top of the stack, store result in top.
+        void XM_CALLCONV MultiplyMatrix(FXMMATRIX matrix) noexcept
+        {
+            m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], matrix);
+        }
+
+        // Pre-multiplies a matrix by the top of the stack, store result in top.
+        void XM_CALLCONV MultiplyMatrixLocal(FXMMATRIX matrix) noexcept
+        {
+            m_stack[m_current] = XMMatrixMultiply(matrix, m_stack[m_current]);
+        }
+
+        // Add a rotation about X to stack top.
+        void XM_CALLCONV RotateX(float angle) noexcept
+        {
+            XMMATRIX mat = XMMatrixRotationX(angle);
+            m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], mat);
+        }
+
+        void XM_CALLCONV RotateXLocal(float angle) noexcept
+        {
+            XMMATRIX mat = XMMatrixRotationX(angle);
+            m_stack[m_current] = XMMatrixMultiply(mat, m_stack[m_current]);
+        }
+
+        // Add a rotation about Y to stack top.
+        void XM_CALLCONV RotateY(float angle) noexcept
+        {
+            XMMATRIX mat = XMMatrixRotationY(angle);
+            m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], mat);
+        }
+
+        void XM_CALLCONV RotateYLocal(float angle) noexcept
+        {
+            XMMATRIX mat = XMMatrixRotationY(angle);
+            m_stack[m_current] = XMMatrixMultiply(mat, m_stack[m_current]);
+        }
+
+        // Add a rotation about Z to stack top.
+        void XM_CALLCONV RotateZ(float angle) noexcept
+        {
+            XMMATRIX mat = XMMatrixRotationZ(angle);
+            m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], mat);
+        }
+
+        void XM_CALLCONV RotateZLocal(float angle) noexcept
+        {
+            XMMATRIX mat = XMMatrixRotationZ(angle);
+            m_stack[m_current] = XMMatrixMultiply(mat, m_stack[m_current]);
+        }
+
+        // Add a rotation around an axis to stack top.
+        void XM_CALLCONV RotateAxis(FXMVECTOR axis, float angle) noexcept
+        {
+            XMMATRIX mat = XMMatrixRotationAxis(axis, angle);
+            m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], mat);
+        }
+
+        void XM_CALLCONV RotateAxisLocal(FXMVECTOR axis, float angle) noexcept
+        {
+            XMMATRIX mat = XMMatrixRotationAxis(axis, angle);
+            m_stack[m_current] = XMMatrixMultiply(mat, m_stack[m_current]);
+        }
+
+        // Add a rotation by roll/pitch/yaw to the stack top.
+        void RotateRollPitchYaw(float pitch, float yaw, float roll) noexcept
+        {
+            XMMATRIX mat = XMMatrixRotationRollPitchYaw(pitch, yaw, roll);
+            m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], mat);
+        }
+
+        void RotateRollPitchYawLocal(float pitch, float yaw, float roll) noexcept
+        {
+            XMMATRIX mat = XMMatrixRotationRollPitchYaw(pitch, yaw, roll);
+            m_stack[m_current] = XMMatrixMultiply(mat, m_stack[m_current]);
+        }
+
+        // Add a rotation by a quaternion stack top.
+        void XM_CALLCONV RotateByQuaternion(FXMVECTOR quat) noexcept
+        {
+            XMMATRIX mat = XMMatrixRotationQuaternion(quat);
+            m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], mat);
+        }
+
+        void XM_CALLCONV RotateByQuaternionLocal(FXMVECTOR quat) noexcept
+        {
+            XMMATRIX mat = XMMatrixRotationQuaternion(quat);
+            m_stack[m_current] = XMMatrixMultiply(mat, m_stack[m_current]);
+        }
+
+        // Add a scale to the stack top.
+        void Scale(float x, float y, float z) noexcept
+        {
+            XMMATRIX mat = XMMatrixScaling(x, y, z);
+            m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], mat);
+        }
+
+        void ScaleLocal(float x, float y, float z) noexcept
+        {
+            XMMATRIX mat = XMMatrixScaling(x, y, z);
+            m_stack[m_current] = XMMatrixMultiply(mat, m_stack[m_current]);
+        }
+
+        // Add a translation to the stack top.
+        void Translate(float x, float y, float z) noexcept
+        {
+            XMMATRIX mat = XMMatrixTranslation(x, y, z);
+            m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], mat);
+        }
+
+        void TranslateLocal(float x, float y, float z) noexcept
+        {
+            XMMATRIX mat = XMMatrixTranslation(x, y, z);
+            m_stack[m_current] = XMMatrixMultiply(mat, m_stack[m_current]);
+        }
+
+    private:
+
+        struct matrix_deleter
+        {
+            void operator()(void* p) noexcept
+            {
+#ifdef _WIN32
+                _aligned_free(p);
+#else
+                free(p);
+#endif
+            }
+        };
+
+        void Allocate(size_t newSize)
+        {
+#ifdef _WIN32
+            void* ptr = _aligned_malloc(newSize * sizeof(XMMATRIX), 16);
+#else
+            // This C++17 Standard Library function is currently NOT
+            // implemented for the Microsoft Standard C++ Library.
+            void* ptr = aligned_alloc(16, newSize * sizeof(XMMATRIX));
+#endif
+            if (!ptr)
+                throw std::bad_alloc();
+
+            if (m_stack)
+            {
+                assert(newSize >= m_stackSize);
+                memcpy(ptr, m_stack.get(), sizeof(XMMATRIX) * m_stackSize);
+            }
+
+            m_stack.reset(reinterpret_cast<XMMATRIX*>(ptr));
+            m_stackSize = newSize;
+        }
+
+        size_t										m_stackSize;
+        size_t										m_current;
+        std::unique_ptr<XMMATRIX[], matrix_deleter>	m_stack;
+    };
+} // namespace DirectX
diff --git a/src/thirdparty/DirectXMath-dec2022/README.md b/src/thirdparty/DirectXMath-dec2022/README.md
new file mode 100644
index 000000000..448640c56
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/README.md
@@ -0,0 +1,115 @@
+![DirectX Logo](https://raw.githubusercontent.com/wiki/Microsoft/DirectXMath/X_jpg.jpg)
+
+# DirectXMath
+
+https://github.com/Microsoft/DirectXMath
+
+Copyright (c) Microsoft Corporation.
+
+**December 2022**
+
+This package contains the DirectXMath library, an all inline SIMD C++ linear algebra library for use in games and graphics apps.
+
+This code is designed to build with Visual Studio 2019 (16.11), Visual Studio 2022, or clang/LLVM for Windows. It is recommended that you make use of the latest updates.
+
+These components are designed to work without requiring any content from the legacy DirectX SDK. For details, see [Where is the DirectX SDK?](https://aka.ms/dxsdk).
+
+## Directory Layout
+
+* ``Inc\``
+
+  + DirectXMath Files (in the DirectX C++ namespace)
+
+    * DirectXMath.h - Core library
+    * DirectXPackedVector.h - Load/Store functions and types for working with various compressed GPU formats
+    * DirectXColors.h - .NET-style Color defines in sRGB and linear color space
+    * DirectXCollision.h - Bounding volume collision library
+
+* ``Extentions\``
+
+  + Advanced instruction set variants for guarded codepaths
+
+    * DirectXMathSSE3.h - SSE3
+    * DirectXMathBE.h - Supplemental SSE3 (SSSE3)
+    * DirectXMathSSE4.h - SSE4.1
+    * DirectXMathAVX.h - Advanced Vector Extensions (AVX)
+    * DirectXMathAVX2.h - Advanced Vector Extensions 2 (AVX2)
+    * DirectXMathF16C.h - Half-precision conversions (F16C)
+    * DirectXMathFMA3.h - Fused multiply-accumulate (FMA3)
+    * DirectXMathFMA4.h - Fused multiply-accumulate (FMA4)
+
+* ``SHMath\``
+
+  + Spherical Harmonics math functions
+
+    * DirectXSH.h - Header for SHMath functions
+    * DirectXSH.cpp, DirectXSHD3D11.cpp, DirectXSHD3D12.cpp - Implementation
+
+* ``XDSP\``
+
+  + XDSP.h - Digital Signal Processing helper functions
+
+* ``build\``
+
+  + Contains YAML files for the build pipelines along with some miscellaneous build files and scripts.
+
+## Documentation
+
+Documentation is available on the [Microsoft Docs](https://docs.microsoft.com/en-us/windows/desktop/dxmath/directxmath-portal). Additional information can be found on the [project wiki](https://github.com/microsoft/DirectXMath/wiki).
+
+## Compiler support
+
+Officially the library is supported with Microsoft Visual C++ 2019 or later, clang/LLVM v12 or later, and GCC 9 or later. It should also compile with the Intel C++ and MinGW compilers.
+
+When building with clang/LLVM or other GNU C compilers, the ``_XM_NO_XMVECTOR_OVERLOADS_`` control define is set because these compilers do not support creating operator overloads for the ``XMVECTOR`` type. You can choose to enable this preprocessor define explicitly to do the same thing with Visual C++ for improved portability.
+
+To build for non-Windows platforms, you need to provide a ``sal.h`` header in your include path. You can obtain an open source version from [GitHub](https://github.com/dotnet/corert/blob/master/src/Native/inc/unix/sal.h).
+
+With GCC, the SAL annotation preprocessor symbols can conflict with the GNU implementation of the Standard C++ Library. The workaround is to include the system headers before including DirectXMath:
+
+```
+#include <algorithm>
+#include <utility>
+
+#include <DirectXMath.h>
+```
+
+## Notices
+
+All content and source code for this package are subject to the terms of the [MIT License](https://github.com/microsoft/DirectXMath/blob/main/LICENSE).
+
+For the latest version of DirectXMath, bug reports, etc. please visit the project site on [GitHub](https://github.com/microsoft/DirectXMath).
+
+## Support
+
+For questions, consider using [Stack Overflow](https://stackoverflow.com/questions/tagged/directxmath) with the *directxmath* tag, or the [DirectX Discord Server](https://discord.gg/directx) in the *dx12-developers* or *dx9-dx11-developers* channel.
+
+For bug reports and feature requests, please use GitHub [issues](https://github.com/microsoft/DirectXMath/issues) for this project.
+
+## Contributing
+
+This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
+
+When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
+
+## Trademarks
+
+This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies.
+
+## Credits
+
+The xboxmath library was originated by Matt Bronder with contributions from Sakphong Chanbai and David Hefner for the Xbox 360.
+
+The xnamath library for the DirectX SDK and Xbox XDK was the work of Chuck Walbourn and Becky Heineman based on xboxmath, with contributions from Jeremy Gup, Dan Haffner, Matt Lee, Casey Meekhof, Rich Sauer, Jason Strayer, and Xiaoyue Zheng.
+
+The DirectXMath library for the Windows SDK and Xbox One XDK is the work of Chuck Walbourn based on xnamath, with contributions from Darren Anderson, Matt Lee, Aaron Rodriguez Hernandez, Yuichi Ito, Reza Nourai, Rich Sauer, and Jason Strayer.
+
+Thanks to Dave Eberly for his contributions particularly in improving the transcendental functions.
+
+Thanks to Bruce Dawson for his help with the rounding functions.
+
+Thanks to Andrew Farrier for the fixes to ``XMVerifyCPUSupport`` to properly support clang.
+
+Thanks to Scott Matloff for his help in getting the library updated to use Intel SVML for VS 2019.
diff --git a/src/thirdparty/DirectXMath-dec2022/SECURITY.md b/src/thirdparty/DirectXMath-dec2022/SECURITY.md
new file mode 100644
index 000000000..f7b89984f
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/SECURITY.md
@@ -0,0 +1,41 @@
+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
+
+## Security
+
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
+
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
+
+## Reporting Security Issues
+
+**Please do not report security vulnerabilities through public GitHub issues.**
+
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
+
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
+
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
+
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.
+
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
+
+## Preferred Languages
+
+We prefer all communications to be in English.
+
+## Policy
+
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
+
+<!-- END MICROSOFT SECURITY.MD BLOCK -->
\ No newline at end of file
diff --git a/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSH.cpp b/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSH.cpp
new file mode 100644
index 000000000..a2c504bce
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSH.cpp
@@ -0,0 +1,4908 @@
+//-----------------------------------------------------------------------------------
+// DirectXSH.cpp -- C++ Spherical Harmonics Math Library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/p/?LinkId=262885
+//-------------------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#pragma warning( disable : 4619 4456 5264)
+// C4619 #pragma warning warnings
+// C4456 declaration hides previous local declaration
+// C5264 'const' variable is not used
+#endif
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic ignored "-Wshadow"
+#pragma clang diagnostic ignored "-Wunused-const-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#endif
+
+#include "DirectXSH.h"
+#include <cassert>
+
+using namespace DirectX;
+
+namespace
+{
+#ifdef _PREFAST_
+#pragma prefast(disable:246, "generated code by maple (nested const variable definitions)")
+#endif
+
+    const float fExtraNormFac[XM_SH_MAXORDER] = { 2.0f*sqrtf(XM_PI), 2.0f / 3.0f*sqrtf(3.0f*XM_PI), 2.0f / 5.0f*sqrtf(5.0f*XM_PI), 2.0f / 7.0f*sqrtf(7.0f*XM_PI), 2.0f / 3.0f*sqrtf(XM_PI), 2.0f / 11.0f*sqrtf(11.0f*XM_PI) };
+
+    // computes the integral of a constant function over a solid angular
+    // extent.  No error checking - only used internaly.  This function
+    // only returns the Yl0 coefficients, since the rest are zero for
+    // circularly symmetric functions.
+    const float ComputeCapInt_t1 = sqrtf(0.3141593E1f);
+    const float ComputeCapInt_t5 = sqrtf(3.0f);
+    const float ComputeCapInt_t11 = sqrtf(5.0f);
+    const float ComputeCapInt_t18 = sqrtf(7.0f);
+    const float ComputeCapInt_t32 = sqrtf(11.0f);
+
+    inline void ComputeCapInt(const size_t order, float angle, float *pR)
+    {
+        const float t2 = cosf(angle);
+        const float t3 = ComputeCapInt_t1*t2;
+        const float t7 = sinf(angle);
+        const float t8 = t7*t7;
+
+
+        pR[0] = -t3 + ComputeCapInt_t1;
+        pR[1] = ComputeCapInt_t5*ComputeCapInt_t1*t8 / 2.0f;
+
+        if (order > 2)
+        {
+            const float t13 = t2*t2;
+
+            pR[2] = -ComputeCapInt_t11*ComputeCapInt_t1*t2*(t13 - 1.0f) / 2.0f;
+            if (order > 3)
+            {
+                const float t19 = ComputeCapInt_t18*ComputeCapInt_t1;
+                const float t20 = t13*t13;
+
+                pR[3] = -5.0f / 8.0f*t19*t20 + 3.0f / 4.0f*t19*t13 - t19 / 8.0f;
+                if (order > 4)
+                {
+
+
+                    pR[4] = -3.0f / 8.0f*t3*(7.0f*t20 - 10.0f*t13 + 3.0f);
+                    if (order > 5)
+                    {
+                        const float t33 = ComputeCapInt_t32*ComputeCapInt_t1;
+                        pR[5] = -21.0f / 16.0f*t33*t20*t13 + 35.0f / 16.0f*t33*t20 - 15.0f / 16.0f*t33*t13 + t33 / 16.0f;
+                    }
+                }
+            }
+        }
+    }
+
+    // input pF only consists of Yl0 values, normalizes coefficients for directional
+    // lights.
+    inline float CosWtInt(const size_t order)
+    {
+        const float fCW0 = 0.25f;
+        const float fCW1 = 0.5f;
+        const float fCW2 = 5.0f / 16.0f;
+        //const float fCW3 = 0.0f;
+        const float fCW4 = -3.0f / 32.0f;
+        //const float fCW5 = 0.0f;
+
+        // order has to be at least linear...
+
+        float fRet = fCW0 + fCW1;
+
+        if (order > 2) fRet += fCW2;
+        if (order > 4) fRet += fCW4;
+
+        // odd degrees >= 3 evaluate to zero integrated against cosine...
+
+        return fRet;
+    }
+
+    const float SHEvalHemisphereLight_fSqrtPi = sqrtf(XM_PI);
+    const float SHEvalHemisphereLight_fSqrtPi3 = sqrtf(XM_PI / 3.0f);
+
+    using REAL = float;
+#define CONSTANT(x) (x ## f)
+
+    // routine generated programmatically for evaluating SH basis for degree 1
+    // inputs (x,y,z) are a point on the sphere (i.e., must be unit length)
+    // output is vector b with SH basis evaluated at (x,y,z).
+    //
+    inline void sh_eval_basis_1(REAL x, REAL y, REAL z, REAL b[4])
+    {
+        /* m=0 */
+
+        // l=0
+        const REAL p_0_0 = CONSTANT(0.282094791773878140);
+        b[0] = p_0_0; // l=0,m=0
+        // l=1
+        const REAL p_1_0 = CONSTANT(0.488602511902919920)*z;
+        b[2] = p_1_0; // l=1,m=0
+
+
+        /* m=1 */
+
+        const REAL s1 = y;
+        const REAL c1 = x;
+
+        // l=1
+        const REAL p_1_1 = CONSTANT(-0.488602511902919920);
+        b[1] = p_1_1*s1; // l=1,m=-1
+        b[3] = p_1_1*c1; // l=1,m=+1
+    }
+
+    // routine generated programmatically for evaluating SH basis for degree 2
+    // inputs (x,y,z) are a point on the sphere (i.e., must be unit length)
+    // output is vector b with SH basis evaluated at (x,y,z).
+    //
+    inline void sh_eval_basis_2(REAL x, REAL y, REAL z, REAL b[9])
+    {
+        const REAL z2 = z*z;
+
+
+        /* m=0 */
+
+        // l=0
+        const REAL p_0_0 = CONSTANT(0.282094791773878140);
+        b[0] = p_0_0; // l=0,m=0
+        // l=1
+        const REAL p_1_0 = CONSTANT(0.488602511902919920)*z;
+        b[2] = p_1_0; // l=1,m=0
+        // l=2
+        const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050);
+        b[6] = p_2_0; // l=2,m=0
+
+
+        /* m=1 */
+
+        const REAL s1 = y;
+        const REAL c1 = x;
+
+        // l=1
+        const REAL p_1_1 = CONSTANT(-0.488602511902919920);
+        b[1] = p_1_1*s1; // l=1,m=-1
+        b[3] = p_1_1*c1; // l=1,m=+1
+        // l=2
+        const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z;
+        b[5] = p_2_1*s1; // l=2,m=-1
+        b[7] = p_2_1*c1; // l=2,m=+1
+
+
+        /* m=2 */
+
+        const REAL s2 = x*s1 + y*c1;
+        const REAL c2 = x*c1 - y*s1;
+
+        // l=2
+        const REAL p_2_2 = CONSTANT(0.546274215296039590);
+        b[4] = p_2_2*s2; // l=2,m=-2
+        b[8] = p_2_2*c2; // l=2,m=+2
+    }
+
+    // routine generated programmatically for evaluating SH basis for degree 3
+    // inputs (x,y,z) are a point on the sphere (i.e., must be unit length)
+    // output is vector b with SH basis evaluated at (x,y,z).
+    //
+    void sh_eval_basis_3(REAL x, REAL y, REAL z, REAL b[16])
+    {
+        const REAL z2 = z*z;
+
+
+        /* m=0 */
+
+        // l=0
+        const REAL p_0_0 = CONSTANT(0.282094791773878140);
+        b[0] = p_0_0; // l=0,m=0
+        // l=1
+        const REAL p_1_0 = CONSTANT(0.488602511902919920)*z;
+        b[2] = p_1_0; // l=1,m=0
+        // l=2
+        const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050);
+        b[6] = p_2_0; // l=2,m=0
+        // l=3
+        const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200));
+        b[12] = p_3_0; // l=3,m=0
+
+
+        /* m=1 */
+
+        const REAL s1 = y;
+        const REAL c1 = x;
+
+        // l=1
+        const REAL p_1_1 = CONSTANT(-0.488602511902919920);
+        b[1] = p_1_1*s1; // l=1,m=-1
+        b[3] = p_1_1*c1; // l=1,m=+1
+        // l=2
+        const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z;
+        b[5] = p_2_1*s1; // l=2,m=-1
+        b[7] = p_2_1*c1; // l=2,m=+1
+        // l=3
+        const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770);
+        b[11] = p_3_1*s1; // l=3,m=-1
+        b[13] = p_3_1*c1; // l=3,m=+1
+
+
+        /* m=2 */
+
+        const REAL s2 = x*s1 + y*c1;
+        const REAL c2 = x*c1 - y*s1;
+
+        // l=2
+        const REAL p_2_2 = CONSTANT(0.546274215296039590);
+        b[4] = p_2_2*s2; // l=2,m=-2
+        b[8] = p_2_2*c2; // l=2,m=+2
+        // l=3
+        const REAL p_3_2 = CONSTANT(1.445305721320277100)*z;
+        b[10] = p_3_2*s2; // l=3,m=-2
+        b[14] = p_3_2*c2; // l=3,m=+2
+
+
+        /* m=3 */
+
+        const REAL s3 = x*s2 + y*c2;
+        const REAL c3 = x*c2 - y*s2;
+
+        // l=3
+        const REAL p_3_3 = CONSTANT(-0.590043589926643520);
+        b[9] = p_3_3*s3; // l=3,m=-3
+        b[15] = p_3_3*c3; // l=3,m=+3
+    }
+
+    // routine generated programmatically for evaluating SH basis for degree 4
+    // inputs (x,y,z) are a point on the sphere (i.e., must be unit length)
+    // output is vector b with SH basis evaluated at (x,y,z).
+    //
+    void sh_eval_basis_4(REAL x, REAL y, REAL z, REAL b[25])
+    {
+        const REAL z2 = z*z;
+
+
+        /* m=0 */
+
+        // l=0
+        const REAL p_0_0 = CONSTANT(0.282094791773878140);
+        b[0] = p_0_0; // l=0,m=0
+        // l=1
+        const REAL p_1_0 = CONSTANT(0.488602511902919920)*z;
+        b[2] = p_1_0; // l=1,m=0
+        // l=2
+        const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050);
+        b[6] = p_2_0; // l=2,m=0
+        // l=3
+        const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200));
+        b[12] = p_3_0; // l=3,m=0
+        // l=4
+        const REAL p_4_0 = CONSTANT(1.984313483298443000)*z*p_3_0 + CONSTANT(-1.006230589874905300)*p_2_0;
+        b[20] = p_4_0; // l=4,m=0
+
+
+        /* m=1 */
+
+        const REAL s1 = y;
+        const REAL c1 = x;
+
+        // l=1
+        const REAL p_1_1 = CONSTANT(-0.488602511902919920);
+        b[1] = p_1_1*s1; // l=1,m=-1
+        b[3] = p_1_1*c1; // l=1,m=+1
+        // l=2
+        const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z;
+        b[5] = p_2_1*s1; // l=2,m=-1
+        b[7] = p_2_1*c1; // l=2,m=+1
+        // l=3
+        const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770);
+        b[11] = p_3_1*s1; // l=3,m=-1
+        b[13] = p_3_1*c1; // l=3,m=+1
+        // l=4
+        const REAL p_4_1 = z*(CONSTANT(-4.683325804901024000)*z2 + CONSTANT(2.007139630671867200));
+        b[19] = p_4_1*s1; // l=4,m=-1
+        b[21] = p_4_1*c1; // l=4,m=+1
+
+
+        /* m=2 */
+
+        const REAL s2 = x*s1 + y*c1;
+        const REAL c2 = x*c1 - y*s1;
+
+        // l=2
+        const REAL p_2_2 = CONSTANT(0.546274215296039590);
+        b[4] = p_2_2*s2; // l=2,m=-2
+        b[8] = p_2_2*c2; // l=2,m=+2
+        // l=3
+        const REAL p_3_2 = CONSTANT(1.445305721320277100)*z;
+        b[10] = p_3_2*s2; // l=3,m=-2
+        b[14] = p_3_2*c2; // l=3,m=+2
+        // l=4
+        const REAL p_4_2 = CONSTANT(3.311611435151459800)*z2 + CONSTANT(-0.473087347878779980);
+        b[18] = p_4_2*s2; // l=4,m=-2
+        b[22] = p_4_2*c2; // l=4,m=+2
+
+
+        /* m=3 */
+
+        const REAL s3 = x*s2 + y*c2;
+        const REAL c3 = x*c2 - y*s2;
+
+        // l=3
+        const REAL p_3_3 = CONSTANT(-0.590043589926643520);
+        b[9] = p_3_3*s3; // l=3,m=-3
+        b[15] = p_3_3*c3; // l=3,m=+3
+        // l=4
+        const REAL p_4_3 = CONSTANT(-1.770130769779930200)*z;
+        b[17] = p_4_3*s3; // l=4,m=-3
+        b[23] = p_4_3*c3; // l=4,m=+3
+
+
+        /* m=4 */
+
+        const REAL s4 = x*s3 + y*c3;
+        const REAL c4 = x*c3 - y*s3;
+
+        // l=4
+        const REAL p_4_4 = CONSTANT(0.625835735449176030);
+        b[16] = p_4_4*s4; // l=4,m=-4
+        b[24] = p_4_4*c4; // l=4,m=+4
+    }
+
+    // routine generated programmatically for evaluating SH basis for degree 5
+    // inputs (x,y,z) are a point on the sphere (i.e., must be unit length)
+    // output is vector b with SH basis evaluated at (x,y,z).
+    //
+    void sh_eval_basis_5(REAL x, REAL y, REAL z, REAL b[36])
+    {
+        const REAL z2 = z*z;
+
+
+        /* m=0 */
+
+        // l=0
+        const REAL p_0_0 = CONSTANT(0.282094791773878140);
+        b[0] = p_0_0; // l=0,m=0
+        // l=1
+        const REAL p_1_0 = CONSTANT(0.488602511902919920)*z;
+        b[2] = p_1_0; // l=1,m=0
+        // l=2
+        const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050);
+        b[6] = p_2_0; // l=2,m=0
+        // l=3
+        const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200));
+        b[12] = p_3_0; // l=3,m=0
+        // l=4
+        const REAL p_4_0 = CONSTANT(1.984313483298443000)*z*p_3_0 + CONSTANT(-1.006230589874905300)*p_2_0;
+        b[20] = p_4_0; // l=4,m=0
+        // l=5
+        const REAL p_5_0 = CONSTANT(1.989974874213239700)*z*p_4_0 + CONSTANT(-1.002853072844814000)*p_3_0;
+        b[30] = p_5_0; // l=5,m=0
+
+
+        /* m=1 */
+
+        const REAL s1 = y;
+        const REAL c1 = x;
+
+        // l=1
+        const REAL p_1_1 = CONSTANT(-0.488602511902919920);
+        b[1] = p_1_1*s1; // l=1,m=-1
+        b[3] = p_1_1*c1; // l=1,m=+1
+        // l=2
+        const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z;
+        b[5] = p_2_1*s1; // l=2,m=-1
+        b[7] = p_2_1*c1; // l=2,m=+1
+        // l=3
+        const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770);
+        b[11] = p_3_1*s1; // l=3,m=-1
+        b[13] = p_3_1*c1; // l=3,m=+1
+        // l=4
+        const REAL p_4_1 = z*(CONSTANT(-4.683325804901024000)*z2 + CONSTANT(2.007139630671867200));
+        b[19] = p_4_1*s1; // l=4,m=-1
+        b[21] = p_4_1*c1; // l=4,m=+1
+        // l=5
+        const REAL p_5_1 = CONSTANT(2.031009601158990200)*z*p_4_1 + CONSTANT(-0.991031208965114650)*p_3_1;
+        b[29] = p_5_1*s1; // l=5,m=-1
+        b[31] = p_5_1*c1; // l=5,m=+1
+
+
+        /* m=2 */
+
+        const REAL s2 = x*s1 + y*c1;
+        const REAL c2 = x*c1 - y*s1;
+
+        // l=2
+        const REAL p_2_2 = CONSTANT(0.546274215296039590);
+        b[4] = p_2_2*s2; // l=2,m=-2
+        b[8] = p_2_2*c2; // l=2,m=+2
+        // l=3
+        const REAL p_3_2 = CONSTANT(1.445305721320277100)*z;
+        b[10] = p_3_2*s2; // l=3,m=-2
+        b[14] = p_3_2*c2; // l=3,m=+2
+        // l=4
+        const REAL p_4_2 = CONSTANT(3.311611435151459800)*z2 + CONSTANT(-0.473087347878779980);
+        b[18] = p_4_2*s2; // l=4,m=-2
+        b[22] = p_4_2*c2; // l=4,m=+2
+        // l=5
+        const REAL p_5_2 = z*(CONSTANT(7.190305177459987500)*z2 + CONSTANT(-2.396768392486662100));
+        b[28] = p_5_2*s2; // l=5,m=-2
+        b[32] = p_5_2*c2; // l=5,m=+2
+
+
+        /* m=3 */
+
+        const REAL s3 = x*s2 + y*c2;
+        const REAL c3 = x*c2 - y*s2;
+
+        // l=3
+        const REAL p_3_3 = CONSTANT(-0.590043589926643520);
+        b[9] = p_3_3*s3; // l=3,m=-3
+        b[15] = p_3_3*c3; // l=3,m=+3
+        // l=4
+        const REAL p_4_3 = CONSTANT(-1.770130769779930200)*z;
+        b[17] = p_4_3*s3; // l=4,m=-3
+        b[23] = p_4_3*c3; // l=4,m=+3
+        // l=5
+        const REAL p_5_3 = CONSTANT(-4.403144694917253700)*z2 + CONSTANT(0.489238299435250430);
+        b[27] = p_5_3*s3; // l=5,m=-3
+        b[33] = p_5_3*c3; // l=5,m=+3
+
+
+        /* m=4 */
+
+        const REAL s4 = x*s3 + y*c3;
+        const REAL c4 = x*c3 - y*s3;
+
+        // l=4
+        const REAL p_4_4 = CONSTANT(0.625835735449176030);
+        b[16] = p_4_4*s4; // l=4,m=-4
+        b[24] = p_4_4*c4; // l=4,m=+4
+        // l=5
+        const REAL p_5_4 = CONSTANT(2.075662314881041100)*z;
+        b[26] = p_5_4*s4; // l=5,m=-4
+        b[34] = p_5_4*c4; // l=5,m=+4
+
+
+        /* m=5 */
+
+        const REAL s5 = x*s4 + y*c4;
+        const REAL c5 = x*c4 - y*s4;
+
+        // l=5
+        const REAL p_5_5 = CONSTANT(-0.656382056840170150);
+        b[25] = p_5_5*s5; // l=5,m=-5
+        b[35] = p_5_5*c5; // l=5,m=+5
+    }
+
+    const REAL M_PIjs = (REAL)(4.0*atan(1.0));
+    const REAL maxang = (REAL)(M_PIjs / 2);
+    const int NSH0 = 1;
+    const int NSH1 = 4;
+    const int NSH2 = 9;
+    const int NSH3 = 16;
+    const int NSH4 = 25;
+    const int NSH5 = 36;
+    const int NSH6 = 49;
+    const int NSH7 = 64;
+    const int NSH8 = 81;
+    const int NSH9 = 100;
+    const int NL0 = 1;
+    const int NL1 = 3;
+    const int NL2 = 5;
+    const int NL3 = 7;
+    const int NL4 = 9;
+    const int NL5 = 11;
+    const int NL6 = 13;
+    const int NL7 = 15;
+    const int NL8 = 17;
+    const int NL9 = 19;
+
+    inline void rot(REAL ct, REAL st, REAL x, REAL y, REAL &xout, REAL &yout)
+    {
+        xout = x*ct - y*st;
+        yout = y*ct + x*st;
+    }
+
+    inline void rot_inv(REAL ct, REAL st, REAL x, REAL y, REAL &xout, REAL &yout)
+    {
+        xout = x*ct + y*st;
+        yout = y*ct - x*st;
+    }
+
+    inline void rot_1(REAL ct, REAL st, REAL ctm[1], REAL stm[1])
+    {
+        ctm[0] = ct;
+        stm[0] = st;
+    }
+
+    inline void rot_2(REAL ct, REAL st, REAL ctm[2], REAL stm[2])
+    {
+        REAL ct2 = CONSTANT(2.0)*ct;
+        ctm[0] = ct;
+        stm[0] = st;
+        ctm[1] = ct2*ct - CONSTANT(1.0);
+        stm[1] = ct2*st;
+    }
+
+    inline void rot_3(REAL ct, REAL st, REAL ctm[3], REAL stm[3])
+    {
+        REAL ct2 = CONSTANT(2.0)*ct;
+        ctm[0] = ct;
+        stm[0] = st;
+        ctm[1] = ct2*ct - CONSTANT(1.0);
+        stm[1] = ct2*st;
+        ctm[2] = ct2*ctm[1] - ct;
+        stm[2] = ct2*stm[1] - st;
+    }
+
+    inline void rot_4(REAL ct, REAL st, REAL ctm[4], REAL stm[4])
+    {
+        REAL ct2 = CONSTANT(2.0)*ct;
+        ctm[0] = ct;
+        stm[0] = st;
+        ctm[1] = ct2*ct - CONSTANT(1.0);
+        stm[1] = ct2*st;
+        ctm[2] = ct2*ctm[1] - ct;
+        stm[2] = ct2*stm[1] - st;
+        ctm[3] = ct2*ctm[2] - ctm[1];
+        stm[3] = ct2*stm[2] - stm[1];
+    }
+
+    inline void rot_5(REAL ct, REAL st, REAL ctm[5], REAL stm[5])
+    {
+        REAL ct2 = CONSTANT(2.0)*ct;
+        ctm[0] = ct;
+        stm[0] = st;
+        ctm[1] = ct2*ct - CONSTANT(1.0);
+        stm[1] = ct2*st;
+        ctm[2] = ct2*ctm[1] - ct;
+        stm[2] = ct2*stm[1] - st;
+        ctm[3] = ct2*ctm[2] - ctm[1];
+        stm[3] = ct2*stm[2] - stm[1];
+        ctm[4] = ct2*ctm[3] - ctm[2];
+        stm[4] = ct2*stm[3] - stm[2];
+    }
+
+    inline void sh_rotz_1(REAL ctm[1], REAL stm[1], REAL y[NL1], REAL yr[NL1])
+    {
+        yr[1] = y[1];
+        rot_inv(ctm[0], stm[0], y[0], y[2], yr[0], yr[2]);
+    }
+
+    inline void sh_rotz_2(REAL ctm[2], REAL stm[2], REAL y[NL2], REAL yr[NL2])
+    {
+        yr[2] = y[2];
+        rot_inv(ctm[0], stm[0], y[1], y[3], yr[1], yr[3]);
+        rot_inv(ctm[1], stm[1], y[0], y[4], yr[0], yr[4]);
+    }
+
+    inline void sh_rotz_3(REAL ctm[3], REAL stm[3], REAL y[NL3], REAL yr[NL3])
+    {
+        yr[3] = y[3];
+        rot_inv(ctm[0], stm[0], y[2], y[4], yr[2], yr[4]);
+        rot_inv(ctm[1], stm[1], y[1], y[5], yr[1], yr[5]);
+        rot_inv(ctm[2], stm[2], y[0], y[6], yr[0], yr[6]);
+    }
+
+    inline void sh_rotz_4(REAL ctm[4], REAL stm[4], REAL y[NL4], REAL yr[NL4])
+    {
+        yr[4] = y[4];
+        rot_inv(ctm[0], stm[0], y[3], y[5], yr[3], yr[5]);
+        rot_inv(ctm[1], stm[1], y[2], y[6], yr[2], yr[6]);
+        rot_inv(ctm[2], stm[2], y[1], y[7], yr[1], yr[7]);
+        rot_inv(ctm[3], stm[3], y[0], y[8], yr[0], yr[8]);
+    }
+
+    inline void sh_rotz_5(REAL ctm[5], REAL stm[5], REAL y[NL5], REAL yr[NL5])
+    {
+        yr[5] = y[5];
+        rot_inv(ctm[0], stm[0], y[4], y[6], yr[4], yr[6]);
+        rot_inv(ctm[1], stm[1], y[3], y[7], yr[3], yr[7]);
+        rot_inv(ctm[2], stm[2], y[2], y[8], yr[2], yr[8]);
+        rot_inv(ctm[3], stm[3], y[1], y[9], yr[1], yr[9]);
+        rot_inv(ctm[4], stm[4], y[0], y[10], yr[0], yr[10]);
+    }
+
+    // rotation code generated programmatically by rotatex (2000x4000 samples, eps=1e-008)
+
+    const REAL fx_1_001 = (REAL)(sqrt(1.0) / 1.0); // 1
+    const REAL fx_1_002 = (REAL)(-sqrt(1.0) / 1.0); // -1.00000030843
+
+    inline void sh_rotx90_1(REAL y[], REAL yr[])
+    {
+        yr[0] = fx_1_001*y[1];
+        yr[1] = fx_1_002*y[0];
+        yr[2] = fx_1_001*y[2];
+    };
+
+    inline void sh_rotx90_inv_1(REAL y[], REAL yr[])
+    {
+        yr[0] = fx_1_002*y[1];
+        yr[1] = fx_1_001*y[0];
+        yr[2] = fx_1_001*y[2];
+    }
+
+    const REAL fx_2_001 = (REAL)(sqrt(4.0) / 2.0); // 1
+    const REAL fx_2_002 = (REAL)(-sqrt(4.0) / 2.0); // -1
+    const REAL fx_2_003 = (REAL)(-sqrt(1.0) / 2.0); // -0.500000257021
+    const REAL fx_2_004 = (REAL)(-sqrt(3.0) / 2.0); // -0.866025848959
+    const REAL fx_2_005 = (REAL)(sqrt(1.0) / 2.0); // 0.5
+
+    inline void sh_rotx90_2(REAL y[], REAL yr[])
+    {
+        yr[0] = fx_2_001*y[3];
+        yr[1] = fx_2_002*y[1];
+        yr[2] = fx_2_003*y[2] + fx_2_004*y[4];
+        yr[3] = fx_2_002*y[0];
+        yr[4] = fx_2_004*y[2] + fx_2_005*y[4];
+    };
+
+    inline void sh_rotx90_inv_2(REAL y[], REAL yr[])
+    {
+        yr[0] = fx_2_002*y[3];
+        yr[1] = fx_2_002*y[1];
+        yr[2] = fx_2_003*y[2] + fx_2_004*y[4];
+        yr[3] = fx_2_001*y[0];
+        yr[4] = fx_2_004*y[2] + fx_2_005*y[4];
+    }
+
+    const REAL fx_3_001 = (REAL)(-sqrt(10.0) / 4.0); // -0.790569415042
+    const REAL fx_3_002 = (REAL)(sqrt(6.0) / 4.0); // 0.612372435696
+    const REAL fx_3_003 = (REAL)(-sqrt(16.0) / 4.0); // -1
+    const REAL fx_3_004 = (REAL)(-sqrt(6.0) / 4.0); // -0.612372435695
+    const REAL fx_3_005 = (REAL)(-sqrt(1.0) / 4.0); // -0.25
+    const REAL fx_3_006 = (REAL)(-sqrt(15.0) / 4.0); // -0.968245836551
+    const REAL fx_3_007 = (REAL)(sqrt(1.0) / 4.0); // 0.25
+    const REAL fx_3_008 = (REAL)(sqrt(10.0) / 4.0); // 0.790569983984
+
+    inline void sh_rotx90_3(REAL y[], REAL yr[])
+    {
+        yr[0] = fx_3_001*y[3] + fx_3_002*y[5];
+        yr[1] = fx_3_003*y[1];
+        yr[2] = fx_3_004*y[3] + fx_3_001*y[5];
+        yr[3] = fx_3_008*y[0] + fx_3_002*y[2];
+        yr[4] = fx_3_005*y[4] + fx_3_006*y[6];
+        yr[5] = fx_3_004*y[0] - fx_3_001*y[2];
+        yr[6] = fx_3_006*y[4] + fx_3_007*y[6];
+    };
+
+    inline void sh_rotx90_inv_3(REAL y[], REAL yr[])
+    {
+        yr[0] = fx_3_008*y[3] + fx_3_004*y[5];
+        yr[1] = fx_3_003*y[1];
+        yr[2] = fx_3_002*y[3] - fx_3_001*y[5];
+        yr[3] = fx_3_001*y[0] + fx_3_004*y[2];
+        yr[4] = fx_3_005*y[4] + fx_3_006*y[6];
+        yr[5] = fx_3_002*y[0] + fx_3_001*y[2];
+        yr[6] = fx_3_006*y[4] + fx_3_007*y[6];
+    }
+
+    const REAL fx_4_001 = (REAL)(-sqrt(56.0) / 8.0); // -0.935414346694
+    const REAL fx_4_002 = (REAL)(sqrt(8.0) / 8.0); // 0.353553390593
+    const REAL fx_4_003 = (REAL)(-sqrt(36.0) / 8.0); // -0.75
+    const REAL fx_4_004 = (REAL)(sqrt(28.0) / 8.0); // 0.661437827766
+    const REAL fx_4_005 = (REAL)(-sqrt(8.0) / 8.0); // -0.353553390593
+    const REAL fx_4_006 = (REAL)(sqrt(36.0) / 8.0); // 0.749999999999
+    const REAL fx_4_007 = (REAL)(sqrt(9.0) / 8.0); // 0.37500034698
+    const REAL fx_4_008 = (REAL)(sqrt(20.0) / 8.0); // 0.559017511622
+    const REAL fx_4_009 = (REAL)(sqrt(35.0) / 8.0); // 0.739510657141
+    const REAL fx_4_010 = (REAL)(sqrt(16.0) / 8.0); // 0.5
+    const REAL fx_4_011 = (REAL)(-sqrt(28.0) / 8.0); // -0.661437827766
+    const REAL fx_4_012 = (REAL)(sqrt(1.0) / 8.0); // 0.125
+    const REAL fx_4_013 = (REAL)(sqrt(56.0) / 8.0); // 0.935414346692
+
+    inline void sh_rotx90_4(REAL y[], REAL yr[])
+    {
+        yr[0] = fx_4_001*y[5] + fx_4_002*y[7];
+        yr[1] = fx_4_003*y[1] + fx_4_004*y[3];
+        yr[2] = fx_4_005*y[5] + fx_4_001*y[7];
+        yr[3] = fx_4_004*y[1] + fx_4_006*y[3];
+        yr[4] = fx_4_007*y[4] + fx_4_008*y[6] + fx_4_009*y[8];
+        yr[5] = fx_4_013*y[0] + fx_4_002*y[2];
+        yr[6] = fx_4_008*y[4] + fx_4_010*y[6] + fx_4_011*y[8];
+        yr[7] = fx_4_005*y[0] - fx_4_001*y[2];
+        yr[8] = fx_4_009*y[4] + fx_4_011*y[6] + fx_4_012*y[8];
+    };
+
+    inline void sh_rotx90_inv_4(REAL y[], REAL yr[])
+    {
+        yr[0] = fx_4_013*y[5] + fx_4_005*y[7];
+        yr[1] = fx_4_003*y[1] + fx_4_004*y[3];
+        yr[2] = fx_4_002*y[5] - fx_4_001*y[7];
+        yr[3] = fx_4_004*y[1] + fx_4_006*y[3];
+        yr[4] = fx_4_007*y[4] + fx_4_008*y[6] + fx_4_009*y[8];
+        yr[5] = fx_4_001*y[0] + fx_4_005*y[2];
+        yr[6] = fx_4_008*y[4] + fx_4_010*y[6] + fx_4_011*y[8];
+        yr[7] = fx_4_002*y[0] + fx_4_001*y[2];
+        yr[8] = fx_4_009*y[4] + fx_4_011*y[6] + fx_4_012*y[8];
+    }
+
+    const REAL fx_5_001 = (REAL)(sqrt(126.0) / 16.0); // 0.70156076002
+    const REAL fx_5_002 = (REAL)(-sqrt(120.0) / 16.0); // -0.684653196882
+    const REAL fx_5_003 = (REAL)(sqrt(10.0) / 16.0); // 0.197642353761
+    const REAL fx_5_004 = (REAL)(-sqrt(64.0) / 16.0); // -0.5
+    const REAL fx_5_005 = (REAL)(sqrt(192.0) / 16.0); // 0.866025403784
+    const REAL fx_5_006 = (REAL)(sqrt(70.0) / 16.0); // 0.522912516584
+    const REAL fx_5_007 = (REAL)(sqrt(24.0) / 16.0); // 0.306186217848
+    const REAL fx_5_008 = (REAL)(-sqrt(162.0) / 16.0); // -0.795495128835
+    const REAL fx_5_009 = (REAL)(sqrt(64.0) / 16.0); // 0.5
+    const REAL fx_5_010 = (REAL)(sqrt(60.0) / 16.0); // 0.484122918274
+    const REAL fx_5_011 = (REAL)(sqrt(112.0) / 16.0); // 0.661437827763
+    const REAL fx_5_012 = (REAL)(sqrt(84.0) / 16.0); // 0.572821961867
+    const REAL fx_5_013 = (REAL)(sqrt(4.0) / 16.0); // 0.125
+    const REAL fx_5_014 = (REAL)(sqrt(42.0) / 16.0); // 0.405046293649
+    const REAL fx_5_015 = (REAL)(sqrt(210.0) / 16.0); // 0.905711046633
+    const REAL fx_5_016 = (REAL)(sqrt(169.0) / 16.0); // 0.8125
+    const REAL fx_5_017 = (REAL)(-sqrt(45.0) / 16.0); // -0.419262745781
+    const REAL fx_5_018 = (REAL)(sqrt(1.0) / 16.0); // 0.0625
+    const REAL fx_5_019 = (REAL)(-sqrt(126.0) / 16.0); // -0.701561553415
+    const REAL fx_5_020 = (REAL)(sqrt(120.0) / 16.0); // 0.684653196881
+    const REAL fx_5_021 = (REAL)(-sqrt(10.0) / 16.0); // -0.197642353761
+    const REAL fx_5_022 = (REAL)(-sqrt(70.0) / 16.0); // -0.522913107945
+    const REAL fx_5_023 = (REAL)(-sqrt(60.0) / 16.0); // -0.48412346577
+
+    inline void sh_rotx90_5(REAL y[], REAL yr[])
+    {
+        yr[0] = fx_5_001*y[5] + fx_5_002*y[7] + fx_5_003*y[9];
+        yr[1] = fx_5_004*y[1] + fx_5_005*y[3];
+        yr[2] = fx_5_006*y[5] + fx_5_007*y[7] + fx_5_008*y[9];
+        yr[3] = fx_5_005*y[1] + fx_5_009*y[3];
+        yr[4] = fx_5_010*y[5] + fx_5_011*y[7] + fx_5_012*y[9];
+        yr[5] = fx_5_019*y[0] + fx_5_022*y[2] + fx_5_023*y[4];
+        yr[6] = fx_5_013*y[6] + fx_5_014*y[8] + fx_5_015*y[10];
+        yr[7] = fx_5_020*y[0] - fx_5_007*y[2] - fx_5_011*y[4];
+        yr[8] = fx_5_014*y[6] + fx_5_016*y[8] + fx_5_017*y[10];
+        yr[9] = fx_5_021*y[0] - fx_5_008*y[2] - fx_5_012*y[4];
+        yr[10] = fx_5_015*y[6] + fx_5_017*y[8] + fx_5_018*y[10];
+    };
+
+    inline void sh_rotx90_inv_5(REAL y[], REAL yr[])
+    {
+        yr[0] = fx_5_019*y[5] + fx_5_020*y[7] + fx_5_021*y[9];
+        yr[1] = fx_5_004*y[1] + fx_5_005*y[3];
+        yr[2] = fx_5_022*y[5] - fx_5_007*y[7] - fx_5_008*y[9];
+        yr[3] = fx_5_005*y[1] + fx_5_009*y[3];
+        yr[4] = fx_5_023*y[5] - fx_5_011*y[7] - fx_5_012*y[9];
+        yr[5] = fx_5_001*y[0] + fx_5_006*y[2] + fx_5_010*y[4];
+        yr[6] = fx_5_013*y[6] + fx_5_014*y[8] + fx_5_015*y[10];
+        yr[7] = fx_5_002*y[0] + fx_5_007*y[2] + fx_5_011*y[4];
+        yr[8] = fx_5_014*y[6] + fx_5_016*y[8] + fx_5_017*y[10];
+        yr[9] = fx_5_003*y[0] + fx_5_008*y[2] + fx_5_012*y[4];
+        yr[10] = fx_5_015*y[6] + fx_5_017*y[8] + fx_5_018*y[10];
+    }
+
+    inline void sh_rot_1(REAL m[3 * 3], REAL y[NL1], REAL yr[NL1])
+    {
+        REAL yr0 = m[4] * y[0] - m[5] * y[1] + m[3] * y[2];
+        REAL yr1 = m[8] * y[1] - m[7] * y[0] - m[6] * y[2];
+        REAL yr2 = m[1] * y[0] - m[2] * y[1] + m[0] * y[2];
+
+        yr[0] = yr0;
+        yr[1] = yr1;
+        yr[2] = yr2;
+    }
+
+    inline void sh_roty_1(REAL ctm[1], REAL stm[1], REAL y[NL1], REAL yr[NL1])
+    {
+        yr[0] = y[0];
+        rot_inv(ctm[0], stm[0], y[1], y[2], yr[1], yr[2]);
+    }
+
+    inline void sh_roty_2(REAL ctm[2], REAL stm[2], REAL y[NL2], REAL yr[NL2])
+    {
+        REAL ytmp[NL2];
+        sh_rotx90_2(y, yr);
+        sh_rotz_2(ctm, stm, yr, ytmp);
+        sh_rotx90_inv_2(ytmp, yr);
+    }
+
+    inline void sh_roty_3(REAL ctm[3], REAL stm[3], REAL y[NL3], REAL yr[NL3])
+    {
+        REAL ytmp[NL3];
+        sh_rotx90_3(y, yr);
+        sh_rotz_3(ctm, stm, yr, ytmp);
+        sh_rotx90_inv_3(ytmp, yr);
+    }
+
+    inline void sh_roty_4(REAL ctm[4], REAL stm[4], REAL y[NL4], REAL yr[NL4])
+    {
+        REAL ytmp[NL4];
+        sh_rotx90_4(y, yr);
+        sh_rotz_4(ctm, stm, yr, ytmp);
+        sh_rotx90_inv_4(ytmp, yr);
+    }
+
+    inline void sh_roty_5(REAL ctm[5], REAL stm[5], REAL y[NL5], REAL yr[NL5])
+    {
+        REAL ytmp[NL5];
+        sh_rotx90_5(y, yr);
+        sh_rotz_5(ctm, stm, yr, ytmp);
+        sh_rotx90_inv_5(ytmp, yr);
+    }
+
+#define ROT_TOL CONSTANT(1e-4)
+
+    /*
+    Finds cosine,sine pairs for zyz rotation (i.e. rotation R_z2 R_y R_z1 v).
+    The rotation is one which maps mx to (1,0,0) and mz to (0,0,1).
+    */
+    inline void zyz(REAL m[3 * 3], REAL &zc1, REAL &zs1, REAL &yc, REAL &ys, REAL &zc2, REAL &zs2)
+    {
+        REAL cz = m[8];
+
+        // rotate so that (cx,cy,0) aligns to (1,0,0)
+        REAL cxylen = (REAL)sqrtf(1.0f - cz*cz);
+        if (cxylen >= ROT_TOL)
+        {
+            // if above is a NaN, will do the correct thing
+            yc = cz;
+            ys = cxylen;
+            REAL len67inv = 1.0f / sqrtf(m[6] * m[6] + m[7] * m[7]);
+            zc1 = -m[6] * len67inv;
+            zs1 = m[7] * len67inv;
+            REAL len25inv = 1.0f / sqrtf(m[2] * m[2] + m[5] * m[5]);
+            zc2 = m[2] * len25inv;
+            zs2 = m[5] * len25inv;
+        }
+        else {  // m[6],m[7],m[8] already aligned to (0,0,1)
+            zc1 = 1.0; zs1 = 0.0;        // identity
+            yc = cz; ys = 0.0;           // identity
+            zc2 = m[0] * cz; zs2 = -m[1];  // align x axis (mx[0],mx[1],0) to (1,0,0)
+        }
+    }
+
+    inline void sh_rotzyz_2(REAL zc1m[2], REAL zs1m[2], REAL ycm[2], REAL ysm[2], REAL zc2m[2], REAL zs2m[2], REAL y[NL2], REAL yr[NL2])
+    {
+        REAL ytmp[NL2];
+        sh_rotz_2(zc1m, zs1m, y, yr);
+        sh_roty_2(ycm, ysm, yr, ytmp);
+        sh_rotz_2(zc2m, zs2m, ytmp, yr);
+    }
+
+    inline void sh_rotzyz_3(REAL zc1m[3], REAL zs1m[3], REAL ycm[3], REAL ysm[3], REAL zc2m[3], REAL zs2m[3], REAL y[NL3], REAL yr[NL3])
+    {
+        REAL ytmp[NL3];
+        sh_rotz_3(zc1m, zs1m, y, yr);
+        sh_roty_3(ycm, ysm, yr, ytmp);
+        sh_rotz_3(zc2m, zs2m, ytmp, yr);
+    }
+
+    inline void sh_rotzyz_4(REAL zc1m[4], REAL zs1m[4], REAL ycm[4], REAL ysm[4], REAL zc2m[4], REAL zs2m[4], REAL y[NL4], REAL yr[NL4])
+    {
+        REAL ytmp[NL4];
+        sh_rotz_4(zc1m, zs1m, y, yr);
+        sh_roty_4(ycm, ysm, yr, ytmp);
+        sh_rotz_4(zc2m, zs2m, ytmp, yr);
+    }
+
+    inline void sh_rotzyz_5(REAL zc1m[5], REAL zs1m[5], REAL ycm[5], REAL ysm[5], REAL zc2m[5], REAL zs2m[5], REAL y[NL5], REAL yr[NL5])
+    {
+        REAL ytmp[NL5];
+        sh_rotz_5(zc1m, zs1m, y, yr);
+        sh_roty_5(ycm, ysm, yr, ytmp);
+        sh_rotz_5(zc2m, zs2m, ytmp, yr);
+    }
+
+    inline void sh3_rot(REAL m[3 * 3], REAL zc1, REAL zs1, REAL yc, REAL ys, REAL zc2, REAL zs2, REAL y[NSH3], REAL yr[NSH3])
+    {
+        REAL zc1m[3], zs1m[3];
+        rot_3(zc1, zs1, zc1m, zs1m);
+        REAL ycm[3], ysm[3];
+        rot_3(yc, ys, ycm, ysm);
+        REAL zc2m[3], zs2m[3];
+        rot_3(zc2, zs2, zc2m, zs2m);
+
+        yr[0] = y[0];
+        sh_rot_1(m, y + NSH0, yr + NSH0);
+        sh_rotzyz_2(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH1, yr + NSH1);
+        sh_rotzyz_3(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH2, yr + NSH2);
+    }
+
+    inline void sh4_rot(REAL m[3 * 3], REAL zc1, REAL zs1, REAL yc, REAL ys, REAL zc2, REAL zs2, REAL y[NSH4], REAL yr[NSH4])
+    {
+        REAL zc1m[4], zs1m[4];
+        rot_4(zc1, zs1, zc1m, zs1m);
+        REAL ycm[4], ysm[4];
+        rot_4(yc, ys, ycm, ysm);
+        REAL zc2m[4], zs2m[4];
+        rot_4(zc2, zs2, zc2m, zs2m);
+
+        yr[0] = y[0];
+        sh_rot_1(m, y + NSH0, yr + NSH0);
+        sh_rotzyz_2(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH1, yr + NSH1);
+        sh_rotzyz_3(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH2, yr + NSH2);
+        sh_rotzyz_4(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH3, yr + NSH3);
+    }
+
+    inline void sh5_rot(REAL m[3 * 3], REAL zc1, REAL zs1, REAL yc, REAL ys, REAL zc2, REAL zs2, REAL y[NSH5], REAL yr[NSH5])
+    {
+        REAL zc1m[5], zs1m[5];
+        rot_5(zc1, zs1, zc1m, zs1m);
+        REAL ycm[5], ysm[5];
+        rot_5(yc, ys, ycm, ysm);
+        REAL zc2m[5], zs2m[5];
+        rot_5(zc2, zs2, zc2m, zs2m);
+
+        yr[0] = y[0];
+        sh_rot_1(m, y + NSH0, yr + NSH0);
+        sh_rotzyz_2(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH1, yr + NSH1);
+        sh_rotzyz_3(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH2, yr + NSH2);
+        sh_rotzyz_4(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH3, yr + NSH3);
+        sh_rotzyz_5(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH4, yr + NSH4);
+    }
+
+    inline void sh1_rot(REAL m[3 * 3], REAL y[NSH1], REAL yr[NSH1])
+    {
+        yr[0] = y[0];
+        sh_rot_1(m, y + NSH0, yr + NSH0);
+    }
+
+    inline void sh3_rot(REAL m[3 * 3], REAL y[NSH3], REAL yr[NSH3])
+    {
+        REAL zc1, zs1, yc, ys, zc2, zs2;
+        zyz(m, zc1, zs1, yc, ys, zc2, zs2);
+        sh3_rot(m, zc1, zs1, yc, ys, zc2, zs2, y, yr);
+    }
+
+    inline void sh4_rot(REAL m[3 * 3], REAL y[NSH4], REAL yr[NSH4])
+    {
+        REAL zc1, zs1, yc, ys, zc2, zs2;
+        zyz(m, zc1, zs1, yc, ys, zc2, zs2);
+        sh4_rot(m, zc1, zs1, yc, ys, zc2, zs2, y, yr);
+    }
+
+    inline void sh5_rot(REAL m[3 * 3], REAL y[NSH5], REAL yr[NSH5])
+    {
+        REAL zc1, zs1, yc, ys, zc2, zs2;
+        zyz(m, zc1, zs1, yc, ys, zc2, zs2);
+        sh5_rot(m, zc1, zs1, yc, ys, zc2, zs2, y, yr);
+    }
+
+    // simple matrix vector multiply for a square matrix (only used by ZRotation)
+    inline void SimpMatMul(size_t dim, const float *matrix, const float *input, float *result)
+    {
+        for (size_t iR = 0; iR < dim; ++iR)
+        {
+            result[iR + 0] = matrix[iR*dim + 0] * input[0];
+            for (size_t iC = 1; iC < dim; ++iC)
+            {
+                result[iR] += matrix[iR*dim + iC] * input[iC];
+            }
+        }
+    }
+
+}; // anonymous namespace
+
+
+//-------------------------------------------------------------------------------------
+// Evaluates the Spherical Harmonic basis functions
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205448.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+float* XM_CALLCONV DirectX::XMSHEvalDirection(
+    float *result,
+    size_t order,
+    FXMVECTOR dir) noexcept
+{
+    if (!result)
+        return nullptr;
+
+    XMFLOAT4A dv;
+    XMStoreFloat4A(&dv, dir);
+
+    const float fX = dv.x;
+    const float fY = dv.y;
+    const float fZ = dv.z;
+
+    switch (order)
+    {
+    case 2:
+        sh_eval_basis_1(fX, fY, fZ, result);
+        break;
+
+    case 3:
+        sh_eval_basis_2(fX, fY, fZ, result);
+        break;
+
+    case 4:
+        sh_eval_basis_3(fX, fY, fZ, result);
+        break;
+
+    case 5:
+        sh_eval_basis_4(fX, fY, fZ, result);
+        break;
+
+    case 6:
+        sh_eval_basis_5(fX, fY, fZ, result);
+        break;
+
+    default:
+        assert(order < XM_SH_MINORDER || order > XM_SH_MAXORDER);
+        return nullptr;
+    }
+
+    return result;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Rotates SH vector by a rotation matrix
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204992.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+float* XM_CALLCONV DirectX::XMSHRotate(
+    float *result,
+    size_t order,
+    FXMMATRIX rotMatrix,
+    const float *input) noexcept
+{
+    if (!result || !input)
+        return nullptr;
+
+    if (result == input)
+        return nullptr;
+
+    XMFLOAT3X3 mat;
+    XMStoreFloat3x3(&mat, rotMatrix);
+
+    float mRot[3 * 3];
+    const float r00 = mRot[0 * 3 + 0] = mat._11;
+    const float r10 = mRot[1 * 3 + 0] = mat._12;
+    const float r20 = mRot[2 * 3 + 0] = mat._13;
+
+    const float r01 = mRot[0 * 3 + 1] = mat._21;
+    const float r11 = mRot[1 * 3 + 1] = mat._22;
+    const float r21 = mRot[2 * 3 + 1] = mat._23;
+
+    const float r02 = mRot[0 * 3 + 2] = mat._31;
+    const float r12 = mRot[1 * 3 + 2] = mat._32;
+    const float r22 = mRot[2 * 3 + 2] = mat._33;
+
+    result[0] = input[0]; // rotate the constant term
+
+    switch (order)
+    {
+    case 2:
+    {
+        // do linear by hand...
+
+        result[1] = r11*input[1] - r12*input[2] + r10*input[3];
+        result[2] = -r21*input[1] + r22*input[2] - r20*input[3];
+        result[3] = r01*input[1] - r02*input[2] + r00*input[3];
+    }
+    break;
+
+    case 3:
+    {
+        float R[25];
+        // do linear by hand...
+
+        result[1] = r11*input[1] - r12*input[2] + r10*input[3];
+        result[2] = -r21*input[1] + r22*input[2] - r20*input[3];
+        result[3] = r01*input[1] - r02*input[2] + r00*input[3];
+
+        // direct code for quadratics is faster than ZYZ reccurence relations
+
+        const float t41 = r01 * r00;
+        const float t43 = r11 * r10;
+        const float t48 = r11 * r12;
+        const float t50 = r01 * r02;
+        const float t55 = r02 * r02;
+        const float t57 = r22 * r22;
+        const float t58 = r12 * r12;
+        const float t61 = r00 * r02;
+        const float t63 = r10 * r12;
+        const float t68 = r10 * r10;
+        const float t70 = r01 * r01;
+        const float t72 = r11 * r11;
+        const float t74 = r00 * r00;
+        const float t76 = r21 * r21;
+        const float t78 = r20 * r20;
+
+        const float v173 = 0.1732050808e1f;
+        const float v577 = 0.5773502693e0f;
+        const float v115 = 0.1154700539e1f;
+        const float v288 = 0.2886751347e0f;
+        const float v866 = 0.8660254040e0f;
+
+        R[0] = r11 * r00 + r01 * r10;
+        R[1] = -r01 * r12 - r11 * r02;
+        R[2] = v173 * r02 * r12;
+        R[3] = -r10 * r02 - r00 * r12;
+        R[4] = r00 * r10 - r01 * r11;
+        R[5] = -r11 * r20 - r21 * r10;
+        R[6] = r11 * r22 + r21 * r12;
+        R[7] = -v173 * r22 * r12;
+        R[8] = r20 * r12 + r10 * r22;
+        R[9] = -r10 * r20 + r11 * r21;
+        R[10] = -v577* (t41 + t43) + v115 * r21 * r20;
+        R[11] = v577* (t48 + t50) - v115 * r21 * r22;
+        R[12] = -0.5000000000e0f * (t55 + t58) + t57;
+        R[13] = v577 * (t61 + t63) - v115 * r20 * r22;
+        R[14] = v288 * (t70 - t68 + t72 - t74) - v577 * (t76 - t78);
+        R[15] = -r01 * r20 - r21 * r00;
+        R[16] = r01 * r22 + r21 * r02;
+        R[17] = -v173 * r22 * r02;
+        R[18] = r00 * r22 + r20 * r02;
+        R[19] = -r00 * r20 + r01 * r21;
+        R[20] = t41 - t43;
+        R[21] = -t50 + t48;
+        R[22] = v866 * (t55 - t58);
+        R[23] = t63 - t61;
+        R[24] = 0.5000000000e0f *(t74 - t68 - t70 + t72);
+
+        // blow the matrix multiply out by hand, looping is ineficient on a P4...
+        for (unsigned int iR = 0; iR < 5; iR++)
+        {
+            const unsigned int uBase = iR * 5;
+            result[4 + iR] = R[uBase + 0] * input[4] + R[uBase + 1] * input[5] + R[uBase + 2] * input[6] + R[uBase + 3] * input[7] + R[uBase + 4] * input[8];
+        }
+    }
+    break;
+
+    case 4:
+        sh3_rot(mRot, const_cast<float *>(input), result);
+        break;
+
+    case 5:
+        sh4_rot(mRot, const_cast<float *>(input), result);
+        break;
+
+    case 6:
+        sh5_rot(mRot, const_cast<float *>(input), result);
+        break;
+
+    default:
+        assert(order < XM_SH_MINORDER || order > XM_SH_MAXORDER);
+        return nullptr;
+    }
+
+    return result;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Rotates the SH vector in the Z axis by an angle
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205461.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+float* DirectX::XMSHRotateZ(
+    float *result,
+    size_t order,
+    float angle,
+    const float *input) noexcept
+{
+    if (!result || !input)
+        return nullptr;
+
+    if (result == input)
+        return nullptr;
+
+    if (order < XM_SH_MINORDER || order > XM_SH_MAXORDER)
+        return nullptr;
+
+    float R[(2 * (XM_SH_MAXORDER - 1) + 1)*(2 * (XM_SH_MAXORDER - 1) + 1)]; // used to store rotation matrices...
+
+    // these are actually very sparse matrices, most of the entries are zero's...
+
+    const float ca = cosf(angle);
+    const float sa = sinf(angle);
+
+    const float t1 = ca;
+    const float t2 = sa;
+    R[0] = t1;
+    R[1] = 0.0f;
+    R[2] = t2;
+    R[3] = 0.0f;
+    R[4] = 1.0f;
+    R[5] = 0.0f;
+    R[6] = -t2;
+    R[7] = 0.0f;
+    R[8] = t1;
+
+    result[0] = input[0];
+    SimpMatMul(3, R, input + 1, result + 1);
+
+    if (order > 2)
+    {
+        for (int j = 0; j < 5 * 5; j++) R[j] = 0.0f;
+        const float t1 = sa;
+        const float t2 = t1*t1;
+        const float t3 = ca;
+        const float t4 = t3*t3;
+        const float t5 = -t2 + t4;
+        const float t7 = 2.0f*t3*t1;
+        R[0] = t5;
+        R[4] = t7;
+        R[6] = t3;
+        R[8] = t1;
+        R[12] = 1.0f;
+        R[16] = -t1;
+        R[18] = t3;
+        R[20] = -t7;
+        R[24] = t5;
+
+        SimpMatMul(5, R, input + 4, result + 4); // un-roll matrix/vector multiply
+        if (order > 3)
+        {
+            for (int j = 0; j < 7 * 7; j++) R[j] = 0.0f;
+            const float t1 = ca;
+            const float t2 = t1*t1;
+            const float t4 = sa;
+            const float t5 = t4*t4;
+            const float t8 = t2*t1 - 3.0f*t1*t5;
+            const float t12 = 3.0f*t4*t2 - t5*t4;
+            const float t13 = -t5 + t2;
+            const float t15 = 2.0f*t1*t4;
+            R[0] = t8;
+            R[6] = t12;
+            R[8] = t13;
+            R[12] = t15;
+            R[16] = t1;
+            R[18] = t4;
+            R[24] = 1.0f;
+            R[30] = -t4;
+            R[32] = t1;
+            R[36] = -t15;
+            R[40] = t13;
+            R[42] = -t12;
+            R[48] = t8;
+            SimpMatMul(7, R, input + 9, result + 9);
+            if (order > 4)
+            {
+                for (int j = 0; j <= 9 * 9; j++) R[j] = 0.0f;
+                const float t1 = ca;
+                const float t2 = t1*t1;
+                const float t3 = t2*t2;
+                const float t4 = sa;
+                const float t5 = t4*t4;
+                const float t6 = t5*t5;
+                const float t9 = t3 + t6 - 6.0f*t5*t2;
+                const float t10 = t5*t4;
+                const float t12 = t2*t1;
+                const float t14 = -t10*t1 + t4*t12;
+                const float t17 = t12 - 3.0f*t1*t5;
+                const float t20 = 3.0f*t4*t2 - t10;
+                const float t21 = -t5 + t2;
+                const float t23 = 2.0f*t1*t4;
+                R[0] = t9;
+                R[8] = 4.0f*t14;
+                R[10] = t17;
+                R[16] = t20;
+                R[20] = t21;
+                R[24] = t23;
+                R[30] = t1;
+                R[32] = t4;
+                R[40] = 1.0f;
+                R[48] = -t4;
+                R[50] = t1;
+                R[56] = -t23;
+                R[60] = t21;
+                R[64] = -t20;
+                R[70] = t17;
+                R[72] = -4.0f*t14;
+                R[80] = t9;
+
+                SimpMatMul(9, R, input + 16, result + 16);
+                if (order > 5)
+                {
+                    for (int j = 0; j < 11 * 11; j++) R[j] = 0.0f;
+                    const float t1 = ca;
+                    const float t2 = sa;
+                    const float t3 = t2*t2;
+                    const float t4 = t3*t3;
+                    const float t7 = t1*t1;
+                    const float t8 = t7*t1;
+                    const float t11 = t7*t7;
+                    const float t13 = 5.0f*t1*t4 - 10.0f*t3*t8 + t11*t1;
+                    const float t14 = t3*t2;
+                    const float t20 = -10.0f*t14*t7 + 5.0f*t2*t11 + t4*t2;
+                    const float t23 = t11 + t4 - 6.0f*t3*t7;
+                    const float t26 = -t14*t1 + t2*t8;
+                    const float t29 = t8 - 3.0f*t1*t3;
+                    const float t32 = 3.0f*t2*t7 - t14;
+                    const float t33 = -t3 + t7;
+                    const float t35 = 2.0f*t1*t2;
+                    R[0] = t13;
+                    R[10] = t20;
+                    R[12] = t23;
+                    R[20] = 4.0f*t26;
+                    R[24] = t29;
+                    R[30] = t32;
+                    R[36] = t33;
+                    R[40] = t35;
+                    R[48] = t1;
+                    R[50] = t2;
+                    R[60] = 1.0f;
+                    R[70] = -t2;
+                    R[72] = t1;
+                    R[80] = -t35;
+                    R[84] = t33;
+                    R[90] = -t32;
+                    R[96] = t29;
+                    R[100] = -4.0f*t26;
+                    R[108] = t23;
+                    R[110] = -t20;
+                    R[120] = t13;
+                    SimpMatMul(11, R, input + 25, result + 25);
+                }
+            }
+        }
+    }
+
+    return result;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Adds two SH vectors, result[i] = inputA[i] + inputB[i];
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205438.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+float* DirectX::XMSHAdd(
+    float *result,
+    size_t order,
+    const float *inputA,
+    const float *inputB) noexcept
+{
+    if (!result || !inputA || !inputB)
+        return nullptr;
+
+    const size_t numcoeff = order*order;
+
+    for (size_t i = 0; i < numcoeff; ++i)
+    {
+        result[i] = inputA[i] + inputB[i];
+    }
+
+    return result;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Scales a SH vector, result[i] = input[i] * scale;
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204994.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+float* DirectX::XMSHScale(
+    float *result,
+    size_t order,
+    const float *input,
+    float scale) noexcept
+{
+    if (!result || !input)
+        return nullptr;
+
+    const size_t numcoeff = order*order;
+
+    for (size_t i = 0; i < numcoeff; ++i)
+    {
+        result[i] = scale * input[i];
+    }
+
+    return result;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Computes the dot product of two SH vectors
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205446.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+float DirectX::XMSHDot(
+    size_t order, 
+    const float *inputA, 
+    const float *inputB) noexcept
+{
+    if (!inputA || !inputB)
+        return 0.f;
+
+    float result = inputA[0] * inputB[0];
+
+    const size_t numcoeff = order*order;
+
+    for (size_t i = 1; i < numcoeff; ++i)
+    {
+        result += inputA[i] * inputB[i];
+    }
+
+    return result;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Computes the product of two functions represented using SH (f and g), where:
+// result[i] = int(y_i(s) * f(s) * g(s)), where y_i(s) is the ith SH basis
+// function, f(s) and g(s) are SH functions (sum_i(y_i(s)*c_i)).  The order O
+// determines the lengths of the arrays, where there should always be O^2 
+// coefficients.  In general the product of two SH functions of order O generates
+// and SH function of order 2*O - 1, but we truncate the result.  This means
+// that the product commutes (f*g == g*f) but doesn't associate 
+// (f*(g*h) != (f*g)*h.
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+float* DirectX::XMSHMultiply(
+    float *result,
+    size_t order,
+    const float *inputF,
+    const float *inputG) noexcept
+{
+    switch (order)
+    {
+    case 2:
+        return XMSHMultiply2(result, inputF, inputG);
+
+    case 3:
+        return XMSHMultiply3(result, inputF, inputG);
+
+    case 4:
+        return XMSHMultiply4(result, inputF, inputG);
+
+    case 5:
+        return XMSHMultiply5(result, inputF, inputG);
+
+    case 6:
+        return XMSHMultiply6(result, inputF, inputG);
+
+    default:
+        assert(order < XM_SH_MINORDER || order > XM_SH_MAXORDER);
+        return nullptr;
+    }
+}
+
+
+//-------------------------------------------------------------------------------------
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205454.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+float* DirectX::XMSHMultiply2(
+    float *y,
+    const float *f,
+    const float *g) noexcept
+{
+    if (!y || !f || !g)
+        return nullptr;
+
+    REAL tf, tg, t;
+    // [0,0]: 0,
+    y[0] = CONSTANT(0.282094792935999980)*f[0] * g[0];
+
+    // [1,1]: 0,
+    tf = CONSTANT(0.282094791773000010)*f[0];
+    tg = CONSTANT(0.282094791773000010)*g[0];
+    y[1] = tf*g[1] + tg*f[1];
+    t = f[1] * g[1];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+
+    // [2,2]: 0,
+    tf = CONSTANT(0.282094795249000000)*f[0];
+    tg = CONSTANT(0.282094795249000000)*g[0];
+    y[2] = tf*g[2] + tg*f[2];
+    t = f[2] * g[2];
+    y[0] += CONSTANT(0.282094795249000000)*t;
+
+    // [3,3]: 0,
+    tf = CONSTANT(0.282094791773000010)*f[0];
+    tg = CONSTANT(0.282094791773000010)*g[0];
+    y[3] = tf*g[3] + tg*f[3];
+    t = f[3] * g[3];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+
+    // multiply count=20
+
+    return y;
+}
+
+
+//-------------------------------------------------------------------------------------
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232906.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+float* DirectX::XMSHMultiply3(
+    float *y,
+    const float *f,
+    const float *g) noexcept
+{
+    if (!y || !f || !g)
+        return nullptr;
+
+    REAL tf, tg, t;
+    // [0,0]: 0,
+    y[0] = CONSTANT(0.282094792935999980)*f[0] * g[0];
+
+    // [1,1]: 0,6,8,
+    tf = CONSTANT(0.282094791773000010)*f[0] + CONSTANT(-0.126156626101000010)*f[6] + CONSTANT(-0.218509686119999990)*f[8];
+    tg = CONSTANT(0.282094791773000010)*g[0] + CONSTANT(-0.126156626101000010)*g[6] + CONSTANT(-0.218509686119999990)*g[8];
+    y[1] = tf*g[1] + tg*f[1];
+    t = f[1] * g[1];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+    y[6] = CONSTANT(-0.126156626101000010)*t;
+    y[8] = CONSTANT(-0.218509686119999990)*t;
+
+    // [1,2]: 5,
+    tf = CONSTANT(0.218509686118000010)*f[5];
+    tg = CONSTANT(0.218509686118000010)*g[5];
+    y[1] += tf*g[2] + tg*f[2];
+    y[2] = tf*g[1] + tg*f[1];
+    t = f[1] * g[2] + f[2] * g[1];
+    y[5] = CONSTANT(0.218509686118000010)*t;
+
+    // [1,3]: 4,
+    tf = CONSTANT(0.218509686114999990)*f[4];
+    tg = CONSTANT(0.218509686114999990)*g[4];
+    y[1] += tf*g[3] + tg*f[3];
+    y[3] = tf*g[1] + tg*f[1];
+    t = f[1] * g[3] + f[3] * g[1];
+    y[4] = CONSTANT(0.218509686114999990)*t;
+
+    // [2,2]: 0,6,
+    tf = CONSTANT(0.282094795249000000)*f[0] + CONSTANT(0.252313259986999990)*f[6];
+    tg = CONSTANT(0.282094795249000000)*g[0] + CONSTANT(0.252313259986999990)*g[6];
+    y[2] += tf*g[2] + tg*f[2];
+    t = f[2] * g[2];
+    y[0] += CONSTANT(0.282094795249000000)*t;
+    y[6] += CONSTANT(0.252313259986999990)*t;
+
+    // [2,3]: 7,
+    tf = CONSTANT(0.218509686118000010)*f[7];
+    tg = CONSTANT(0.218509686118000010)*g[7];
+    y[2] += tf*g[3] + tg*f[3];
+    y[3] += tf*g[2] + tg*f[2];
+    t = f[2] * g[3] + f[3] * g[2];
+    y[7] = CONSTANT(0.218509686118000010)*t;
+
+    // [3,3]: 0,6,8,
+    tf = CONSTANT(0.282094791773000010)*f[0] + CONSTANT(-0.126156626101000010)*f[6] + CONSTANT(0.218509686119999990)*f[8];
+    tg = CONSTANT(0.282094791773000010)*g[0] + CONSTANT(-0.126156626101000010)*g[6] + CONSTANT(0.218509686119999990)*g[8];
+    y[3] += tf*g[3] + tg*f[3];
+    t = f[3] * g[3];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+    y[6] += CONSTANT(-0.126156626101000010)*t;
+    y[8] += CONSTANT(0.218509686119999990)*t;
+
+    // [4,4]: 0,6,
+    tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.180223751576000010)*f[6];
+    tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.180223751576000010)*g[6];
+    y[4] += tf*g[4] + tg*f[4];
+    t = f[4] * g[4];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[6] += CONSTANT(-0.180223751576000010)*t;
+
+    // [4,5]: 7,
+    tf = CONSTANT(0.156078347226000000)*f[7];
+    tg = CONSTANT(0.156078347226000000)*g[7];
+    y[4] += tf*g[5] + tg*f[5];
+    y[5] += tf*g[4] + tg*f[4];
+    t = f[4] * g[5] + f[5] * g[4];
+    y[7] += CONSTANT(0.156078347226000000)*t;
+
+    // [5,5]: 0,6,8,
+    tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.090111875786499998)*f[6] + CONSTANT(-0.156078347227999990)*f[8];
+    tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.090111875786499998)*g[6] + CONSTANT(-0.156078347227999990)*g[8];
+    y[5] += tf*g[5] + tg*f[5];
+    t = f[5] * g[5];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[6] += CONSTANT(0.090111875786499998)*t;
+    y[8] += CONSTANT(-0.156078347227999990)*t;
+
+    // [6,6]: 0,6,
+    tf = CONSTANT(0.282094797560000000)*f[0];
+    tg = CONSTANT(0.282094797560000000)*g[0];
+    y[6] += tf*g[6] + tg*f[6];
+    t = f[6] * g[6];
+    y[0] += CONSTANT(0.282094797560000000)*t;
+    y[6] += CONSTANT(0.180223764527000010)*t;
+
+    // [7,7]: 0,6,8,
+    tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.090111875786499998)*f[6] + CONSTANT(0.156078347227999990)*f[8];
+    tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.090111875786499998)*g[6] + CONSTANT(0.156078347227999990)*g[8];
+    y[7] += tf*g[7] + tg*f[7];
+    t = f[7] * g[7];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[6] += CONSTANT(0.090111875786499998)*t;
+    y[8] += CONSTANT(0.156078347227999990)*t;
+
+    // [8,8]: 0,6,
+    tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.180223751576000010)*f[6];
+    tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.180223751576000010)*g[6];
+    y[8] += tf*g[8] + tg*f[8];
+    t = f[8] * g[8];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[6] += CONSTANT(-0.180223751576000010)*t;
+
+    // multiply count=120
+
+    return y;
+}
+
+
+//-------------------------------------------------------------------------------------
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232907.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+float* DirectX::XMSHMultiply4(
+    float *y,
+    const float *f,
+    const float *g) noexcept
+{
+    if (!y || !f || !g)
+        return nullptr;
+
+    REAL tf, tg, t;
+    // [0,0]: 0,
+    y[0] = CONSTANT(0.282094792935999980)*f[0] * g[0];
+
+    // [1,1]: 0,6,8,
+    tf = CONSTANT(0.282094791773000010)*f[0] + CONSTANT(-0.126156626101000010)*f[6] + CONSTANT(-0.218509686119999990)*f[8];
+    tg = CONSTANT(0.282094791773000010)*g[0] + CONSTANT(-0.126156626101000010)*g[6] + CONSTANT(-0.218509686119999990)*g[8];
+    y[1] = tf*g[1] + tg*f[1];
+    t = f[1] * g[1];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+    y[6] = CONSTANT(-0.126156626101000010)*t;
+    y[8] = CONSTANT(-0.218509686119999990)*t;
+
+    // [1,4]: 3,13,15,
+    tf = CONSTANT(0.218509686114999990)*f[3] + CONSTANT(-0.058399170082300000)*f[13] + CONSTANT(-0.226179013157999990)*f[15];
+    tg = CONSTANT(0.218509686114999990)*g[3] + CONSTANT(-0.058399170082300000)*g[13] + CONSTANT(-0.226179013157999990)*g[15];
+    y[1] += tf*g[4] + tg*f[4];
+    y[4] = tf*g[1] + tg*f[1];
+    t = f[1] * g[4] + f[4] * g[1];
+    y[3] = CONSTANT(0.218509686114999990)*t;
+    y[13] = CONSTANT(-0.058399170082300000)*t;
+    y[15] = CONSTANT(-0.226179013157999990)*t;
+
+    // [1,5]: 2,12,14,
+    tf = CONSTANT(0.218509686118000010)*f[2] + CONSTANT(-0.143048168103000000)*f[12] + CONSTANT(-0.184674390923000000)*f[14];
+    tg = CONSTANT(0.218509686118000010)*g[2] + CONSTANT(-0.143048168103000000)*g[12] + CONSTANT(-0.184674390923000000)*g[14];
+    y[1] += tf*g[5] + tg*f[5];
+    y[5] = tf*g[1] + tg*f[1];
+    t = f[1] * g[5] + f[5] * g[1];
+    y[2] = CONSTANT(0.218509686118000010)*t;
+    y[12] = CONSTANT(-0.143048168103000000)*t;
+    y[14] = CONSTANT(-0.184674390923000000)*t;
+
+    // [1,6]: 11,
+    tf = CONSTANT(0.202300659402999990)*f[11];
+    tg = CONSTANT(0.202300659402999990)*g[11];
+    y[1] += tf*g[6] + tg*f[6];
+    y[6] += tf*g[1] + tg*f[1];
+    t = f[1] * g[6] + f[6] * g[1];
+    y[11] = CONSTANT(0.202300659402999990)*t;
+
+    // [1,8]: 9,11,
+    tf = CONSTANT(0.226179013155000000)*f[9] + CONSTANT(0.058399170081799998)*f[11];
+    tg = CONSTANT(0.226179013155000000)*g[9] + CONSTANT(0.058399170081799998)*g[11];
+    y[1] += tf*g[8] + tg*f[8];
+    y[8] += tf*g[1] + tg*f[1];
+    t = f[1] * g[8] + f[8] * g[1];
+    y[9] = CONSTANT(0.226179013155000000)*t;
+    y[11] += CONSTANT(0.058399170081799998)*t;
+
+    // [2,2]: 0,6,
+    tf = CONSTANT(0.282094795249000000)*f[0] + CONSTANT(0.252313259986999990)*f[6];
+    tg = CONSTANT(0.282094795249000000)*g[0] + CONSTANT(0.252313259986999990)*g[6];
+    y[2] += tf*g[2] + tg*f[2];
+    t = f[2] * g[2];
+    y[0] += CONSTANT(0.282094795249000000)*t;
+    y[6] += CONSTANT(0.252313259986999990)*t;
+
+    // [2,6]: 12,
+    tf = CONSTANT(0.247766706973999990)*f[12];
+    tg = CONSTANT(0.247766706973999990)*g[12];
+    y[2] += tf*g[6] + tg*f[6];
+    y[6] += tf*g[2] + tg*f[2];
+    t = f[2] * g[6] + f[6] * g[2];
+    y[12] += CONSTANT(0.247766706973999990)*t;
+
+    // [3,3]: 0,6,8,
+    tf = CONSTANT(0.282094791773000010)*f[0] + CONSTANT(-0.126156626101000010)*f[6] + CONSTANT(0.218509686119999990)*f[8];
+    tg = CONSTANT(0.282094791773000010)*g[0] + CONSTANT(-0.126156626101000010)*g[6] + CONSTANT(0.218509686119999990)*g[8];
+    y[3] += tf*g[3] + tg*f[3];
+    t = f[3] * g[3];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+    y[6] += CONSTANT(-0.126156626101000010)*t;
+    y[8] += CONSTANT(0.218509686119999990)*t;
+
+    // [3,6]: 13,
+    tf = CONSTANT(0.202300659402999990)*f[13];
+    tg = CONSTANT(0.202300659402999990)*g[13];
+    y[3] += tf*g[6] + tg*f[6];
+    y[6] += tf*g[3] + tg*f[3];
+    t = f[3] * g[6] + f[6] * g[3];
+    y[13] += CONSTANT(0.202300659402999990)*t;
+
+    // [3,7]: 2,12,14,
+    tf = CONSTANT(0.218509686118000010)*f[2] + CONSTANT(-0.143048168103000000)*f[12] + CONSTANT(0.184674390923000000)*f[14];
+    tg = CONSTANT(0.218509686118000010)*g[2] + CONSTANT(-0.143048168103000000)*g[12] + CONSTANT(0.184674390923000000)*g[14];
+    y[3] += tf*g[7] + tg*f[7];
+    y[7] = tf*g[3] + tg*f[3];
+    t = f[3] * g[7] + f[7] * g[3];
+    y[2] += CONSTANT(0.218509686118000010)*t;
+    y[12] += CONSTANT(-0.143048168103000000)*t;
+    y[14] += CONSTANT(0.184674390923000000)*t;
+
+    // [3,8]: 13,15,
+    tf = CONSTANT(-0.058399170081799998)*f[13] + CONSTANT(0.226179013155000000)*f[15];
+    tg = CONSTANT(-0.058399170081799998)*g[13] + CONSTANT(0.226179013155000000)*g[15];
+    y[3] += tf*g[8] + tg*f[8];
+    y[8] += tf*g[3] + tg*f[3];
+    t = f[3] * g[8] + f[8] * g[3];
+    y[13] += CONSTANT(-0.058399170081799998)*t;
+    y[15] += CONSTANT(0.226179013155000000)*t;
+
+    // [4,4]: 0,6,
+    tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.180223751576000010)*f[6];
+    tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.180223751576000010)*g[6];
+    y[4] += tf*g[4] + tg*f[4];
+    t = f[4] * g[4];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[6] += CONSTANT(-0.180223751576000010)*t;
+
+    // [4,5]: 7,
+    tf = CONSTANT(0.156078347226000000)*f[7];
+    tg = CONSTANT(0.156078347226000000)*g[7];
+    y[4] += tf*g[5] + tg*f[5];
+    y[5] += tf*g[4] + tg*f[4];
+    t = f[4] * g[5] + f[5] * g[4];
+    y[7] += CONSTANT(0.156078347226000000)*t;
+
+    // [4,9]: 3,13,
+    tf = CONSTANT(0.226179013157999990)*f[3] + CONSTANT(-0.094031597258400004)*f[13];
+    tg = CONSTANT(0.226179013157999990)*g[3] + CONSTANT(-0.094031597258400004)*g[13];
+    y[4] += tf*g[9] + tg*f[9];
+    y[9] += tf*g[4] + tg*f[4];
+    t = f[4] * g[9] + f[9] * g[4];
+    y[3] += CONSTANT(0.226179013157999990)*t;
+    y[13] += CONSTANT(-0.094031597258400004)*t;
+
+    // [4,10]: 2,12,
+    tf = CONSTANT(0.184674390919999990)*f[2] + CONSTANT(-0.188063194517999990)*f[12];
+    tg = CONSTANT(0.184674390919999990)*g[2] + CONSTANT(-0.188063194517999990)*g[12];
+    y[4] += tf*g[10] + tg*f[10];
+    y[10] = tf*g[4] + tg*f[4];
+    t = f[4] * g[10] + f[10] * g[4];
+    y[2] += CONSTANT(0.184674390919999990)*t;
+    y[12] += CONSTANT(-0.188063194517999990)*t;
+
+    // [4,11]: 3,13,15,
+    tf = CONSTANT(-0.058399170082300000)*f[3] + CONSTANT(0.145673124078000010)*f[13] + CONSTANT(0.094031597258400004)*f[15];
+    tg = CONSTANT(-0.058399170082300000)*g[3] + CONSTANT(0.145673124078000010)*g[13] + CONSTANT(0.094031597258400004)*g[15];
+    y[4] += tf*g[11] + tg*f[11];
+    y[11] += tf*g[4] + tg*f[4];
+    t = f[4] * g[11] + f[11] * g[4];
+    y[3] += CONSTANT(-0.058399170082300000)*t;
+    y[13] += CONSTANT(0.145673124078000010)*t;
+    y[15] += CONSTANT(0.094031597258400004)*t;
+
+    // [5,5]: 0,6,8,
+    tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.090111875786499998)*f[6] + CONSTANT(-0.156078347227999990)*f[8];
+    tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.090111875786499998)*g[6] + CONSTANT(-0.156078347227999990)*g[8];
+    y[5] += tf*g[5] + tg*f[5];
+    t = f[5] * g[5];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[6] += CONSTANT(0.090111875786499998)*t;
+    y[8] += CONSTANT(-0.156078347227999990)*t;
+
+    // [5,9]: 14,
+    tf = CONSTANT(0.148677009677999990)*f[14];
+    tg = CONSTANT(0.148677009677999990)*g[14];
+    y[5] += tf*g[9] + tg*f[9];
+    y[9] += tf*g[5] + tg*f[5];
+    t = f[5] * g[9] + f[9] * g[5];
+    y[14] += CONSTANT(0.148677009677999990)*t;
+
+    // [5,10]: 3,13,15,
+    tf = CONSTANT(0.184674390919999990)*f[3] + CONSTANT(0.115164716490000000)*f[13] + CONSTANT(-0.148677009678999990)*f[15];
+    tg = CONSTANT(0.184674390919999990)*g[3] + CONSTANT(0.115164716490000000)*g[13] + CONSTANT(-0.148677009678999990)*g[15];
+    y[5] += tf*g[10] + tg*f[10];
+    y[10] += tf*g[5] + tg*f[5];
+    t = f[5] * g[10] + f[10] * g[5];
+    y[3] += CONSTANT(0.184674390919999990)*t;
+    y[13] += CONSTANT(0.115164716490000000)*t;
+    y[15] += CONSTANT(-0.148677009678999990)*t;
+
+    // [5,11]: 2,12,14,
+    tf = CONSTANT(0.233596680327000010)*f[2] + CONSTANT(0.059470803871800003)*f[12] + CONSTANT(-0.115164716491000000)*f[14];
+    tg = CONSTANT(0.233596680327000010)*g[2] + CONSTANT(0.059470803871800003)*g[12] + CONSTANT(-0.115164716491000000)*g[14];
+    y[5] += tf*g[11] + tg*f[11];
+    y[11] += tf*g[5] + tg*f[5];
+    t = f[5] * g[11] + f[11] * g[5];
+    y[2] += CONSTANT(0.233596680327000010)*t;
+    y[12] += CONSTANT(0.059470803871800003)*t;
+    y[14] += CONSTANT(-0.115164716491000000)*t;
+
+    // [6,6]: 0,6,
+    tf = CONSTANT(0.282094797560000000)*f[0];
+    tg = CONSTANT(0.282094797560000000)*g[0];
+    y[6] += tf*g[6] + tg*f[6];
+    t = f[6] * g[6];
+    y[0] += CONSTANT(0.282094797560000000)*t;
+    y[6] += CONSTANT(0.180223764527000010)*t;
+
+    // [7,7]: 6,0,8,
+    tf = CONSTANT(0.090111875786499998)*f[6] + CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.156078347227999990)*f[8];
+    tg = CONSTANT(0.090111875786499998)*g[6] + CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.156078347227999990)*g[8];
+    y[7] += tf*g[7] + tg*f[7];
+    t = f[7] * g[7];
+    y[6] += CONSTANT(0.090111875786499998)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[8] += CONSTANT(0.156078347227999990)*t;
+
+    // [7,10]: 9,1,11,
+    tf = CONSTANT(0.148677009678999990)*f[9] + CONSTANT(0.184674390919999990)*f[1] + CONSTANT(0.115164716490000000)*f[11];
+    tg = CONSTANT(0.148677009678999990)*g[9] + CONSTANT(0.184674390919999990)*g[1] + CONSTANT(0.115164716490000000)*g[11];
+    y[7] += tf*g[10] + tg*f[10];
+    y[10] += tf*g[7] + tg*f[7];
+    t = f[7] * g[10] + f[10] * g[7];
+    y[9] += CONSTANT(0.148677009678999990)*t;
+    y[1] += CONSTANT(0.184674390919999990)*t;
+    y[11] += CONSTANT(0.115164716490000000)*t;
+
+    // [7,13]: 12,2,14,
+    tf = CONSTANT(0.059470803871800003)*f[12] + CONSTANT(0.233596680327000010)*f[2] + CONSTANT(0.115164716491000000)*f[14];
+    tg = CONSTANT(0.059470803871800003)*g[12] + CONSTANT(0.233596680327000010)*g[2] + CONSTANT(0.115164716491000000)*g[14];
+    y[7] += tf*g[13] + tg*f[13];
+    y[13] += tf*g[7] + tg*f[7];
+    t = f[7] * g[13] + f[13] * g[7];
+    y[12] += CONSTANT(0.059470803871800003)*t;
+    y[2] += CONSTANT(0.233596680327000010)*t;
+    y[14] += CONSTANT(0.115164716491000000)*t;
+
+    // [7,14]: 15,
+    tf = CONSTANT(0.148677009677999990)*f[15];
+    tg = CONSTANT(0.148677009677999990)*g[15];
+    y[7] += tf*g[14] + tg*f[14];
+    y[14] += tf*g[7] + tg*f[7];
+    t = f[7] * g[14] + f[14] * g[7];
+    y[15] += CONSTANT(0.148677009677999990)*t;
+
+    // [8,8]: 0,6,
+    tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.180223751576000010)*f[6];
+    tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.180223751576000010)*g[6];
+    y[8] += tf*g[8] + tg*f[8];
+    t = f[8] * g[8];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[6] += CONSTANT(-0.180223751576000010)*t;
+
+    // [8,9]: 11,
+    tf = CONSTANT(-0.094031597259499999)*f[11];
+    tg = CONSTANT(-0.094031597259499999)*g[11];
+    y[8] += tf*g[9] + tg*f[9];
+    y[9] += tf*g[8] + tg*f[8];
+    t = f[8] * g[9] + f[9] * g[8];
+    y[11] += CONSTANT(-0.094031597259499999)*t;
+
+    // [8,13]: 15,
+    tf = CONSTANT(-0.094031597259499999)*f[15];
+    tg = CONSTANT(-0.094031597259499999)*g[15];
+    y[8] += tf*g[13] + tg*f[13];
+    y[13] += tf*g[8] + tg*f[8];
+    t = f[8] * g[13] + f[13] * g[8];
+    y[15] += CONSTANT(-0.094031597259499999)*t;
+
+    // [8,14]: 2,12,
+    tf = CONSTANT(0.184674390919999990)*f[2] + CONSTANT(-0.188063194517999990)*f[12];
+    tg = CONSTANT(0.184674390919999990)*g[2] + CONSTANT(-0.188063194517999990)*g[12];
+    y[8] += tf*g[14] + tg*f[14];
+    y[14] += tf*g[8] + tg*f[8];
+    t = f[8] * g[14] + f[14] * g[8];
+    y[2] += CONSTANT(0.184674390919999990)*t;
+    y[12] += CONSTANT(-0.188063194517999990)*t;
+
+    // [9,9]: 6,0,
+    tf = CONSTANT(-0.210261043508000010)*f[6] + CONSTANT(0.282094791766999970)*f[0];
+    tg = CONSTANT(-0.210261043508000010)*g[6] + CONSTANT(0.282094791766999970)*g[0];
+    y[9] += tf*g[9] + tg*f[9];
+    t = f[9] * g[9];
+    y[6] += CONSTANT(-0.210261043508000010)*t;
+    y[0] += CONSTANT(0.282094791766999970)*t;
+
+    // [10,10]: 0,
+    tf = CONSTANT(0.282094791771999980)*f[0];
+    tg = CONSTANT(0.282094791771999980)*g[0];
+    y[10] += tf*g[10] + tg*f[10];
+    t = f[10] * g[10];
+    y[0] += CONSTANT(0.282094791771999980)*t;
+
+    // [11,11]: 0,6,8,
+    tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.126156626101000010)*f[6] + CONSTANT(-0.145673124078999990)*f[8];
+    tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.126156626101000010)*g[6] + CONSTANT(-0.145673124078999990)*g[8];
+    y[11] += tf*g[11] + tg*f[11];
+    t = f[11] * g[11];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[6] += CONSTANT(0.126156626101000010)*t;
+    y[8] += CONSTANT(-0.145673124078999990)*t;
+
+    // [12,12]: 0,6,
+    tf = CONSTANT(0.282094799871999980)*f[0] + CONSTANT(0.168208852954000010)*f[6];
+    tg = CONSTANT(0.282094799871999980)*g[0] + CONSTANT(0.168208852954000010)*g[6];
+    y[12] += tf*g[12] + tg*f[12];
+    t = f[12] * g[12];
+    y[0] += CONSTANT(0.282094799871999980)*t;
+    y[6] += CONSTANT(0.168208852954000010)*t;
+
+    // [13,13]: 0,8,6,
+    tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.145673124078999990)*f[8] + CONSTANT(0.126156626101000010)*f[6];
+    tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.145673124078999990)*g[8] + CONSTANT(0.126156626101000010)*g[6];
+    y[13] += tf*g[13] + tg*f[13];
+    t = f[13] * g[13];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[8] += CONSTANT(0.145673124078999990)*t;
+    y[6] += CONSTANT(0.126156626101000010)*t;
+
+    // [14,14]: 0,
+    tf = CONSTANT(0.282094791771999980)*f[0];
+    tg = CONSTANT(0.282094791771999980)*g[0];
+    y[14] += tf*g[14] + tg*f[14];
+    t = f[14] * g[14];
+    y[0] += CONSTANT(0.282094791771999980)*t;
+
+    // [15,15]: 0,6,
+    tf = CONSTANT(0.282094791766999970)*f[0] + CONSTANT(-0.210261043508000010)*f[6];
+    tg = CONSTANT(0.282094791766999970)*g[0] + CONSTANT(-0.210261043508000010)*g[6];
+    y[15] += tf*g[15] + tg*f[15];
+    t = f[15] * g[15];
+    y[0] += CONSTANT(0.282094791766999970)*t;
+    y[6] += CONSTANT(-0.210261043508000010)*t;
+
+    // multiply count=399
+
+    return y;
+}
+
+
+//-------------------------------------------------------------------------------------
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232908.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+float* DirectX::XMSHMultiply5(
+    float *y,
+    const float *f,
+    const float *g) noexcept
+{
+    if (!y || !f || !g)
+        return nullptr;
+
+    REAL tf, tg, t;
+    // [0,0]: 0,
+    y[0] = CONSTANT(0.282094792935999980)*f[0] * g[0];
+
+    // [1,1]: 0,6,8,
+    tf = CONSTANT(0.282094791773000010)*f[0] + CONSTANT(-0.126156626101000010)*f[6] + CONSTANT(-0.218509686119999990)*f[8];
+    tg = CONSTANT(0.282094791773000010)*g[0] + CONSTANT(-0.126156626101000010)*g[6] + CONSTANT(-0.218509686119999990)*g[8];
+    y[1] = tf*g[1] + tg*f[1];
+    t = f[1] * g[1];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+    y[6] = CONSTANT(-0.126156626101000010)*t;
+    y[8] = CONSTANT(-0.218509686119999990)*t;
+
+    // [1,4]: 3,13,15,
+    tf = CONSTANT(0.218509686114999990)*f[3] + CONSTANT(-0.058399170082300000)*f[13] + CONSTANT(-0.226179013157999990)*f[15];
+    tg = CONSTANT(0.218509686114999990)*g[3] + CONSTANT(-0.058399170082300000)*g[13] + CONSTANT(-0.226179013157999990)*g[15];
+    y[1] += tf*g[4] + tg*f[4];
+    y[4] = tf*g[1] + tg*f[1];
+    t = f[1] * g[4] + f[4] * g[1];
+    y[3] = CONSTANT(0.218509686114999990)*t;
+    y[13] = CONSTANT(-0.058399170082300000)*t;
+    y[15] = CONSTANT(-0.226179013157999990)*t;
+
+    // [1,5]: 2,12,14,
+    tf = CONSTANT(0.218509686118000010)*f[2] + CONSTANT(-0.143048168103000000)*f[12] + CONSTANT(-0.184674390923000000)*f[14];
+    tg = CONSTANT(0.218509686118000010)*g[2] + CONSTANT(-0.143048168103000000)*g[12] + CONSTANT(-0.184674390923000000)*g[14];
+    y[1] += tf*g[5] + tg*f[5];
+    y[5] = tf*g[1] + tg*f[1];
+    t = f[1] * g[5] + f[5] * g[1];
+    y[2] = CONSTANT(0.218509686118000010)*t;
+    y[12] = CONSTANT(-0.143048168103000000)*t;
+    y[14] = CONSTANT(-0.184674390923000000)*t;
+
+    // [1,9]: 8,22,24,
+    tf = CONSTANT(0.226179013155000000)*f[8] + CONSTANT(-0.043528171378199997)*f[22] + CONSTANT(-0.230329432978999990)*f[24];
+    tg = CONSTANT(0.226179013155000000)*g[8] + CONSTANT(-0.043528171378199997)*g[22] + CONSTANT(-0.230329432978999990)*g[24];
+    y[1] += tf*g[9] + tg*f[9];
+    y[9] = tf*g[1] + tg*f[1];
+    t = f[1] * g[9] + f[9] * g[1];
+    y[8] += CONSTANT(0.226179013155000000)*t;
+    y[22] = CONSTANT(-0.043528171378199997)*t;
+    y[24] = CONSTANT(-0.230329432978999990)*t;
+
+    // [1,10]: 7,21,23,
+    tf = CONSTANT(0.184674390919999990)*f[7] + CONSTANT(-0.075393004386799994)*f[21] + CONSTANT(-0.199471140200000010)*f[23];
+    tg = CONSTANT(0.184674390919999990)*g[7] + CONSTANT(-0.075393004386799994)*g[21] + CONSTANT(-0.199471140200000010)*g[23];
+    y[1] += tf*g[10] + tg*f[10];
+    y[10] = tf*g[1] + tg*f[1];
+    t = f[1] * g[10] + f[10] * g[1];
+    y[7] = CONSTANT(0.184674390919999990)*t;
+    y[21] = CONSTANT(-0.075393004386799994)*t;
+    y[23] = CONSTANT(-0.199471140200000010)*t;
+
+    // [1,11]: 6,8,20,22,
+    tf = CONSTANT(0.202300659402999990)*f[6] + CONSTANT(0.058399170081799998)*f[8] + CONSTANT(-0.150786008773000000)*f[20] + CONSTANT(-0.168583882836999990)*f[22];
+    tg = CONSTANT(0.202300659402999990)*g[6] + CONSTANT(0.058399170081799998)*g[8] + CONSTANT(-0.150786008773000000)*g[20] + CONSTANT(-0.168583882836999990)*g[22];
+    y[1] += tf*g[11] + tg*f[11];
+    y[11] = tf*g[1] + tg*f[1];
+    t = f[1] * g[11] + f[11] * g[1];
+    y[6] += CONSTANT(0.202300659402999990)*t;
+    y[8] += CONSTANT(0.058399170081799998)*t;
+    y[20] = CONSTANT(-0.150786008773000000)*t;
+    y[22] += CONSTANT(-0.168583882836999990)*t;
+
+    // [1,12]: 19,
+    tf = CONSTANT(0.194663900273000010)*f[19];
+    tg = CONSTANT(0.194663900273000010)*g[19];
+    y[1] += tf*g[12] + tg*f[12];
+    y[12] += tf*g[1] + tg*f[1];
+    t = f[1] * g[12] + f[12] * g[1];
+    y[19] = CONSTANT(0.194663900273000010)*t;
+
+    // [1,13]: 18,
+    tf = CONSTANT(0.168583882834000000)*f[18];
+    tg = CONSTANT(0.168583882834000000)*g[18];
+    y[1] += tf*g[13] + tg*f[13];
+    y[13] += tf*g[1] + tg*f[1];
+    t = f[1] * g[13] + f[13] * g[1];
+    y[18] = CONSTANT(0.168583882834000000)*t;
+
+    // [1,14]: 17,19,
+    tf = CONSTANT(0.199471140196999990)*f[17] + CONSTANT(0.075393004386399995)*f[19];
+    tg = CONSTANT(0.199471140196999990)*g[17] + CONSTANT(0.075393004386399995)*g[19];
+    y[1] += tf*g[14] + tg*f[14];
+    y[14] += tf*g[1] + tg*f[1];
+    t = f[1] * g[14] + f[14] * g[1];
+    y[17] = CONSTANT(0.199471140196999990)*t;
+    y[19] += CONSTANT(0.075393004386399995)*t;
+
+    // [1,15]: 16,18,
+    tf = CONSTANT(0.230329432973999990)*f[16] + CONSTANT(0.043528171377799997)*f[18];
+    tg = CONSTANT(0.230329432973999990)*g[16] + CONSTANT(0.043528171377799997)*g[18];
+    y[1] += tf*g[15] + tg*f[15];
+    y[15] += tf*g[1] + tg*f[1];
+    t = f[1] * g[15] + f[15] * g[1];
+    y[16] = CONSTANT(0.230329432973999990)*t;
+    y[18] += CONSTANT(0.043528171377799997)*t;
+
+    // [2,2]: 0,6,
+    tf = CONSTANT(0.282094795249000000)*f[0] + CONSTANT(0.252313259986999990)*f[6];
+    tg = CONSTANT(0.282094795249000000)*g[0] + CONSTANT(0.252313259986999990)*g[6];
+    y[2] += tf*g[2] + tg*f[2];
+    t = f[2] * g[2];
+    y[0] += CONSTANT(0.282094795249000000)*t;
+    y[6] += CONSTANT(0.252313259986999990)*t;
+
+    // [2,10]: 4,18,
+    tf = CONSTANT(0.184674390919999990)*f[4] + CONSTANT(0.213243618621000000)*f[18];
+    tg = CONSTANT(0.184674390919999990)*g[4] + CONSTANT(0.213243618621000000)*g[18];
+    y[2] += tf*g[10] + tg*f[10];
+    y[10] += tf*g[2] + tg*f[2];
+    t = f[2] * g[10] + f[10] * g[2];
+    y[4] += CONSTANT(0.184674390919999990)*t;
+    y[18] += CONSTANT(0.213243618621000000)*t;
+
+    // [2,12]: 6,20,
+    tf = CONSTANT(0.247766706973999990)*f[6] + CONSTANT(0.246232537174000010)*f[20];
+    tg = CONSTANT(0.247766706973999990)*g[6] + CONSTANT(0.246232537174000010)*g[20];
+    y[2] += tf*g[12] + tg*f[12];
+    y[12] += tf*g[2] + tg*f[2];
+    t = f[2] * g[12] + f[12] * g[2];
+    y[6] += CONSTANT(0.247766706973999990)*t;
+    y[20] += CONSTANT(0.246232537174000010)*t;
+
+    // [2,14]: 8,22,
+    tf = CONSTANT(0.184674390919999990)*f[8] + CONSTANT(0.213243618621000000)*f[22];
+    tg = CONSTANT(0.184674390919999990)*g[8] + CONSTANT(0.213243618621000000)*g[22];
+    y[2] += tf*g[14] + tg*f[14];
+    y[14] += tf*g[2] + tg*f[2];
+    t = f[2] * g[14] + f[14] * g[2];
+    y[8] += CONSTANT(0.184674390919999990)*t;
+    y[22] += CONSTANT(0.213243618621000000)*t;
+
+    // [3,3]: 0,6,8,
+    tf = CONSTANT(0.282094791773000010)*f[0] + CONSTANT(-0.126156626101000010)*f[6] + CONSTANT(0.218509686119999990)*f[8];
+    tg = CONSTANT(0.282094791773000010)*g[0] + CONSTANT(-0.126156626101000010)*g[6] + CONSTANT(0.218509686119999990)*g[8];
+    y[3] += tf*g[3] + tg*f[3];
+    t = f[3] * g[3];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+    y[6] += CONSTANT(-0.126156626101000010)*t;
+    y[8] += CONSTANT(0.218509686119999990)*t;
+
+    // [3,7]: 2,12,14,
+    tf = CONSTANT(0.218509686118000010)*f[2] + CONSTANT(-0.143048168103000000)*f[12] + CONSTANT(0.184674390923000000)*f[14];
+    tg = CONSTANT(0.218509686118000010)*g[2] + CONSTANT(-0.143048168103000000)*g[12] + CONSTANT(0.184674390923000000)*g[14];
+    y[3] += tf*g[7] + tg*f[7];
+    y[7] += tf*g[3] + tg*f[3];
+    t = f[3] * g[7] + f[7] * g[3];
+    y[2] += CONSTANT(0.218509686118000010)*t;
+    y[12] += CONSTANT(-0.143048168103000000)*t;
+    y[14] += CONSTANT(0.184674390923000000)*t;
+
+    // [3,9]: 4,16,18,
+    tf = CONSTANT(0.226179013157999990)*f[4] + CONSTANT(0.230329432973999990)*f[16] + CONSTANT(-0.043528171377799997)*f[18];
+    tg = CONSTANT(0.226179013157999990)*g[4] + CONSTANT(0.230329432973999990)*g[16] + CONSTANT(-0.043528171377799997)*g[18];
+    y[3] += tf*g[9] + tg*f[9];
+    y[9] += tf*g[3] + tg*f[3];
+    t = f[3] * g[9] + f[9] * g[3];
+    y[4] += CONSTANT(0.226179013157999990)*t;
+    y[16] += CONSTANT(0.230329432973999990)*t;
+    y[18] += CONSTANT(-0.043528171377799997)*t;
+
+    // [3,10]: 5,17,19,
+    tf = CONSTANT(0.184674390919999990)*f[5] + CONSTANT(0.199471140200000010)*f[17] + CONSTANT(-0.075393004386799994)*f[19];
+    tg = CONSTANT(0.184674390919999990)*g[5] + CONSTANT(0.199471140200000010)*g[17] + CONSTANT(-0.075393004386799994)*g[19];
+    y[3] += tf*g[10] + tg*f[10];
+    y[10] += tf*g[3] + tg*f[3];
+    t = f[3] * g[10] + f[10] * g[3];
+    y[5] += CONSTANT(0.184674390919999990)*t;
+    y[17] += CONSTANT(0.199471140200000010)*t;
+    y[19] += CONSTANT(-0.075393004386799994)*t;
+
+    // [3,12]: 21,
+    tf = CONSTANT(0.194663900273000010)*f[21];
+    tg = CONSTANT(0.194663900273000010)*g[21];
+    y[3] += tf*g[12] + tg*f[12];
+    y[12] += tf*g[3] + tg*f[3];
+    t = f[3] * g[12] + f[12] * g[3];
+    y[21] += CONSTANT(0.194663900273000010)*t;
+
+    // [3,13]: 8,6,20,22,
+    tf = CONSTANT(-0.058399170081799998)*f[8] + CONSTANT(0.202300659402999990)*f[6] + CONSTANT(-0.150786008773000000)*f[20] + CONSTANT(0.168583882836999990)*f[22];
+    tg = CONSTANT(-0.058399170081799998)*g[8] + CONSTANT(0.202300659402999990)*g[6] + CONSTANT(-0.150786008773000000)*g[20] + CONSTANT(0.168583882836999990)*g[22];
+    y[3] += tf*g[13] + tg*f[13];
+    y[13] += tf*g[3] + tg*f[3];
+    t = f[3] * g[13] + f[13] * g[3];
+    y[8] += CONSTANT(-0.058399170081799998)*t;
+    y[6] += CONSTANT(0.202300659402999990)*t;
+    y[20] += CONSTANT(-0.150786008773000000)*t;
+    y[22] += CONSTANT(0.168583882836999990)*t;
+
+    // [3,14]: 21,23,
+    tf = CONSTANT(-0.075393004386399995)*f[21] + CONSTANT(0.199471140196999990)*f[23];
+    tg = CONSTANT(-0.075393004386399995)*g[21] + CONSTANT(0.199471140196999990)*g[23];
+    y[3] += tf*g[14] + tg*f[14];
+    y[14] += tf*g[3] + tg*f[3];
+    t = f[3] * g[14] + f[14] * g[3];
+    y[21] += CONSTANT(-0.075393004386399995)*t;
+    y[23] += CONSTANT(0.199471140196999990)*t;
+
+    // [3,15]: 8,22,24,
+    tf = CONSTANT(0.226179013155000000)*f[8] + CONSTANT(-0.043528171378199997)*f[22] + CONSTANT(0.230329432978999990)*f[24];
+    tg = CONSTANT(0.226179013155000000)*g[8] + CONSTANT(-0.043528171378199997)*g[22] + CONSTANT(0.230329432978999990)*g[24];
+    y[3] += tf*g[15] + tg*f[15];
+    y[15] += tf*g[3] + tg*f[3];
+    t = f[3] * g[15] + f[15] * g[3];
+    y[8] += CONSTANT(0.226179013155000000)*t;
+    y[22] += CONSTANT(-0.043528171378199997)*t;
+    y[24] += CONSTANT(0.230329432978999990)*t;
+
+    // [4,4]: 0,6,20,24,
+    tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.180223751576000010)*f[6] + CONSTANT(0.040299255967500003)*f[20] + CONSTANT(-0.238413613505999990)*f[24];
+    tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.180223751576000010)*g[6] + CONSTANT(0.040299255967500003)*g[20] + CONSTANT(-0.238413613505999990)*g[24];
+    y[4] += tf*g[4] + tg*f[4];
+    t = f[4] * g[4];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[6] += CONSTANT(-0.180223751576000010)*t;
+    y[20] += CONSTANT(0.040299255967500003)*t;
+    y[24] += CONSTANT(-0.238413613505999990)*t;
+
+    // [4,5]: 7,21,23,
+    tf = CONSTANT(0.156078347226000000)*f[7] + CONSTANT(-0.063718718434399996)*f[21] + CONSTANT(-0.168583882835000000)*f[23];
+    tg = CONSTANT(0.156078347226000000)*g[7] + CONSTANT(-0.063718718434399996)*g[21] + CONSTANT(-0.168583882835000000)*g[23];
+    y[4] += tf*g[5] + tg*f[5];
+    y[5] += tf*g[4] + tg*f[4];
+    t = f[4] * g[5] + f[5] * g[4];
+    y[7] += CONSTANT(0.156078347226000000)*t;
+    y[21] += CONSTANT(-0.063718718434399996)*t;
+    y[23] += CONSTANT(-0.168583882835000000)*t;
+
+    // [4,11]: 3,13,15,
+    tf = CONSTANT(-0.058399170082300000)*f[3] + CONSTANT(0.145673124078000010)*f[13] + CONSTANT(0.094031597258400004)*f[15];
+    tg = CONSTANT(-0.058399170082300000)*g[3] + CONSTANT(0.145673124078000010)*g[13] + CONSTANT(0.094031597258400004)*g[15];
+    y[4] += tf*g[11] + tg*f[11];
+    y[11] += tf*g[4] + tg*f[4];
+    t = f[4] * g[11] + f[11] * g[4];
+    y[3] += CONSTANT(-0.058399170082300000)*t;
+    y[13] += CONSTANT(0.145673124078000010)*t;
+    y[15] += CONSTANT(0.094031597258400004)*t;
+
+    // [4,16]: 8,22,
+    tf = CONSTANT(0.238413613494000000)*f[8] + CONSTANT(-0.075080816693699995)*f[22];
+    tg = CONSTANT(0.238413613494000000)*g[8] + CONSTANT(-0.075080816693699995)*g[22];
+    y[4] += tf*g[16] + tg*f[16];
+    y[16] += tf*g[4] + tg*f[4];
+    t = f[4] * g[16] + f[16] * g[4];
+    y[8] += CONSTANT(0.238413613494000000)*t;
+    y[22] += CONSTANT(-0.075080816693699995)*t;
+
+    // [4,18]: 6,20,24,
+    tf = CONSTANT(0.156078347226000000)*f[6] + CONSTANT(-0.190364615029000010)*f[20] + CONSTANT(0.075080816691500005)*f[24];
+    tg = CONSTANT(0.156078347226000000)*g[6] + CONSTANT(-0.190364615029000010)*g[20] + CONSTANT(0.075080816691500005)*g[24];
+    y[4] += tf*g[18] + tg*f[18];
+    y[18] += tf*g[4] + tg*f[4];
+    t = f[4] * g[18] + f[18] * g[4];
+    y[6] += CONSTANT(0.156078347226000000)*t;
+    y[20] += CONSTANT(-0.190364615029000010)*t;
+    y[24] += CONSTANT(0.075080816691500005)*t;
+
+    // [4,19]: 7,21,23,
+    tf = CONSTANT(-0.063718718434399996)*f[7] + CONSTANT(0.141889406569999990)*f[21] + CONSTANT(0.112621225039000000)*f[23];
+    tg = CONSTANT(-0.063718718434399996)*g[7] + CONSTANT(0.141889406569999990)*g[21] + CONSTANT(0.112621225039000000)*g[23];
+    y[4] += tf*g[19] + tg*f[19];
+    y[19] += tf*g[4] + tg*f[4];
+    t = f[4] * g[19] + f[19] * g[4];
+    y[7] += CONSTANT(-0.063718718434399996)*t;
+    y[21] += CONSTANT(0.141889406569999990)*t;
+    y[23] += CONSTANT(0.112621225039000000)*t;
+
+    // [5,5]: 0,6,8,20,22,
+    tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.090111875786499998)*f[6] + CONSTANT(-0.156078347227999990)*f[8] + CONSTANT(-0.161197023870999990)*f[20] + CONSTANT(-0.180223751574000000)*f[22];
+    tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.090111875786499998)*g[6] + CONSTANT(-0.156078347227999990)*g[8] + CONSTANT(-0.161197023870999990)*g[20] + CONSTANT(-0.180223751574000000)*g[22];
+    y[5] += tf*g[5] + tg*f[5];
+    t = f[5] * g[5];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[6] += CONSTANT(0.090111875786499998)*t;
+    y[8] += CONSTANT(-0.156078347227999990)*t;
+    y[20] += CONSTANT(-0.161197023870999990)*t;
+    y[22] += CONSTANT(-0.180223751574000000)*t;
+
+    // [5,11]: 2,12,14,
+    tf = CONSTANT(0.233596680327000010)*f[2] + CONSTANT(0.059470803871800003)*f[12] + CONSTANT(-0.115164716491000000)*f[14];
+    tg = CONSTANT(0.233596680327000010)*g[2] + CONSTANT(0.059470803871800003)*g[12] + CONSTANT(-0.115164716491000000)*g[14];
+    y[5] += tf*g[11] + tg*f[11];
+    y[11] += tf*g[5] + tg*f[5];
+    t = f[5] * g[11] + f[11] * g[5];
+    y[2] += CONSTANT(0.233596680327000010)*t;
+    y[12] += CONSTANT(0.059470803871800003)*t;
+    y[14] += CONSTANT(-0.115164716491000000)*t;
+
+    // [5,17]: 8,22,24,
+    tf = CONSTANT(0.168583882832999990)*f[8] + CONSTANT(0.132725386548000010)*f[22] + CONSTANT(-0.140463346189000000)*f[24];
+    tg = CONSTANT(0.168583882832999990)*g[8] + CONSTANT(0.132725386548000010)*g[22] + CONSTANT(-0.140463346189000000)*g[24];
+    y[5] += tf*g[17] + tg*f[17];
+    y[17] += tf*g[5] + tg*f[5];
+    t = f[5] * g[17] + f[17] * g[5];
+    y[8] += CONSTANT(0.168583882832999990)*t;
+    y[22] += CONSTANT(0.132725386548000010)*t;
+    y[24] += CONSTANT(-0.140463346189000000)*t;
+
+    // [5,18]: 7,21,23,
+    tf = CONSTANT(0.180223751571000010)*f[7] + CONSTANT(0.090297865407399994)*f[21] + CONSTANT(-0.132725386549000010)*f[23];
+    tg = CONSTANT(0.180223751571000010)*g[7] + CONSTANT(0.090297865407399994)*g[21] + CONSTANT(-0.132725386549000010)*g[23];
+    y[5] += tf*g[18] + tg*f[18];
+    y[18] += tf*g[5] + tg*f[5];
+    t = f[5] * g[18] + f[18] * g[5];
+    y[7] += CONSTANT(0.180223751571000010)*t;
+    y[21] += CONSTANT(0.090297865407399994)*t;
+    y[23] += CONSTANT(-0.132725386549000010)*t;
+
+    // [5,19]: 6,8,20,22,
+    tf = CONSTANT(0.220728115440999990)*f[6] + CONSTANT(0.063718718433900007)*f[8] + CONSTANT(0.044869370061299998)*f[20] + CONSTANT(-0.090297865408399999)*f[22];
+    tg = CONSTANT(0.220728115440999990)*g[6] + CONSTANT(0.063718718433900007)*g[8] + CONSTANT(0.044869370061299998)*g[20] + CONSTANT(-0.090297865408399999)*g[22];
+    y[5] += tf*g[19] + tg*f[19];
+    y[19] += tf*g[5] + tg*f[5];
+    t = f[5] * g[19] + f[19] * g[5];
+    y[6] += CONSTANT(0.220728115440999990)*t;
+    y[8] += CONSTANT(0.063718718433900007)*t;
+    y[20] += CONSTANT(0.044869370061299998)*t;
+    y[22] += CONSTANT(-0.090297865408399999)*t;
+
+    // [6,6]: 0,6,20,
+    tf = CONSTANT(0.282094797560000000)*f[0] + CONSTANT(0.241795553185999990)*f[20];
+    tg = CONSTANT(0.282094797560000000)*g[0] + CONSTANT(0.241795553185999990)*g[20];
+    y[6] += tf*g[6] + tg*f[6];
+    t = f[6] * g[6];
+    y[0] += CONSTANT(0.282094797560000000)*t;
+    y[6] += CONSTANT(0.180223764527000010)*t;
+    y[20] += CONSTANT(0.241795553185999990)*t;
+
+    // [7,7]: 6,0,8,20,22,
+    tf = CONSTANT(0.090111875786499998)*f[6] + CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.156078347227999990)*f[8] + CONSTANT(-0.161197023870999990)*f[20] + CONSTANT(0.180223751574000000)*f[22];
+    tg = CONSTANT(0.090111875786499998)*g[6] + CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.156078347227999990)*g[8] + CONSTANT(-0.161197023870999990)*g[20] + CONSTANT(0.180223751574000000)*g[22];
+    y[7] += tf*g[7] + tg*f[7];
+    t = f[7] * g[7];
+    y[6] += CONSTANT(0.090111875786499998)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[8] += CONSTANT(0.156078347227999990)*t;
+    y[20] += CONSTANT(-0.161197023870999990)*t;
+    y[22] += CONSTANT(0.180223751574000000)*t;
+
+    // [7,13]: 12,2,14,
+    tf = CONSTANT(0.059470803871800003)*f[12] + CONSTANT(0.233596680327000010)*f[2] + CONSTANT(0.115164716491000000)*f[14];
+    tg = CONSTANT(0.059470803871800003)*g[12] + CONSTANT(0.233596680327000010)*g[2] + CONSTANT(0.115164716491000000)*g[14];
+    y[7] += tf*g[13] + tg*f[13];
+    y[13] += tf*g[7] + tg*f[7];
+    t = f[7] * g[13] + f[13] * g[7];
+    y[12] += CONSTANT(0.059470803871800003)*t;
+    y[2] += CONSTANT(0.233596680327000010)*t;
+    y[14] += CONSTANT(0.115164716491000000)*t;
+
+    // [7,17]: 16,4,18,
+    tf = CONSTANT(0.140463346187999990)*f[16] + CONSTANT(0.168583882835000000)*f[4] + CONSTANT(0.132725386549000010)*f[18];
+    tg = CONSTANT(0.140463346187999990)*g[16] + CONSTANT(0.168583882835000000)*g[4] + CONSTANT(0.132725386549000010)*g[18];
+    y[7] += tf*g[17] + tg*f[17];
+    y[17] += tf*g[7] + tg*f[7];
+    t = f[7] * g[17] + f[17] * g[7];
+    y[16] += CONSTANT(0.140463346187999990)*t;
+    y[4] += CONSTANT(0.168583882835000000)*t;
+    y[18] += CONSTANT(0.132725386549000010)*t;
+
+    // [7,21]: 8,20,6,22,
+    tf = CONSTANT(-0.063718718433900007)*f[8] + CONSTANT(0.044869370061299998)*f[20] + CONSTANT(0.220728115440999990)*f[6] + CONSTANT(0.090297865408399999)*f[22];
+    tg = CONSTANT(-0.063718718433900007)*g[8] + CONSTANT(0.044869370061299998)*g[20] + CONSTANT(0.220728115440999990)*g[6] + CONSTANT(0.090297865408399999)*g[22];
+    y[7] += tf*g[21] + tg*f[21];
+    y[21] += tf*g[7] + tg*f[7];
+    t = f[7] * g[21] + f[21] * g[7];
+    y[8] += CONSTANT(-0.063718718433900007)*t;
+    y[20] += CONSTANT(0.044869370061299998)*t;
+    y[6] += CONSTANT(0.220728115440999990)*t;
+    y[22] += CONSTANT(0.090297865408399999)*t;
+
+    // [7,23]: 8,22,24,
+    tf = CONSTANT(0.168583882832999990)*f[8] + CONSTANT(0.132725386548000010)*f[22] + CONSTANT(0.140463346189000000)*f[24];
+    tg = CONSTANT(0.168583882832999990)*g[8] + CONSTANT(0.132725386548000010)*g[22] + CONSTANT(0.140463346189000000)*g[24];
+    y[7] += tf*g[23] + tg*f[23];
+    y[23] += tf*g[7] + tg*f[7];
+    t = f[7] * g[23] + f[23] * g[7];
+    y[8] += CONSTANT(0.168583882832999990)*t;
+    y[22] += CONSTANT(0.132725386548000010)*t;
+    y[24] += CONSTANT(0.140463346189000000)*t;
+
+    // [8,8]: 0,6,20,24,
+    tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.180223751576000010)*f[6] + CONSTANT(0.040299255967500003)*f[20] + CONSTANT(0.238413613505999990)*f[24];
+    tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.180223751576000010)*g[6] + CONSTANT(0.040299255967500003)*g[20] + CONSTANT(0.238413613505999990)*g[24];
+    y[8] += tf*g[8] + tg*f[8];
+    t = f[8] * g[8];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[6] += CONSTANT(-0.180223751576000010)*t;
+    y[20] += CONSTANT(0.040299255967500003)*t;
+    y[24] += CONSTANT(0.238413613505999990)*t;
+
+    // [8,22]: 6,20,24,
+    tf = CONSTANT(0.156078347226000000)*f[6] + CONSTANT(-0.190364615029000010)*f[20] + CONSTANT(-0.075080816691500005)*f[24];
+    tg = CONSTANT(0.156078347226000000)*g[6] + CONSTANT(-0.190364615029000010)*g[20] + CONSTANT(-0.075080816691500005)*g[24];
+    y[8] += tf*g[22] + tg*f[22];
+    y[22] += tf*g[8] + tg*f[8];
+    t = f[8] * g[22] + f[22] * g[8];
+    y[6] += CONSTANT(0.156078347226000000)*t;
+    y[20] += CONSTANT(-0.190364615029000010)*t;
+    y[24] += CONSTANT(-0.075080816691500005)*t;
+
+    // [9,9]: 6,0,20,
+    tf = CONSTANT(-0.210261043508000010)*f[6] + CONSTANT(0.282094791766999970)*f[0] + CONSTANT(0.076934943209800002)*f[20];
+    tg = CONSTANT(-0.210261043508000010)*g[6] + CONSTANT(0.282094791766999970)*g[0] + CONSTANT(0.076934943209800002)*g[20];
+    y[9] += tf*g[9] + tg*f[9];
+    t = f[9] * g[9];
+    y[6] += CONSTANT(-0.210261043508000010)*t;
+    y[0] += CONSTANT(0.282094791766999970)*t;
+    y[20] += CONSTANT(0.076934943209800002)*t;
+
+    // [9,10]: 7,21,
+    tf = CONSTANT(0.148677009678999990)*f[7] + CONSTANT(-0.099322584599600000)*f[21];
+    tg = CONSTANT(0.148677009678999990)*g[7] + CONSTANT(-0.099322584599600000)*g[21];
+    y[9] += tf*g[10] + tg*f[10];
+    y[10] += tf*g[9] + tg*f[9];
+    t = f[9] * g[10] + f[10] * g[9];
+    y[7] += CONSTANT(0.148677009678999990)*t;
+    y[21] += CONSTANT(-0.099322584599600000)*t;
+
+    // [9,11]: 8,22,24,
+    tf = CONSTANT(-0.094031597259499999)*f[8] + CONSTANT(0.133255230518000010)*f[22] + CONSTANT(0.117520066950999990)*f[24];
+    tg = CONSTANT(-0.094031597259499999)*g[8] + CONSTANT(0.133255230518000010)*g[22] + CONSTANT(0.117520066950999990)*g[24];
+    y[9] += tf*g[11] + tg*f[11];
+    y[11] += tf*g[9] + tg*f[9];
+    t = f[9] * g[11] + f[11] * g[9];
+    y[8] += CONSTANT(-0.094031597259499999)*t;
+    y[22] += CONSTANT(0.133255230518000010)*t;
+    y[24] += CONSTANT(0.117520066950999990)*t;
+
+    // [9,13]: 4,16,18,
+    tf = CONSTANT(-0.094031597258400004)*f[4] + CONSTANT(-0.117520066953000000)*f[16] + CONSTANT(0.133255230519000010)*f[18];
+    tg = CONSTANT(-0.094031597258400004)*g[4] + CONSTANT(-0.117520066953000000)*g[16] + CONSTANT(0.133255230519000010)*g[18];
+    y[9] += tf*g[13] + tg*f[13];
+    y[13] += tf*g[9] + tg*f[9];
+    t = f[9] * g[13] + f[13] * g[9];
+    y[4] += CONSTANT(-0.094031597258400004)*t;
+    y[16] += CONSTANT(-0.117520066953000000)*t;
+    y[18] += CONSTANT(0.133255230519000010)*t;
+
+    // [9,14]: 5,19,
+    tf = CONSTANT(0.148677009677999990)*f[5] + CONSTANT(-0.099322584600699995)*f[19];
+    tg = CONSTANT(0.148677009677999990)*g[5] + CONSTANT(-0.099322584600699995)*g[19];
+    y[9] += tf*g[14] + tg*f[14];
+    y[14] += tf*g[9] + tg*f[9];
+    t = f[9] * g[14] + f[14] * g[9];
+    y[5] += CONSTANT(0.148677009677999990)*t;
+    y[19] += CONSTANT(-0.099322584600699995)*t;
+
+    // [9,17]: 2,12,
+    tf = CONSTANT(0.162867503964999990)*f[2] + CONSTANT(-0.203550726872999990)*f[12];
+    tg = CONSTANT(0.162867503964999990)*g[2] + CONSTANT(-0.203550726872999990)*g[12];
+    y[9] += tf*g[17] + tg*f[17];
+    y[17] += tf*g[9] + tg*f[9];
+    t = f[9] * g[17] + f[17] * g[9];
+    y[2] += CONSTANT(0.162867503964999990)*t;
+    y[12] += CONSTANT(-0.203550726872999990)*t;
+
+    // [10,10]: 0,20,24,
+    tf = CONSTANT(0.282094791771999980)*f[0] + CONSTANT(-0.179514867494000000)*f[20] + CONSTANT(-0.151717754049000010)*f[24];
+    tg = CONSTANT(0.282094791771999980)*g[0] + CONSTANT(-0.179514867494000000)*g[20] + CONSTANT(-0.151717754049000010)*g[24];
+    y[10] += tf*g[10] + tg*f[10];
+    t = f[10] * g[10];
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[20] += CONSTANT(-0.179514867494000000)*t;
+    y[24] += CONSTANT(-0.151717754049000010)*t;
+
+    // [10,11]: 7,21,23,
+    tf = CONSTANT(0.115164716490000000)*f[7] + CONSTANT(0.102579924281000000)*f[21] + CONSTANT(-0.067850242288900006)*f[23];
+    tg = CONSTANT(0.115164716490000000)*g[7] + CONSTANT(0.102579924281000000)*g[21] + CONSTANT(-0.067850242288900006)*g[23];
+    y[10] += tf*g[11] + tg*f[11];
+    y[11] += tf*g[10] + tg*f[10];
+    t = f[10] * g[11] + f[11] * g[10];
+    y[7] += CONSTANT(0.115164716490000000)*t;
+    y[21] += CONSTANT(0.102579924281000000)*t;
+    y[23] += CONSTANT(-0.067850242288900006)*t;
+
+    // [10,12]: 4,18,
+    tf = CONSTANT(-0.188063194517999990)*f[4] + CONSTANT(-0.044418410173299998)*f[18];
+    tg = CONSTANT(-0.188063194517999990)*g[4] + CONSTANT(-0.044418410173299998)*g[18];
+    y[10] += tf*g[12] + tg*f[12];
+    y[12] += tf*g[10] + tg*f[10];
+    t = f[10] * g[12] + f[12] * g[10];
+    y[4] += CONSTANT(-0.188063194517999990)*t;
+    y[18] += CONSTANT(-0.044418410173299998)*t;
+
+    // [10,13]: 5,17,19,
+    tf = CONSTANT(0.115164716490000000)*f[5] + CONSTANT(0.067850242288900006)*f[17] + CONSTANT(0.102579924281000000)*f[19];
+    tg = CONSTANT(0.115164716490000000)*g[5] + CONSTANT(0.067850242288900006)*g[17] + CONSTANT(0.102579924281000000)*g[19];
+    y[10] += tf*g[13] + tg*f[13];
+    y[13] += tf*g[10] + tg*f[10];
+    t = f[10] * g[13] + f[13] * g[10];
+    y[5] += CONSTANT(0.115164716490000000)*t;
+    y[17] += CONSTANT(0.067850242288900006)*t;
+    y[19] += CONSTANT(0.102579924281000000)*t;
+
+    // [10,14]: 16,
+    tf = CONSTANT(0.151717754044999990)*f[16];
+    tg = CONSTANT(0.151717754044999990)*g[16];
+    y[10] += tf*g[14] + tg*f[14];
+    y[14] += tf*g[10] + tg*f[10];
+    t = f[10] * g[14] + f[14] * g[10];
+    y[16] += CONSTANT(0.151717754044999990)*t;
+
+    // [10,15]: 5,19,
+    tf = CONSTANT(-0.148677009678999990)*f[5] + CONSTANT(0.099322584599600000)*f[19];
+    tg = CONSTANT(-0.148677009678999990)*g[5] + CONSTANT(0.099322584599600000)*g[19];
+    y[10] += tf*g[15] + tg*f[15];
+    y[15] += tf*g[10] + tg*f[10];
+    t = f[10] * g[15] + f[15] * g[10];
+    y[5] += CONSTANT(-0.148677009678999990)*t;
+    y[19] += CONSTANT(0.099322584599600000)*t;
+
+    // [11,11]: 0,6,8,20,22,
+    tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.126156626101000010)*f[6] + CONSTANT(-0.145673124078999990)*f[8] + CONSTANT(0.025644981070299999)*f[20] + CONSTANT(-0.114687841910000000)*f[22];
+    tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.126156626101000010)*g[6] + CONSTANT(-0.145673124078999990)*g[8] + CONSTANT(0.025644981070299999)*g[20] + CONSTANT(-0.114687841910000000)*g[22];
+    y[11] += tf*g[11] + tg*f[11];
+    t = f[11] * g[11];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[6] += CONSTANT(0.126156626101000010)*t;
+    y[8] += CONSTANT(-0.145673124078999990)*t;
+    y[20] += CONSTANT(0.025644981070299999)*t;
+    y[22] += CONSTANT(-0.114687841910000000)*t;
+
+    // [11,14]: 17,
+    tf = CONSTANT(0.067850242288500007)*f[17];
+    tg = CONSTANT(0.067850242288500007)*g[17];
+    y[11] += tf*g[14] + tg*f[14];
+    y[14] += tf*g[11] + tg*f[11];
+    t = f[11] * g[14] + f[14] * g[11];
+    y[17] += CONSTANT(0.067850242288500007)*t;
+
+    // [11,15]: 16,
+    tf = CONSTANT(-0.117520066953000000)*f[16];
+    tg = CONSTANT(-0.117520066953000000)*g[16];
+    y[11] += tf*g[15] + tg*f[15];
+    y[15] += tf*g[11] + tg*f[11];
+    t = f[11] * g[15] + f[15] * g[11];
+    y[16] += CONSTANT(-0.117520066953000000)*t;
+
+    // [11,18]: 3,13,15,
+    tf = CONSTANT(0.168583882834000000)*f[3] + CONSTANT(0.114687841909000000)*f[13] + CONSTANT(-0.133255230519000010)*f[15];
+    tg = CONSTANT(0.168583882834000000)*g[3] + CONSTANT(0.114687841909000000)*g[13] + CONSTANT(-0.133255230519000010)*g[15];
+    y[11] += tf*g[18] + tg*f[18];
+    y[18] += tf*g[11] + tg*f[11];
+    t = f[11] * g[18] + f[18] * g[11];
+    y[3] += CONSTANT(0.168583882834000000)*t;
+    y[13] += CONSTANT(0.114687841909000000)*t;
+    y[15] += CONSTANT(-0.133255230519000010)*t;
+
+    // [11,19]: 2,14,12,
+    tf = CONSTANT(0.238413613504000000)*f[2] + CONSTANT(-0.102579924282000000)*f[14] + CONSTANT(0.099322584599300004)*f[12];
+    tg = CONSTANT(0.238413613504000000)*g[2] + CONSTANT(-0.102579924282000000)*g[14] + CONSTANT(0.099322584599300004)*g[12];
+    y[11] += tf*g[19] + tg*f[19];
+    y[19] += tf*g[11] + tg*f[11];
+    t = f[11] * g[19] + f[19] * g[11];
+    y[2] += CONSTANT(0.238413613504000000)*t;
+    y[14] += CONSTANT(-0.102579924282000000)*t;
+    y[12] += CONSTANT(0.099322584599300004)*t;
+
+    // [12,12]: 0,6,20,
+    tf = CONSTANT(0.282094799871999980)*f[0] + CONSTANT(0.168208852954000010)*f[6] + CONSTANT(0.153869910786000010)*f[20];
+    tg = CONSTANT(0.282094799871999980)*g[0] + CONSTANT(0.168208852954000010)*g[6] + CONSTANT(0.153869910786000010)*g[20];
+    y[12] += tf*g[12] + tg*f[12];
+    t = f[12] * g[12];
+    y[0] += CONSTANT(0.282094799871999980)*t;
+    y[6] += CONSTANT(0.168208852954000010)*t;
+    y[20] += CONSTANT(0.153869910786000010)*t;
+
+    // [12,14]: 8,22,
+    tf = CONSTANT(-0.188063194517999990)*f[8] + CONSTANT(-0.044418410173299998)*f[22];
+    tg = CONSTANT(-0.188063194517999990)*g[8] + CONSTANT(-0.044418410173299998)*g[22];
+    y[12] += tf*g[14] + tg*f[14];
+    y[14] += tf*g[12] + tg*f[12];
+    t = f[12] * g[14] + f[14] * g[12];
+    y[8] += CONSTANT(-0.188063194517999990)*t;
+    y[22] += CONSTANT(-0.044418410173299998)*t;
+
+    // [13,13]: 0,8,6,20,22,
+    tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.145673124078999990)*f[8] + CONSTANT(0.126156626101000010)*f[6] + CONSTANT(0.025644981070299999)*f[20] + CONSTANT(0.114687841910000000)*f[22];
+    tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.145673124078999990)*g[8] + CONSTANT(0.126156626101000010)*g[6] + CONSTANT(0.025644981070299999)*g[20] + CONSTANT(0.114687841910000000)*g[22];
+    y[13] += tf*g[13] + tg*f[13];
+    t = f[13] * g[13];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[8] += CONSTANT(0.145673124078999990)*t;
+    y[6] += CONSTANT(0.126156626101000010)*t;
+    y[20] += CONSTANT(0.025644981070299999)*t;
+    y[22] += CONSTANT(0.114687841910000000)*t;
+
+    // [13,14]: 23,
+    tf = CONSTANT(0.067850242288500007)*f[23];
+    tg = CONSTANT(0.067850242288500007)*g[23];
+    y[13] += tf*g[14] + tg*f[14];
+    y[14] += tf*g[13] + tg*f[13];
+    t = f[13] * g[14] + f[14] * g[13];
+    y[23] += CONSTANT(0.067850242288500007)*t;
+
+    // [13,15]: 8,22,24,
+    tf = CONSTANT(-0.094031597259499999)*f[8] + CONSTANT(0.133255230518000010)*f[22] + CONSTANT(-0.117520066950999990)*f[24];
+    tg = CONSTANT(-0.094031597259499999)*g[8] + CONSTANT(0.133255230518000010)*g[22] + CONSTANT(-0.117520066950999990)*g[24];
+    y[13] += tf*g[15] + tg*f[15];
+    y[15] += tf*g[13] + tg*f[13];
+    t = f[13] * g[15] + f[15] * g[13];
+    y[8] += CONSTANT(-0.094031597259499999)*t;
+    y[22] += CONSTANT(0.133255230518000010)*t;
+    y[24] += CONSTANT(-0.117520066950999990)*t;
+
+    // [13,21]: 2,12,14,
+    tf = CONSTANT(0.238413613504000000)*f[2] + CONSTANT(0.099322584599300004)*f[12] + CONSTANT(0.102579924282000000)*f[14];
+    tg = CONSTANT(0.238413613504000000)*g[2] + CONSTANT(0.099322584599300004)*g[12] + CONSTANT(0.102579924282000000)*g[14];
+    y[13] += tf*g[21] + tg*f[21];
+    y[21] += tf*g[13] + tg*f[13];
+    t = f[13] * g[21] + f[21] * g[13];
+    y[2] += CONSTANT(0.238413613504000000)*t;
+    y[12] += CONSTANT(0.099322584599300004)*t;
+    y[14] += CONSTANT(0.102579924282000000)*t;
+
+    // [14,14]: 0,20,24,
+    tf = CONSTANT(0.282094791771999980)*f[0] + CONSTANT(-0.179514867494000000)*f[20] + CONSTANT(0.151717754049000010)*f[24];
+    tg = CONSTANT(0.282094791771999980)*g[0] + CONSTANT(-0.179514867494000000)*g[20] + CONSTANT(0.151717754049000010)*g[24];
+    y[14] += tf*g[14] + tg*f[14];
+    t = f[14] * g[14];
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[20] += CONSTANT(-0.179514867494000000)*t;
+    y[24] += CONSTANT(0.151717754049000010)*t;
+
+    // [14,15]: 7,21,
+    tf = CONSTANT(0.148677009677999990)*f[7] + CONSTANT(-0.099322584600699995)*f[21];
+    tg = CONSTANT(0.148677009677999990)*g[7] + CONSTANT(-0.099322584600699995)*g[21];
+    y[14] += tf*g[15] + tg*f[15];
+    y[15] += tf*g[14] + tg*f[14];
+    t = f[14] * g[15] + f[15] * g[14];
+    y[7] += CONSTANT(0.148677009677999990)*t;
+    y[21] += CONSTANT(-0.099322584600699995)*t;
+
+    // [15,15]: 0,6,20,
+    tf = CONSTANT(0.282094791766999970)*f[0] + CONSTANT(-0.210261043508000010)*f[6] + CONSTANT(0.076934943209800002)*f[20];
+    tg = CONSTANT(0.282094791766999970)*g[0] + CONSTANT(-0.210261043508000010)*g[6] + CONSTANT(0.076934943209800002)*g[20];
+    y[15] += tf*g[15] + tg*f[15];
+    t = f[15] * g[15];
+    y[0] += CONSTANT(0.282094791766999970)*t;
+    y[6] += CONSTANT(-0.210261043508000010)*t;
+    y[20] += CONSTANT(0.076934943209800002)*t;
+
+    // [15,23]: 12,2,
+    tf = CONSTANT(-0.203550726872999990)*f[12] + CONSTANT(0.162867503964999990)*f[2];
+    tg = CONSTANT(-0.203550726872999990)*g[12] + CONSTANT(0.162867503964999990)*g[2];
+    y[15] += tf*g[23] + tg*f[23];
+    y[23] += tf*g[15] + tg*f[15];
+    t = f[15] * g[23] + f[23] * g[15];
+    y[12] += CONSTANT(-0.203550726872999990)*t;
+    y[2] += CONSTANT(0.162867503964999990)*t;
+
+    // [16,16]: 0,6,20,
+    tf = CONSTANT(0.282094791763999990)*f[0] + CONSTANT(-0.229375683829000000)*f[6] + CONSTANT(0.106525305981000000)*f[20];
+    tg = CONSTANT(0.282094791763999990)*g[0] + CONSTANT(-0.229375683829000000)*g[6] + CONSTANT(0.106525305981000000)*g[20];
+    y[16] += tf*g[16] + tg*f[16];
+    t = f[16] * g[16];
+    y[0] += CONSTANT(0.282094791763999990)*t;
+    y[6] += CONSTANT(-0.229375683829000000)*t;
+    y[20] += CONSTANT(0.106525305981000000)*t;
+
+    // [16,18]: 8,22,
+    tf = CONSTANT(-0.075080816693699995)*f[8] + CONSTANT(0.135045473380000000)*f[22];
+    tg = CONSTANT(-0.075080816693699995)*g[8] + CONSTANT(0.135045473380000000)*g[22];
+    y[16] += tf*g[18] + tg*f[18];
+    y[18] += tf*g[16] + tg*f[16];
+    t = f[16] * g[18] + f[18] * g[16];
+    y[8] += CONSTANT(-0.075080816693699995)*t;
+    y[22] += CONSTANT(0.135045473380000000)*t;
+
+    // [16,23]: 19,5,
+    tf = CONSTANT(-0.119098912754999990)*f[19] + CONSTANT(0.140463346187999990)*f[5];
+    tg = CONSTANT(-0.119098912754999990)*g[19] + CONSTANT(0.140463346187999990)*g[5];
+    y[16] += tf*g[23] + tg*f[23];
+    y[23] += tf*g[16] + tg*f[16];
+    t = f[16] * g[23] + f[23] * g[16];
+    y[19] += CONSTANT(-0.119098912754999990)*t;
+    y[5] += CONSTANT(0.140463346187999990)*t;
+
+    // [17,17]: 0,6,20,
+    tf = CONSTANT(0.282094791768999990)*f[0] + CONSTANT(-0.057343920955899998)*f[6] + CONSTANT(-0.159787958979000000)*f[20];
+    tg = CONSTANT(0.282094791768999990)*g[0] + CONSTANT(-0.057343920955899998)*g[6] + CONSTANT(-0.159787958979000000)*g[20];
+    y[17] += tf*g[17] + tg*f[17];
+    t = f[17] * g[17];
+    y[0] += CONSTANT(0.282094791768999990)*t;
+    y[6] += CONSTANT(-0.057343920955899998)*t;
+    y[20] += CONSTANT(-0.159787958979000000)*t;
+
+    // [17,19]: 8,22,24,
+    tf = CONSTANT(-0.112621225039000000)*f[8] + CONSTANT(0.045015157794100001)*f[22] + CONSTANT(0.119098912753000000)*f[24];
+    tg = CONSTANT(-0.112621225039000000)*g[8] + CONSTANT(0.045015157794100001)*g[22] + CONSTANT(0.119098912753000000)*g[24];
+    y[17] += tf*g[19] + tg*f[19];
+    y[19] += tf*g[17] + tg*f[17];
+    t = f[17] * g[19] + f[19] * g[17];
+    y[8] += CONSTANT(-0.112621225039000000)*t;
+    y[22] += CONSTANT(0.045015157794100001)*t;
+    y[24] += CONSTANT(0.119098912753000000)*t;
+
+    // [17,21]: 16,4,18,
+    tf = CONSTANT(-0.119098912754999990)*f[16] + CONSTANT(-0.112621225039000000)*f[4] + CONSTANT(0.045015157794399997)*f[18];
+    tg = CONSTANT(-0.119098912754999990)*g[16] + CONSTANT(-0.112621225039000000)*g[4] + CONSTANT(0.045015157794399997)*g[18];
+    y[17] += tf*g[21] + tg*f[21];
+    y[21] += tf*g[17] + tg*f[17];
+    t = f[17] * g[21] + f[21] * g[17];
+    y[16] += CONSTANT(-0.119098912754999990)*t;
+    y[4] += CONSTANT(-0.112621225039000000)*t;
+    y[18] += CONSTANT(0.045015157794399997)*t;
+
+    // [18,18]: 6,0,20,24,
+    tf = CONSTANT(0.065535909662600006)*f[6] + CONSTANT(0.282094791771999980)*f[0] + CONSTANT(-0.083698454702400005)*f[20] + CONSTANT(-0.135045473384000000)*f[24];
+    tg = CONSTANT(0.065535909662600006)*g[6] + CONSTANT(0.282094791771999980)*g[0] + CONSTANT(-0.083698454702400005)*g[20] + CONSTANT(-0.135045473384000000)*g[24];
+    y[18] += tf*g[18] + tg*f[18];
+    t = f[18] * g[18];
+    y[6] += CONSTANT(0.065535909662600006)*t;
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[20] += CONSTANT(-0.083698454702400005)*t;
+    y[24] += CONSTANT(-0.135045473384000000)*t;
+
+    // [18,19]: 7,21,23,
+    tf = CONSTANT(0.090297865407399994)*f[7] + CONSTANT(0.102084782359000000)*f[21] + CONSTANT(-0.045015157794399997)*f[23];
+    tg = CONSTANT(0.090297865407399994)*g[7] + CONSTANT(0.102084782359000000)*g[21] + CONSTANT(-0.045015157794399997)*g[23];
+    y[18] += tf*g[19] + tg*f[19];
+    y[19] += tf*g[18] + tg*f[18];
+    t = f[18] * g[19] + f[19] * g[18];
+    y[7] += CONSTANT(0.090297865407399994)*t;
+    y[21] += CONSTANT(0.102084782359000000)*t;
+    y[23] += CONSTANT(-0.045015157794399997)*t;
+
+    // [19,19]: 6,8,0,20,22,
+    tf = CONSTANT(0.139263808033999990)*f[6] + CONSTANT(-0.141889406570999990)*f[8] + CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.068480553847200004)*f[20] + CONSTANT(-0.102084782360000000)*f[22];
+    tg = CONSTANT(0.139263808033999990)*g[6] + CONSTANT(-0.141889406570999990)*g[8] + CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.068480553847200004)*g[20] + CONSTANT(-0.102084782360000000)*g[22];
+    y[19] += tf*g[19] + tg*f[19];
+    t = f[19] * g[19];
+    y[6] += CONSTANT(0.139263808033999990)*t;
+    y[8] += CONSTANT(-0.141889406570999990)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[20] += CONSTANT(0.068480553847200004)*t;
+    y[22] += CONSTANT(-0.102084782360000000)*t;
+
+    // [20,20]: 6,0,20,
+    tf = CONSTANT(0.163839797503000010)*f[6] + CONSTANT(0.282094802232000010)*f[0];
+    tg = CONSTANT(0.163839797503000010)*g[6] + CONSTANT(0.282094802232000010)*g[0];
+    y[20] += tf*g[20] + tg*f[20];
+    t = f[20] * g[20];
+    y[6] += CONSTANT(0.163839797503000010)*t;
+    y[0] += CONSTANT(0.282094802232000010)*t;
+    y[20] += CONSTANT(0.136961139005999990)*t;
+
+    // [21,21]: 6,20,0,8,22,
+    tf = CONSTANT(0.139263808033999990)*f[6] + CONSTANT(0.068480553847200004)*f[20] + CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.141889406570999990)*f[8] + CONSTANT(0.102084782360000000)*f[22];
+    tg = CONSTANT(0.139263808033999990)*g[6] + CONSTANT(0.068480553847200004)*g[20] + CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.141889406570999990)*g[8] + CONSTANT(0.102084782360000000)*g[22];
+    y[21] += tf*g[21] + tg*f[21];
+    t = f[21] * g[21];
+    y[6] += CONSTANT(0.139263808033999990)*t;
+    y[20] += CONSTANT(0.068480553847200004)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[8] += CONSTANT(0.141889406570999990)*t;
+    y[22] += CONSTANT(0.102084782360000000)*t;
+
+    // [21,23]: 8,22,24,
+    tf = CONSTANT(-0.112621225039000000)*f[8] + CONSTANT(0.045015157794100001)*f[22] + CONSTANT(-0.119098912753000000)*f[24];
+    tg = CONSTANT(-0.112621225039000000)*g[8] + CONSTANT(0.045015157794100001)*g[22] + CONSTANT(-0.119098912753000000)*g[24];
+    y[21] += tf*g[23] + tg*f[23];
+    y[23] += tf*g[21] + tg*f[21];
+    t = f[21] * g[23] + f[23] * g[21];
+    y[8] += CONSTANT(-0.112621225039000000)*t;
+    y[22] += CONSTANT(0.045015157794100001)*t;
+    y[24] += CONSTANT(-0.119098912753000000)*t;
+
+    // [22,22]: 6,20,0,24,
+    tf = CONSTANT(0.065535909662600006)*f[6] + CONSTANT(-0.083698454702400005)*f[20] + CONSTANT(0.282094791771999980)*f[0] + CONSTANT(0.135045473384000000)*f[24];
+    tg = CONSTANT(0.065535909662600006)*g[6] + CONSTANT(-0.083698454702400005)*g[20] + CONSTANT(0.282094791771999980)*g[0] + CONSTANT(0.135045473384000000)*g[24];
+    y[22] += tf*g[22] + tg*f[22];
+    t = f[22] * g[22];
+    y[6] += CONSTANT(0.065535909662600006)*t;
+    y[20] += CONSTANT(-0.083698454702400005)*t;
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[24] += CONSTANT(0.135045473384000000)*t;
+
+    // [23,23]: 6,20,0,
+    tf = CONSTANT(-0.057343920955899998)*f[6] + CONSTANT(-0.159787958979000000)*f[20] + CONSTANT(0.282094791768999990)*f[0];
+    tg = CONSTANT(-0.057343920955899998)*g[6] + CONSTANT(-0.159787958979000000)*g[20] + CONSTANT(0.282094791768999990)*g[0];
+    y[23] += tf*g[23] + tg*f[23];
+    t = f[23] * g[23];
+    y[6] += CONSTANT(-0.057343920955899998)*t;
+    y[20] += CONSTANT(-0.159787958979000000)*t;
+    y[0] += CONSTANT(0.282094791768999990)*t;
+
+    // [24,24]: 6,0,20,
+    tf = CONSTANT(-0.229375683829000000)*f[6] + CONSTANT(0.282094791763999990)*f[0] + CONSTANT(0.106525305981000000)*f[20];
+    tg = CONSTANT(-0.229375683829000000)*g[6] + CONSTANT(0.282094791763999990)*g[0] + CONSTANT(0.106525305981000000)*g[20];
+    y[24] += tf*g[24] + tg*f[24];
+    t = f[24] * g[24];
+    y[6] += CONSTANT(-0.229375683829000000)*t;
+    y[0] += CONSTANT(0.282094791763999990)*t;
+    y[20] += CONSTANT(0.106525305981000000)*t;
+
+    // multiply count=1135
+
+    return y;
+}
+
+
+//-------------------------------------------------------------------------------------
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232909.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+float* DirectX::XMSHMultiply6(
+    float *y,
+    const float *f,
+    const float *g) noexcept
+{
+    if (!y || !f || !g)
+        return nullptr;
+
+    REAL tf, tg, t;
+    // [0,0]: 0,
+    y[0] = CONSTANT(0.282094792935999980)*f[0] * g[0];
+
+    // [1,1]: 0,6,8,
+    tf = CONSTANT(0.282094791773000010)*f[0] + CONSTANT(-0.126156626101000010)*f[6] + CONSTANT(-0.218509686119999990)*f[8];
+    tg = CONSTANT(0.282094791773000010)*g[0] + CONSTANT(-0.126156626101000010)*g[6] + CONSTANT(-0.218509686119999990)*g[8];
+    y[1] = tf*g[1] + tg*f[1];
+    t = f[1] * g[1];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+    y[6] = CONSTANT(-0.126156626101000010)*t;
+    y[8] = CONSTANT(-0.218509686119999990)*t;
+
+    // [1,4]: 3,13,15,
+    tf = CONSTANT(0.218509686114999990)*f[3] + CONSTANT(-0.058399170082300000)*f[13] + CONSTANT(-0.226179013157999990)*f[15];
+    tg = CONSTANT(0.218509686114999990)*g[3] + CONSTANT(-0.058399170082300000)*g[13] + CONSTANT(-0.226179013157999990)*g[15];
+    y[1] += tf*g[4] + tg*f[4];
+    y[4] = tf*g[1] + tg*f[1];
+    t = f[1] * g[4] + f[4] * g[1];
+    y[3] = CONSTANT(0.218509686114999990)*t;
+    y[13] = CONSTANT(-0.058399170082300000)*t;
+    y[15] = CONSTANT(-0.226179013157999990)*t;
+
+    // [1,5]: 2,12,
+    tf = CONSTANT(0.218509686118000010)*f[2] + CONSTANT(-0.143048168103000000)*f[12];
+    tg = CONSTANT(0.218509686118000010)*g[2] + CONSTANT(-0.143048168103000000)*g[12];
+    y[1] += tf*g[5] + tg*f[5];
+    y[5] = tf*g[1] + tg*f[1];
+    t = f[1] * g[5] + f[5] * g[1];
+    y[2] = CONSTANT(0.218509686118000010)*t;
+    y[12] = CONSTANT(-0.143048168103000000)*t;
+
+    // [1,11]: 6,8,20,22,
+    tf = CONSTANT(0.202300659402999990)*f[6] + CONSTANT(0.058399170081799998)*f[8] + CONSTANT(-0.150786008773000000)*f[20] + CONSTANT(-0.168583882836999990)*f[22];
+    tg = CONSTANT(0.202300659402999990)*g[6] + CONSTANT(0.058399170081799998)*g[8] + CONSTANT(-0.150786008773000000)*g[20] + CONSTANT(-0.168583882836999990)*g[22];
+    y[1] += tf*g[11] + tg*f[11];
+    y[11] = tf*g[1] + tg*f[1];
+    t = f[1] * g[11] + f[11] * g[1];
+    y[6] += CONSTANT(0.202300659402999990)*t;
+    y[8] += CONSTANT(0.058399170081799998)*t;
+    y[20] = CONSTANT(-0.150786008773000000)*t;
+    y[22] = CONSTANT(-0.168583882836999990)*t;
+
+    // [1,16]: 15,33,35,
+    tf = CONSTANT(0.230329432973999990)*f[15] + CONSTANT(-0.034723468517399998)*f[33] + CONSTANT(-0.232932108051999990)*f[35];
+    tg = CONSTANT(0.230329432973999990)*g[15] + CONSTANT(-0.034723468517399998)*g[33] + CONSTANT(-0.232932108051999990)*g[35];
+    y[1] += tf*g[16] + tg*f[16];
+    y[16] = tf*g[1] + tg*f[1];
+    t = f[1] * g[16] + f[16] * g[1];
+    y[15] += CONSTANT(0.230329432973999990)*t;
+    y[33] = CONSTANT(-0.034723468517399998)*t;
+    y[35] = CONSTANT(-0.232932108051999990)*t;
+
+    // [1,18]: 15,13,31,33,
+    tf = CONSTANT(0.043528171377799997)*f[15] + CONSTANT(0.168583882834000000)*f[13] + CONSTANT(-0.085054779966799998)*f[31] + CONSTANT(-0.183739324705999990)*f[33];
+    tg = CONSTANT(0.043528171377799997)*g[15] + CONSTANT(0.168583882834000000)*g[13] + CONSTANT(-0.085054779966799998)*g[31] + CONSTANT(-0.183739324705999990)*g[33];
+    y[1] += tf*g[18] + tg*f[18];
+    y[18] = tf*g[1] + tg*f[1];
+    t = f[1] * g[18] + f[18] * g[1];
+    y[15] += CONSTANT(0.043528171377799997)*t;
+    y[13] += CONSTANT(0.168583882834000000)*t;
+    y[31] = CONSTANT(-0.085054779966799998)*t;
+    y[33] += CONSTANT(-0.183739324705999990)*t;
+
+    // [1,19]: 14,12,30,32,
+    tf = CONSTANT(0.075393004386399995)*f[14] + CONSTANT(0.194663900273000010)*f[12] + CONSTANT(-0.155288072037000010)*f[30] + CONSTANT(-0.159122922869999990)*f[32];
+    tg = CONSTANT(0.075393004386399995)*g[14] + CONSTANT(0.194663900273000010)*g[12] + CONSTANT(-0.155288072037000010)*g[30] + CONSTANT(-0.159122922869999990)*g[32];
+    y[1] += tf*g[19] + tg*f[19];
+    y[19] = tf*g[1] + tg*f[1];
+    t = f[1] * g[19] + f[19] * g[1];
+    y[14] = CONSTANT(0.075393004386399995)*t;
+    y[12] += CONSTANT(0.194663900273000010)*t;
+    y[30] = CONSTANT(-0.155288072037000010)*t;
+    y[32] = CONSTANT(-0.159122922869999990)*t;
+
+    // [1,24]: 9,25,27,
+    tf = CONSTANT(-0.230329432978999990)*f[9] + CONSTANT(0.232932108049000000)*f[25] + CONSTANT(0.034723468517100002)*f[27];
+    tg = CONSTANT(-0.230329432978999990)*g[9] + CONSTANT(0.232932108049000000)*g[25] + CONSTANT(0.034723468517100002)*g[27];
+    y[1] += tf*g[24] + tg*f[24];
+    y[24] = tf*g[1] + tg*f[1];
+    t = f[1] * g[24] + f[24] * g[1];
+    y[9] = CONSTANT(-0.230329432978999990)*t;
+    y[25] = CONSTANT(0.232932108049000000)*t;
+    y[27] = CONSTANT(0.034723468517100002)*t;
+
+    // [1,29]: 22,20,
+    tf = CONSTANT(0.085054779965999999)*f[22] + CONSTANT(0.190188269815000010)*f[20];
+    tg = CONSTANT(0.085054779965999999)*g[22] + CONSTANT(0.190188269815000010)*g[20];
+    y[1] += tf*g[29] + tg*f[29];
+    y[29] = tf*g[1] + tg*f[1];
+    t = f[1] * g[29] + f[29] * g[1];
+    y[22] += CONSTANT(0.085054779965999999)*t;
+    y[20] += CONSTANT(0.190188269815000010)*t;
+
+    // [2,2]: 0,6,
+    tf = CONSTANT(0.282094795249000000)*f[0] + CONSTANT(0.252313259986999990)*f[6];
+    tg = CONSTANT(0.282094795249000000)*g[0] + CONSTANT(0.252313259986999990)*g[6];
+    y[2] += tf*g[2] + tg*f[2];
+    t = f[2] * g[2];
+    y[0] += CONSTANT(0.282094795249000000)*t;
+    y[6] += CONSTANT(0.252313259986999990)*t;
+
+    // [2,12]: 6,20,
+    tf = CONSTANT(0.247766706973999990)*f[6] + CONSTANT(0.246232537174000010)*f[20];
+    tg = CONSTANT(0.247766706973999990)*g[6] + CONSTANT(0.246232537174000010)*g[20];
+    y[2] += tf*g[12] + tg*f[12];
+    y[12] += tf*g[2] + tg*f[2];
+    t = f[2] * g[12] + f[12] * g[2];
+    y[6] += CONSTANT(0.247766706973999990)*t;
+    y[20] += CONSTANT(0.246232537174000010)*t;
+
+    // [2,20]: 30,
+    tf = CONSTANT(0.245532020560000010)*f[30];
+    tg = CONSTANT(0.245532020560000010)*g[30];
+    y[2] += tf*g[20] + tg*f[20];
+    y[20] += tf*g[2] + tg*f[2];
+    t = f[2] * g[20] + f[20] * g[2];
+    y[30] += CONSTANT(0.245532020560000010)*t;
+
+    // [3,3]: 0,6,8,
+    tf = CONSTANT(0.282094791773000010)*f[0] + CONSTANT(-0.126156626101000010)*f[6] + CONSTANT(0.218509686119999990)*f[8];
+    tg = CONSTANT(0.282094791773000010)*g[0] + CONSTANT(-0.126156626101000010)*g[6] + CONSTANT(0.218509686119999990)*g[8];
+    y[3] += tf*g[3] + tg*f[3];
+    t = f[3] * g[3];
+    y[0] += CONSTANT(0.282094791773000010)*t;
+    y[6] += CONSTANT(-0.126156626101000010)*t;
+    y[8] += CONSTANT(0.218509686119999990)*t;
+
+    // [3,7]: 2,12,
+    tf = CONSTANT(0.218509686118000010)*f[2] + CONSTANT(-0.143048168103000000)*f[12];
+    tg = CONSTANT(0.218509686118000010)*g[2] + CONSTANT(-0.143048168103000000)*g[12];
+    y[3] += tf*g[7] + tg*f[7];
+    y[7] = tf*g[3] + tg*f[3];
+    t = f[3] * g[7] + f[7] * g[3];
+    y[2] += CONSTANT(0.218509686118000010)*t;
+    y[12] += CONSTANT(-0.143048168103000000)*t;
+
+    // [3,13]: 8,6,20,22,
+    tf = CONSTANT(-0.058399170081799998)*f[8] + CONSTANT(0.202300659402999990)*f[6] + CONSTANT(-0.150786008773000000)*f[20] + CONSTANT(0.168583882836999990)*f[22];
+    tg = CONSTANT(-0.058399170081799998)*g[8] + CONSTANT(0.202300659402999990)*g[6] + CONSTANT(-0.150786008773000000)*g[20] + CONSTANT(0.168583882836999990)*g[22];
+    y[3] += tf*g[13] + tg*f[13];
+    y[13] += tf*g[3] + tg*f[3];
+    t = f[3] * g[13] + f[13] * g[3];
+    y[8] += CONSTANT(-0.058399170081799998)*t;
+    y[6] += CONSTANT(0.202300659402999990)*t;
+    y[20] += CONSTANT(-0.150786008773000000)*t;
+    y[22] += CONSTANT(0.168583882836999990)*t;
+
+    // [3,16]: 9,25,27,
+    tf = CONSTANT(0.230329432973999990)*f[9] + CONSTANT(0.232932108051999990)*f[25] + CONSTANT(-0.034723468517399998)*f[27];
+    tg = CONSTANT(0.230329432973999990)*g[9] + CONSTANT(0.232932108051999990)*g[25] + CONSTANT(-0.034723468517399998)*g[27];
+    y[3] += tf*g[16] + tg*f[16];
+    y[16] += tf*g[3] + tg*f[3];
+    t = f[3] * g[16] + f[16] * g[3];
+    y[9] += CONSTANT(0.230329432973999990)*t;
+    y[25] += CONSTANT(0.232932108051999990)*t;
+    y[27] += CONSTANT(-0.034723468517399998)*t;
+
+    // [3,21]: 12,14,30,32,
+    tf = CONSTANT(0.194663900273000010)*f[12] + CONSTANT(-0.075393004386399995)*f[14] + CONSTANT(-0.155288072037000010)*f[30] + CONSTANT(0.159122922869999990)*f[32];
+    tg = CONSTANT(0.194663900273000010)*g[12] + CONSTANT(-0.075393004386399995)*g[14] + CONSTANT(-0.155288072037000010)*g[30] + CONSTANT(0.159122922869999990)*g[32];
+    y[3] += tf*g[21] + tg*f[21];
+    y[21] = tf*g[3] + tg*f[3];
+    t = f[3] * g[21] + f[21] * g[3];
+    y[12] += CONSTANT(0.194663900273000010)*t;
+    y[14] += CONSTANT(-0.075393004386399995)*t;
+    y[30] += CONSTANT(-0.155288072037000010)*t;
+    y[32] += CONSTANT(0.159122922869999990)*t;
+
+    // [3,24]: 15,33,35,
+    tf = CONSTANT(0.230329432978999990)*f[15] + CONSTANT(-0.034723468517100002)*f[33] + CONSTANT(0.232932108049000000)*f[35];
+    tg = CONSTANT(0.230329432978999990)*g[15] + CONSTANT(-0.034723468517100002)*g[33] + CONSTANT(0.232932108049000000)*g[35];
+    y[3] += tf*g[24] + tg*f[24];
+    y[24] += tf*g[3] + tg*f[3];
+    t = f[3] * g[24] + f[24] * g[3];
+    y[15] += CONSTANT(0.230329432978999990)*t;
+    y[33] += CONSTANT(-0.034723468517100002)*t;
+    y[35] += CONSTANT(0.232932108049000000)*t;
+
+    // [3,31]: 20,22,
+    tf = CONSTANT(0.190188269815000010)*f[20] + CONSTANT(-0.085054779965999999)*f[22];
+    tg = CONSTANT(0.190188269815000010)*g[20] + CONSTANT(-0.085054779965999999)*g[22];
+    y[3] += tf*g[31] + tg*f[31];
+    y[31] += tf*g[3] + tg*f[3];
+    t = f[3] * g[31] + f[31] * g[3];
+    y[20] += CONSTANT(0.190188269815000010)*t;
+    y[22] += CONSTANT(-0.085054779965999999)*t;
+
+    // [4,4]: 0,6,20,24,
+    tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.180223751576000010)*f[6] + CONSTANT(0.040299255967500003)*f[20] + CONSTANT(-0.238413613505999990)*f[24];
+    tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.180223751576000010)*g[6] + CONSTANT(0.040299255967500003)*g[20] + CONSTANT(-0.238413613505999990)*g[24];
+    y[4] += tf*g[4] + tg*f[4];
+    t = f[4] * g[4];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[6] += CONSTANT(-0.180223751576000010)*t;
+    y[20] += CONSTANT(0.040299255967500003)*t;
+    y[24] += CONSTANT(-0.238413613505999990)*t;
+
+    // [4,5]: 7,21,23,
+    tf = CONSTANT(0.156078347226000000)*f[7] + CONSTANT(-0.063718718434399996)*f[21] + CONSTANT(-0.168583882835000000)*f[23];
+    tg = CONSTANT(0.156078347226000000)*g[7] + CONSTANT(-0.063718718434399996)*g[21] + CONSTANT(-0.168583882835000000)*g[23];
+    y[4] += tf*g[5] + tg*f[5];
+    y[5] += tf*g[4] + tg*f[4];
+    t = f[4] * g[5] + f[5] * g[4];
+    y[7] += CONSTANT(0.156078347226000000)*t;
+    y[21] += CONSTANT(-0.063718718434399996)*t;
+    y[23] = CONSTANT(-0.168583882835000000)*t;
+
+    // [4,9]: 3,13,31,35,
+    tf = CONSTANT(0.226179013157999990)*f[3] + CONSTANT(-0.094031597258400004)*f[13] + CONSTANT(0.016943317729299998)*f[31] + CONSTANT(-0.245532000542000000)*f[35];
+    tg = CONSTANT(0.226179013157999990)*g[3] + CONSTANT(-0.094031597258400004)*g[13] + CONSTANT(0.016943317729299998)*g[31] + CONSTANT(-0.245532000542000000)*g[35];
+    y[4] += tf*g[9] + tg*f[9];
+    y[9] += tf*g[4] + tg*f[4];
+    t = f[4] * g[9] + f[9] * g[4];
+    y[3] += CONSTANT(0.226179013157999990)*t;
+    y[13] += CONSTANT(-0.094031597258400004)*t;
+    y[31] += CONSTANT(0.016943317729299998)*t;
+    y[35] += CONSTANT(-0.245532000542000000)*t;
+
+    // [4,10]: 2,12,30,34,
+    tf = CONSTANT(0.184674390919999990)*f[2] + CONSTANT(-0.188063194517999990)*f[12] + CONSTANT(0.053579475144400000)*f[30] + CONSTANT(-0.190188269816000010)*f[34];
+    tg = CONSTANT(0.184674390919999990)*g[2] + CONSTANT(-0.188063194517999990)*g[12] + CONSTANT(0.053579475144400000)*g[30] + CONSTANT(-0.190188269816000010)*g[34];
+    y[4] += tf*g[10] + tg*f[10];
+    y[10] = tf*g[4] + tg*f[4];
+    t = f[4] * g[10] + f[10] * g[4];
+    y[2] += CONSTANT(0.184674390919999990)*t;
+    y[12] += CONSTANT(-0.188063194517999990)*t;
+    y[30] += CONSTANT(0.053579475144400000)*t;
+    y[34] = CONSTANT(-0.190188269816000010)*t;
+
+    // [4,11]: 3,13,15,31,33,
+    tf = CONSTANT(-0.058399170082300000)*f[3] + CONSTANT(0.145673124078000010)*f[13] + CONSTANT(0.094031597258400004)*f[15] + CONSTANT(-0.065621187395699998)*f[31] + CONSTANT(-0.141757966610000010)*f[33];
+    tg = CONSTANT(-0.058399170082300000)*g[3] + CONSTANT(0.145673124078000010)*g[13] + CONSTANT(0.094031597258400004)*g[15] + CONSTANT(-0.065621187395699998)*g[31] + CONSTANT(-0.141757966610000010)*g[33];
+    y[4] += tf*g[11] + tg*f[11];
+    y[11] += tf*g[4] + tg*f[4];
+    t = f[4] * g[11] + f[11] * g[4];
+    y[3] += CONSTANT(-0.058399170082300000)*t;
+    y[13] += CONSTANT(0.145673124078000010)*t;
+    y[15] += CONSTANT(0.094031597258400004)*t;
+    y[31] += CONSTANT(-0.065621187395699998)*t;
+    y[33] += CONSTANT(-0.141757966610000010)*t;
+
+    // [4,16]: 8,22,
+    tf = CONSTANT(0.238413613494000000)*f[8] + CONSTANT(-0.075080816693699995)*f[22];
+    tg = CONSTANT(0.238413613494000000)*g[8] + CONSTANT(-0.075080816693699995)*g[22];
+    y[4] += tf*g[16] + tg*f[16];
+    y[16] += tf*g[4] + tg*f[4];
+    t = f[4] * g[16] + f[16] * g[4];
+    y[8] += CONSTANT(0.238413613494000000)*t;
+    y[22] += CONSTANT(-0.075080816693699995)*t;
+
+    // [4,18]: 6,20,24,
+    tf = CONSTANT(0.156078347226000000)*f[6] + CONSTANT(-0.190364615029000010)*f[20] + CONSTANT(0.075080816691500005)*f[24];
+    tg = CONSTANT(0.156078347226000000)*g[6] + CONSTANT(-0.190364615029000010)*g[20] + CONSTANT(0.075080816691500005)*g[24];
+    y[4] += tf*g[18] + tg*f[18];
+    y[18] += tf*g[4] + tg*f[4];
+    t = f[4] * g[18] + f[18] * g[4];
+    y[6] += CONSTANT(0.156078347226000000)*t;
+    y[20] += CONSTANT(-0.190364615029000010)*t;
+    y[24] += CONSTANT(0.075080816691500005)*t;
+
+    // [4,19]: 7,21,23,
+    tf = CONSTANT(-0.063718718434399996)*f[7] + CONSTANT(0.141889406569999990)*f[21] + CONSTANT(0.112621225039000000)*f[23];
+    tg = CONSTANT(-0.063718718434399996)*g[7] + CONSTANT(0.141889406569999990)*g[21] + CONSTANT(0.112621225039000000)*g[23];
+    y[4] += tf*g[19] + tg*f[19];
+    y[19] += tf*g[4] + tg*f[4];
+    t = f[4] * g[19] + f[19] * g[4];
+    y[7] += CONSTANT(-0.063718718434399996)*t;
+    y[21] += CONSTANT(0.141889406569999990)*t;
+    y[23] += CONSTANT(0.112621225039000000)*t;
+
+    // [4,25]: 15,33,
+    tf = CONSTANT(0.245532000542000000)*f[15] + CONSTANT(-0.062641347680800000)*f[33];
+    tg = CONSTANT(0.245532000542000000)*g[15] + CONSTANT(-0.062641347680800000)*g[33];
+    y[4] += tf*g[25] + tg*f[25];
+    y[25] += tf*g[4] + tg*f[4];
+    t = f[4] * g[25] + f[25] * g[4];
+    y[15] += CONSTANT(0.245532000542000000)*t;
+    y[33] += CONSTANT(-0.062641347680800000)*t;
+
+    // [4,26]: 14,32,
+    tf = CONSTANT(0.190188269806999990)*f[14] + CONSTANT(-0.097043558542400002)*f[32];
+    tg = CONSTANT(0.190188269806999990)*g[14] + CONSTANT(-0.097043558542400002)*g[32];
+    y[4] += tf*g[26] + tg*f[26];
+    y[26] = tf*g[4] + tg*f[4];
+    t = f[4] * g[26] + f[26] * g[4];
+    y[14] += CONSTANT(0.190188269806999990)*t;
+    y[32] += CONSTANT(-0.097043558542400002)*t;
+
+    // [4,27]: 13,31,35,
+    tf = CONSTANT(0.141757966610000010)*f[13] + CONSTANT(-0.121034582549000000)*f[31] + CONSTANT(0.062641347680800000)*f[35];
+    tg = CONSTANT(0.141757966610000010)*g[13] + CONSTANT(-0.121034582549000000)*g[31] + CONSTANT(0.062641347680800000)*g[35];
+    y[4] += tf*g[27] + tg*f[27];
+    y[27] += tf*g[4] + tg*f[4];
+    t = f[4] * g[27] + f[27] * g[4];
+    y[13] += CONSTANT(0.141757966610000010)*t;
+    y[31] += CONSTANT(-0.121034582549000000)*t;
+    y[35] += CONSTANT(0.062641347680800000)*t;
+
+    // [4,28]: 12,30,34,
+    tf = CONSTANT(0.141757966609000000)*f[12] + CONSTANT(-0.191372478254000000)*f[30] + CONSTANT(0.097043558538899996)*f[34];
+    tg = CONSTANT(0.141757966609000000)*g[12] + CONSTANT(-0.191372478254000000)*g[30] + CONSTANT(0.097043558538899996)*g[34];
+    y[4] += tf*g[28] + tg*f[28];
+    y[28] = tf*g[4] + tg*f[4];
+    t = f[4] * g[28] + f[28] * g[4];
+    y[12] += CONSTANT(0.141757966609000000)*t;
+    y[30] += CONSTANT(-0.191372478254000000)*t;
+    y[34] += CONSTANT(0.097043558538899996)*t;
+
+    // [4,29]: 13,15,31,33,
+    tf = CONSTANT(-0.065621187395699998)*f[13] + CONSTANT(-0.016943317729299998)*f[15] + CONSTANT(0.140070311613999990)*f[31] + CONSTANT(0.121034582549000000)*f[33];
+    tg = CONSTANT(-0.065621187395699998)*g[13] + CONSTANT(-0.016943317729299998)*g[15] + CONSTANT(0.140070311613999990)*g[31] + CONSTANT(0.121034582549000000)*g[33];
+    y[4] += tf*g[29] + tg*f[29];
+    y[29] += tf*g[4] + tg*f[4];
+    t = f[4] * g[29] + f[29] * g[4];
+    y[13] += CONSTANT(-0.065621187395699998)*t;
+    y[15] += CONSTANT(-0.016943317729299998)*t;
+    y[31] += CONSTANT(0.140070311613999990)*t;
+    y[33] += CONSTANT(0.121034582549000000)*t;
+
+    // [5,5]: 0,6,8,20,22,
+    tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.090111875786499998)*f[6] + CONSTANT(-0.156078347227999990)*f[8] + CONSTANT(-0.161197023870999990)*f[20] + CONSTANT(-0.180223751574000000)*f[22];
+    tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.090111875786499998)*g[6] + CONSTANT(-0.156078347227999990)*g[8] + CONSTANT(-0.161197023870999990)*g[20] + CONSTANT(-0.180223751574000000)*g[22];
+    y[5] += tf*g[5] + tg*f[5];
+    t = f[5] * g[5];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[6] += CONSTANT(0.090111875786499998)*t;
+    y[8] += CONSTANT(-0.156078347227999990)*t;
+    y[20] += CONSTANT(-0.161197023870999990)*t;
+    y[22] += CONSTANT(-0.180223751574000000)*t;
+
+    // [5,10]: 3,13,15,31,33,
+    tf = CONSTANT(0.184674390919999990)*f[3] + CONSTANT(0.115164716490000000)*f[13] + CONSTANT(-0.148677009678999990)*f[15] + CONSTANT(-0.083004965974099995)*f[31] + CONSTANT(-0.179311220383999990)*f[33];
+    tg = CONSTANT(0.184674390919999990)*g[3] + CONSTANT(0.115164716490000000)*g[13] + CONSTANT(-0.148677009678999990)*g[15] + CONSTANT(-0.083004965974099995)*g[31] + CONSTANT(-0.179311220383999990)*g[33];
+    y[5] += tf*g[10] + tg*f[10];
+    y[10] += tf*g[5] + tg*f[5];
+    t = f[5] * g[10] + f[10] * g[5];
+    y[3] += CONSTANT(0.184674390919999990)*t;
+    y[13] += CONSTANT(0.115164716490000000)*t;
+    y[15] += CONSTANT(-0.148677009678999990)*t;
+    y[31] += CONSTANT(-0.083004965974099995)*t;
+    y[33] += CONSTANT(-0.179311220383999990)*t;
+
+    // [5,11]: 2,12,14,30,32,
+    tf = CONSTANT(0.233596680327000010)*f[2] + CONSTANT(0.059470803871800003)*f[12] + CONSTANT(-0.115164716491000000)*f[14] + CONSTANT(-0.169433177294000010)*f[30] + CONSTANT(-0.173617342585000000)*f[32];
+    tg = CONSTANT(0.233596680327000010)*g[2] + CONSTANT(0.059470803871800003)*g[12] + CONSTANT(-0.115164716491000000)*g[14] + CONSTANT(-0.169433177294000010)*g[30] + CONSTANT(-0.173617342585000000)*g[32];
+    y[5] += tf*g[11] + tg*f[11];
+    y[11] += tf*g[5] + tg*f[5];
+    t = f[5] * g[11] + f[11] * g[5];
+    y[2] += CONSTANT(0.233596680327000010)*t;
+    y[12] += CONSTANT(0.059470803871800003)*t;
+    y[14] += CONSTANT(-0.115164716491000000)*t;
+    y[30] += CONSTANT(-0.169433177294000010)*t;
+    y[32] += CONSTANT(-0.173617342585000000)*t;
+
+    // [5,14]: 9,1,27,29,
+    tf = CONSTANT(0.148677009677999990)*f[9] + CONSTANT(-0.184674390923000000)*f[1] + CONSTANT(0.179311220382000010)*f[27] + CONSTANT(0.083004965973399999)*f[29];
+    tg = CONSTANT(0.148677009677999990)*g[9] + CONSTANT(-0.184674390923000000)*g[1] + CONSTANT(0.179311220382000010)*g[27] + CONSTANT(0.083004965973399999)*g[29];
+    y[5] += tf*g[14] + tg*f[14];
+    y[14] += tf*g[5] + tg*f[5];
+    t = f[5] * g[14] + f[14] * g[5];
+    y[9] += CONSTANT(0.148677009677999990)*t;
+    y[1] += CONSTANT(-0.184674390923000000)*t;
+    y[27] += CONSTANT(0.179311220382000010)*t;
+    y[29] += CONSTANT(0.083004965973399999)*t;
+
+    // [5,17]: 8,22,24,
+    tf = CONSTANT(0.168583882832999990)*f[8] + CONSTANT(0.132725386548000010)*f[22] + CONSTANT(-0.140463346189000000)*f[24];
+    tg = CONSTANT(0.168583882832999990)*g[8] + CONSTANT(0.132725386548000010)*g[22] + CONSTANT(-0.140463346189000000)*g[24];
+    y[5] += tf*g[17] + tg*f[17];
+    y[17] = tf*g[5] + tg*f[5];
+    t = f[5] * g[17] + f[17] * g[5];
+    y[8] += CONSTANT(0.168583882832999990)*t;
+    y[22] += CONSTANT(0.132725386548000010)*t;
+    y[24] += CONSTANT(-0.140463346189000000)*t;
+
+    // [5,18]: 7,21,23,
+    tf = CONSTANT(0.180223751571000010)*f[7] + CONSTANT(0.090297865407399994)*f[21] + CONSTANT(-0.132725386549000010)*f[23];
+    tg = CONSTANT(0.180223751571000010)*g[7] + CONSTANT(0.090297865407399994)*g[21] + CONSTANT(-0.132725386549000010)*g[23];
+    y[5] += tf*g[18] + tg*f[18];
+    y[18] += tf*g[5] + tg*f[5];
+    t = f[5] * g[18] + f[18] * g[5];
+    y[7] += CONSTANT(0.180223751571000010)*t;
+    y[21] += CONSTANT(0.090297865407399994)*t;
+    y[23] += CONSTANT(-0.132725386549000010)*t;
+
+    // [5,19]: 6,8,20,22,
+    tf = CONSTANT(0.220728115440999990)*f[6] + CONSTANT(0.063718718433900007)*f[8] + CONSTANT(0.044869370061299998)*f[20] + CONSTANT(-0.090297865408399999)*f[22];
+    tg = CONSTANT(0.220728115440999990)*g[6] + CONSTANT(0.063718718433900007)*g[8] + CONSTANT(0.044869370061299998)*g[20] + CONSTANT(-0.090297865408399999)*g[22];
+    y[5] += tf*g[19] + tg*f[19];
+    y[19] += tf*g[5] + tg*f[5];
+    t = f[5] * g[19] + f[19] * g[5];
+    y[6] += CONSTANT(0.220728115440999990)*t;
+    y[8] += CONSTANT(0.063718718433900007)*t;
+    y[20] += CONSTANT(0.044869370061299998)*t;
+    y[22] += CONSTANT(-0.090297865408399999)*t;
+
+    // [5,26]: 15,33,35,
+    tf = CONSTANT(0.155288072035000000)*f[15] + CONSTANT(0.138662534056999990)*f[33] + CONSTANT(-0.132882365179999990)*f[35];
+    tg = CONSTANT(0.155288072035000000)*g[15] + CONSTANT(0.138662534056999990)*g[33] + CONSTANT(-0.132882365179999990)*g[35];
+    y[5] += tf*g[26] + tg*f[26];
+    y[26] += tf*g[5] + tg*f[5];
+    t = f[5] * g[26] + f[26] * g[5];
+    y[15] += CONSTANT(0.155288072035000000)*t;
+    y[33] += CONSTANT(0.138662534056999990)*t;
+    y[35] += CONSTANT(-0.132882365179999990)*t;
+
+    // [5,28]: 15,13,31,33,
+    tf = CONSTANT(0.044827805096399997)*f[15] + CONSTANT(0.173617342584000000)*f[13] + CONSTANT(0.074118242118699995)*f[31] + CONSTANT(-0.114366930522000000)*f[33];
+    tg = CONSTANT(0.044827805096399997)*g[15] + CONSTANT(0.173617342584000000)*g[13] + CONSTANT(0.074118242118699995)*g[31] + CONSTANT(-0.114366930522000000)*g[33];
+    y[5] += tf*g[28] + tg*f[28];
+    y[28] += tf*g[5] + tg*f[5];
+    t = f[5] * g[28] + f[28] * g[5];
+    y[15] += CONSTANT(0.044827805096399997)*t;
+    y[13] += CONSTANT(0.173617342584000000)*t;
+    y[31] += CONSTANT(0.074118242118699995)*t;
+    y[33] += CONSTANT(-0.114366930522000000)*t;
+
+    // [5,29]: 12,30,32,
+    tf = CONSTANT(0.214317900578999990)*f[12] + CONSTANT(0.036165998945399999)*f[30] + CONSTANT(-0.074118242119099995)*f[32];
+    tg = CONSTANT(0.214317900578999990)*g[12] + CONSTANT(0.036165998945399999)*g[30] + CONSTANT(-0.074118242119099995)*g[32];
+    y[5] += tf*g[29] + tg*f[29];
+    y[29] += tf*g[5] + tg*f[5];
+    t = f[5] * g[29] + f[29] * g[5];
+    y[12] += CONSTANT(0.214317900578999990)*t;
+    y[30] += CONSTANT(0.036165998945399999)*t;
+    y[32] += CONSTANT(-0.074118242119099995)*t;
+
+    // [5,32]: 9,27,
+    tf = CONSTANT(-0.044827805096799997)*f[9] + CONSTANT(0.114366930522000000)*f[27];
+    tg = CONSTANT(-0.044827805096799997)*g[9] + CONSTANT(0.114366930522000000)*g[27];
+    y[5] += tf*g[32] + tg*f[32];
+    y[32] += tf*g[5] + tg*f[5];
+    t = f[5] * g[32] + f[32] * g[5];
+    y[9] += CONSTANT(-0.044827805096799997)*t;
+    y[27] += CONSTANT(0.114366930522000000)*t;
+
+    // [5,34]: 9,27,25,
+    tf = CONSTANT(-0.155288072036000010)*f[9] + CONSTANT(-0.138662534059000000)*f[27] + CONSTANT(0.132882365179000010)*f[25];
+    tg = CONSTANT(-0.155288072036000010)*g[9] + CONSTANT(-0.138662534059000000)*g[27] + CONSTANT(0.132882365179000010)*g[25];
+    y[5] += tf*g[34] + tg*f[34];
+    y[34] += tf*g[5] + tg*f[5];
+    t = f[5] * g[34] + f[34] * g[5];
+    y[9] += CONSTANT(-0.155288072036000010)*t;
+    y[27] += CONSTANT(-0.138662534059000000)*t;
+    y[25] += CONSTANT(0.132882365179000010)*t;
+
+    // [6,6]: 0,6,20,
+    tf = CONSTANT(0.282094797560000000)*f[0] + CONSTANT(0.241795553185999990)*f[20];
+    tg = CONSTANT(0.282094797560000000)*g[0] + CONSTANT(0.241795553185999990)*g[20];
+    y[6] += tf*g[6] + tg*f[6];
+    t = f[6] * g[6];
+    y[0] += CONSTANT(0.282094797560000000)*t;
+    y[6] += CONSTANT(0.180223764527000010)*t;
+    y[20] += CONSTANT(0.241795553185999990)*t;
+
+    // [7,7]: 6,0,8,20,22,
+    tf = CONSTANT(0.090111875786499998)*f[6] + CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.156078347227999990)*f[8] + CONSTANT(-0.161197023870999990)*f[20] + CONSTANT(0.180223751574000000)*f[22];
+    tg = CONSTANT(0.090111875786499998)*g[6] + CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.156078347227999990)*g[8] + CONSTANT(-0.161197023870999990)*g[20] + CONSTANT(0.180223751574000000)*g[22];
+    y[7] += tf*g[7] + tg*f[7];
+    t = f[7] * g[7];
+    y[6] += CONSTANT(0.090111875786499998)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[8] += CONSTANT(0.156078347227999990)*t;
+    y[20] += CONSTANT(-0.161197023870999990)*t;
+    y[22] += CONSTANT(0.180223751574000000)*t;
+
+    // [7,10]: 9,1,11,27,29,
+    tf = CONSTANT(0.148677009678999990)*f[9] + CONSTANT(0.184674390919999990)*f[1] + CONSTANT(0.115164716490000000)*f[11] + CONSTANT(0.179311220383999990)*f[27] + CONSTANT(-0.083004965974099995)*f[29];
+    tg = CONSTANT(0.148677009678999990)*g[9] + CONSTANT(0.184674390919999990)*g[1] + CONSTANT(0.115164716490000000)*g[11] + CONSTANT(0.179311220383999990)*g[27] + CONSTANT(-0.083004965974099995)*g[29];
+    y[7] += tf*g[10] + tg*f[10];
+    y[10] += tf*g[7] + tg*f[7];
+    t = f[7] * g[10] + f[10] * g[7];
+    y[9] += CONSTANT(0.148677009678999990)*t;
+    y[1] += CONSTANT(0.184674390919999990)*t;
+    y[11] += CONSTANT(0.115164716490000000)*t;
+    y[27] += CONSTANT(0.179311220383999990)*t;
+    y[29] += CONSTANT(-0.083004965974099995)*t;
+
+    // [7,13]: 12,2,14,30,32,
+    tf = CONSTANT(0.059470803871800003)*f[12] + CONSTANT(0.233596680327000010)*f[2] + CONSTANT(0.115164716491000000)*f[14] + CONSTANT(-0.169433177294000010)*f[30] + CONSTANT(0.173617342585000000)*f[32];
+    tg = CONSTANT(0.059470803871800003)*g[12] + CONSTANT(0.233596680327000010)*g[2] + CONSTANT(0.115164716491000000)*g[14] + CONSTANT(-0.169433177294000010)*g[30] + CONSTANT(0.173617342585000000)*g[32];
+    y[7] += tf*g[13] + tg*f[13];
+    y[13] += tf*g[7] + tg*f[7];
+    t = f[7] * g[13] + f[13] * g[7];
+    y[12] += CONSTANT(0.059470803871800003)*t;
+    y[2] += CONSTANT(0.233596680327000010)*t;
+    y[14] += CONSTANT(0.115164716491000000)*t;
+    y[30] += CONSTANT(-0.169433177294000010)*t;
+    y[32] += CONSTANT(0.173617342585000000)*t;
+
+    // [7,14]: 3,15,31,33,
+    tf = CONSTANT(0.184674390923000000)*f[3] + CONSTANT(0.148677009677999990)*f[15] + CONSTANT(-0.083004965973399999)*f[31] + CONSTANT(0.179311220382000010)*f[33];
+    tg = CONSTANT(0.184674390923000000)*g[3] + CONSTANT(0.148677009677999990)*g[15] + CONSTANT(-0.083004965973399999)*g[31] + CONSTANT(0.179311220382000010)*g[33];
+    y[7] += tf*g[14] + tg*f[14];
+    y[14] += tf*g[7] + tg*f[7];
+    t = f[7] * g[14] + f[14] * g[7];
+    y[3] += CONSTANT(0.184674390923000000)*t;
+    y[15] += CONSTANT(0.148677009677999990)*t;
+    y[31] += CONSTANT(-0.083004965973399999)*t;
+    y[33] += CONSTANT(0.179311220382000010)*t;
+
+    // [7,17]: 16,4,18,
+    tf = CONSTANT(0.140463346187999990)*f[16] + CONSTANT(0.168583882835000000)*f[4] + CONSTANT(0.132725386549000010)*f[18];
+    tg = CONSTANT(0.140463346187999990)*g[16] + CONSTANT(0.168583882835000000)*g[4] + CONSTANT(0.132725386549000010)*g[18];
+    y[7] += tf*g[17] + tg*f[17];
+    y[17] += tf*g[7] + tg*f[7];
+    t = f[7] * g[17] + f[17] * g[7];
+    y[16] += CONSTANT(0.140463346187999990)*t;
+    y[4] += CONSTANT(0.168583882835000000)*t;
+    y[18] += CONSTANT(0.132725386549000010)*t;
+
+    // [7,21]: 8,20,6,22,
+    tf = CONSTANT(-0.063718718433900007)*f[8] + CONSTANT(0.044869370061299998)*f[20] + CONSTANT(0.220728115440999990)*f[6] + CONSTANT(0.090297865408399999)*f[22];
+    tg = CONSTANT(-0.063718718433900007)*g[8] + CONSTANT(0.044869370061299998)*g[20] + CONSTANT(0.220728115440999990)*g[6] + CONSTANT(0.090297865408399999)*g[22];
+    y[7] += tf*g[21] + tg*f[21];
+    y[21] += tf*g[7] + tg*f[7];
+    t = f[7] * g[21] + f[21] * g[7];
+    y[8] += CONSTANT(-0.063718718433900007)*t;
+    y[20] += CONSTANT(0.044869370061299998)*t;
+    y[6] += CONSTANT(0.220728115440999990)*t;
+    y[22] += CONSTANT(0.090297865408399999)*t;
+
+    // [7,23]: 8,22,24,
+    tf = CONSTANT(0.168583882832999990)*f[8] + CONSTANT(0.132725386548000010)*f[22] + CONSTANT(0.140463346189000000)*f[24];
+    tg = CONSTANT(0.168583882832999990)*g[8] + CONSTANT(0.132725386548000010)*g[22] + CONSTANT(0.140463346189000000)*g[24];
+    y[7] += tf*g[23] + tg*f[23];
+    y[23] += tf*g[7] + tg*f[7];
+    t = f[7] * g[23] + f[23] * g[7];
+    y[8] += CONSTANT(0.168583882832999990)*t;
+    y[22] += CONSTANT(0.132725386548000010)*t;
+    y[24] += CONSTANT(0.140463346189000000)*t;
+
+    // [7,26]: 9,25,27,
+    tf = CONSTANT(0.155288072035000000)*f[9] + CONSTANT(0.132882365179999990)*f[25] + CONSTANT(0.138662534056999990)*f[27];
+    tg = CONSTANT(0.155288072035000000)*g[9] + CONSTANT(0.132882365179999990)*g[25] + CONSTANT(0.138662534056999990)*g[27];
+    y[7] += tf*g[26] + tg*f[26];
+    y[26] += tf*g[7] + tg*f[7];
+    t = f[7] * g[26] + f[26] * g[7];
+    y[9] += CONSTANT(0.155288072035000000)*t;
+    y[25] += CONSTANT(0.132882365179999990)*t;
+    y[27] += CONSTANT(0.138662534056999990)*t;
+
+    // [7,28]: 27,11,9,29,
+    tf = CONSTANT(0.114366930522000000)*f[27] + CONSTANT(0.173617342584000000)*f[11] + CONSTANT(-0.044827805096399997)*f[9] + CONSTANT(0.074118242118699995)*f[29];
+    tg = CONSTANT(0.114366930522000000)*g[27] + CONSTANT(0.173617342584000000)*g[11] + CONSTANT(-0.044827805096399997)*g[9] + CONSTANT(0.074118242118699995)*g[29];
+    y[7] += tf*g[28] + tg*f[28];
+    y[28] += tf*g[7] + tg*f[7];
+    t = f[7] * g[28] + f[28] * g[7];
+    y[27] += CONSTANT(0.114366930522000000)*t;
+    y[11] += CONSTANT(0.173617342584000000)*t;
+    y[9] += CONSTANT(-0.044827805096399997)*t;
+    y[29] += CONSTANT(0.074118242118699995)*t;
+
+    // [7,31]: 30,12,32,
+    tf = CONSTANT(0.036165998945399999)*f[30] + CONSTANT(0.214317900578999990)*f[12] + CONSTANT(0.074118242119099995)*f[32];
+    tg = CONSTANT(0.036165998945399999)*g[30] + CONSTANT(0.214317900578999990)*g[12] + CONSTANT(0.074118242119099995)*g[32];
+    y[7] += tf*g[31] + tg*f[31];
+    y[31] += tf*g[7] + tg*f[7];
+    t = f[7] * g[31] + f[31] * g[7];
+    y[30] += CONSTANT(0.036165998945399999)*t;
+    y[12] += CONSTANT(0.214317900578999990)*t;
+    y[32] += CONSTANT(0.074118242119099995)*t;
+
+    // [7,32]: 15,33,
+    tf = CONSTANT(-0.044827805096799997)*f[15] + CONSTANT(0.114366930522000000)*f[33];
+    tg = CONSTANT(-0.044827805096799997)*g[15] + CONSTANT(0.114366930522000000)*g[33];
+    y[7] += tf*g[32] + tg*f[32];
+    y[32] += tf*g[7] + tg*f[7];
+    t = f[7] * g[32] + f[32] * g[7];
+    y[15] += CONSTANT(-0.044827805096799997)*t;
+    y[33] += CONSTANT(0.114366930522000000)*t;
+
+    // [7,34]: 15,33,35,
+    tf = CONSTANT(0.155288072036000010)*f[15] + CONSTANT(0.138662534059000000)*f[33] + CONSTANT(0.132882365179000010)*f[35];
+    tg = CONSTANT(0.155288072036000010)*g[15] + CONSTANT(0.138662534059000000)*g[33] + CONSTANT(0.132882365179000010)*g[35];
+    y[7] += tf*g[34] + tg*f[34];
+    y[34] += tf*g[7] + tg*f[7];
+    t = f[7] * g[34] + f[34] * g[7];
+    y[15] += CONSTANT(0.155288072036000010)*t;
+    y[33] += CONSTANT(0.138662534059000000)*t;
+    y[35] += CONSTANT(0.132882365179000010)*t;
+
+    // [8,8]: 0,6,20,24,
+    tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.180223751576000010)*f[6] + CONSTANT(0.040299255967500003)*f[20] + CONSTANT(0.238413613505999990)*f[24];
+    tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.180223751576000010)*g[6] + CONSTANT(0.040299255967500003)*g[20] + CONSTANT(0.238413613505999990)*g[24];
+    y[8] += tf*g[8] + tg*f[8];
+    t = f[8] * g[8];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[6] += CONSTANT(-0.180223751576000010)*t;
+    y[20] += CONSTANT(0.040299255967500003)*t;
+    y[24] += CONSTANT(0.238413613505999990)*t;
+
+    // [8,9]: 1,11,25,29,
+    tf = CONSTANT(0.226179013155000000)*f[1] + CONSTANT(-0.094031597259499999)*f[11] + CONSTANT(0.245532000541000000)*f[25] + CONSTANT(0.016943317729199998)*f[29];
+    tg = CONSTANT(0.226179013155000000)*g[1] + CONSTANT(-0.094031597259499999)*g[11] + CONSTANT(0.245532000541000000)*g[25] + CONSTANT(0.016943317729199998)*g[29];
+    y[8] += tf*g[9] + tg*f[9];
+    y[9] += tf*g[8] + tg*f[8];
+    t = f[8] * g[9] + f[9] * g[8];
+    y[1] += CONSTANT(0.226179013155000000)*t;
+    y[11] += CONSTANT(-0.094031597259499999)*t;
+    y[25] += CONSTANT(0.245532000541000000)*t;
+    y[29] += CONSTANT(0.016943317729199998)*t;
+
+    // [8,14]: 2,12,30,34,
+    tf = CONSTANT(0.184674390919999990)*f[2] + CONSTANT(-0.188063194517999990)*f[12] + CONSTANT(0.053579475144400000)*f[30] + CONSTANT(0.190188269816000010)*f[34];
+    tg = CONSTANT(0.184674390919999990)*g[2] + CONSTANT(-0.188063194517999990)*g[12] + CONSTANT(0.053579475144400000)*g[30] + CONSTANT(0.190188269816000010)*g[34];
+    y[8] += tf*g[14] + tg*f[14];
+    y[14] += tf*g[8] + tg*f[8];
+    t = f[8] * g[14] + f[14] * g[8];
+    y[2] += CONSTANT(0.184674390919999990)*t;
+    y[12] += CONSTANT(-0.188063194517999990)*t;
+    y[30] += CONSTANT(0.053579475144400000)*t;
+    y[34] += CONSTANT(0.190188269816000010)*t;
+
+    // [8,15]: 13,3,31,35,
+    tf = CONSTANT(-0.094031597259499999)*f[13] + CONSTANT(0.226179013155000000)*f[3] + CONSTANT(0.016943317729199998)*f[31] + CONSTANT(0.245532000541000000)*f[35];
+    tg = CONSTANT(-0.094031597259499999)*g[13] + CONSTANT(0.226179013155000000)*g[3] + CONSTANT(0.016943317729199998)*g[31] + CONSTANT(0.245532000541000000)*g[35];
+    y[8] += tf*g[15] + tg*f[15];
+    y[15] += tf*g[8] + tg*f[8];
+    t = f[8] * g[15] + f[15] * g[8];
+    y[13] += CONSTANT(-0.094031597259499999)*t;
+    y[3] += CONSTANT(0.226179013155000000)*t;
+    y[31] += CONSTANT(0.016943317729199998)*t;
+    y[35] += CONSTANT(0.245532000541000000)*t;
+
+    // [8,22]: 6,20,24,
+    tf = CONSTANT(0.156078347226000000)*f[6] + CONSTANT(-0.190364615029000010)*f[20] + CONSTANT(-0.075080816691500005)*f[24];
+    tg = CONSTANT(0.156078347226000000)*g[6] + CONSTANT(-0.190364615029000010)*g[20] + CONSTANT(-0.075080816691500005)*g[24];
+    y[8] += tf*g[22] + tg*f[22];
+    y[22] += tf*g[8] + tg*f[8];
+    t = f[8] * g[22] + f[22] * g[8];
+    y[6] += CONSTANT(0.156078347226000000)*t;
+    y[20] += CONSTANT(-0.190364615029000010)*t;
+    y[24] += CONSTANT(-0.075080816691500005)*t;
+
+    // [8,26]: 10,28,
+    tf = CONSTANT(0.190188269806999990)*f[10] + CONSTANT(-0.097043558542400002)*f[28];
+    tg = CONSTANT(0.190188269806999990)*g[10] + CONSTANT(-0.097043558542400002)*g[28];
+    y[8] += tf*g[26] + tg*f[26];
+    y[26] += tf*g[8] + tg*f[8];
+    t = f[8] * g[26] + f[26] * g[8];
+    y[10] += CONSTANT(0.190188269806999990)*t;
+    y[28] += CONSTANT(-0.097043558542400002)*t;
+
+    // [8,27]: 25,11,29,
+    tf = CONSTANT(-0.062641347680800000)*f[25] + CONSTANT(0.141757966609000000)*f[11] + CONSTANT(-0.121034582550000010)*f[29];
+    tg = CONSTANT(-0.062641347680800000)*g[25] + CONSTANT(0.141757966609000000)*g[11] + CONSTANT(-0.121034582550000010)*g[29];
+    y[8] += tf*g[27] + tg*f[27];
+    y[27] += tf*g[8] + tg*f[8];
+    t = f[8] * g[27] + f[27] * g[8];
+    y[25] += CONSTANT(-0.062641347680800000)*t;
+    y[11] += CONSTANT(0.141757966609000000)*t;
+    y[29] += CONSTANT(-0.121034582550000010)*t;
+
+    // [8,32]: 30,12,34,
+    tf = CONSTANT(-0.191372478254000000)*f[30] + CONSTANT(0.141757966609000000)*f[12] + CONSTANT(-0.097043558538899996)*f[34];
+    tg = CONSTANT(-0.191372478254000000)*g[30] + CONSTANT(0.141757966609000000)*g[12] + CONSTANT(-0.097043558538899996)*g[34];
+    y[8] += tf*g[32] + tg*f[32];
+    y[32] += tf*g[8] + tg*f[8];
+    t = f[8] * g[32] + f[32] * g[8];
+    y[30] += CONSTANT(-0.191372478254000000)*t;
+    y[12] += CONSTANT(0.141757966609000000)*t;
+    y[34] += CONSTANT(-0.097043558538899996)*t;
+
+    // [8,33]: 13,31,35,
+    tf = CONSTANT(0.141757966609000000)*f[13] + CONSTANT(-0.121034582550000010)*f[31] + CONSTANT(-0.062641347680800000)*f[35];
+    tg = CONSTANT(0.141757966609000000)*g[13] + CONSTANT(-0.121034582550000010)*g[31] + CONSTANT(-0.062641347680800000)*g[35];
+    y[8] += tf*g[33] + tg*f[33];
+    y[33] += tf*g[8] + tg*f[8];
+    t = f[8] * g[33] + f[33] * g[8];
+    y[13] += CONSTANT(0.141757966609000000)*t;
+    y[31] += CONSTANT(-0.121034582550000010)*t;
+    y[35] += CONSTANT(-0.062641347680800000)*t;
+
+    // [9,9]: 6,0,20,
+    tf = CONSTANT(-0.210261043508000010)*f[6] + CONSTANT(0.282094791766999970)*f[0] + CONSTANT(0.076934943209800002)*f[20];
+    tg = CONSTANT(-0.210261043508000010)*g[6] + CONSTANT(0.282094791766999970)*g[0] + CONSTANT(0.076934943209800002)*g[20];
+    y[9] += tf*g[9] + tg*f[9];
+    t = f[9] * g[9];
+    y[6] += CONSTANT(-0.210261043508000010)*t;
+    y[0] += CONSTANT(0.282094791766999970)*t;
+    y[20] += CONSTANT(0.076934943209800002)*t;
+
+    // [9,17]: 2,12,30,
+    tf = CONSTANT(0.162867503964999990)*f[2] + CONSTANT(-0.203550726872999990)*f[12] + CONSTANT(0.098140130728100003)*f[30];
+    tg = CONSTANT(0.162867503964999990)*g[2] + CONSTANT(-0.203550726872999990)*g[12] + CONSTANT(0.098140130728100003)*g[30];
+    y[9] += tf*g[17] + tg*f[17];
+    y[17] += tf*g[9] + tg*f[9];
+    t = f[9] * g[17] + f[17] * g[9];
+    y[2] += CONSTANT(0.162867503964999990)*t;
+    y[12] += CONSTANT(-0.203550726872999990)*t;
+    y[30] += CONSTANT(0.098140130728100003)*t;
+
+    // [9,18]: 3,13,31,35,
+    tf = CONSTANT(-0.043528171377799997)*f[3] + CONSTANT(0.133255230519000010)*f[13] + CONSTANT(-0.101584686310000010)*f[31] + CONSTANT(0.098140130731999994)*f[35];
+    tg = CONSTANT(-0.043528171377799997)*g[3] + CONSTANT(0.133255230519000010)*g[13] + CONSTANT(-0.101584686310000010)*g[31] + CONSTANT(0.098140130731999994)*g[35];
+    y[9] += tf*g[18] + tg*f[18];
+    y[18] += tf*g[9] + tg*f[9];
+    t = f[9] * g[18] + f[18] * g[9];
+    y[3] += CONSTANT(-0.043528171377799997)*t;
+    y[13] += CONSTANT(0.133255230519000010)*t;
+    y[31] += CONSTANT(-0.101584686310000010)*t;
+    y[35] += CONSTANT(0.098140130731999994)*t;
+
+    // [9,19]: 14,32,34,
+    tf = CONSTANT(-0.099322584600699995)*f[14] + CONSTANT(0.126698363970000010)*f[32] + CONSTANT(0.131668802180999990)*f[34];
+    tg = CONSTANT(-0.099322584600699995)*g[14] + CONSTANT(0.126698363970000010)*g[32] + CONSTANT(0.131668802180999990)*g[34];
+    y[9] += tf*g[19] + tg*f[19];
+    y[19] += tf*g[9] + tg*f[9];
+    t = f[9] * g[19] + f[19] * g[9];
+    y[14] += CONSTANT(-0.099322584600699995)*t;
+    y[32] += CONSTANT(0.126698363970000010)*t;
+    y[34] += CONSTANT(0.131668802180999990)*t;
+
+    // [9,22]: 1,11,25,29,
+    tf = CONSTANT(-0.043528171378199997)*f[1] + CONSTANT(0.133255230518000010)*f[11] + CONSTANT(-0.098140130732499997)*f[25] + CONSTANT(-0.101584686311000000)*f[29];
+    tg = CONSTANT(-0.043528171378199997)*g[1] + CONSTANT(0.133255230518000010)*g[11] + CONSTANT(-0.098140130732499997)*g[25] + CONSTANT(-0.101584686311000000)*g[29];
+    y[9] += tf*g[22] + tg*f[22];
+    y[22] += tf*g[9] + tg*f[9];
+    t = f[9] * g[22] + f[22] * g[9];
+    y[1] += CONSTANT(-0.043528171378199997)*t;
+    y[11] += CONSTANT(0.133255230518000010)*t;
+    y[25] += CONSTANT(-0.098140130732499997)*t;
+    y[29] += CONSTANT(-0.101584686311000000)*t;
+
+    // [9,27]: 6,20,
+    tf = CONSTANT(0.126792179874999990)*f[6] + CONSTANT(-0.196280261464999990)*f[20];
+    tg = CONSTANT(0.126792179874999990)*g[6] + CONSTANT(-0.196280261464999990)*g[20];
+    y[9] += tf*g[27] + tg*f[27];
+    y[27] += tf*g[9] + tg*f[9];
+    t = f[9] * g[27] + f[27] * g[9];
+    y[6] += CONSTANT(0.126792179874999990)*t;
+    y[20] += CONSTANT(-0.196280261464999990)*t;
+
+    // [10,10]: 0,20,24,
+    tf = CONSTANT(0.282094791771999980)*f[0] + CONSTANT(-0.179514867494000000)*f[20] + CONSTANT(-0.151717754049000010)*f[24];
+    tg = CONSTANT(0.282094791771999980)*g[0] + CONSTANT(-0.179514867494000000)*g[20] + CONSTANT(-0.151717754049000010)*g[24];
+    y[10] += tf*g[10] + tg*f[10];
+    t = f[10] * g[10];
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[20] += CONSTANT(-0.179514867494000000)*t;
+    y[24] += CONSTANT(-0.151717754049000010)*t;
+
+    // [10,16]: 14,32,
+    tf = CONSTANT(0.151717754044999990)*f[14] + CONSTANT(-0.077413979111300005)*f[32];
+    tg = CONSTANT(0.151717754044999990)*g[14] + CONSTANT(-0.077413979111300005)*g[32];
+    y[10] += tf*g[16] + tg*f[16];
+    y[16] += tf*g[10] + tg*f[10];
+    t = f[10] * g[16] + f[16] * g[10];
+    y[14] += CONSTANT(0.151717754044999990)*t;
+    y[32] += CONSTANT(-0.077413979111300005)*t;
+
+    // [10,17]: 13,3,31,35,
+    tf = CONSTANT(0.067850242288900006)*f[13] + CONSTANT(0.199471140200000010)*f[3] + CONSTANT(-0.113793659091000000)*f[31] + CONSTANT(-0.149911525925999990)*f[35];
+    tg = CONSTANT(0.067850242288900006)*g[13] + CONSTANT(0.199471140200000010)*g[3] + CONSTANT(-0.113793659091000000)*g[31] + CONSTANT(-0.149911525925999990)*g[35];
+    y[10] += tf*g[17] + tg*f[17];
+    y[17] += tf*g[10] + tg*f[10];
+    t = f[10] * g[17] + f[17] * g[10];
+    y[13] += CONSTANT(0.067850242288900006)*t;
+    y[3] += CONSTANT(0.199471140200000010)*t;
+    y[31] += CONSTANT(-0.113793659091000000)*t;
+    y[35] += CONSTANT(-0.149911525925999990)*t;
+
+    // [10,18]: 12,2,30,34,
+    tf = CONSTANT(-0.044418410173299998)*f[12] + CONSTANT(0.213243618621000000)*f[2] + CONSTANT(-0.171327458205000000)*f[30] + CONSTANT(-0.101358691177000000)*f[34];
+    tg = CONSTANT(-0.044418410173299998)*g[12] + CONSTANT(0.213243618621000000)*g[2] + CONSTANT(-0.171327458205000000)*g[30] + CONSTANT(-0.101358691177000000)*g[34];
+    y[10] += tf*g[18] + tg*f[18];
+    y[18] += tf*g[10] + tg*f[10];
+    t = f[10] * g[18] + f[18] * g[10];
+    y[12] += CONSTANT(-0.044418410173299998)*t;
+    y[2] += CONSTANT(0.213243618621000000)*t;
+    y[30] += CONSTANT(-0.171327458205000000)*t;
+    y[34] += CONSTANT(-0.101358691177000000)*t;
+
+    // [10,19]: 3,15,13,31,33,
+    tf = CONSTANT(-0.075393004386799994)*f[3] + CONSTANT(0.099322584599600000)*f[15] + CONSTANT(0.102579924281000000)*f[13] + CONSTANT(0.097749909976500002)*f[31] + CONSTANT(-0.025339672794100002)*f[33];
+    tg = CONSTANT(-0.075393004386799994)*g[3] + CONSTANT(0.099322584599600000)*g[15] + CONSTANT(0.102579924281000000)*g[13] + CONSTANT(0.097749909976500002)*g[31] + CONSTANT(-0.025339672794100002)*g[33];
+    y[10] += tf*g[19] + tg*f[19];
+    y[19] += tf*g[10] + tg*f[10];
+    t = f[10] * g[19] + f[19] * g[10];
+    y[3] += CONSTANT(-0.075393004386799994)*t;
+    y[15] += CONSTANT(0.099322584599600000)*t;
+    y[13] += CONSTANT(0.102579924281000000)*t;
+    y[31] += CONSTANT(0.097749909976500002)*t;
+    y[33] += CONSTANT(-0.025339672794100002)*t;
+
+    // [10,21]: 11,1,9,27,29,
+    tf = CONSTANT(0.102579924281000000)*f[11] + CONSTANT(-0.075393004386799994)*f[1] + CONSTANT(-0.099322584599600000)*f[9] + CONSTANT(0.025339672794100002)*f[27] + CONSTANT(0.097749909976500002)*f[29];
+    tg = CONSTANT(0.102579924281000000)*g[11] + CONSTANT(-0.075393004386799994)*g[1] + CONSTANT(-0.099322584599600000)*g[9] + CONSTANT(0.025339672794100002)*g[27] + CONSTANT(0.097749909976500002)*g[29];
+    y[10] += tf*g[21] + tg*f[21];
+    y[21] += tf*g[10] + tg*f[10];
+    t = f[10] * g[21] + f[21] * g[10];
+    y[11] += CONSTANT(0.102579924281000000)*t;
+    y[1] += CONSTANT(-0.075393004386799994)*t;
+    y[9] += CONSTANT(-0.099322584599600000)*t;
+    y[27] += CONSTANT(0.025339672794100002)*t;
+    y[29] += CONSTANT(0.097749909976500002)*t;
+
+    // [10,23]: 11,1,25,29,
+    tf = CONSTANT(-0.067850242288900006)*f[11] + CONSTANT(-0.199471140200000010)*f[1] + CONSTANT(0.149911525925999990)*f[25] + CONSTANT(0.113793659091000000)*f[29];
+    tg = CONSTANT(-0.067850242288900006)*g[11] + CONSTANT(-0.199471140200000010)*g[1] + CONSTANT(0.149911525925999990)*g[25] + CONSTANT(0.113793659091000000)*g[29];
+    y[10] += tf*g[23] + tg*f[23];
+    y[23] += tf*g[10] + tg*f[10];
+    t = f[10] * g[23] + f[23] * g[10];
+    y[11] += CONSTANT(-0.067850242288900006)*t;
+    y[1] += CONSTANT(-0.199471140200000010)*t;
+    y[25] += CONSTANT(0.149911525925999990)*t;
+    y[29] += CONSTANT(0.113793659091000000)*t;
+
+    // [10,28]: 6,20,24,
+    tf = CONSTANT(0.190188269814000000)*f[6] + CONSTANT(-0.065426753820500005)*f[20] + CONSTANT(0.077413979109600004)*f[24];
+    tg = CONSTANT(0.190188269814000000)*g[6] + CONSTANT(-0.065426753820500005)*g[20] + CONSTANT(0.077413979109600004)*g[24];
+    y[10] += tf*g[28] + tg*f[28];
+    y[28] += tf*g[10] + tg*f[10];
+    t = f[10] * g[28] + f[28] * g[10];
+    y[6] += CONSTANT(0.190188269814000000)*t;
+    y[20] += CONSTANT(-0.065426753820500005)*t;
+    y[24] += CONSTANT(0.077413979109600004)*t;
+
+    // [11,11]: 0,6,8,20,22,
+    tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.126156626101000010)*f[6] + CONSTANT(-0.145673124078999990)*f[8] + CONSTANT(0.025644981070299999)*f[20] + CONSTANT(-0.114687841910000000)*f[22];
+    tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.126156626101000010)*g[6] + CONSTANT(-0.145673124078999990)*g[8] + CONSTANT(0.025644981070299999)*g[20] + CONSTANT(-0.114687841910000000)*g[22];
+    y[11] += tf*g[11] + tg*f[11];
+    t = f[11] * g[11];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[6] += CONSTANT(0.126156626101000010)*t;
+    y[8] += CONSTANT(-0.145673124078999990)*t;
+    y[20] += CONSTANT(0.025644981070299999)*t;
+    y[22] += CONSTANT(-0.114687841910000000)*t;
+
+    // [11,16]: 15,33,35,
+    tf = CONSTANT(-0.117520066953000000)*f[15] + CONSTANT(0.119929220739999990)*f[33] + CONSTANT(0.134084945035999990)*f[35];
+    tg = CONSTANT(-0.117520066953000000)*g[15] + CONSTANT(0.119929220739999990)*g[33] + CONSTANT(0.134084945035999990)*g[35];
+    y[11] += tf*g[16] + tg*f[16];
+    y[16] += tf*g[11] + tg*f[11];
+    t = f[11] * g[16] + f[16] * g[11];
+    y[15] += CONSTANT(-0.117520066953000000)*t;
+    y[33] += CONSTANT(0.119929220739999990)*t;
+    y[35] += CONSTANT(0.134084945035999990)*t;
+
+    // [11,18]: 3,13,15,31,33,
+    tf = CONSTANT(0.168583882834000000)*f[3] + CONSTANT(0.114687841909000000)*f[13] + CONSTANT(-0.133255230519000010)*f[15] + CONSTANT(0.075189952564900006)*f[31] + CONSTANT(-0.101990215611000000)*f[33];
+    tg = CONSTANT(0.168583882834000000)*g[3] + CONSTANT(0.114687841909000000)*g[13] + CONSTANT(-0.133255230519000010)*g[15] + CONSTANT(0.075189952564900006)*g[31] + CONSTANT(-0.101990215611000000)*g[33];
+    y[11] += tf*g[18] + tg*f[18];
+    y[18] += tf*g[11] + tg*f[11];
+    t = f[11] * g[18] + f[18] * g[11];
+    y[3] += CONSTANT(0.168583882834000000)*t;
+    y[13] += CONSTANT(0.114687841909000000)*t;
+    y[15] += CONSTANT(-0.133255230519000010)*t;
+    y[31] += CONSTANT(0.075189952564900006)*t;
+    y[33] += CONSTANT(-0.101990215611000000)*t;
+
+    // [11,19]: 2,14,12,30,32,
+    tf = CONSTANT(0.238413613504000000)*f[2] + CONSTANT(-0.102579924282000000)*f[14] + CONSTANT(0.099322584599300004)*f[12] + CONSTANT(0.009577496073830001)*f[30] + CONSTANT(-0.104682806112000000)*f[32];
+    tg = CONSTANT(0.238413613504000000)*g[2] + CONSTANT(-0.102579924282000000)*g[14] + CONSTANT(0.099322584599300004)*g[12] + CONSTANT(0.009577496073830001)*g[30] + CONSTANT(-0.104682806112000000)*g[32];
+    y[11] += tf*g[19] + tg*f[19];
+    y[19] += tf*g[11] + tg*f[11];
+    t = f[11] * g[19] + f[19] * g[11];
+    y[2] += CONSTANT(0.238413613504000000)*t;
+    y[14] += CONSTANT(-0.102579924282000000)*t;
+    y[12] += CONSTANT(0.099322584599300004)*t;
+    y[30] += CONSTANT(0.009577496073830001)*t;
+    y[32] += CONSTANT(-0.104682806112000000)*t;
+
+    // [11,24]: 9,25,27,
+    tf = CONSTANT(0.117520066950999990)*f[9] + CONSTANT(-0.134084945037000000)*f[25] + CONSTANT(-0.119929220742000010)*f[27];
+    tg = CONSTANT(0.117520066950999990)*g[9] + CONSTANT(-0.134084945037000000)*g[25] + CONSTANT(-0.119929220742000010)*g[27];
+    y[11] += tf*g[24] + tg*f[24];
+    y[24] += tf*g[11] + tg*f[11];
+    t = f[11] * g[24] + f[24] * g[11];
+    y[9] += CONSTANT(0.117520066950999990)*t;
+    y[25] += CONSTANT(-0.134084945037000000)*t;
+    y[27] += CONSTANT(-0.119929220742000010)*t;
+
+    // [11,29]: 6,20,22,8,
+    tf = CONSTANT(0.227318461243000010)*f[6] + CONSTANT(0.086019920779800002)*f[20] + CONSTANT(-0.075189952565200002)*f[22] + CONSTANT(0.065621187395299999)*f[8];
+    tg = CONSTANT(0.227318461243000010)*g[6] + CONSTANT(0.086019920779800002)*g[20] + CONSTANT(-0.075189952565200002)*g[22] + CONSTANT(0.065621187395299999)*g[8];
+    y[11] += tf*g[29] + tg*f[29];
+    y[29] += tf*g[11] + tg*f[11];
+    t = f[11] * g[29] + f[29] * g[11];
+    y[6] += CONSTANT(0.227318461243000010)*t;
+    y[20] += CONSTANT(0.086019920779800002)*t;
+    y[22] += CONSTANT(-0.075189952565200002)*t;
+    y[8] += CONSTANT(0.065621187395299999)*t;
+
+    // [12,12]: 0,6,20,
+    tf = CONSTANT(0.282094799871999980)*f[0] + CONSTANT(0.168208852954000010)*f[6] + CONSTANT(0.153869910786000010)*f[20];
+    tg = CONSTANT(0.282094799871999980)*g[0] + CONSTANT(0.168208852954000010)*g[6] + CONSTANT(0.153869910786000010)*g[20];
+    y[12] += tf*g[12] + tg*f[12];
+    t = f[12] * g[12];
+    y[0] += CONSTANT(0.282094799871999980)*t;
+    y[6] += CONSTANT(0.168208852954000010)*t;
+    y[20] += CONSTANT(0.153869910786000010)*t;
+
+    // [12,30]: 20,6,
+    tf = CONSTANT(0.148373961712999990)*f[20] + CONSTANT(0.239614719999000000)*f[6];
+    tg = CONSTANT(0.148373961712999990)*g[20] + CONSTANT(0.239614719999000000)*g[6];
+    y[12] += tf*g[30] + tg*f[30];
+    y[30] += tf*g[12] + tg*f[12];
+    t = f[12] * g[30] + f[30] * g[12];
+    y[20] += CONSTANT(0.148373961712999990)*t;
+    y[6] += CONSTANT(0.239614719999000000)*t;
+
+    // [13,13]: 0,8,6,20,22,
+    tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.145673124078999990)*f[8] + CONSTANT(0.126156626101000010)*f[6] + CONSTANT(0.025644981070299999)*f[20] + CONSTANT(0.114687841910000000)*f[22];
+    tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.145673124078999990)*g[8] + CONSTANT(0.126156626101000010)*g[6] + CONSTANT(0.025644981070299999)*g[20] + CONSTANT(0.114687841910000000)*g[22];
+    y[13] += tf*g[13] + tg*f[13];
+    t = f[13] * g[13];
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[8] += CONSTANT(0.145673124078999990)*t;
+    y[6] += CONSTANT(0.126156626101000010)*t;
+    y[20] += CONSTANT(0.025644981070299999)*t;
+    y[22] += CONSTANT(0.114687841910000000)*t;
+
+    // [13,16]: 9,25,27,
+    tf = CONSTANT(-0.117520066953000000)*f[9] + CONSTANT(-0.134084945035999990)*f[25] + CONSTANT(0.119929220739999990)*f[27];
+    tg = CONSTANT(-0.117520066953000000)*g[9] + CONSTANT(-0.134084945035999990)*g[25] + CONSTANT(0.119929220739999990)*g[27];
+    y[13] += tf*g[16] + tg*f[16];
+    y[16] += tf*g[13] + tg*f[13];
+    t = f[13] * g[16] + f[16] * g[13];
+    y[9] += CONSTANT(-0.117520066953000000)*t;
+    y[25] += CONSTANT(-0.134084945035999990)*t;
+    y[27] += CONSTANT(0.119929220739999990)*t;
+
+    // [13,21]: 2,12,14,30,32,
+    tf = CONSTANT(0.238413613504000000)*f[2] + CONSTANT(0.099322584599300004)*f[12] + CONSTANT(0.102579924282000000)*f[14] + CONSTANT(0.009577496073830001)*f[30] + CONSTANT(0.104682806112000000)*f[32];
+    tg = CONSTANT(0.238413613504000000)*g[2] + CONSTANT(0.099322584599300004)*g[12] + CONSTANT(0.102579924282000000)*g[14] + CONSTANT(0.009577496073830001)*g[30] + CONSTANT(0.104682806112000000)*g[32];
+    y[13] += tf*g[21] + tg*f[21];
+    y[21] += tf*g[13] + tg*f[13];
+    t = f[13] * g[21] + f[21] * g[13];
+    y[2] += CONSTANT(0.238413613504000000)*t;
+    y[12] += CONSTANT(0.099322584599300004)*t;
+    y[14] += CONSTANT(0.102579924282000000)*t;
+    y[30] += CONSTANT(0.009577496073830001)*t;
+    y[32] += CONSTANT(0.104682806112000000)*t;
+
+    // [13,24]: 15,33,35,
+    tf = CONSTANT(-0.117520066950999990)*f[15] + CONSTANT(0.119929220742000010)*f[33] + CONSTANT(-0.134084945037000000)*f[35];
+    tg = CONSTANT(-0.117520066950999990)*g[15] + CONSTANT(0.119929220742000010)*g[33] + CONSTANT(-0.134084945037000000)*g[35];
+    y[13] += tf*g[24] + tg*f[24];
+    y[24] += tf*g[13] + tg*f[13];
+    t = f[13] * g[24] + f[24] * g[13];
+    y[15] += CONSTANT(-0.117520066950999990)*t;
+    y[33] += CONSTANT(0.119929220742000010)*t;
+    y[35] += CONSTANT(-0.134084945037000000)*t;
+
+    // [13,31]: 6,22,20,8,
+    tf = CONSTANT(0.227318461243000010)*f[6] + CONSTANT(0.075189952565200002)*f[22] + CONSTANT(0.086019920779800002)*f[20] + CONSTANT(-0.065621187395299999)*f[8];
+    tg = CONSTANT(0.227318461243000010)*g[6] + CONSTANT(0.075189952565200002)*g[22] + CONSTANT(0.086019920779800002)*g[20] + CONSTANT(-0.065621187395299999)*g[8];
+    y[13] += tf*g[31] + tg*f[31];
+    y[31] += tf*g[13] + tg*f[13];
+    t = f[13] * g[31] + f[31] * g[13];
+    y[6] += CONSTANT(0.227318461243000010)*t;
+    y[22] += CONSTANT(0.075189952565200002)*t;
+    y[20] += CONSTANT(0.086019920779800002)*t;
+    y[8] += CONSTANT(-0.065621187395299999)*t;
+
+    // [14,14]: 0,20,24,
+    tf = CONSTANT(0.282094791771999980)*f[0] + CONSTANT(-0.179514867494000000)*f[20] + CONSTANT(0.151717754049000010)*f[24];
+    tg = CONSTANT(0.282094791771999980)*g[0] + CONSTANT(-0.179514867494000000)*g[20] + CONSTANT(0.151717754049000010)*g[24];
+    y[14] += tf*g[14] + tg*f[14];
+    t = f[14] * g[14];
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[20] += CONSTANT(-0.179514867494000000)*t;
+    y[24] += CONSTANT(0.151717754049000010)*t;
+
+    // [14,17]: 11,1,25,29,
+    tf = CONSTANT(0.067850242288500007)*f[11] + CONSTANT(0.199471140196999990)*f[1] + CONSTANT(0.149911525925999990)*f[25] + CONSTANT(-0.113793659092000000)*f[29];
+    tg = CONSTANT(0.067850242288500007)*g[11] + CONSTANT(0.199471140196999990)*g[1] + CONSTANT(0.149911525925999990)*g[25] + CONSTANT(-0.113793659092000000)*g[29];
+    y[14] += tf*g[17] + tg*f[17];
+    y[17] += tf*g[14] + tg*f[14];
+    t = f[14] * g[17] + f[17] * g[14];
+    y[11] += CONSTANT(0.067850242288500007)*t;
+    y[1] += CONSTANT(0.199471140196999990)*t;
+    y[25] += CONSTANT(0.149911525925999990)*t;
+    y[29] += CONSTANT(-0.113793659092000000)*t;
+
+    // [14,22]: 12,2,30,34,
+    tf = CONSTANT(-0.044418410173299998)*f[12] + CONSTANT(0.213243618621000000)*f[2] + CONSTANT(-0.171327458205000000)*f[30] + CONSTANT(0.101358691177000000)*f[34];
+    tg = CONSTANT(-0.044418410173299998)*g[12] + CONSTANT(0.213243618621000000)*g[2] + CONSTANT(-0.171327458205000000)*g[30] + CONSTANT(0.101358691177000000)*g[34];
+    y[14] += tf*g[22] + tg*f[22];
+    y[22] += tf*g[14] + tg*f[14];
+    t = f[14] * g[22] + f[22] * g[14];
+    y[12] += CONSTANT(-0.044418410173299998)*t;
+    y[2] += CONSTANT(0.213243618621000000)*t;
+    y[30] += CONSTANT(-0.171327458205000000)*t;
+    y[34] += CONSTANT(0.101358691177000000)*t;
+
+    // [14,23]: 13,3,31,35,
+    tf = CONSTANT(0.067850242288500007)*f[13] + CONSTANT(0.199471140196999990)*f[3] + CONSTANT(-0.113793659092000000)*f[31] + CONSTANT(0.149911525925999990)*f[35];
+    tg = CONSTANT(0.067850242288500007)*g[13] + CONSTANT(0.199471140196999990)*g[3] + CONSTANT(-0.113793659092000000)*g[31] + CONSTANT(0.149911525925999990)*g[35];
+    y[14] += tf*g[23] + tg*f[23];
+    y[23] += tf*g[14] + tg*f[14];
+    t = f[14] * g[23] + f[23] * g[14];
+    y[13] += CONSTANT(0.067850242288500007)*t;
+    y[3] += CONSTANT(0.199471140196999990)*t;
+    y[31] += CONSTANT(-0.113793659092000000)*t;
+    y[35] += CONSTANT(0.149911525925999990)*t;
+
+    // [14,32]: 20,6,24,
+    tf = CONSTANT(-0.065426753820500005)*f[20] + CONSTANT(0.190188269814000000)*f[6] + CONSTANT(-0.077413979109600004)*f[24];
+    tg = CONSTANT(-0.065426753820500005)*g[20] + CONSTANT(0.190188269814000000)*g[6] + CONSTANT(-0.077413979109600004)*g[24];
+    y[14] += tf*g[32] + tg*f[32];
+    y[32] += tf*g[14] + tg*f[14];
+    t = f[14] * g[32] + f[32] * g[14];
+    y[20] += CONSTANT(-0.065426753820500005)*t;
+    y[6] += CONSTANT(0.190188269814000000)*t;
+    y[24] += CONSTANT(-0.077413979109600004)*t;
+
+    // [15,15]: 0,6,20,
+    tf = CONSTANT(0.282094791766999970)*f[0] + CONSTANT(-0.210261043508000010)*f[6] + CONSTANT(0.076934943209800002)*f[20];
+    tg = CONSTANT(0.282094791766999970)*g[0] + CONSTANT(-0.210261043508000010)*g[6] + CONSTANT(0.076934943209800002)*g[20];
+    y[15] += tf*g[15] + tg*f[15];
+    t = f[15] * g[15];
+    y[0] += CONSTANT(0.282094791766999970)*t;
+    y[6] += CONSTANT(-0.210261043508000010)*t;
+    y[20] += CONSTANT(0.076934943209800002)*t;
+
+    // [15,21]: 14,32,34,
+    tf = CONSTANT(-0.099322584600699995)*f[14] + CONSTANT(0.126698363970000010)*f[32] + CONSTANT(-0.131668802180999990)*f[34];
+    tg = CONSTANT(-0.099322584600699995)*g[14] + CONSTANT(0.126698363970000010)*g[32] + CONSTANT(-0.131668802180999990)*g[34];
+    y[15] += tf*g[21] + tg*f[21];
+    y[21] += tf*g[15] + tg*f[15];
+    t = f[15] * g[21] + f[21] * g[15];
+    y[14] += CONSTANT(-0.099322584600699995)*t;
+    y[32] += CONSTANT(0.126698363970000010)*t;
+    y[34] += CONSTANT(-0.131668802180999990)*t;
+
+    // [15,22]: 13,3,31,35,
+    tf = CONSTANT(0.133255230518000010)*f[13] + CONSTANT(-0.043528171378199997)*f[3] + CONSTANT(-0.101584686311000000)*f[31] + CONSTANT(-0.098140130732499997)*f[35];
+    tg = CONSTANT(0.133255230518000010)*g[13] + CONSTANT(-0.043528171378199997)*g[3] + CONSTANT(-0.101584686311000000)*g[31] + CONSTANT(-0.098140130732499997)*g[35];
+    y[15] += tf*g[22] + tg*f[22];
+    y[22] += tf*g[15] + tg*f[15];
+    t = f[15] * g[22] + f[22] * g[15];
+    y[13] += CONSTANT(0.133255230518000010)*t;
+    y[3] += CONSTANT(-0.043528171378199997)*t;
+    y[31] += CONSTANT(-0.101584686311000000)*t;
+    y[35] += CONSTANT(-0.098140130732499997)*t;
+
+    // [15,23]: 12,2,30,
+    tf = CONSTANT(-0.203550726872999990)*f[12] + CONSTANT(0.162867503964999990)*f[2] + CONSTANT(0.098140130728100003)*f[30];
+    tg = CONSTANT(-0.203550726872999990)*g[12] + CONSTANT(0.162867503964999990)*g[2] + CONSTANT(0.098140130728100003)*g[30];
+    y[15] += tf*g[23] + tg*f[23];
+    y[23] += tf*g[15] + tg*f[15];
+    t = f[15] * g[23] + f[23] * g[15];
+    y[12] += CONSTANT(-0.203550726872999990)*t;
+    y[2] += CONSTANT(0.162867503964999990)*t;
+    y[30] += CONSTANT(0.098140130728100003)*t;
+
+    // [15,33]: 6,20,
+    tf = CONSTANT(0.126792179874999990)*f[6] + CONSTANT(-0.196280261464999990)*f[20];
+    tg = CONSTANT(0.126792179874999990)*g[6] + CONSTANT(-0.196280261464999990)*g[20];
+    y[15] += tf*g[33] + tg*f[33];
+    y[33] += tf*g[15] + tg*f[15];
+    t = f[15] * g[33] + f[33] * g[15];
+    y[6] += CONSTANT(0.126792179874999990)*t;
+    y[20] += CONSTANT(-0.196280261464999990)*t;
+
+    // [16,16]: 0,6,20,
+    tf = CONSTANT(0.282094791763999990)*f[0] + CONSTANT(-0.229375683829000000)*f[6] + CONSTANT(0.106525305981000000)*f[20];
+    tg = CONSTANT(0.282094791763999990)*g[0] + CONSTANT(-0.229375683829000000)*g[6] + CONSTANT(0.106525305981000000)*g[20];
+    y[16] += tf*g[16] + tg*f[16];
+    t = f[16] * g[16];
+    y[0] += CONSTANT(0.282094791763999990)*t;
+    y[6] += CONSTANT(-0.229375683829000000)*t;
+    y[20] += CONSTANT(0.106525305981000000)*t;
+
+    // [16,18]: 8,22,
+    tf = CONSTANT(-0.075080816693699995)*f[8] + CONSTANT(0.135045473380000000)*f[22];
+    tg = CONSTANT(-0.075080816693699995)*g[8] + CONSTANT(0.135045473380000000)*g[22];
+    y[16] += tf*g[18] + tg*f[18];
+    y[18] += tf*g[16] + tg*f[16];
+    t = f[16] * g[18] + f[18] * g[16];
+    y[8] += CONSTANT(-0.075080816693699995)*t;
+    y[22] += CONSTANT(0.135045473380000000)*t;
+
+    // [16,23]: 19,5,
+    tf = CONSTANT(-0.119098912754999990)*f[19] + CONSTANT(0.140463346187999990)*f[5];
+    tg = CONSTANT(-0.119098912754999990)*g[19] + CONSTANT(0.140463346187999990)*g[5];
+    y[16] += tf*g[23] + tg*f[23];
+    y[23] += tf*g[16] + tg*f[16];
+    t = f[16] * g[23] + f[23] * g[16];
+    y[19] += CONSTANT(-0.119098912754999990)*t;
+    y[5] += CONSTANT(0.140463346187999990)*t;
+
+    // [16,26]: 12,2,30,
+    tf = CONSTANT(-0.207723503645000000)*f[12] + CONSTANT(0.147319200325000010)*f[2] + CONSTANT(0.130197596199999990)*f[30];
+    tg = CONSTANT(-0.207723503645000000)*g[12] + CONSTANT(0.147319200325000010)*g[2] + CONSTANT(0.130197596199999990)*g[30];
+    y[16] += tf*g[26] + tg*f[26];
+    y[26] += tf*g[16] + tg*f[16];
+    t = f[16] * g[26] + f[26] * g[16];
+    y[12] += CONSTANT(-0.207723503645000000)*t;
+    y[2] += CONSTANT(0.147319200325000010)*t;
+    y[30] += CONSTANT(0.130197596199999990)*t;
+
+    // [16,28]: 14,32,
+    tf = CONSTANT(-0.077413979111300005)*f[14] + CONSTANT(0.128376561115000010)*f[32];
+    tg = CONSTANT(-0.077413979111300005)*g[14] + CONSTANT(0.128376561115000010)*g[32];
+    y[16] += tf*g[28] + tg*f[28];
+    y[28] += tf*g[16] + tg*f[16];
+    t = f[16] * g[28] + f[28] * g[16];
+    y[14] += CONSTANT(-0.077413979111300005)*t;
+    y[32] += CONSTANT(0.128376561115000010)*t;
+
+    // [16,29]: 15,33,35,
+    tf = CONSTANT(0.035835708931099997)*f[15] + CONSTANT(-0.118853600623999990)*f[33] + CONSTANT(-0.053152946071899999)*f[35];
+    tg = CONSTANT(0.035835708931099997)*g[15] + CONSTANT(-0.118853600623999990)*g[33] + CONSTANT(-0.053152946071899999)*g[35];
+    y[16] += tf*g[29] + tg*f[29];
+    y[29] += tf*g[16] + tg*f[16];
+    t = f[16] * g[29] + f[29] * g[16];
+    y[15] += CONSTANT(0.035835708931099997)*t;
+    y[33] += CONSTANT(-0.118853600623999990)*t;
+    y[35] += CONSTANT(-0.053152946071899999)*t;
+
+    // [16,31]: 27,9,25,
+    tf = CONSTANT(-0.118853600623999990)*f[27] + CONSTANT(0.035835708931099997)*f[9] + CONSTANT(0.053152946071899999)*f[25];
+    tg = CONSTANT(-0.118853600623999990)*g[27] + CONSTANT(0.035835708931099997)*g[9] + CONSTANT(0.053152946071899999)*g[25];
+    y[16] += tf*g[31] + tg*f[31];
+    y[31] += tf*g[16] + tg*f[16];
+    t = f[16] * g[31] + f[31] * g[16];
+    y[27] += CONSTANT(-0.118853600623999990)*t;
+    y[9] += CONSTANT(0.035835708931099997)*t;
+    y[25] += CONSTANT(0.053152946071899999)*t;
+
+    // [17,17]: 0,6,20,
+    tf = CONSTANT(0.282094791768999990)*f[0] + CONSTANT(-0.057343920955899998)*f[6] + CONSTANT(-0.159787958979000000)*f[20];
+    tg = CONSTANT(0.282094791768999990)*g[0] + CONSTANT(-0.057343920955899998)*g[6] + CONSTANT(-0.159787958979000000)*g[20];
+    y[17] += tf*g[17] + tg*f[17];
+    t = f[17] * g[17];
+    y[0] += CONSTANT(0.282094791768999990)*t;
+    y[6] += CONSTANT(-0.057343920955899998)*t;
+    y[20] += CONSTANT(-0.159787958979000000)*t;
+
+    // [17,19]: 8,22,24,
+    tf = CONSTANT(-0.112621225039000000)*f[8] + CONSTANT(0.045015157794100001)*f[22] + CONSTANT(0.119098912753000000)*f[24];
+    tg = CONSTANT(-0.112621225039000000)*g[8] + CONSTANT(0.045015157794100001)*g[22] + CONSTANT(0.119098912753000000)*g[24];
+    y[17] += tf*g[19] + tg*f[19];
+    y[19] += tf*g[17] + tg*f[17];
+    t = f[17] * g[19] + f[19] * g[17];
+    y[8] += CONSTANT(-0.112621225039000000)*t;
+    y[22] += CONSTANT(0.045015157794100001)*t;
+    y[24] += CONSTANT(0.119098912753000000)*t;
+
+    // [17,21]: 16,4,18,
+    tf = CONSTANT(-0.119098912754999990)*f[16] + CONSTANT(-0.112621225039000000)*f[4] + CONSTANT(0.045015157794399997)*f[18];
+    tg = CONSTANT(-0.119098912754999990)*g[16] + CONSTANT(-0.112621225039000000)*g[4] + CONSTANT(0.045015157794399997)*g[18];
+    y[17] += tf*g[21] + tg*f[21];
+    y[21] += tf*g[17] + tg*f[17];
+    t = f[17] * g[21] + f[21] * g[17];
+    y[16] += CONSTANT(-0.119098912754999990)*t;
+    y[4] += CONSTANT(-0.112621225039000000)*t;
+    y[18] += CONSTANT(0.045015157794399997)*t;
+
+    // [17,26]: 3,13,31,
+    tf = CONSTANT(0.208340811096000000)*f[3] + CONSTANT(0.029982305185199998)*f[13] + CONSTANT(-0.118853600623999990)*f[31];
+    tg = CONSTANT(0.208340811096000000)*g[3] + CONSTANT(0.029982305185199998)*g[13] + CONSTANT(-0.118853600623999990)*g[31];
+    y[17] += tf*g[26] + tg*f[26];
+    y[26] += tf*g[17] + tg*f[17];
+    t = f[17] * g[26] + f[26] * g[17];
+    y[3] += CONSTANT(0.208340811096000000)*t;
+    y[13] += CONSTANT(0.029982305185199998)*t;
+    y[31] += CONSTANT(-0.118853600623999990)*t;
+
+    // [17,27]: 12,2,30,
+    tf = CONSTANT(-0.103861751821000010)*f[12] + CONSTANT(0.196425600433000000)*f[2] + CONSTANT(-0.130197596204999990)*f[30];
+    tg = CONSTANT(-0.103861751821000010)*g[12] + CONSTANT(0.196425600433000000)*g[2] + CONSTANT(-0.130197596204999990)*g[30];
+    y[17] += tf*g[27] + tg*f[27];
+    y[27] += tf*g[17] + tg*f[17];
+    t = f[17] * g[27] + f[27] * g[17];
+    y[12] += CONSTANT(-0.103861751821000010)*t;
+    y[2] += CONSTANT(0.196425600433000000)*t;
+    y[30] += CONSTANT(-0.130197596204999990)*t;
+
+    // [17,28]: 13,3,31,35,
+    tf = CONSTANT(0.121172043789000000)*f[13] + CONSTANT(-0.060142811686500000)*f[3] + CONSTANT(0.034310079156700000)*f[31] + CONSTANT(0.099440056652200001)*f[35];
+    tg = CONSTANT(0.121172043789000000)*g[13] + CONSTANT(-0.060142811686500000)*g[3] + CONSTANT(0.034310079156700000)*g[31] + CONSTANT(0.099440056652200001)*g[35];
+    y[17] += tf*g[28] + tg*f[28];
+    y[28] += tf*g[17] + tg*f[17];
+    t = f[17] * g[28] + f[28] * g[17];
+    y[13] += CONSTANT(0.121172043789000000)*t;
+    y[3] += CONSTANT(-0.060142811686500000)*t;
+    y[31] += CONSTANT(0.034310079156700000)*t;
+    y[35] += CONSTANT(0.099440056652200001)*t;
+
+    // [17,32]: 11,1,25,29,
+    tf = CONSTANT(0.121172043788000010)*f[11] + CONSTANT(-0.060142811686900000)*f[1] + CONSTANT(-0.099440056652700004)*f[25] + CONSTANT(0.034310079156599997)*f[29];
+    tg = CONSTANT(0.121172043788000010)*g[11] + CONSTANT(-0.060142811686900000)*g[1] + CONSTANT(-0.099440056652700004)*g[25] + CONSTANT(0.034310079156599997)*g[29];
+    y[17] += tf*g[32] + tg*f[32];
+    y[32] += tf*g[17] + tg*f[17];
+    t = f[17] * g[32] + f[32] * g[17];
+    y[11] += CONSTANT(0.121172043788000010)*t;
+    y[1] += CONSTANT(-0.060142811686900000)*t;
+    y[25] += CONSTANT(-0.099440056652700004)*t;
+    y[29] += CONSTANT(0.034310079156599997)*t;
+
+    // [17,34]: 29,11,1,
+    tf = CONSTANT(0.118853600623000000)*f[29] + CONSTANT(-0.029982305185400002)*f[11] + CONSTANT(-0.208340811100000000)*f[1];
+    tg = CONSTANT(0.118853600623000000)*g[29] + CONSTANT(-0.029982305185400002)*g[11] + CONSTANT(-0.208340811100000000)*g[1];
+    y[17] += tf*g[34] + tg*f[34];
+    y[34] += tf*g[17] + tg*f[17];
+    t = f[17] * g[34] + f[34] * g[17];
+    y[29] += CONSTANT(0.118853600623000000)*t;
+    y[11] += CONSTANT(-0.029982305185400002)*t;
+    y[1] += CONSTANT(-0.208340811100000000)*t;
+
+    // [18,18]: 6,0,20,24,
+    tf = CONSTANT(0.065535909662600006)*f[6] + CONSTANT(0.282094791771999980)*f[0] + CONSTANT(-0.083698454702400005)*f[20] + CONSTANT(-0.135045473384000000)*f[24];
+    tg = CONSTANT(0.065535909662600006)*g[6] + CONSTANT(0.282094791771999980)*g[0] + CONSTANT(-0.083698454702400005)*g[20] + CONSTANT(-0.135045473384000000)*g[24];
+    y[18] += tf*g[18] + tg*f[18];
+    t = f[18] * g[18];
+    y[6] += CONSTANT(0.065535909662600006)*t;
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[20] += CONSTANT(-0.083698454702400005)*t;
+    y[24] += CONSTANT(-0.135045473384000000)*t;
+
+    // [18,19]: 7,21,23,
+    tf = CONSTANT(0.090297865407399994)*f[7] + CONSTANT(0.102084782359000000)*f[21] + CONSTANT(-0.045015157794399997)*f[23];
+    tg = CONSTANT(0.090297865407399994)*g[7] + CONSTANT(0.102084782359000000)*g[21] + CONSTANT(-0.045015157794399997)*g[23];
+    y[18] += tf*g[19] + tg*f[19];
+    y[19] += tf*g[18] + tg*f[18];
+    t = f[18] * g[19] + f[19] * g[18];
+    y[7] += CONSTANT(0.090297865407399994)*t;
+    y[21] += CONSTANT(0.102084782359000000)*t;
+    y[23] += CONSTANT(-0.045015157794399997)*t;
+
+    // [18,25]: 15,33,
+    tf = CONSTANT(-0.098140130731999994)*f[15] + CONSTANT(0.130197596202000000)*f[33];
+    tg = CONSTANT(-0.098140130731999994)*g[15] + CONSTANT(0.130197596202000000)*g[33];
+    y[18] += tf*g[25] + tg*f[25];
+    y[25] += tf*g[18] + tg*f[18];
+    t = f[18] * g[25] + f[25] * g[18];
+    y[15] += CONSTANT(-0.098140130731999994)*t;
+    y[33] += CONSTANT(0.130197596202000000)*t;
+
+    // [18,26]: 14,32,
+    tf = CONSTANT(0.101358691174000000)*f[14] + CONSTANT(0.084042186965900004)*f[32];
+    tg = CONSTANT(0.101358691174000000)*g[14] + CONSTANT(0.084042186965900004)*g[32];
+    y[18] += tf*g[26] + tg*f[26];
+    y[26] += tf*g[18] + tg*f[18];
+    t = f[18] * g[26] + f[26] * g[18];
+    y[14] += CONSTANT(0.101358691174000000)*t;
+    y[32] += CONSTANT(0.084042186965900004)*t;
+
+    // [18,27]: 13,3,35,
+    tf = CONSTANT(0.101990215611000000)*f[13] + CONSTANT(0.183739324705999990)*f[3] + CONSTANT(-0.130197596202000000)*f[35];
+    tg = CONSTANT(0.101990215611000000)*g[13] + CONSTANT(0.183739324705999990)*g[3] + CONSTANT(-0.130197596202000000)*g[35];
+    y[18] += tf*g[27] + tg*f[27];
+    y[27] += tf*g[18] + tg*f[18];
+    t = f[18] * g[27] + f[27] * g[18];
+    y[13] += CONSTANT(0.101990215611000000)*t;
+    y[3] += CONSTANT(0.183739324705999990)*t;
+    y[35] += CONSTANT(-0.130197596202000000)*t;
+
+    // [18,28]: 2,12,30,34,
+    tf = CONSTANT(0.225033795606000010)*f[2] + CONSTANT(0.022664492358099999)*f[12] + CONSTANT(-0.099440056651100006)*f[30] + CONSTANT(-0.084042186968800003)*f[34];
+    tg = CONSTANT(0.225033795606000010)*g[2] + CONSTANT(0.022664492358099999)*g[12] + CONSTANT(-0.099440056651100006)*g[30] + CONSTANT(-0.084042186968800003)*g[34];
+    y[18] += tf*g[28] + tg*f[28];
+    y[28] += tf*g[18] + tg*f[18];
+    t = f[18] * g[28] + f[28] * g[18];
+    y[2] += CONSTANT(0.225033795606000010)*t;
+    y[12] += CONSTANT(0.022664492358099999)*t;
+    y[30] += CONSTANT(-0.099440056651100006)*t;
+    y[34] += CONSTANT(-0.084042186968800003)*t;
+
+    // [18,29]: 3,13,15,31,
+    tf = CONSTANT(-0.085054779966799998)*f[3] + CONSTANT(0.075189952564900006)*f[13] + CONSTANT(0.101584686310000010)*f[15] + CONSTANT(0.097043558538999999)*f[31];
+    tg = CONSTANT(-0.085054779966799998)*g[3] + CONSTANT(0.075189952564900006)*g[13] + CONSTANT(0.101584686310000010)*g[15] + CONSTANT(0.097043558538999999)*g[31];
+    y[18] += tf*g[29] + tg*f[29];
+    y[29] += tf*g[18] + tg*f[18];
+    t = f[18] * g[29] + f[29] * g[18];
+    y[3] += CONSTANT(-0.085054779966799998)*t;
+    y[13] += CONSTANT(0.075189952564900006)*t;
+    y[15] += CONSTANT(0.101584686310000010)*t;
+    y[31] += CONSTANT(0.097043558538999999)*t;
+
+    // [19,19]: 6,8,0,20,22,
+    tf = CONSTANT(0.139263808033999990)*f[6] + CONSTANT(-0.141889406570999990)*f[8] + CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.068480553847200004)*f[20] + CONSTANT(-0.102084782360000000)*f[22];
+    tg = CONSTANT(0.139263808033999990)*g[6] + CONSTANT(-0.141889406570999990)*g[8] + CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.068480553847200004)*g[20] + CONSTANT(-0.102084782360000000)*g[22];
+    y[19] += tf*g[19] + tg*f[19];
+    t = f[19] * g[19];
+    y[6] += CONSTANT(0.139263808033999990)*t;
+    y[8] += CONSTANT(-0.141889406570999990)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[20] += CONSTANT(0.068480553847200004)*t;
+    y[22] += CONSTANT(-0.102084782360000000)*t;
+
+    // [19,25]: 34,
+    tf = CONSTANT(-0.130197596205999990)*f[34];
+    tg = CONSTANT(-0.130197596205999990)*g[34];
+    y[19] += tf*g[25] + tg*f[25];
+    y[25] += tf*g[19] + tg*f[19];
+    t = f[19] * g[25] + f[25] * g[19];
+    y[34] += CONSTANT(-0.130197596205999990)*t;
+
+    // [19,26]: 15,35,
+    tf = CONSTANT(-0.131668802182000000)*f[15] + CONSTANT(0.130197596204999990)*f[35];
+    tg = CONSTANT(-0.131668802182000000)*g[15] + CONSTANT(0.130197596204999990)*g[35];
+    y[19] += tf*g[26] + tg*f[26];
+    y[26] += tf*g[19] + tg*f[19];
+    t = f[19] * g[26] + f[26] * g[19];
+    y[15] += CONSTANT(-0.131668802182000000)*t;
+    y[35] += CONSTANT(0.130197596204999990)*t;
+
+    // [19,27]: 14,32,
+    tf = CONSTANT(0.025339672793899998)*f[14] + CONSTANT(0.084042186967699994)*f[32];
+    tg = CONSTANT(0.025339672793899998)*g[14] + CONSTANT(0.084042186967699994)*g[32];
+    y[19] += tf*g[27] + tg*f[27];
+    y[27] += tf*g[19] + tg*f[19];
+    t = f[19] * g[27] + f[27] * g[19];
+    y[14] += CONSTANT(0.025339672793899998)*t;
+    y[32] += CONSTANT(0.084042186967699994)*t;
+
+    // [19,28]: 13,3,15,31,33,
+    tf = CONSTANT(0.104682806111000000)*f[13] + CONSTANT(0.159122922869999990)*f[3] + CONSTANT(-0.126698363970000010)*f[15] + CONSTANT(0.090775936911399999)*f[31] + CONSTANT(-0.084042186968400004)*f[33];
+    tg = CONSTANT(0.104682806111000000)*g[13] + CONSTANT(0.159122922869999990)*g[3] + CONSTANT(-0.126698363970000010)*g[15] + CONSTANT(0.090775936911399999)*g[31] + CONSTANT(-0.084042186968400004)*g[33];
+    y[19] += tf*g[28] + tg*f[28];
+    y[28] += tf*g[19] + tg*f[19];
+    t = f[19] * g[28] + f[28] * g[19];
+    y[13] += CONSTANT(0.104682806111000000)*t;
+    y[3] += CONSTANT(0.159122922869999990)*t;
+    y[15] += CONSTANT(-0.126698363970000010)*t;
+    y[31] += CONSTANT(0.090775936911399999)*t;
+    y[33] += CONSTANT(-0.084042186968400004)*t;
+
+    // [19,29]: 12,14,2,30,32,
+    tf = CONSTANT(0.115089467124000010)*f[12] + CONSTANT(-0.097749909977199997)*f[14] + CONSTANT(0.240571246744999990)*f[2] + CONSTANT(0.053152946072499999)*f[30] + CONSTANT(-0.090775936912099994)*f[32];
+    tg = CONSTANT(0.115089467124000010)*g[12] + CONSTANT(-0.097749909977199997)*g[14] + CONSTANT(0.240571246744999990)*g[2] + CONSTANT(0.053152946072499999)*g[30] + CONSTANT(-0.090775936912099994)*g[32];
+    y[19] += tf*g[29] + tg*f[29];
+    y[29] += tf*g[19] + tg*f[19];
+    t = f[19] * g[29] + f[29] * g[19];
+    y[12] += CONSTANT(0.115089467124000010)*t;
+    y[14] += CONSTANT(-0.097749909977199997)*t;
+    y[2] += CONSTANT(0.240571246744999990)*t;
+    y[30] += CONSTANT(0.053152946072499999)*t;
+    y[32] += CONSTANT(-0.090775936912099994)*t;
+
+    // [20,20]: 6,0,20,
+    tf = CONSTANT(0.163839797503000010)*f[6] + CONSTANT(0.282094802232000010)*f[0];
+    tg = CONSTANT(0.163839797503000010)*g[6] + CONSTANT(0.282094802232000010)*g[0];
+    y[20] += tf*g[20] + tg*f[20];
+    t = f[20] * g[20];
+    y[6] += CONSTANT(0.163839797503000010)*t;
+    y[0] += CONSTANT(0.282094802232000010)*t;
+    y[20] += CONSTANT(0.136961139005999990)*t;
+
+    // [21,21]: 6,20,0,8,22,
+    tf = CONSTANT(0.139263808033999990)*f[6] + CONSTANT(0.068480553847200004)*f[20] + CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.141889406570999990)*f[8] + CONSTANT(0.102084782360000000)*f[22];
+    tg = CONSTANT(0.139263808033999990)*g[6] + CONSTANT(0.068480553847200004)*g[20] + CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.141889406570999990)*g[8] + CONSTANT(0.102084782360000000)*g[22];
+    y[21] += tf*g[21] + tg*f[21];
+    t = f[21] * g[21];
+    y[6] += CONSTANT(0.139263808033999990)*t;
+    y[20] += CONSTANT(0.068480553847200004)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[8] += CONSTANT(0.141889406570999990)*t;
+    y[22] += CONSTANT(0.102084782360000000)*t;
+
+    // [21,23]: 8,22,24,
+    tf = CONSTANT(-0.112621225039000000)*f[8] + CONSTANT(0.045015157794100001)*f[22] + CONSTANT(-0.119098912753000000)*f[24];
+    tg = CONSTANT(-0.112621225039000000)*g[8] + CONSTANT(0.045015157794100001)*g[22] + CONSTANT(-0.119098912753000000)*g[24];
+    y[21] += tf*g[23] + tg*f[23];
+    y[23] += tf*g[21] + tg*f[21];
+    t = f[21] * g[23] + f[23] * g[21];
+    y[8] += CONSTANT(-0.112621225039000000)*t;
+    y[22] += CONSTANT(0.045015157794100001)*t;
+    y[24] += CONSTANT(-0.119098912753000000)*t;
+
+    // [21,26]: 9,25,
+    tf = CONSTANT(-0.131668802182000000)*f[9] + CONSTANT(-0.130197596204999990)*f[25];
+    tg = CONSTANT(-0.131668802182000000)*g[9] + CONSTANT(-0.130197596204999990)*g[25];
+    y[21] += tf*g[26] + tg*f[26];
+    y[26] += tf*g[21] + tg*f[21];
+    t = f[21] * g[26] + f[26] * g[21];
+    y[9] += CONSTANT(-0.131668802182000000)*t;
+    y[25] += CONSTANT(-0.130197596204999990)*t;
+
+    // [21,28]: 27,1,11,9,29,
+    tf = CONSTANT(0.084042186968400004)*f[27] + CONSTANT(0.159122922869999990)*f[1] + CONSTANT(0.104682806111000000)*f[11] + CONSTANT(0.126698363970000010)*f[9] + CONSTANT(0.090775936911399999)*f[29];
+    tg = CONSTANT(0.084042186968400004)*g[27] + CONSTANT(0.159122922869999990)*g[1] + CONSTANT(0.104682806111000000)*g[11] + CONSTANT(0.126698363970000010)*g[9] + CONSTANT(0.090775936911399999)*g[29];
+    y[21] += tf*g[28] + tg*f[28];
+    y[28] += tf*g[21] + tg*f[21];
+    t = f[21] * g[28] + f[28] * g[21];
+    y[27] += CONSTANT(0.084042186968400004)*t;
+    y[1] += CONSTANT(0.159122922869999990)*t;
+    y[11] += CONSTANT(0.104682806111000000)*t;
+    y[9] += CONSTANT(0.126698363970000010)*t;
+    y[29] += CONSTANT(0.090775936911399999)*t;
+
+    // [21,31]: 14,2,30,12,32,
+    tf = CONSTANT(0.097749909977199997)*f[14] + CONSTANT(0.240571246744999990)*f[2] + CONSTANT(0.053152946072499999)*f[30] + CONSTANT(0.115089467124000010)*f[12] + CONSTANT(0.090775936912099994)*f[32];
+    tg = CONSTANT(0.097749909977199997)*g[14] + CONSTANT(0.240571246744999990)*g[2] + CONSTANT(0.053152946072499999)*g[30] + CONSTANT(0.115089467124000010)*g[12] + CONSTANT(0.090775936912099994)*g[32];
+    y[21] += tf*g[31] + tg*f[31];
+    y[31] += tf*g[21] + tg*f[21];
+    t = f[21] * g[31] + f[31] * g[21];
+    y[14] += CONSTANT(0.097749909977199997)*t;
+    y[2] += CONSTANT(0.240571246744999990)*t;
+    y[30] += CONSTANT(0.053152946072499999)*t;
+    y[12] += CONSTANT(0.115089467124000010)*t;
+    y[32] += CONSTANT(0.090775936912099994)*t;
+
+    // [21,33]: 32,14,
+    tf = CONSTANT(0.084042186967699994)*f[32] + CONSTANT(0.025339672793899998)*f[14];
+    tg = CONSTANT(0.084042186967699994)*g[32] + CONSTANT(0.025339672793899998)*g[14];
+    y[21] += tf*g[33] + tg*f[33];
+    y[33] += tf*g[21] + tg*f[21];
+    t = f[21] * g[33] + f[33] * g[21];
+    y[32] += CONSTANT(0.084042186967699994)*t;
+    y[14] += CONSTANT(0.025339672793899998)*t;
+
+    // [21,34]: 35,
+    tf = CONSTANT(-0.130197596205999990)*f[35];
+    tg = CONSTANT(-0.130197596205999990)*g[35];
+    y[21] += tf*g[34] + tg*f[34];
+    y[34] += tf*g[21] + tg*f[21];
+    t = f[21] * g[34] + f[34] * g[21];
+    y[35] += CONSTANT(-0.130197596205999990)*t;
+
+    // [22,22]: 6,20,0,24,
+    tf = CONSTANT(0.065535909662600006)*f[6] + CONSTANT(-0.083698454702400005)*f[20] + CONSTANT(0.282094791771999980)*f[0] + CONSTANT(0.135045473384000000)*f[24];
+    tg = CONSTANT(0.065535909662600006)*g[6] + CONSTANT(-0.083698454702400005)*g[20] + CONSTANT(0.282094791771999980)*g[0] + CONSTANT(0.135045473384000000)*g[24];
+    y[22] += tf*g[22] + tg*f[22];
+    t = f[22] * g[22];
+    y[6] += CONSTANT(0.065535909662600006)*t;
+    y[20] += CONSTANT(-0.083698454702400005)*t;
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[24] += CONSTANT(0.135045473384000000)*t;
+
+    // [22,26]: 10,28,
+    tf = CONSTANT(0.101358691174000000)*f[10] + CONSTANT(0.084042186965900004)*f[28];
+    tg = CONSTANT(0.101358691174000000)*g[10] + CONSTANT(0.084042186965900004)*g[28];
+    y[22] += tf*g[26] + tg*f[26];
+    y[26] += tf*g[22] + tg*f[22];
+    t = f[22] * g[26] + f[26] * g[22];
+    y[10] += CONSTANT(0.101358691174000000)*t;
+    y[28] += CONSTANT(0.084042186965900004)*t;
+
+    // [22,27]: 1,11,25,
+    tf = CONSTANT(0.183739324704000010)*f[1] + CONSTANT(0.101990215611000000)*f[11] + CONSTANT(0.130197596200999990)*f[25];
+    tg = CONSTANT(0.183739324704000010)*g[1] + CONSTANT(0.101990215611000000)*g[11] + CONSTANT(0.130197596200999990)*g[25];
+    y[22] += tf*g[27] + tg*f[27];
+    y[27] += tf*g[22] + tg*f[22];
+    t = f[22] * g[27] + f[27] * g[22];
+    y[1] += CONSTANT(0.183739324704000010)*t;
+    y[11] += CONSTANT(0.101990215611000000)*t;
+    y[25] += CONSTANT(0.130197596200999990)*t;
+
+    // [22,32]: 2,30,12,34,
+    tf = CONSTANT(0.225033795606000010)*f[2] + CONSTANT(-0.099440056651100006)*f[30] + CONSTANT(0.022664492358099999)*f[12] + CONSTANT(0.084042186968800003)*f[34];
+    tg = CONSTANT(0.225033795606000010)*g[2] + CONSTANT(-0.099440056651100006)*g[30] + CONSTANT(0.022664492358099999)*g[12] + CONSTANT(0.084042186968800003)*g[34];
+    y[22] += tf*g[32] + tg*f[32];
+    y[32] += tf*g[22] + tg*f[22];
+    t = f[22] * g[32] + f[32] * g[22];
+    y[2] += CONSTANT(0.225033795606000010)*t;
+    y[30] += CONSTANT(-0.099440056651100006)*t;
+    y[12] += CONSTANT(0.022664492358099999)*t;
+    y[34] += CONSTANT(0.084042186968800003)*t;
+
+    // [22,33]: 3,13,35,
+    tf = CONSTANT(0.183739324704000010)*f[3] + CONSTANT(0.101990215611000000)*f[13] + CONSTANT(0.130197596200999990)*f[35];
+    tg = CONSTANT(0.183739324704000010)*g[3] + CONSTANT(0.101990215611000000)*g[13] + CONSTANT(0.130197596200999990)*g[35];
+    y[22] += tf*g[33] + tg*f[33];
+    y[33] += tf*g[22] + tg*f[22];
+    t = f[22] * g[33] + f[33] * g[22];
+    y[3] += CONSTANT(0.183739324704000010)*t;
+    y[13] += CONSTANT(0.101990215611000000)*t;
+    y[35] += CONSTANT(0.130197596200999990)*t;
+
+    // [23,23]: 6,20,0,
+    tf = CONSTANT(-0.057343920955899998)*f[6] + CONSTANT(-0.159787958979000000)*f[20] + CONSTANT(0.282094791768999990)*f[0];
+    tg = CONSTANT(-0.057343920955899998)*g[6] + CONSTANT(-0.159787958979000000)*g[20] + CONSTANT(0.282094791768999990)*g[0];
+    y[23] += tf*g[23] + tg*f[23];
+    t = f[23] * g[23];
+    y[6] += CONSTANT(-0.057343920955899998)*t;
+    y[20] += CONSTANT(-0.159787958979000000)*t;
+    y[0] += CONSTANT(0.282094791768999990)*t;
+
+    // [23,26]: 1,11,29,
+    tf = CONSTANT(0.208340811096000000)*f[1] + CONSTANT(0.029982305185199998)*f[11] + CONSTANT(-0.118853600623999990)*f[29];
+    tg = CONSTANT(0.208340811096000000)*g[1] + CONSTANT(0.029982305185199998)*g[11] + CONSTANT(-0.118853600623999990)*g[29];
+    y[23] += tf*g[26] + tg*f[26];
+    y[26] += tf*g[23] + tg*f[23];
+    t = f[23] * g[26] + f[26] * g[23];
+    y[1] += CONSTANT(0.208340811096000000)*t;
+    y[11] += CONSTANT(0.029982305185199998)*t;
+    y[29] += CONSTANT(-0.118853600623999990)*t;
+
+    // [23,28]: 25,11,1,29,
+    tf = CONSTANT(-0.099440056652200001)*f[25] + CONSTANT(-0.121172043789000000)*f[11] + CONSTANT(0.060142811686500000)*f[1] + CONSTANT(-0.034310079156700000)*f[29];
+    tg = CONSTANT(-0.099440056652200001)*g[25] + CONSTANT(-0.121172043789000000)*g[11] + CONSTANT(0.060142811686500000)*g[1] + CONSTANT(-0.034310079156700000)*g[29];
+    y[23] += tf*g[28] + tg*f[28];
+    y[28] += tf*g[23] + tg*f[23];
+    t = f[23] * g[28] + f[28] * g[23];
+    y[25] += CONSTANT(-0.099440056652200001)*t;
+    y[11] += CONSTANT(-0.121172043789000000)*t;
+    y[1] += CONSTANT(0.060142811686500000)*t;
+    y[29] += CONSTANT(-0.034310079156700000)*t;
+
+    // [23,32]: 31,13,3,35,
+    tf = CONSTANT(0.034310079156599997)*f[31] + CONSTANT(0.121172043788000010)*f[13] + CONSTANT(-0.060142811686900000)*f[3] + CONSTANT(-0.099440056652700004)*f[35];
+    tg = CONSTANT(0.034310079156599997)*g[31] + CONSTANT(0.121172043788000010)*g[13] + CONSTANT(-0.060142811686900000)*g[3] + CONSTANT(-0.099440056652700004)*g[35];
+    y[23] += tf*g[32] + tg*f[32];
+    y[32] += tf*g[23] + tg*f[23];
+    t = f[23] * g[32] + f[32] * g[23];
+    y[31] += CONSTANT(0.034310079156599997)*t;
+    y[13] += CONSTANT(0.121172043788000010)*t;
+    y[3] += CONSTANT(-0.060142811686900000)*t;
+    y[35] += CONSTANT(-0.099440056652700004)*t;
+
+    // [23,33]: 2,30,12,
+    tf = CONSTANT(0.196425600433000000)*f[2] + CONSTANT(-0.130197596204999990)*f[30] + CONSTANT(-0.103861751821000010)*f[12];
+    tg = CONSTANT(0.196425600433000000)*g[2] + CONSTANT(-0.130197596204999990)*g[30] + CONSTANT(-0.103861751821000010)*g[12];
+    y[23] += tf*g[33] + tg*f[33];
+    y[33] += tf*g[23] + tg*f[23];
+    t = f[23] * g[33] + f[33] * g[23];
+    y[2] += CONSTANT(0.196425600433000000)*t;
+    y[30] += CONSTANT(-0.130197596204999990)*t;
+    y[12] += CONSTANT(-0.103861751821000010)*t;
+
+    // [23,34]: 3,13,31,
+    tf = CONSTANT(0.208340811100000000)*f[3] + CONSTANT(0.029982305185400002)*f[13] + CONSTANT(-0.118853600623000000)*f[31];
+    tg = CONSTANT(0.208340811100000000)*g[3] + CONSTANT(0.029982305185400002)*g[13] + CONSTANT(-0.118853600623000000)*g[31];
+    y[23] += tf*g[34] + tg*f[34];
+    y[34] += tf*g[23] + tg*f[23];
+    t = f[23] * g[34] + f[34] * g[23];
+    y[3] += CONSTANT(0.208340811100000000)*t;
+    y[13] += CONSTANT(0.029982305185400002)*t;
+    y[31] += CONSTANT(-0.118853600623000000)*t;
+
+    // [24,24]: 6,0,20,
+    tf = CONSTANT(-0.229375683829000000)*f[6] + CONSTANT(0.282094791763999990)*f[0] + CONSTANT(0.106525305981000000)*f[20];
+    tg = CONSTANT(-0.229375683829000000)*g[6] + CONSTANT(0.282094791763999990)*g[0] + CONSTANT(0.106525305981000000)*g[20];
+    y[24] += tf*g[24] + tg*f[24];
+    t = f[24] * g[24];
+    y[6] += CONSTANT(-0.229375683829000000)*t;
+    y[0] += CONSTANT(0.282094791763999990)*t;
+    y[20] += CONSTANT(0.106525305981000000)*t;
+
+    // [24,29]: 9,27,25,
+    tf = CONSTANT(-0.035835708931400000)*f[9] + CONSTANT(0.118853600623000000)*f[27] + CONSTANT(0.053152946071199997)*f[25];
+    tg = CONSTANT(-0.035835708931400000)*g[9] + CONSTANT(0.118853600623000000)*g[27] + CONSTANT(0.053152946071199997)*g[25];
+    y[24] += tf*g[29] + tg*f[29];
+    y[29] += tf*g[24] + tg*f[24];
+    t = f[24] * g[29] + f[29] * g[24];
+    y[9] += CONSTANT(-0.035835708931400000)*t;
+    y[27] += CONSTANT(0.118853600623000000)*t;
+    y[25] += CONSTANT(0.053152946071199997)*t;
+
+    // [24,31]: 15,33,35,
+    tf = CONSTANT(0.035835708931400000)*f[15] + CONSTANT(-0.118853600623000000)*f[33] + CONSTANT(0.053152946071199997)*f[35];
+    tg = CONSTANT(0.035835708931400000)*g[15] + CONSTANT(-0.118853600623000000)*g[33] + CONSTANT(0.053152946071199997)*g[35];
+    y[24] += tf*g[31] + tg*f[31];
+    y[31] += tf*g[24] + tg*f[24];
+    t = f[24] * g[31] + f[31] * g[24];
+    y[15] += CONSTANT(0.035835708931400000)*t;
+    y[33] += CONSTANT(-0.118853600623000000)*t;
+    y[35] += CONSTANT(0.053152946071199997)*t;
+
+    // [24,34]: 12,30,2,
+    tf = CONSTANT(-0.207723503645000000)*f[12] + CONSTANT(0.130197596199999990)*f[30] + CONSTANT(0.147319200325000010)*f[2];
+    tg = CONSTANT(-0.207723503645000000)*g[12] + CONSTANT(0.130197596199999990)*g[30] + CONSTANT(0.147319200325000010)*g[2];
+    y[24] += tf*g[34] + tg*f[34];
+    y[34] += tf*g[24] + tg*f[24];
+    t = f[24] * g[34] + f[34] * g[24];
+    y[12] += CONSTANT(-0.207723503645000000)*t;
+    y[30] += CONSTANT(0.130197596199999990)*t;
+    y[2] += CONSTANT(0.147319200325000010)*t;
+
+    // [25,25]: 0,6,20,
+    tf = CONSTANT(0.282094791761999970)*f[0] + CONSTANT(-0.242608896358999990)*f[6] + CONSTANT(0.130197596198000000)*f[20];
+    tg = CONSTANT(0.282094791761999970)*g[0] + CONSTANT(-0.242608896358999990)*g[6] + CONSTANT(0.130197596198000000)*g[20];
+    y[25] += tf*g[25] + tg*f[25];
+    t = f[25] * g[25];
+    y[0] += CONSTANT(0.282094791761999970)*t;
+    y[6] += CONSTANT(-0.242608896358999990)*t;
+    y[20] += CONSTANT(0.130197596198000000)*t;
+
+    // [26,26]: 6,20,0,
+    tf = CONSTANT(-0.097043558542400002)*f[6] + CONSTANT(-0.130197596207000000)*f[20] + CONSTANT(0.282094791766000000)*f[0];
+    tg = CONSTANT(-0.097043558542400002)*g[6] + CONSTANT(-0.130197596207000000)*g[20] + CONSTANT(0.282094791766000000)*g[0];
+    y[26] += tf*g[26] + tg*f[26];
+    t = f[26] * g[26];
+    y[6] += CONSTANT(-0.097043558542400002)*t;
+    y[20] += CONSTANT(-0.130197596207000000)*t;
+    y[0] += CONSTANT(0.282094791766000000)*t;
+
+    // [27,27]: 0,20,6,
+    tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.130197596204999990)*f[20] + CONSTANT(0.016173926423100001)*f[6];
+    tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.130197596204999990)*g[20] + CONSTANT(0.016173926423100001)*g[6];
+    y[27] += tf*g[27] + tg*f[27];
+    t = f[27] * g[27];
+    y[0] += CONSTANT(0.282094791770000020)*t;
+    y[20] += CONSTANT(-0.130197596204999990)*t;
+    y[6] += CONSTANT(0.016173926423100001)*t;
+
+    // [28,28]: 6,0,20,24,
+    tf = CONSTANT(0.097043558538800007)*f[6] + CONSTANT(0.282094791771999980)*f[0] + CONSTANT(-0.021699599367299999)*f[20] + CONSTANT(-0.128376561118000000)*f[24];
+    tg = CONSTANT(0.097043558538800007)*g[6] + CONSTANT(0.282094791771999980)*g[0] + CONSTANT(-0.021699599367299999)*g[20] + CONSTANT(-0.128376561118000000)*g[24];
+    y[28] += tf*g[28] + tg*f[28];
+    t = f[28] * g[28];
+    y[6] += CONSTANT(0.097043558538800007)*t;
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[20] += CONSTANT(-0.021699599367299999)*t;
+    y[24] += CONSTANT(-0.128376561118000000)*t;
+
+    // [29,29]: 20,6,0,22,8,
+    tf = CONSTANT(0.086798397468799998)*f[20] + CONSTANT(0.145565337808999990)*f[6] + CONSTANT(0.282094791773999990)*f[0] + CONSTANT(-0.097043558539500002)*f[22] + CONSTANT(-0.140070311615000000)*f[8];
+    tg = CONSTANT(0.086798397468799998)*g[20] + CONSTANT(0.145565337808999990)*g[6] + CONSTANT(0.282094791773999990)*g[0] + CONSTANT(-0.097043558539500002)*g[22] + CONSTANT(-0.140070311615000000)*g[8];
+    y[29] += tf*g[29] + tg*f[29];
+    t = f[29] * g[29];
+    y[20] += CONSTANT(0.086798397468799998)*t;
+    y[6] += CONSTANT(0.145565337808999990)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+    y[22] += CONSTANT(-0.097043558539500002)*t;
+    y[8] += CONSTANT(-0.140070311615000000)*t;
+
+    // [30,30]: 0,20,6,
+    tf = CONSTANT(0.282094804531000000)*f[0] + CONSTANT(0.130197634486000000)*f[20] + CONSTANT(0.161739292769000010)*f[6];
+    tg = CONSTANT(0.282094804531000000)*g[0] + CONSTANT(0.130197634486000000)*g[20] + CONSTANT(0.161739292769000010)*g[6];
+    y[30] += tf*g[30] + tg*f[30];
+    t = f[30] * g[30];
+    y[0] += CONSTANT(0.282094804531000000)*t;
+    y[20] += CONSTANT(0.130197634486000000)*t;
+    y[6] += CONSTANT(0.161739292769000010)*t;
+
+    // [31,31]: 6,8,20,22,0,
+    tf = CONSTANT(0.145565337808999990)*f[6] + CONSTANT(0.140070311615000000)*f[8] + CONSTANT(0.086798397468799998)*f[20] + CONSTANT(0.097043558539500002)*f[22] + CONSTANT(0.282094791773999990)*f[0];
+    tg = CONSTANT(0.145565337808999990)*g[6] + CONSTANT(0.140070311615000000)*g[8] + CONSTANT(0.086798397468799998)*g[20] + CONSTANT(0.097043558539500002)*g[22] + CONSTANT(0.282094791773999990)*g[0];
+    y[31] += tf*g[31] + tg*f[31];
+    t = f[31] * g[31];
+    y[6] += CONSTANT(0.145565337808999990)*t;
+    y[8] += CONSTANT(0.140070311615000000)*t;
+    y[20] += CONSTANT(0.086798397468799998)*t;
+    y[22] += CONSTANT(0.097043558539500002)*t;
+    y[0] += CONSTANT(0.282094791773999990)*t;
+
+    // [32,32]: 0,24,20,6,
+    tf = CONSTANT(0.282094791771999980)*f[0] + CONSTANT(0.128376561118000000)*f[24] + CONSTANT(-0.021699599367299999)*f[20] + CONSTANT(0.097043558538800007)*f[6];
+    tg = CONSTANT(0.282094791771999980)*g[0] + CONSTANT(0.128376561118000000)*g[24] + CONSTANT(-0.021699599367299999)*g[20] + CONSTANT(0.097043558538800007)*g[6];
+    y[32] += tf*g[32] + tg*f[32];
+    t = f[32] * g[32];
+    y[0] += CONSTANT(0.282094791771999980)*t;
+    y[24] += CONSTANT(0.128376561118000000)*t;
+    y[20] += CONSTANT(-0.021699599367299999)*t;
+    y[6] += CONSTANT(0.097043558538800007)*t;
+
+    // [33,33]: 6,20,0,
+    tf = CONSTANT(0.016173926423100001)*f[6] + CONSTANT(-0.130197596204999990)*f[20] + CONSTANT(0.282094791770000020)*f[0];
+    tg = CONSTANT(0.016173926423100001)*g[6] + CONSTANT(-0.130197596204999990)*g[20] + CONSTANT(0.282094791770000020)*g[0];
+    y[33] += tf*g[33] + tg*f[33];
+    t = f[33] * g[33];
+    y[6] += CONSTANT(0.016173926423100001)*t;
+    y[20] += CONSTANT(-0.130197596204999990)*t;
+    y[0] += CONSTANT(0.282094791770000020)*t;
+
+    // [34,34]: 20,6,0,
+    tf = CONSTANT(-0.130197596207000000)*f[20] + CONSTANT(-0.097043558542400002)*f[6] + CONSTANT(0.282094791766000000)*f[0];
+    tg = CONSTANT(-0.130197596207000000)*g[20] + CONSTANT(-0.097043558542400002)*g[6] + CONSTANT(0.282094791766000000)*g[0];
+    y[34] += tf*g[34] + tg*f[34];
+    t = f[34] * g[34];
+    y[20] += CONSTANT(-0.130197596207000000)*t;
+    y[6] += CONSTANT(-0.097043558542400002)*t;
+    y[0] += CONSTANT(0.282094791766000000)*t;
+
+    // [35,35]: 6,0,20,
+    tf = CONSTANT(-0.242608896358999990)*f[6] + CONSTANT(0.282094791761999970)*f[0] + CONSTANT(0.130197596198000000)*f[20];
+    tg = CONSTANT(-0.242608896358999990)*g[6] + CONSTANT(0.282094791761999970)*g[0] + CONSTANT(0.130197596198000000)*g[20];
+    y[35] += tf*g[35] + tg*f[35];
+    t = f[35] * g[35];
+    y[6] += CONSTANT(-0.242608896358999990)*t;
+    y[0] += CONSTANT(0.282094791761999970)*t;
+    y[20] += CONSTANT(0.130197596198000000)*t;
+
+    // multiply count=2527
+
+    return y;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Evaluates a directional light and returns spectral SH data.  The output 
+// vector is computed so that if the intensity of R/G/B is unit the resulting
+// exit radiance of a point directly under the light on a diffuse object with
+// an albedo of 1 would be 1.0.  This will compute 3 spectral samples, resultR
+// has to be specified, while resultG and resultB are optional.
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204988.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+bool XM_CALLCONV DirectX::XMSHEvalDirectionalLight(
+    size_t order,
+    FXMVECTOR dir,
+    FXMVECTOR color,
+    float *resultR,
+    float *resultG,
+    float *resultB) noexcept
+{
+    if (!resultR)
+        return false;
+
+    if (order < XM_SH_MINORDER || order > XM_SH_MAXORDER)
+        return false;
+
+    XMFLOAT3A clr;
+    XMStoreFloat3A(&clr, color);
+
+    float fTmp[XM_SH_MAXORDER * XM_SH_MAXORDER];
+
+    XMSHEvalDirection(fTmp, order, dir); // evaluate the BF in this direction...
+
+    // now compute "normalization" and scale vector for each valid spectral band
+    const float fNorm = XM_PI / CosWtInt(order);
+
+    const size_t numcoeff = order*order;
+
+    const float fRScale = fNorm * clr.x;
+
+    for (size_t i = 0; i < numcoeff; ++i)
+    {
+        resultR[i] = fTmp[i] * fRScale;
+    }
+
+    if (resultG)
+    {
+        const float fGScale = fNorm * clr.y;
+
+        for (size_t i = 0; i < numcoeff; ++i)
+        {
+            resultG[i] = fTmp[i] * fGScale;
+        }
+    }
+
+    if (resultB)
+    {
+        const float fBScale = fNorm * clr.z;
+
+        for (size_t i = 0; i < numcoeff; ++i)
+        {
+            resultB[i] = fTmp[i] * fBScale;
+        }
+    }
+
+    return true;
+}
+
+
+//------------------------------------------------------------------------------------
+// Evaluates a spherical light and returns spectral SH data.  There is no 
+// normalization of the intensity of the light like there is for directional
+// lights, care has to be taken when specifiying the intensities.  This will 
+// compute 3 spectral samples, resultR has to be specified, while resultG and 
+// resultB are optional.
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205451.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+bool XM_CALLCONV DirectX::XMSHEvalSphericalLight(
+    size_t order,
+    FXMVECTOR pos,
+    float radius,
+    FXMVECTOR color,
+    float *resultR,
+    float *resultG,
+    float *resultB) noexcept
+{
+    if (!resultR)
+        return false;
+
+    if (radius < 0.f)
+        return false;
+
+    const float fDist = XMVectorGetX(XMVector3Length(pos));
+
+    // WARNING: fDist should not be < radius - otherwise light contains origin
+
+    //const float fSinConeAngle = (fDist <= radius) ? 0.99999f : radius/fDist;
+    const float fConeAngle = (fDist <= radius) ? (XM_PIDIV2) : asinf(radius / fDist);
+
+    XMVECTOR dir = XMVector3Normalize(pos);
+
+    float fTmpDir[XM_SH_MAXORDER* XM_SH_MAXORDER];  // rotation "vector"
+    float fTmpL0[XM_SH_MAXORDER];
+
+    //
+    // Sphere at distance fDist, the cone angle is determined by looking at the
+    // right triangle with one side (the hypotenuse) beind the vector from the 
+    // origin to the center of the sphere, another side is from the origin to
+    // a point on the sphere whose normal is perpendicular to the given side (this
+    // is one of the points on the cone that is defined by the projection of the sphere
+    // through the origin - we want to find the angle of this cone) and the final
+    // side being from the center of the sphere to the point of tagency (the two
+    // sides conected to this are at a right angle by construction.)
+    // From trig we know that sin(theta) = ||opposite||/||hypotenuse||, where
+    // ||opposite|| = Radius, ||hypotenuse|| = fDist
+    // theta is the angle of the cone that subtends the sphere from the origin
+    //
+
+    // no default normalization is done for this case, have to be careful how
+    // you represent the coefficients...
+
+    const float fNewNorm = 1.0f;///(fSinConeAngle*fSinConeAngle); 
+
+    ComputeCapInt(order, fConeAngle, fTmpL0);
+
+    XMFLOAT3A vd;
+    XMStoreFloat3(&vd, dir);
+
+    const float fX = vd.x;
+    const float fY = vd.y;
+    const float fZ = vd.z;
+
+    switch (order)
+    {
+    case 2:
+        sh_eval_basis_1(fX, fY, fZ, fTmpDir);
+        break;
+
+    case 3:
+        sh_eval_basis_2(fX, fY, fZ, fTmpDir);
+        break;
+
+    case 4:
+        sh_eval_basis_3(fX, fY, fZ, fTmpDir);
+        break;
+
+    case 5:
+        sh_eval_basis_4(fX, fY, fZ, fTmpDir);
+        break;
+
+    case 6:
+        sh_eval_basis_5(fX, fY, fZ, fTmpDir);
+        break;
+
+    default:
+        assert(order < XM_SH_MINORDER || order > XM_SH_MAXORDER);
+        return false;
+    }
+
+    XMFLOAT3A clr;
+    XMStoreFloat3A(&clr, color);
+
+    for (size_t i = 0; i < order; ++i)
+    {
+        const size_t cNumCoefs = 2 * i + 1;
+        const size_t cStart = i*i;
+        const float fValUse = fTmpL0[i] * clr.x*fNewNorm*fExtraNormFac[i];
+        for (size_t j = 0; j < cNumCoefs; ++j) resultR[cStart + j] = fTmpDir[cStart + j] * fValUse;
+    }
+
+    if (resultG)
+    {
+        for (size_t i = 0; i < order; ++i)
+        {
+            const size_t cNumCoefs = 2 * i + 1;
+            const size_t cStart = i*i;
+            const float fValUse = fTmpL0[i] * clr.y*fNewNorm*fExtraNormFac[i];
+            for (size_t j = 0; j < cNumCoefs; ++j) resultG[cStart + j] = fTmpDir[cStart + j] * fValUse;
+        }
+    }
+
+    if (resultB)
+    {
+        for (size_t i = 0; i < order; ++i)
+        {
+            const size_t cNumCoefs = 2 * i + 1;
+            const size_t cStart = i*i;
+            const float fValUse = fTmpL0[i] * clr.z*fNewNorm*fExtraNormFac[i];
+            for (size_t j = 0; j < cNumCoefs; ++j) resultB[cStart + j] = fTmpDir[cStart + j] * fValUse;
+        }
+    }
+
+    return true;
+}
+
+
+//-------------------------------------------------------------------------------------
+// Evaluates a light that is a cone of constant intensity and returns spectral
+// SH data.  The output vector is computed so that if the intensity of R/G/B is
+// unit the resulting exit radiance of a point directly under the light oriented
+// in the cone direction on a diffuse object with an albedo of 1 would be 1.0.
+// This will compute 3 spectral samples, resultR has to be specified, while resultG
+// and resultB are optional.
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204986.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+bool XM_CALLCONV DirectX::XMSHEvalConeLight(
+    size_t order,
+    FXMVECTOR dir,
+    float radius,
+    FXMVECTOR color,
+    float *resultR,
+    float *resultG,
+    float *resultB) noexcept
+{
+    if (!resultR)
+        return false;
+
+    if (radius < 0.f || radius >(XM_PI*1.00001f))
+        return false;
+
+    if (radius < 0.0001f)
+    {
+        // turn it into a pure directional light...
+        return XMSHEvalDirectionalLight(order, dir, color, resultR, resultG, resultB);
+    }
+    else
+    {
+        float fTmpL0[XM_SH_MAXORDER];
+        float fTmpDir[XM_SH_MAXORDER * XM_SH_MAXORDER];
+
+        const float fConeAngle = radius;
+        const float fAngCheck = (fConeAngle > XM_PIDIV2) ? (XM_PIDIV2) : fConeAngle;
+
+        const float fNewNorm = 1.0f / (sinf(fAngCheck)*sinf(fAngCheck));
+
+        ComputeCapInt(order, fConeAngle, fTmpL0);
+
+        XMFLOAT3A vd;
+        XMStoreFloat3(&vd, dir);
+
+        const float fX = vd.x;
+        const float fY = vd.y;
+        const float fZ = vd.z;
+
+        switch (order)
+        {
+        case 2:
+            sh_eval_basis_1(fX, fY, fZ, fTmpDir);
+            break;
+
+        case 3:
+            sh_eval_basis_2(fX, fY, fZ, fTmpDir);
+            break;
+
+        case 4:
+            sh_eval_basis_3(fX, fY, fZ, fTmpDir);
+            break;
+
+        case 5:
+            sh_eval_basis_4(fX, fY, fZ, fTmpDir);
+            break;
+
+        case 6:
+            sh_eval_basis_5(fX, fY, fZ, fTmpDir);
+            break;
+
+        default:
+            assert(order < XM_SH_MINORDER || order > XM_SH_MAXORDER);
+            return false;
+        }
+
+        XMFLOAT3A clr;
+        XMStoreFloat3A(&clr, color);
+
+        for (size_t i = 0; i < order; ++i)
+        {
+            const size_t cNumCoefs = 2 * i + 1;
+            const size_t cStart = i*i;
+            const float fValUse = fTmpL0[i] * clr.x*fNewNorm*fExtraNormFac[i];
+            for (size_t j = 0; j < cNumCoefs; ++j)
+                resultR[cStart + j] = fTmpDir[cStart + j] * fValUse;
+        }
+
+        if (resultG)
+        {
+            for (size_t i = 0; i < order; ++i)
+            {
+                const size_t cNumCoefs = 2 * i + 1;
+                const size_t cStart = i*i;
+                const float fValUse = fTmpL0[i] * clr.y*fNewNorm*fExtraNormFac[i];
+                for (size_t j = 0; j < cNumCoefs; ++j)
+                    resultG[cStart + j] = fTmpDir[cStart + j] * fValUse;
+            }
+        }
+
+        if (resultB)
+        {
+            for (size_t i = 0; i < order; ++i)
+            {
+                const size_t cNumCoefs = 2 * i + 1;
+                const size_t cStart = i*i;
+                const float fValUse = fTmpL0[i] * clr.z*fNewNorm*fExtraNormFac[i];
+                for (size_t j = 0; j < cNumCoefs; ++j)
+                    resultB[cStart + j] = fTmpDir[cStart + j] * fValUse;
+            }
+        }
+    }
+
+    return true;
+}
+
+
+//------------------------------------------------------------------------------------
+// Evaluates a light that is a linear interpolant between two colors over the
+// sphere.  The interpolant is linear along the axis of the two points, not
+// over the surface of the sphere (ie: if the axis was (0,0,1) it is linear in
+// Z, not in the azimuthal angle.)  The resulting spherical lighting function
+// is normalized so that a point on a perfectly diffuse surface with no
+// shadowing and a normal pointed in the direction pDir would result in exit
+// radiance with a value of 1 if the top color was white and the bottom color
+// was black.  This is a very simple model where topColor represents the intensity 
+// of the "sky" and bottomColor represents the intensity of the "ground".
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204989.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+bool XM_CALLCONV DirectX::XMSHEvalHemisphereLight(
+    size_t order,
+    FXMVECTOR dir,
+    FXMVECTOR topColor,
+    FXMVECTOR bottomColor,
+    float *resultR,
+    float *resultG,
+    float *resultB) noexcept
+{
+    if (!resultR)
+        return false;
+
+    if (order < XM_SH_MINORDER || order > XM_SH_MAXORDER)
+        return false;
+
+    // seperate "R/G/B colors...
+
+    float fTmpDir[XM_SH_MAXORDER * XM_SH_MAXORDER];  // rotation "vector"
+    float fTmpL0[XM_SH_MAXORDER];
+
+    const float fNewNorm = 3.0f / 2.0f; // normalizes things for 1 sky color, 0 ground color...
+
+    XMFLOAT3A vd;
+    XMStoreFloat3(&vd, dir);
+
+    const float fX = vd.x;
+    const float fY = vd.y;
+    const float fZ = vd.z;
+
+    sh_eval_basis_1(fX, fY, fZ, fTmpDir);
+
+    XMFLOAT3A clrTop;
+    XMStoreFloat3A(&clrTop, topColor);
+
+    XMFLOAT3A clrBottom;
+    XMStoreFloat3A(&clrBottom, bottomColor);
+
+    float fA = clrTop.x;
+    float fAvrg = (clrTop.x + clrBottom.x)*0.5f;
+
+    fTmpL0[0] = fAvrg*2.0f*SHEvalHemisphereLight_fSqrtPi;
+    fTmpL0[1] = (fA - fAvrg)*2.0f*SHEvalHemisphereLight_fSqrtPi3;
+
+    size_t i = 0;
+    for (; i < 2; ++i)
+    {
+        _Analysis_assume_(i < order);
+        const size_t cNumCoefs = 2 * i + 1;
+        const size_t cStart = i*i;
+        const float fValUse = fTmpL0[i] * fNewNorm*fExtraNormFac[i];
+        for (size_t j = 0; j < cNumCoefs; ++j) resultR[cStart + j] = fTmpDir[cStart + j] * fValUse;
+    }
+
+    for (; i < order; ++i)
+    {
+        const size_t cNumCoefs = 2 * i + 1;
+        const size_t cStart = i*i;
+        for (size_t j = 0; j < cNumCoefs; ++j) resultR[cStart + j] = 0.0f;
+    }
+
+    if (resultG)
+    {
+        fA = clrTop.y;
+        fAvrg = (clrTop.y + clrBottom.y)*0.5f;
+
+        fTmpL0[0] = fAvrg*2.0f*SHEvalHemisphereLight_fSqrtPi;
+        fTmpL0[1] = (fA - fAvrg)*2.0f*SHEvalHemisphereLight_fSqrtPi3;
+
+        for (i = 0; i < 2; ++i)
+        {
+            _Analysis_assume_(i < order);
+            const size_t cNumCoefs = 2 * i + 1;
+            const size_t cStart = i*i;
+            const float fValUse = fTmpL0[i] * fNewNorm*fExtraNormFac[i];
+            for (size_t j = 0; j < cNumCoefs; ++j) resultG[cStart + j] = fTmpDir[cStart + j] * fValUse;
+        }
+
+        for (; i < order; ++i)
+        {
+            const size_t cNumCoefs = 2 * i + 1;
+            const size_t cStart = i*i;
+            for (size_t j = 0; j < cNumCoefs; ++j) resultG[cStart + j] = 0.0f;
+        }
+    }
+
+    if (resultB)
+    {
+        fA = clrTop.z;
+        fAvrg = (clrTop.z + clrBottom.z)*0.5f;
+
+        fTmpL0[0] = fAvrg*2.0f*SHEvalHemisphereLight_fSqrtPi;
+        fTmpL0[1] = (fA - fAvrg)*2.0f*SHEvalHemisphereLight_fSqrtPi3;
+
+        for (i = 0; i < 2; ++i)
+        {
+            _Analysis_assume_(i < order);
+            const size_t cNumCoefs = 2 * i + 1;
+            const size_t cStart = i*i;
+            const float fValUse = fTmpL0[i] * fNewNorm*fExtraNormFac[i];
+            for (size_t j = 0; j < cNumCoefs; ++j) resultB[cStart + j] = fTmpDir[cStart + j] * fValUse;
+        }
+
+        for (; i < order; ++i)
+        {
+            const size_t cNumCoefs = 2 * i + 1;
+            const size_t cStart = i*i;
+            for (size_t j = 0; j < cNumCoefs; ++j) resultB[cStart + j] = 0.0f;
+        }
+    }
+
+    return true;
+}
diff --git a/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSH.h b/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSH.h
new file mode 100644
index 000000000..9f5183553
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSH.h
@@ -0,0 +1,72 @@
+//-------------------------------------------------------------------------------------
+// DirectXSH.h -- C++ Spherical Harmonics Math Library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/p/?LinkId=262885
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#define DIRECTX_SHMATH_VERSION 106
+
+#include <DirectXMath.h>
+
+namespace DirectX
+{
+	constexpr size_t XM_SH_MINORDER = 2;
+	constexpr size_t XM_SH_MAXORDER = 6;
+
+	float* XM_CALLCONV XMSHEvalDirection(_Out_writes_(order*order) float *result, _In_ size_t order, _In_ FXMVECTOR dir) noexcept;
+
+	float* XM_CALLCONV XMSHRotate(_Out_writes_(order*order) float *result, _In_ size_t order, _In_ FXMMATRIX rotMatrix, _In_reads_(order*order) const float *input) noexcept;
+
+	float* XMSHRotateZ(_Out_writes_(order*order) float *result, _In_ size_t order, _In_ float angle, _In_reads_(order*order) const float *input) noexcept;
+
+	float* XMSHAdd(_Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *inputA, _In_reads_(order*order) const float *inputB) noexcept;
+
+	float* XMSHScale(_Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *input, _In_ float scale) noexcept;
+
+	float XMSHDot(_In_ size_t order, _In_reads_(order*order) const float *inputA, _In_reads_(order*order) const float *inputB) noexcept;
+
+	float* XMSHMultiply(_Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *inputF, _In_reads_(order*order) const float *inputG) noexcept;
+
+	float* XMSHMultiply2(_Out_writes_(4) float *result, _In_reads_(4) const float *inputF, _In_reads_(4) const float *inputG) noexcept;
+
+	float* XMSHMultiply3(_Out_writes_(9) float *result, _In_reads_(9) const float *inputF, _In_reads_(9) const float *inputG) noexcept;
+
+	float* XMSHMultiply4(_Out_writes_(16) float *result, _In_reads_(16) const float *inputF, _In_reads_(16) const float *inputG) noexcept;
+
+	float* XMSHMultiply5(_Out_writes_(25) float *result, _In_reads_(25) const float *inputF, _In_reads_(25) const float *inputG) noexcept;
+
+	float* XMSHMultiply6(_Out_writes_(36) float *result, _In_reads_(36) const float *inputF, _In_reads_(36) const float *inputG) noexcept;
+
+	bool XM_CALLCONV XMSHEvalDirectionalLight(
+		_In_ size_t order, _In_ FXMVECTOR dir, _In_ FXMVECTOR color,
+		_Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB) noexcept;
+
+	bool XM_CALLCONV XMSHEvalSphericalLight(
+		_In_ size_t order, _In_ FXMVECTOR pos, _In_ float radius, _In_ FXMVECTOR color,
+		_Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB) noexcept;
+
+	bool XM_CALLCONV XMSHEvalConeLight(
+		_In_ size_t order, _In_ FXMVECTOR dir, _In_ float radius, _In_ FXMVECTOR color,
+		_Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB) noexcept;
+
+	bool XM_CALLCONV XMSHEvalHemisphereLight(
+		_In_ size_t order, _In_ FXMVECTOR dir, _In_ FXMVECTOR topColor, _In_ FXMVECTOR bottomColor,
+		_Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB) noexcept;
+
+	#if defined(__d3d11_h__) || defined(__d3d11_x_h__)
+	HRESULT SHProjectCubeMap(
+		_In_ ID3D11DeviceContext *context, _In_ size_t order, _In_ ID3D11Texture2D *cubeMap,
+		_Out_writes_opt_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB) noexcept;
+	#endif
+
+	#if defined(__d3d12_h__) || defined(__d3d12_x_h__) || defined(__XBOX_D3D12_X__)
+	HRESULT SHProjectCubeMap(
+		_In_ size_t order, _In_ const D3D12_RESOURCE_DESC& desc, _In_ const D3D12_SUBRESOURCE_DATA cubeMap[6],
+		_Out_writes_opt_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB) noexcept;
+	#endif
+} // namespace DirectX
diff --git a/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSHD3D11.cpp b/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSHD3D11.cpp
new file mode 100644
index 000000000..a2e4e0bba
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSHD3D11.cpp
@@ -0,0 +1,383 @@
+//-------------------------------------------------------------------------------------
+// DirectXSHD3D11.cpp -- C++ Spherical Harmonics Math Library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/p/?LinkId=262885
+//-------------------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#pragma warning( disable : 4616 4619 4061 4265 4626 5039 )
+// C4616/C4619 #pragma warning warnings
+// C4061 numerator 'identifier' in switch of enum 'enumeration' is not explicitly handled by a case label
+// C4265 class has virtual functions, but destructor is not virtual
+// C4626 assignment operator was implicitly defined as deleted
+// C5039 pointer or reference to potentially throwing function passed to extern C function under - EHc
+
+#pragma warning(push)
+#pragma warning(disable: 4365)
+#endif
+#include <d3d11_1.h>
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#include "DirectXSH.h"
+
+#include <DirectXPackedVector.h>
+
+#include <cassert>
+#include <memory>
+#include <malloc.h>
+
+#include <wrl/client.h>
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wcovered-switch-default"
+#pragma clang diagnostic ignored "-Wswitch-enum"
+#endif
+
+using namespace DirectX;
+
+using Microsoft::WRL::ComPtr;
+
+namespace
+{
+    struct aligned_deleter { void operator()(void* p) { _aligned_free(p); } };
+
+    using ScopedAlignedArrayXMVECTOR = std::unique_ptr<DirectX::XMVECTOR, aligned_deleter>;
+
+    //-------------------------------------------------------------------------------------
+    // This code is lifted from DirectXTex http://go.microsoft.com/fwlink/?LinkId=248926
+    // If you need additional DXGI format support, see DirectXTexConvert.cpp
+    //-------------------------------------------------------------------------------------
+#define LOAD_SCANLINE( type, func )\
+        if ( size >= sizeof(type) )\
+        {\
+            const type * __restrict sPtr = reinterpret_cast<const type*>(pSource);\
+            for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\
+            {\
+                if ( dPtr >= ePtr ) break;\
+                *(dPtr++) = func( sPtr++ );\
+            }\
+            return true;\
+        }\
+        return false;
+
+#define LOAD_SCANLINE3( type, func, defvec )\
+        if ( size >= sizeof(type) )\
+        {\
+            const type * __restrict sPtr = reinterpret_cast<const type*>(pSource);\
+            for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\
+            {\
+                XMVECTOR v = func( sPtr++ );\
+                if ( dPtr >= ePtr ) break;\
+                *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1110 );\
+            }\
+            return true;\
+        }\
+        return false;
+
+#define LOAD_SCANLINE2( type, func, defvec )\
+        if ( size >= sizeof(type) )\
+        {\
+            const type * __restrict sPtr = reinterpret_cast<const type*>(pSource);\
+            for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\
+            {\
+                XMVECTOR v = func( sPtr++ );\
+                if ( dPtr >= ePtr ) break;\
+                *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1100 );\
+            }\
+            return true;\
+        }\
+        return false;
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 6101)
+#endif
+    _Success_(return)
+        bool LoadScanline(
+            _Out_writes_(count) DirectX::XMVECTOR* pDestination,
+            size_t count,
+            _In_reads_bytes_(size) LPCVOID pSource,
+            size_t size,
+            DXGI_FORMAT format)
+    {
+        assert(pDestination && count > 0 && ((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0));
+        assert(pSource && size > 0);
+
+        using namespace DirectX::PackedVector;
+
+        XMVECTOR* __restrict dPtr = pDestination;
+        if (!dPtr)
+            return false;
+
+        const XMVECTOR* ePtr = pDestination + count;
+
+        switch (format)
+        {
+        case DXGI_FORMAT_R32G32B32A32_FLOAT:
+        {
+            size_t msize = (size > (sizeof(XMVECTOR)*count)) ? (sizeof(XMVECTOR)*count) : size;
+            memcpy_s(dPtr, sizeof(XMVECTOR)*count, pSource, msize);
+        }
+        return true;
+
+        case DXGI_FORMAT_R32G32B32_FLOAT:
+            LOAD_SCANLINE3(XMFLOAT3, XMLoadFloat3, g_XMIdentityR3)
+
+        case DXGI_FORMAT_R16G16B16A16_FLOAT:
+            LOAD_SCANLINE(XMHALF4, XMLoadHalf4)
+
+        case DXGI_FORMAT_R32G32_FLOAT:
+            LOAD_SCANLINE2(XMFLOAT2, XMLoadFloat2, g_XMIdentityR3)
+
+        case DXGI_FORMAT_R11G11B10_FLOAT:
+            LOAD_SCANLINE3(XMFLOAT3PK, XMLoadFloat3PK, g_XMIdentityR3)
+
+        case DXGI_FORMAT_R16G16_FLOAT:
+            LOAD_SCANLINE2(XMHALF2, XMLoadHalf2, g_XMIdentityR3)
+
+        case DXGI_FORMAT_R32_FLOAT:
+            if (size >= sizeof(float))
+            {
+                const float* __restrict sPtr = reinterpret_cast<const float*>(pSource);
+                for (size_t icount = 0; icount < size; icount += sizeof(float))
+                {
+                    XMVECTOR v = XMLoadFloat(sPtr++);
+                    if (dPtr >= ePtr) break;
+                    *(dPtr++) = XMVectorSelect(g_XMIdentityR3, v, g_XMSelect1000);
+                }
+                return true;
+            }
+            return false;
+
+        case DXGI_FORMAT_R16_FLOAT:
+            if (size >= sizeof(HALF))
+            {
+                const HALF * __restrict sPtr = reinterpret_cast<const HALF*>(pSource);
+                for (size_t icount = 0; icount < size; icount += sizeof(HALF))
+                {
+                    if (dPtr >= ePtr) break;
+                    *(dPtr++) = XMVectorSet(XMConvertHalfToFloat(*sPtr++), 0.f, 0.f, 1.f);
+                }
+                return true;
+            }
+            return false;
+
+        default:
+            return false;
+        }
+    }
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+} // namespace anonymous
+
+//-------------------------------------------------------------------------------------
+// Projects a function represented in a cube map into spherical harmonics.
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/ff476300.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+HRESULT DirectX::SHProjectCubeMap(
+    ID3D11DeviceContext *context,
+    size_t order,
+    ID3D11Texture2D *cubeMap,
+    float *resultR,
+    float *resultG,
+    float* resultB) noexcept
+{
+    if (!context || !cubeMap)
+        return E_INVALIDARG;
+
+    if (order < XM_SH_MINORDER || order > XM_SH_MAXORDER)
+        return E_INVALIDARG;
+
+    D3D11_TEXTURE2D_DESC desc;
+    cubeMap->GetDesc(&desc);
+
+    if ((desc.ArraySize != 6)
+        || (desc.Width != desc.Height)
+        || (desc.SampleDesc.Count > 1))
+        return E_FAIL;
+
+    switch (desc.Format)
+    {
+    case DXGI_FORMAT_R32G32B32A32_FLOAT:
+    case DXGI_FORMAT_R32G32B32_FLOAT:
+    case DXGI_FORMAT_R16G16B16A16_FLOAT:
+    case DXGI_FORMAT_R32G32_FLOAT:
+    case DXGI_FORMAT_R11G11B10_FLOAT:
+    case DXGI_FORMAT_R16G16_FLOAT:
+    case DXGI_FORMAT_R32_FLOAT:
+    case DXGI_FORMAT_R16_FLOAT:
+        // See LoadScanline to support more pixel formats
+        break;
+
+    default:
+        return E_FAIL;
+    }
+
+    //--- Create a staging resource copy (if needed) to be able to read data
+    ID3D11Texture2D* texture = nullptr;
+
+    ComPtr<ID3D11Texture2D> staging;
+    if (!(desc.CPUAccessFlags & D3D11_CPU_ACCESS_READ))
+    {
+        D3D11_TEXTURE2D_DESC sdesc = desc;
+        sdesc.BindFlags = 0;
+        sdesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+        sdesc.Usage = D3D11_USAGE_STAGING;
+
+        ComPtr<ID3D11Device> device;
+        context->GetDevice(&device);
+
+        HRESULT hr = device->CreateTexture2D(&sdesc, nullptr, &staging);
+        if (FAILED(hr))
+            return hr;
+
+        context->CopyResource(staging.Get(), cubeMap);
+
+        texture = staging.Get();
+    }
+    else
+        texture = cubeMap;
+
+    assert(texture != nullptr);
+
+    //--- Setup for SH projection
+    ScopedAlignedArrayXMVECTOR scanline(reinterpret_cast<XMVECTOR*>(_aligned_malloc(sizeof(XMVECTOR)*desc.Width, 16)));
+    if (!scanline)
+        return E_OUTOFMEMORY;
+
+    assert(desc.Width > 0);
+    float fSize = static_cast<float>(desc.Width);
+    float fPicSize = 1.0f / fSize;
+
+    // index from [0,W-1], f(0) maps to -1 + 1/W, f(W-1) maps to 1 - 1/w
+    // linear function x*S +B, 1st constraint means B is (-1+1/W), plug into
+    // second and solve for S: S = 2*(1-1/W)/(W-1). The old code that did 
+    // this was incorrect - but only for computing the differential solid
+    // angle, where the final value was 1.0 instead of 1-1/w...
+
+    float fB = -1.0f + 1.0f / fSize;
+    float fS = (desc.Width > 1) ? (2.0f*(1.0f - 1.0f / fSize) / (fSize - 1.0f)) : 0.f;
+
+    // clear out accumulation variables
+    float fWt = 0.0f;
+
+    if (resultR)
+        memset(resultR, 0, sizeof(float)*order*order);
+    if (resultG)
+        memset(resultG, 0, sizeof(float)*order*order);
+    if (resultB)
+        memset(resultB, 0, sizeof(float)*order*order);
+
+    float shBuff[XM_SH_MAXORDER*XM_SH_MAXORDER] = {};
+    float shBuffB[XM_SH_MAXORDER*XM_SH_MAXORDER] = {};
+
+    //--- Process each face of the cubemap
+    for (UINT face = 0; face < 6; ++face)
+    {
+        UINT dindex = D3D11CalcSubresource(0, face, desc.MipLevels);
+
+        D3D11_MAPPED_SUBRESOURCE mapped;
+        HRESULT hr = context->Map(texture, dindex, D3D11_MAP_READ, 0, &mapped);
+        if (FAILED(hr))
+            return hr;
+
+        const uint8_t *pSrc = reinterpret_cast<const uint8_t*>(mapped.pData);
+        for (UINT y = 0; y < desc.Height; ++y)
+        {
+            XMVECTOR* ptr = scanline.get();
+            if (!LoadScanline(ptr, desc.Width, pSrc, mapped.RowPitch, desc.Format))
+            {
+                context->Unmap(texture, dindex);
+                return E_FAIL;
+            }
+
+            const float v = float(y) * fS + fB;
+
+            XMVECTOR* pixel = ptr;
+            for (UINT x = 0; x < desc.Width; ++x, ++pixel)
+            {
+                const float u = float(x) * fS + fB;
+
+                float ix, iy, iz;
+                switch (face)
+                {
+                case 0: // Positive X
+                    iz = 1.0f - (2.0f * float(x) + 1.0f) * fPicSize;
+                    iy = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize;
+                    ix = 1.0f;
+                    break;
+
+                case 1: // Negative X
+                    iz = -1.0f + (2.0f * float(x) + 1.0f) * fPicSize;
+                    iy = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize;
+                    ix = -1;
+                    break;
+
+                case 2: // Positive Y
+                    iz = -1.0f + (2.0f * float(y) + 1.0f) * fPicSize;
+                    iy = 1.0f;
+                    ix = -1.0f + (2.0f * float(x) + 1.0f) * fPicSize;
+                    break;
+
+                case 3: // Negative Y
+                    iz = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize;
+                    iy = -1.0f;
+                    ix = -1.0f + (2.0f * float(x) + 1.0f) * fPicSize;
+                    break;
+
+                case 4: // Positive Z
+                    iz = 1.0f;
+                    iy = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize;
+                    ix = -1.0f + (2.0f * float(x) + 1.0f) * fPicSize;
+                    break;
+
+                case 5: // Negative Z
+                    iz = -1.0f;
+                    iy = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize;
+                    ix = 1.0f - (2.0f * float(x) + 1.0f) * fPicSize;
+                    break;
+
+                default:
+                    ix = iy = iz = 0.f;
+                    assert(false);
+                    break;
+                }
+
+                XMVECTOR dir = XMVectorSet(ix, iy, iz, 0);
+                dir = XMVector3Normalize(dir);
+
+                const float fDiffSolid = 4.0f / ((1.0f + u * u + v * v)*sqrtf(1.0f + u * u + v * v));
+                fWt += fDiffSolid;
+
+                XMSHEvalDirection(shBuff, order, dir);
+
+                XMFLOAT3A clr;
+                XMStoreFloat3A(&clr, *pixel);
+
+                if (resultR) XMSHAdd(resultR, order, resultR, XMSHScale(shBuffB, order, shBuff, clr.x*fDiffSolid));
+                if (resultG) XMSHAdd(resultG, order, resultG, XMSHScale(shBuffB, order, shBuff, clr.y*fDiffSolid));
+                if (resultB) XMSHAdd(resultB, order, resultB, XMSHScale(shBuffB, order, shBuff, clr.z*fDiffSolid));
+            }
+
+            pSrc += mapped.RowPitch;
+        }
+
+        context->Unmap(texture, dindex);
+    }
+
+    const float fNormProj = (4.0f*XM_PI) / fWt;
+
+    if (resultR) XMSHScale(resultR, order, resultR, fNormProj);
+    if (resultG) XMSHScale(resultG, order, resultG, fNormProj);
+    if (resultB) XMSHScale(resultB, order, resultB, fNormProj);
+
+    return S_OK;
+}
diff --git a/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSHD3D12.cpp b/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSHD3D12.cpp
new file mode 100644
index 000000000..5ef93527d
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSHD3D12.cpp
@@ -0,0 +1,339 @@
+//-------------------------------------------------------------------------------------
+// DirectXSHD3D12.cpp -- C++ Spherical Harmonics Math Library
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/p/?LinkId=262885
+//-------------------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#pragma warning( disable : 4616 4619 4061 4265 4626 5039 )
+// C4616/C4619 #pragma warning warnings
+// C4061 numerator 'identifier' in switch of enum 'enumeration' is not explicitly handled by a case label
+// C4265 class has virtual functions, but destructor is not virtual
+// C4626 assignment operator was implicitly defined as deleted
+// C5039 pointer or reference to potentially throwing function passed to extern C function under - EHc
+#endif
+
+#include <d3d12.h>
+
+#include "DirectXSH.h"
+
+#include <DirectXPackedVector.h>
+
+#include <cassert>
+#include <memory>
+#include <malloc.h>
+
+#include <wrl/client.h>
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wcovered-switch-default"
+#pragma clang diagnostic ignored "-Wswitch-enum"
+#endif
+
+using namespace DirectX;
+
+using Microsoft::WRL::ComPtr;
+
+namespace
+{
+    struct aligned_deleter { void operator()(void* p) { _aligned_free(p); } };
+
+    using ScopedAlignedArrayXMVECTOR = std::unique_ptr<DirectX::XMVECTOR, aligned_deleter>;
+
+    //-------------------------------------------------------------------------------------
+    // This code is lifted from DirectXTex http://go.microsoft.com/fwlink/?LinkId=248926
+    // If you need additional DXGI format support, see DirectXTexConvert.cpp
+    //-------------------------------------------------------------------------------------
+#define LOAD_SCANLINE( type, func )\
+        if ( size >= sizeof(type) )\
+        {\
+            const type * __restrict sPtr = reinterpret_cast<const type*>(pSource);\
+            for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\
+            {\
+                if ( dPtr >= ePtr ) break;\
+                *(dPtr++) = func( sPtr++ );\
+            }\
+            return true;\
+        }\
+        return false;
+
+#define LOAD_SCANLINE3( type, func, defvec )\
+        if ( size >= sizeof(type) )\
+        {\
+            const type * __restrict sPtr = reinterpret_cast<const type*>(pSource);\
+            for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\
+            {\
+                XMVECTOR v = func( sPtr++ );\
+                if ( dPtr >= ePtr ) break;\
+                *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1110 );\
+            }\
+            return true;\
+        }\
+        return false;
+
+#define LOAD_SCANLINE2( type, func, defvec )\
+        if ( size >= sizeof(type) )\
+        {\
+            const type * __restrict sPtr = reinterpret_cast<const type*>(pSource);\
+            for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\
+            {\
+                XMVECTOR v = func( sPtr++ );\
+                if ( dPtr >= ePtr ) break;\
+                *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1100 );\
+            }\
+            return true;\
+        }\
+        return false;
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 6101)
+#endif
+    _Success_(return)
+        bool LoadScanline(
+            _Out_writes_(count) DirectX::XMVECTOR* pDestination,
+            size_t count,
+            _In_reads_bytes_(size) LPCVOID pSource,
+            size_t size,
+            DXGI_FORMAT format)
+    {
+        assert(pDestination && count > 0 && ((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0));
+        assert(pSource && size > 0);
+
+        using namespace DirectX::PackedVector;
+
+        XMVECTOR* __restrict dPtr = pDestination;
+        if (!dPtr)
+            return false;
+
+        const XMVECTOR* ePtr = pDestination + count;
+
+        switch (format)
+        {
+        case DXGI_FORMAT_R32G32B32A32_FLOAT:
+        {
+            size_t msize = (size > (sizeof(XMVECTOR)*count)) ? (sizeof(XMVECTOR)*count) : size;
+            memcpy_s(dPtr, sizeof(XMVECTOR)*count, pSource, msize);
+        }
+        return true;
+
+        case DXGI_FORMAT_R32G32B32_FLOAT:
+            LOAD_SCANLINE3(XMFLOAT3, XMLoadFloat3, g_XMIdentityR3)
+
+        case DXGI_FORMAT_R16G16B16A16_FLOAT:
+            LOAD_SCANLINE(XMHALF4, XMLoadHalf4)
+
+        case DXGI_FORMAT_R32G32_FLOAT:
+            LOAD_SCANLINE2(XMFLOAT2, XMLoadFloat2, g_XMIdentityR3)
+
+        case DXGI_FORMAT_R11G11B10_FLOAT:
+            LOAD_SCANLINE3(XMFLOAT3PK, XMLoadFloat3PK, g_XMIdentityR3)
+
+        case DXGI_FORMAT_R16G16_FLOAT:
+            LOAD_SCANLINE2(XMHALF2, XMLoadHalf2, g_XMIdentityR3)
+
+        case DXGI_FORMAT_R32_FLOAT:
+            if (size >= sizeof(float))
+            {
+                const float* __restrict sPtr = reinterpret_cast<const float*>(pSource);
+                for (size_t icount = 0; icount < size; icount += sizeof(float))
+                {
+                    XMVECTOR v = XMLoadFloat(sPtr++);
+                    if (dPtr >= ePtr) break;
+                    *(dPtr++) = XMVectorSelect(g_XMIdentityR3, v, g_XMSelect1000);
+                }
+                return true;
+            }
+            return false;
+
+        case DXGI_FORMAT_R16_FLOAT:
+            if (size >= sizeof(HALF))
+            {
+                const HALF * __restrict sPtr = reinterpret_cast<const HALF*>(pSource);
+                for (size_t icount = 0; icount < size; icount += sizeof(HALF))
+                {
+                    if (dPtr >= ePtr) break;
+                    *(dPtr++) = XMVectorSet(XMConvertHalfToFloat(*sPtr++), 0.f, 0.f, 1.f);
+                }
+                return true;
+            }
+            return false;
+
+        default:
+            return false;
+        }
+    }
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+} // namespace anonymous
+
+//-------------------------------------------------------------------------------------
+// Projects a function represented in a cube map into spherical harmonics.
+//
+// http://msdn.microsoft.com/en-us/library/windows/desktop/ff476300.aspx
+//-------------------------------------------------------------------------------------
+_Use_decl_annotations_
+HRESULT DirectX::SHProjectCubeMap(
+    size_t order,
+    const D3D12_RESOURCE_DESC& desc,
+    const D3D12_SUBRESOURCE_DATA cubeMap[6],
+    float *resultR,
+    float *resultG,
+    float *resultB) noexcept
+{
+    if (order < XM_SH_MINORDER || order > XM_SH_MAXORDER)
+        return E_INVALIDARG;
+
+    if (desc.Dimension != D3D12_RESOURCE_DIMENSION_TEXTURE2D
+        || (desc.DepthOrArraySize != 6)
+        || (desc.Width != desc.Height)
+        || (desc.SampleDesc.Count > 1))
+        return E_FAIL;
+
+    switch (desc.Format)
+    {
+    case DXGI_FORMAT_R32G32B32A32_FLOAT:
+    case DXGI_FORMAT_R32G32B32_FLOAT:
+    case DXGI_FORMAT_R16G16B16A16_FLOAT:
+    case DXGI_FORMAT_R32G32_FLOAT:
+    case DXGI_FORMAT_R11G11B10_FLOAT:
+    case DXGI_FORMAT_R16G16_FLOAT:
+    case DXGI_FORMAT_R32_FLOAT:
+    case DXGI_FORMAT_R16_FLOAT:
+        // See LoadScanline to support more pixel formats
+        break;
+
+    default:
+        return E_FAIL;
+    }
+
+    //--- Setup for SH projection
+    ScopedAlignedArrayXMVECTOR scanline(reinterpret_cast<XMVECTOR*>(_aligned_malloc(static_cast<size_t>(sizeof(XMVECTOR)*desc.Width), 16)));
+    if (!scanline)
+        return E_OUTOFMEMORY;
+
+    assert(desc.Width > 0);
+    float fSize = static_cast<float>(desc.Width);
+    float fPicSize = 1.0f / fSize;
+
+    // index from [0,W-1], f(0) maps to -1 + 1/W, f(W-1) maps to 1 - 1/w
+    // linear function x*S +B, 1st constraint means B is (-1+1/W), plug into
+    // second and solve for S: S = 2*(1-1/W)/(W-1). The old code that did 
+    // this was incorrect - but only for computing the differential solid
+    // angle, where the final value was 1.0 instead of 1-1/w...
+
+    float fB = -1.0f + 1.0f / fSize;
+    float fS = (desc.Width > 1) ? (2.0f*(1.0f - 1.0f / fSize) / (fSize - 1.0f)) : 0.f;
+
+    // clear out accumulation variables
+    float fWt = 0.0f;
+
+    if (resultR)
+        memset(resultR, 0, sizeof(float)*order*order);
+    if (resultG)
+        memset(resultG, 0, sizeof(float)*order*order);
+    if (resultB)
+        memset(resultB, 0, sizeof(float)*order*order);
+
+    float shBuff[XM_SH_MAXORDER*XM_SH_MAXORDER] = {};
+    float shBuffB[XM_SH_MAXORDER*XM_SH_MAXORDER] = {};
+
+    //--- Process each face of the cubemap
+    for (UINT face = 0; face < 6; ++face)
+    {
+        if (!cubeMap[face].pData)
+            return E_POINTER;
+
+        const uint8_t *pSrc = reinterpret_cast<const uint8_t*>(cubeMap[face].pData);
+        for (UINT y = 0; y < desc.Height; ++y)
+        {
+            XMVECTOR* ptr = scanline.get();
+            if (!LoadScanline(ptr, static_cast<size_t>(desc.Width), pSrc, static_cast<size_t>(cubeMap[face].RowPitch), desc.Format))
+            {
+                return E_FAIL;
+            }
+
+            const float v = float(y) * fS + fB;
+
+            XMVECTOR* pixel = ptr;
+            for (UINT x = 0; x < desc.Width; ++x, ++pixel)
+            {
+                const float u = float(x) * fS + fB;
+
+                float ix, iy, iz;
+                switch (face)
+                {
+                case 0: // Positive X
+                    iz = 1.0f - (2.0f * float(x) + 1.0f) * fPicSize;
+                    iy = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize;
+                    ix = 1.0f;
+                    break;
+
+                case 1: // Negative X
+                    iz = -1.0f + (2.0f * float(x) + 1.0f) * fPicSize;
+                    iy = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize;
+                    ix = -1;
+                    break;
+
+                case 2: // Positive Y
+                    iz = -1.0f + (2.0f * float(y) + 1.0f) * fPicSize;
+                    iy = 1.0f;
+                    ix = -1.0f + (2.0f * float(x) + 1.0f) * fPicSize;
+                    break;
+
+                case 3: // Negative Y
+                    iz = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize;
+                    iy = -1.0f;
+                    ix = -1.0f + (2.0f * float(x) + 1.0f) * fPicSize;
+                    break;
+
+                case 4: // Positive Z
+                    iz = 1.0f;
+                    iy = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize;
+                    ix = -1.0f + (2.0f * float(x) + 1.0f) * fPicSize;
+                    break;
+
+                case 5: // Negative Z
+                    iz = -1.0f;
+                    iy = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize;
+                    ix = 1.0f - (2.0f * float(x) + 1.0f) * fPicSize;
+                    break;
+
+                default:
+                    ix = iy = iz = 0.f;
+                    assert(false);
+                    break;
+                }
+
+                XMVECTOR dir = XMVectorSet(ix, iy, iz, 0);
+                dir = XMVector3Normalize(dir);
+
+                const float fDiffSolid = 4.0f / ((1.0f + u * u + v * v)*sqrtf(1.0f + u * u + v * v));
+                fWt += fDiffSolid;
+
+                XMSHEvalDirection(shBuff, order, dir);
+
+                XMFLOAT3A clr;
+                XMStoreFloat3A(&clr, *pixel);
+
+                if (resultR) XMSHAdd(resultR, order, resultR, XMSHScale(shBuffB, order, shBuff, clr.x*fDiffSolid));
+                if (resultG) XMSHAdd(resultG, order, resultG, XMSHScale(shBuffB, order, shBuff, clr.y*fDiffSolid));
+                if (resultB) XMSHAdd(resultB, order, resultB, XMSHScale(shBuffB, order, shBuff, clr.z*fDiffSolid));
+            }
+
+            pSrc += cubeMap[face].RowPitch;
+        }
+    }
+
+    const float fNormProj = (4.0f*XM_PI) / fWt;
+
+    if (resultR) XMSHScale(resultR, order, resultR, fNormProj);
+    if (resultG) XMSHScale(resultG, order, resultG, fNormProj);
+    if (resultB) XMSHScale(resultB, order, resultB, fNormProj);
+
+    return S_OK;
+}
diff --git a/src/thirdparty/DirectXMath-dec2022/Stereo3D/Stereo3DMatrixHelper.cpp b/src/thirdparty/DirectXMath-dec2022/Stereo3D/Stereo3DMatrixHelper.cpp
new file mode 100644
index 000000000..6e49b6cad
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Stereo3D/Stereo3DMatrixHelper.cpp
@@ -0,0 +1,257 @@
+//-------------------------------------------------------------------------------------
+// Stereo3DMatrixHelper.cpp -- SIMD C++ Math helper for Stereo 3D matricies
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//-------------------------------------------------------------------------------------
+
+#include "Stereo3DMatrixHelper.h"
+
+using namespace DirectX;
+
+namespace
+{
+    inline bool StereoProjectionHelper
+    (
+        const STEREO_PARAMETERS& stereoParameters,
+        _Out_ float* fVirtualProjection,
+        _Out_ float* zNearWidth,
+        _Out_ float* zNearHeight,
+        float FovAngleY,
+        float AspectRatio,
+        float NearZ
+    )
+    {
+        // note that most people have difficulty fusing images into 3D
+        // if the separation equals even just the human average. by 
+        // reducing the separation (interocular distance) by 1/2, we
+        // guarantee a larger subset of people will see full 3D
+
+        // the conservative setting should always be used. the only problem
+        // with the conservative setting is that the 3D effect will be less 
+        // impressive on smaller screens (which makes sense, since your eye
+        // cannot be tricked as easily based on the smaller fov). to simulate
+        // the effect of a larger screen, use the liberal settings (debug only)
+
+        // Conservative Settings: * max acuity angle: 0.8f degrees * interoc distance: 1.25 inches
+
+        // Liberal Settings: * max acuity angle: 1.6f degrees * interoc distance: 2.5f inches
+
+        // maximum visual accuity angle allowed is 3.2 degrees for 
+        // a physical scene, and 1.6 degrees for a virtual one. 
+        // thus we cannot allow an object to appear any closer to
+        // the viewer than 1.6 degrees (divided by two for most 
+        // half-angle calculations)
+
+        static const float fMaxStereoDistance = 780; // inches (should be between 10 and 20m)
+        static const float fMaxVisualAcuityAngle = 1.6f * (XM_PI / 180.0f);  // radians
+        static const float fInterocularDistance = 1.25f; // inches
+
+        float fDisplayHeight = stereoParameters.fDisplaySizeInches / sqrtf(AspectRatio * AspectRatio + 1.0f);
+        float fDisplayWidth = fDisplayHeight * AspectRatio;
+        float fHalfInterocular = 0.5f * fInterocularDistance * stereoParameters.fStereoExaggerationFactor;
+        float fHalfPixelWidth = fDisplayWidth / stereoParameters.fPixelResolutionWidth * 0.5f;
+        float fHalfMaximumAcuityAngle = fMaxVisualAcuityAngle * 0.5f * stereoParameters.fStereoExaggerationFactor;
+        // float fHalfWidth = fDisplayWidth * 0.5f;
+
+        float fMaxSeparationAcuityAngle = atanf(fHalfInterocular / fMaxStereoDistance);
+        float fMaxSeparationDistance = fHalfPixelWidth / tanf(fMaxSeparationAcuityAngle);
+        float fRefinedMaxStereoDistance = fMaxStereoDistance - fMaxSeparationDistance;
+        float fFovHalfAngle = FovAngleY / 2.0f;
+
+        bool ComfortableResult = true;
+        if (fRefinedMaxStereoDistance < 0.0f || fMaxSeparationDistance > 0.1f * fMaxStereoDistance)
+        {
+            // Pixel resolution is too low to offer a comfortable stereo experience
+            ComfortableResult = false;
+        }
+
+        float fRefinedMaxSeparationAcuityAngle = atanf(fHalfInterocular / (fRefinedMaxStereoDistance));
+        float fPhysicalZNearDistance = fHalfInterocular / tanf(fHalfMaximumAcuityAngle);
+        // float fScalingFactor = fHalfMaximumAcuityAngle / atanf(fHalfInterocular / stereoParameters.fViewerDistanceInches);
+
+        float fNearZSeparation = tanf(fRefinedMaxSeparationAcuityAngle) * (fRefinedMaxStereoDistance - fPhysicalZNearDistance);
+        // float fNearZSeparation2 = fHalfInterocular * (fRefinedMaxStereoDistance - fPhysicalZNearDistance) / fRefinedMaxStereoDistance;
+
+        (*zNearHeight) = cosf(fFovHalfAngle) / sinf(fFovHalfAngle);
+        (*zNearWidth) = (*zNearHeight) / AspectRatio;
+        (*fVirtualProjection) = (fNearZSeparation * NearZ * (*zNearWidth * 4.0f)) / (2.0f * NearZ);
+
+        return ComfortableResult;
+    }
+}
+
+//------------------------------------------------------------------------------
+
+void DirectX::StereoCreateDefaultParameters
+(
+    STEREO_PARAMETERS& stereoParameters
+)
+{
+    // Default assumption is 1920x1200 resolution, a 22" LCD monitor, and a 2' viewing distance
+    stereoParameters.fViewerDistanceInches = 24.0f;
+    stereoParameters.fPixelResolutionWidth = 1920.0f;
+    stereoParameters.fPixelResolutionHeight = 1200.0f;
+    stereoParameters.fDisplaySizeInches = 22.0f;
+
+    stereoParameters.fStereoSeparationFactor = 1.0f;
+    stereoParameters.fStereoExaggerationFactor = 1.0f;
+}
+
+//------------------------------------------------------------------------------
+
+XMMATRIX DirectX::StereoProjectionFovLH
+(
+    _In_opt_ const STEREO_PARAMETERS* pStereoParameters,
+    STEREO_CHANNEL Channel,
+    float FovAngleY,
+    float AspectRatio,
+    float NearZ,
+    float FarZ,
+    STEREO_MODE StereoMode
+)
+{
+    assert(Channel == STEREO_CHANNEL_LEFT || Channel == STEREO_CHANNEL_RIGHT);
+    assert(StereoMode == STEREO_MODE_NORMAL || StereoMode == STEREO_MODE_INVERTED);
+    assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
+    assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+    STEREO_PARAMETERS DefaultParameters = {};
+    if (pStereoParameters == nullptr)
+    {
+        StereoCreateDefaultParameters(DefaultParameters);
+        pStereoParameters = &DefaultParameters;
+    }
+
+    assert(pStereoParameters->fStereoSeparationFactor >= 0.0f && pStereoParameters->fStereoSeparationFactor <= 1.0f);
+    assert(pStereoParameters->fStereoExaggerationFactor >= 1.0f && pStereoParameters->fStereoExaggerationFactor <= 2.0f);
+
+    float fVirtualProjection = 0.0f;
+    float zNearWidth = 0.0f;
+    float zNearHeight = 0.0f;
+    StereoProjectionHelper(*pStereoParameters, &fVirtualProjection, &zNearWidth, &zNearHeight, FovAngleY, AspectRatio, NearZ);
+
+    fVirtualProjection *= pStereoParameters->fStereoSeparationFactor; // incorporate developer defined bias
+
+    //
+    // By applying a translation, we are forcing our cameras to be parallel 
+    //
+
+    float fInvertedAngle = atanf(fVirtualProjection / (2.0f * NearZ));
+
+    XMMATRIX proj = XMMatrixPerspectiveFovLH(FovAngleY, AspectRatio, NearZ, FarZ);
+
+    XMMATRIX patchedProjection;
+    if (Channel == STEREO_CHANNEL_LEFT)
+    {
+        if (StereoMode > STEREO_MODE_NORMAL)
+        {
+            XMMATRIX rots = XMMatrixRotationY(fInvertedAngle);
+            XMMATRIX trans = XMMatrixTranslation(-fVirtualProjection, 0, 0);
+            patchedProjection = XMMatrixMultiply(XMMatrixMultiply(rots, trans), proj);
+        }
+        else
+        {
+            XMMATRIX trans = XMMatrixTranslation(-fVirtualProjection, 0, 0);
+            patchedProjection = XMMatrixMultiply(trans, proj);
+        }
+    }
+    else
+    {
+        if (StereoMode > STEREO_MODE_NORMAL)
+        {
+            XMMATRIX rots = XMMatrixRotationY(-fInvertedAngle);
+            XMMATRIX trans = XMMatrixTranslation(fVirtualProjection, 0, 0);
+            patchedProjection = XMMatrixMultiply(XMMatrixMultiply(rots, trans), proj);
+        }
+        else
+        {
+            XMMATRIX trans = XMMatrixTranslation(fVirtualProjection, 0, 0);
+            patchedProjection = XMMatrixMultiply(trans, proj);
+        }
+    }
+
+    return patchedProjection;
+}
+
+//------------------------------------------------------------------------------
+
+XMMATRIX DirectX::StereoProjectionFovRH
+(
+    _In_opt_ const STEREO_PARAMETERS* pStereoParameters,
+    STEREO_CHANNEL Channel,
+    float FovAngleY,
+    float AspectRatio,
+    float NearZ,
+    float FarZ,
+    STEREO_MODE StereoMode
+)
+{
+    assert(Channel == STEREO_CHANNEL_LEFT || Channel == STEREO_CHANNEL_RIGHT);
+    assert(StereoMode == STEREO_MODE_NORMAL || StereoMode == STEREO_MODE_INVERTED);
+    assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
+    assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
+    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
+
+    STEREO_PARAMETERS DefaultParameters = {};
+    if (pStereoParameters == nullptr)
+    {
+        StereoCreateDefaultParameters(DefaultParameters);
+        pStereoParameters = &DefaultParameters;
+    }
+
+    assert(pStereoParameters->fStereoSeparationFactor >= 0.0f && pStereoParameters->fStereoSeparationFactor <= 1.0f);
+    assert(pStereoParameters->fStereoExaggerationFactor >= 1.0f && pStereoParameters->fStereoExaggerationFactor <= 2.0f);
+
+    float fVirtualProjection = 0.0f;
+    float zNearWidth = 0.0f;
+    float zNearHeight = 0.0f;
+    StereoProjectionHelper(*pStereoParameters, &fVirtualProjection, &zNearWidth, &zNearHeight, FovAngleY, AspectRatio, NearZ);
+
+    fVirtualProjection *= pStereoParameters->fStereoSeparationFactor; // incorporate developer defined bias
+
+    //
+    // By applying a translation, we are forcing our cameras to be parallel 
+    //
+
+    float fInvertedAngle = atanf(fVirtualProjection / (2.0f * NearZ));
+
+    XMMATRIX proj = XMMatrixPerspectiveFovRH(FovAngleY, AspectRatio, NearZ, FarZ);
+
+    //
+    // By applying a translation, we are forcing our cameras to be parallel 
+    //
+
+    XMMATRIX patchedProjection;
+    if (Channel == STEREO_CHANNEL_LEFT)
+    {
+        if (StereoMode > STEREO_MODE_NORMAL)
+        {
+            XMMATRIX rots = XMMatrixRotationY(fInvertedAngle);
+            XMMATRIX trans = XMMatrixTranslation(-fVirtualProjection, 0, 0);
+            patchedProjection = XMMatrixMultiply(XMMatrixMultiply(rots, trans), proj);
+        }
+        else
+        {
+            XMMATRIX trans = XMMatrixTranslation(-fVirtualProjection, 0, 0);
+            patchedProjection = XMMatrixMultiply(trans, proj);
+        }
+    }
+    else
+    {
+        if (StereoMode > STEREO_MODE_NORMAL)
+        {
+            XMMATRIX rots = XMMatrixRotationY(-fInvertedAngle);
+            XMMATRIX trans = XMMatrixTranslation(fVirtualProjection, 0, 0);
+            patchedProjection = XMMatrixMultiply(XMMatrixMultiply(rots, trans), proj);
+        }
+        else
+        {
+            XMMATRIX trans = XMMatrixTranslation(fVirtualProjection, 0, 0);
+            patchedProjection = XMMatrixMultiply(trans, proj);
+        }
+    }
+
+    return patchedProjection;
+}
diff --git a/src/thirdparty/DirectXMath-dec2022/Stereo3D/Stereo3DMatrixHelper.h b/src/thirdparty/DirectXMath-dec2022/Stereo3D/Stereo3DMatrixHelper.h
new file mode 100644
index 000000000..412d0350a
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/Stereo3D/Stereo3DMatrixHelper.h
@@ -0,0 +1,64 @@
+//-------------------------------------------------------------------------------------
+// Stereo3DMatrixHelper.h -- SIMD C++ Math helper for Stereo 3D matrices
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//-------------------------------------------------------------------------------------
+
+#pragma once
+
+#include "DirectXMath.h"
+
+namespace DirectX
+{
+    // Enumeration for stereo channels (left and right).
+    enum STEREO_CHANNEL
+    {
+        STEREO_CHANNEL_LEFT = 0,
+        STEREO_CHANNEL_RIGHT
+    };
+
+    // Enumeration for stereo mode (normal or inverted).
+    enum STEREO_MODE
+    {
+        STEREO_MODE_NORMAL = 0,
+        STEREO_MODE_INVERTED,
+    };
+
+    //------------------------------------------------------------------------------
+    //
+    // Stereo calibration settings
+    //
+    // * Viewer distance to the display
+    // * Physical display size
+    // * Render resolution
+    //
+    // The stereo separation factor indicates how much separation is between the left and right
+    // eyes.  0 is no separation, 1 is full separation. It defaults to 1.0.
+    //
+    // The debug stereo exaggeration factor indicates how much to increase the interocular spacing and
+    // maximum acuity angle from comfortable defaults.  For retail builds, this value should always
+    // be 1.0, but during development, on small screens, this value can be raised to up to 2.0 in
+    // order to exaggerate the 3D effect.  Values over 1.0 may cause discomfort on normal sized
+    // displays. It defaults to 1.0.
+    // 
+    struct STEREO_PARAMETERS
+    {
+        float fViewerDistanceInches;
+        float fDisplaySizeInches;
+        float fPixelResolutionWidth;
+        float fPixelResolutionHeight;
+        float fStereoSeparationFactor;
+        float fStereoExaggerationFactor;
+    };
+
+    void StereoCreateDefaultParameters(STEREO_PARAMETERS& stereoParameters);
+
+    XMMATRIX StereoProjectionFovLH(_In_opt_ const STEREO_PARAMETERS* pStereoParameters,
+        STEREO_CHANNEL Channel, float FovAngleY, float AspectRatio, float NearZ, float FarZ,
+        STEREO_MODE StereoMode = STEREO_MODE_NORMAL);
+
+    XMMATRIX StereoProjectionFovRH(_In_opt_ const STEREO_PARAMETERS* pStereoParameters,
+        STEREO_CHANNEL Channel, float FovAngleY, float AspectRatio, float NearZ, float FarZ,
+        STEREO_MODE StereoMode = STEREO_MODE_NORMAL);
+}
\ No newline at end of file
diff --git a/src/thirdparty/DirectXMath-dec2022/XDSP/XDSP.h b/src/thirdparty/DirectXMath-dec2022/XDSP/XDSP.h
new file mode 100644
index 000000000..eabd64ab0
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/XDSP/XDSP.h
@@ -0,0 +1,871 @@
+//--------------------------------------------------------------------------------------
+// File: XDSP.h
+//
+// DirectXMath based Digital Signal Processing (DSP) functions for audio,
+// primarily Fast Fourier Transform (FFT)
+//
+// All buffer parameters must be 16-byte aligned
+//
+// All FFT functions support only single-precision floating-point audio
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+//
+// http://go.microsoft.com/fwlink/?LinkID=615557
+//--------------------------------------------------------------------------------------
+
+#pragma once
+
+#include <cassert>
+#include <DirectXMath.h>
+
+#include <cstdint>
+#include <cstring>
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 6001 6262)
+#endif
+
+namespace XDSP
+{
+    using XMVECTOR = DirectX::XMVECTOR;
+    using FXMVECTOR = DirectX::FXMVECTOR;
+    using GXMVECTOR = DirectX::GXMVECTOR;
+    using CXMVECTOR = DirectX::CXMVECTOR;
+    using XMFLOAT4A = DirectX::XMFLOAT4A;
+
+    inline bool ISPOWEROF2(size_t n) { return (((n)&((n)-1)) == 0 && (n) != 0); }
+
+    // Parallel multiplication of four complex numbers, assuming real and imaginary values are stored in separate vectors.
+    inline void XM_CALLCONV vmulComplex(
+        _Out_ XMVECTOR& rResult, _Out_ XMVECTOR& iResult,
+        _In_ FXMVECTOR r1, _In_ FXMVECTOR i1, _In_ FXMVECTOR r2, _In_ GXMVECTOR i2) noexcept
+    {
+        using namespace DirectX;
+        // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
+        const XMVECTOR vr1r2 = XMVectorMultiply(r1, r2);
+        const XMVECTOR vr1i2 = XMVectorMultiply(r1, i2);
+        rResult = XMVectorNegativeMultiplySubtract(i1, i2, vr1r2); // real: (r1*r2 - i1*i2)
+        iResult = XMVectorMultiplyAdd(r2, i1, vr1i2); // imaginary: (r1*i2 + r2*i1)
+    }
+
+    inline void XM_CALLCONV vmulComplex(
+        _Inout_ XMVECTOR& r1, _Inout_ XMVECTOR& i1, _In_ FXMVECTOR r2, _In_ FXMVECTOR i2) noexcept
+    {
+        using namespace DirectX;
+        // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
+        const XMVECTOR vr1r2 = XMVectorMultiply(r1, r2);
+        const XMVECTOR vr1i2 = XMVectorMultiply(r1, i2);
+        r1 = XMVectorNegativeMultiplySubtract(i1, i2, vr1r2); // real: (r1*r2 - i1*i2)
+        i1 = XMVectorMultiplyAdd(r2, i1, vr1i2); // imaginary: (r1*i2 + r2*i1)
+    }
+
+    //----------------------------------------------------------------------------------
+    // Radix-4 decimation-in-time FFT butterfly.
+    // This version assumes that all four elements of the butterfly are
+    // adjacent in a single vector.
+    //
+    // Compute the product of the complex input vector and the
+    // 4-element DFT matrix:
+    //     | 1  1  1  1 |    | (r1X,i1X) |
+    //     | 1 -j -1  j |    | (r1Y,i1Y) |
+    //     | 1 -1  1 -1 |    | (r1Z,i1Z) |
+    //     | 1  j -1 -j |    | (r1W,i1W) |
+    //
+    // This matrix can be decomposed into two simpler ones to reduce the
+    // number of additions needed. The decomposed matrices look like this:
+    //     | 1  0  1  0 |    | 1  0  1  0 |
+    //     | 0  1  0 -j |    | 1  0 -1  0 |
+    //     | 1  0 -1  0 |    | 0  1  0  1 |
+    //     | 0  1  0  j |    | 0  1  0 -1 |
+    //
+    // Combine as follows:
+    //          | 1  0  1  0 |   | (r1X,i1X) |         | (r1X + r1Z, i1X + i1Z) |
+    // Temp   = | 1  0 -1  0 | * | (r1Y,i1Y) |       = | (r1X - r1Z, i1X - i1Z) |
+    //          | 0  1  0  1 |   | (r1Z,i1Z) |         | (r1Y + r1W, i1Y + i1W) |
+    //          | 0  1  0 -1 |   | (r1W,i1W) |         | (r1Y - r1W, i1Y - i1W) |
+    //
+    //          | 1  0  1  0 |   | (rTempX,iTempX) |   | (rTempX + rTempZ, iTempX + iTempZ) |
+    // Result = | 0  1  0 -j | * | (rTempY,iTempY) | = | (rTempY + iTempW, iTempY - rTempW) |
+    //          | 1  0 -1  0 |   | (rTempZ,iTempZ) |   | (rTempX - rTempZ, iTempX - iTempZ) |
+    //          | 0  1  0  j |   | (rTempW,iTempW) |   | (rTempY - iTempW, iTempY + rTempW) |
+    //----------------------------------------------------------------------------------
+    inline void ButterflyDIT4_1 (_Inout_ XMVECTOR& r1, _Inout_ XMVECTOR& i1) noexcept
+    {
+        using namespace DirectX;
+
+        // sign constants for radix-4 butterflies
+        static const XMVECTORF32 vDFT4SignBits1 = { { { 1.0f, -1.0f, 1.0f, -1.0f } } };
+        static const XMVECTORF32 vDFT4SignBits2 = { { { 1.0f, 1.0f, -1.0f, -1.0f } } };
+        static const XMVECTORF32 vDFT4SignBits3 = { { { 1.0f, -1.0f, -1.0f, 1.0f } } };
+
+        // calculating Temp
+        // [r1X| r1X|r1Y| r1Y] + [r1Z|-r1Z|r1W|-r1W]
+        // [i1X| i1X|i1Y| i1Y] + [i1Z|-i1Z|i1W|-i1W]
+        const XMVECTOR r1L = XMVectorSwizzle<0, 0, 1, 1>(r1);
+        const XMVECTOR r1H = XMVectorSwizzle<2, 2, 3, 3>(r1);
+
+        const XMVECTOR i1L = XMVectorSwizzle<0, 0, 1, 1>(i1);
+        const XMVECTOR i1H = XMVectorSwizzle<2, 2, 3, 3>(i1);
+
+        const XMVECTOR rTemp = XMVectorMultiplyAdd(r1H, vDFT4SignBits1, r1L);
+        const XMVECTOR iTemp = XMVectorMultiplyAdd(i1H, vDFT4SignBits1, i1L);
+
+        // calculating Result
+        const XMVECTOR rZrWiZiW = XMVectorPermute<2, 3, 6, 7>(rTemp, iTemp);   // [rTempZ|rTempW|iTempZ|iTempW]
+        const XMVECTOR rZiWrZiW = XMVectorSwizzle<0, 3, 0, 3>(rZrWiZiW);       // [rTempZ|iTempW|rTempZ|iTempW]
+        const XMVECTOR iZrWiZrW = XMVectorSwizzle<2, 1, 2, 1>(rZrWiZiW);       // [rTempZ|iTempW|rTempZ|iTempW]
+
+        // [rTempX| rTempY| rTempX| rTempY] + [rTempZ| iTempW|-rTempZ|-iTempW]
+        // [iTempX| iTempY| iTempX| iTempY] + // [iTempZ|-rTempW|-iTempZ| rTempW]
+        const XMVECTOR rTempL = XMVectorSwizzle<0, 1, 0, 1>(rTemp);
+        const XMVECTOR iTempL = XMVectorSwizzle<0, 1, 0, 1>(iTemp);
+
+        r1 = XMVectorMultiplyAdd(rZiWrZiW, vDFT4SignBits2, rTempL);
+        i1 = XMVectorMultiplyAdd(iZrWiZrW, vDFT4SignBits3, iTempL);
+    }
+
+    //----------------------------------------------------------------------------------
+    // Radix-4 decimation-in-time FFT butterfly.
+    // This version assumes that elements of the butterfly are
+    // in different vectors, so that each vector in the input
+    // contains elements from four different butterflies.
+    // The four separate butterflies are processed in parallel.
+    //
+    // The calculations here are the same as the ones in the single-vector
+    // radix-4 DFT, but instead of being done on a single vector (X,Y,Z,W)
+    // they are done in parallel on sixteen independent complex values.
+    // There is no interdependence between the vector elements:
+    // | 1  0  1  0 |    | (rIn0,iIn0) |               | (rIn0 + rIn2, iIn0 + iIn2) |
+    // | 1  0 -1  0 | *  | (rIn1,iIn1) |  =   Temp   = | (rIn0 - rIn2, iIn0 - iIn2) |
+    // | 0  1  0  1 |    | (rIn2,iIn2) |               | (rIn1 + rIn3, iIn1 + iIn3) |
+    // | 0  1  0 -1 |    | (rIn3,iIn3) |               | (rIn1 - rIn3, iIn1 - iIn3) |
+    //
+    //          | 1  0  1  0 |   | (rTemp0,iTemp0) |   | (rTemp0 + rTemp2, iTemp0 + iTemp2) |
+    // Result = | 0  1  0 -j | * | (rTemp1,iTemp1) | = | (rTemp1 + iTemp3, iTemp1 - rTemp3) |
+    //          | 1  0 -1  0 |   | (rTemp2,iTemp2) |   | (rTemp0 - rTemp2, iTemp0 - iTemp2) |
+    //          | 0  1  0  j |   | (rTemp3,iTemp3) |   | (rTemp1 - iTemp3, iTemp1 + rTemp3) |
+    //----------------------------------------------------------------------------------
+    inline void ButterflyDIT4_4(
+        _Inout_ XMVECTOR& r0,
+        _Inout_ XMVECTOR& r1,
+        _Inout_ XMVECTOR& r2,
+        _Inout_ XMVECTOR& r3,
+        _Inout_ XMVECTOR& i0,
+        _Inout_ XMVECTOR& i1,
+        _Inout_ XMVECTOR& i2,
+        _Inout_ XMVECTOR& i3,
+        _In_reads_(uStride * 4) const XMVECTOR* __restrict pUnityTableReal,
+        _In_reads_(uStride * 4) const XMVECTOR* __restrict pUnityTableImaginary,
+        _In_ size_t uStride,
+        _In_ const bool fLast) noexcept
+    {
+        using namespace DirectX;
+
+        assert(pUnityTableReal);
+        assert(pUnityTableImaginary);
+        assert(reinterpret_cast<uintptr_t>(pUnityTableReal) % 16 == 0);
+        assert(reinterpret_cast<uintptr_t>(pUnityTableImaginary) % 16 == 0);
+        assert(ISPOWEROF2(uStride));
+
+        // calculating Temp
+        const XMVECTOR rTemp0 = XMVectorAdd(r0, r2);
+        const XMVECTOR iTemp0 = XMVectorAdd(i0, i2);
+
+        const XMVECTOR rTemp2 = XMVectorAdd(r1, r3);
+        const XMVECTOR iTemp2 = XMVectorAdd(i1, i3);
+
+        const XMVECTOR rTemp1 = XMVectorSubtract(r0, r2);
+        const XMVECTOR iTemp1 = XMVectorSubtract(i0, i2);
+
+        const XMVECTOR rTemp3 = XMVectorSubtract(r1, r3);
+        const XMVECTOR iTemp3 = XMVectorSubtract(i1, i3);
+
+        XMVECTOR rTemp4 = XMVectorAdd(rTemp0, rTemp2);
+        XMVECTOR iTemp4 = XMVectorAdd(iTemp0, iTemp2);
+
+        XMVECTOR rTemp5 = XMVectorAdd(rTemp1, iTemp3);
+        XMVECTOR iTemp5 = XMVectorSubtract(iTemp1, rTemp3);
+
+        XMVECTOR rTemp6 = XMVectorSubtract(rTemp0, rTemp2);
+        XMVECTOR iTemp6 = XMVectorSubtract(iTemp0, iTemp2);
+
+        XMVECTOR rTemp7 = XMVectorSubtract(rTemp1, iTemp3);
+        XMVECTOR iTemp7 = XMVectorAdd(iTemp1, rTemp3);
+
+        // calculating Result
+        // vmulComplex(rTemp0, iTemp0, rTemp0, iTemp0, pUnityTableReal[0], pUnityTableImaginary[0]); // first one is always trivial
+        vmulComplex(rTemp5, iTemp5, pUnityTableReal[uStride], pUnityTableImaginary[uStride]);
+        vmulComplex(rTemp6, iTemp6, pUnityTableReal[uStride * 2], pUnityTableImaginary[uStride * 2]);
+        vmulComplex(rTemp7, iTemp7, pUnityTableReal[uStride * 3], pUnityTableImaginary[uStride * 3]);
+
+        if (fLast)
+        {
+            ButterflyDIT4_1(rTemp4, iTemp4);
+            ButterflyDIT4_1(rTemp5, iTemp5);
+            ButterflyDIT4_1(rTemp6, iTemp6);
+            ButterflyDIT4_1(rTemp7, iTemp7);
+        }
+
+        r0 = rTemp4;    i0 = iTemp4;
+        r1 = rTemp5;    i1 = iTemp5;
+        r2 = rTemp6;    i2 = iTemp6;
+        r3 = rTemp7;    i3 = iTemp7;
+    }
+
+    //==================================================================================
+    // F-U-N-C-T-I-O-N-S
+    //==================================================================================
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  4-sample FFT.
+    //
+    // PARAMETERS:
+    //  pReal      - [inout] real components, must have at least uCount elements
+    //  pImaginary - [inout] imaginary components, must have at least uCount elements
+    //  uCount     - [in]    number of FFT iterations
+    //----------------------------------------------------------------------------------
+    inline void FFT4(
+        _Inout_updates_(uCount) XMVECTOR* __restrict pReal,
+        _Inout_updates_(uCount) XMVECTOR* __restrict pImaginary,
+        const size_t uCount = 1) noexcept
+    {
+        assert(pReal);
+        assert(pImaginary);
+        assert(reinterpret_cast<uintptr_t>(pReal) % 16 == 0);
+        assert(reinterpret_cast<uintptr_t>(pImaginary) % 16 == 0);
+        assert(ISPOWEROF2(uCount));
+
+        for (size_t uIndex = 0; uIndex < uCount; ++uIndex)
+        {
+            ButterflyDIT4_1(pReal[uIndex], pImaginary[uIndex]);
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  8-sample FFT.
+    //
+    // PARAMETERS:
+    //  pReal      - [inout] real components, must have at least uCount*2 elements
+    //  pImaginary - [inout] imaginary components, must have at least uCount*2 elements
+    //  uCount     - [in]    number of FFT iterations
+    //----------------------------------------------------------------------------------
+    inline void FFT8(
+        _Inout_updates_(uCount * 2) XMVECTOR* __restrict pReal,
+        _Inout_updates_(uCount * 2) XMVECTOR* __restrict pImaginary,
+        _In_ const size_t uCount = 1) noexcept
+    {
+        using namespace DirectX;
+
+        assert(pReal);
+        assert(pImaginary);
+        assert(reinterpret_cast<uintptr_t>(pReal) % 16 == 0);
+        assert(reinterpret_cast<uintptr_t>(pImaginary) % 16 == 0);
+        assert(ISPOWEROF2(uCount));
+
+        static const XMVECTORF32 wr1 = { { { 1.0f, 0.70710677f, 0.0f, -0.70710677f } } };
+        static const XMVECTORF32 wi1 = { { { 0.0f, -0.70710677f, -1.0f, -0.70710677f } } };
+        static const XMVECTORF32 wr2 = { { { -1.0f, -0.70710677f, 0.0f, 0.70710677f } } };
+        static const XMVECTORF32 wi2 = { { { 0.0f, 0.70710677f, 1.0f, 0.70710677f } } };
+
+        for (size_t uIndex = 0; uIndex < uCount; ++uIndex)
+        {
+            XMVECTOR* __restrict pR = pReal + uIndex * 2;
+            XMVECTOR* __restrict pI = pImaginary + uIndex * 2;
+
+            XMVECTOR oddsR = XMVectorPermute<1, 3, 5, 7>(pR[0], pR[1]);
+            XMVECTOR evensR = XMVectorPermute<0, 2, 4, 6>(pR[0], pR[1]);
+            XMVECTOR oddsI = XMVectorPermute<1, 3, 5, 7>(pI[0], pI[1]);
+            XMVECTOR evensI = XMVectorPermute<0, 2, 4, 6>(pI[0], pI[1]);
+            ButterflyDIT4_1(oddsR, oddsI);
+            ButterflyDIT4_1(evensR, evensI);
+
+            XMVECTOR r, i;
+            vmulComplex(r, i, oddsR, oddsI, wr1, wi1);
+            pR[0] = XMVectorAdd(evensR, r);
+            pI[0] = XMVectorAdd(evensI, i);
+
+            vmulComplex(r, i, oddsR, oddsI, wr2, wi2);
+            pR[1] = XMVectorAdd(evensR, r);
+            pI[1] = XMVectorAdd(evensI, i);
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  16-sample FFT.
+    //
+    // PARAMETERS:
+    //  pReal      - [inout] real components, must have at least uCount*4 elements
+    //  pImaginary - [inout] imaginary components, must have at least uCount*4 elements
+    //  uCount     - [in]    number of FFT iterations
+    //----------------------------------------------------------------------------------
+    inline void FFT16(
+        _Inout_updates_(uCount * 4) XMVECTOR* __restrict pReal,
+        _Inout_updates_(uCount * 4) XMVECTOR* __restrict pImaginary,
+        _In_ const size_t uCount = 1) noexcept
+    {
+        using namespace DirectX;
+
+        assert(pReal);
+        assert(pImaginary);
+        assert(reinterpret_cast<uintptr_t>(pReal) % 16 == 0);
+        assert(reinterpret_cast<uintptr_t>(pImaginary) % 16 == 0);
+        assert(ISPOWEROF2(uCount));
+
+        static const XMVECTORF32 aUnityTableReal[4] = {
+            { { { 1.0f, 1.0f, 1.0f, 1.0f } } },
+            { { { 1.0f, 0.92387950f, 0.70710677f, 0.38268343f } } },
+            { { { 1.0f, 0.70710677f, -4.3711388e-008f, -0.70710677f } } },
+            { { { 1.0f, 0.38268343f, -0.70710677f, -0.92387950f } } }
+        };
+        static const XMVECTORF32 aUnityTableImaginary[4] =
+        {
+            { { { -0.0f, -0.0f, -0.0f, -0.0f } } },
+            { { { -0.0f, -0.38268343f, -0.70710677f, -0.92387950f } } },
+            { { { -0.0f, -0.70710677f, -1.0f, -0.70710677f } } },
+            { { { -0.0f, -0.92387950f, -0.70710677f, 0.38268343f } } }
+        };
+
+        for (size_t uIndex = 0; uIndex < uCount; ++uIndex)
+        {
+            ButterflyDIT4_4(pReal[uIndex * 4],
+                pReal[uIndex * 4 + 1],
+                pReal[uIndex * 4 + 2],
+                pReal[uIndex * 4 + 3],
+                pImaginary[uIndex * 4],
+                pImaginary[uIndex * 4 + 1],
+                pImaginary[uIndex * 4 + 2],
+                pImaginary[uIndex * 4 + 3],
+                reinterpret_cast<const XMVECTOR*>(aUnityTableReal),
+                reinterpret_cast<const XMVECTOR*>(aUnityTableImaginary),
+                1, true);
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  2^N-sample FFT.
+    //
+    // REMARKS:
+    //  For FFTs length 16 and below, call FFT16(), FFT8(), or FFT4().
+    //
+    // PARAMETERS:
+    //  pReal       - [inout] real components, must have at least (uLength*uCount)/4 elements
+    //  pImaginary  - [inout] imaginary components, must have at least (uLength*uCount)/4 elements
+    //  pUnityTable - [in]    unity table, must have at least uLength*uCount elements, see FFTInitializeUnityTable()
+    //  uLength     - [in]    FFT length in samples, must be a power of 2 > 16
+    //  uCount      - [in]    number of FFT iterations
+    //----------------------------------------------------------------------------------
+    inline void FFT (
+        _Inout_updates_((uLength * uCount) / 4) XMVECTOR* __restrict pReal,
+        _Inout_updates_((uLength * uCount) / 4) XMVECTOR* __restrict pImaginary,
+        _In_reads_(uLength * uCount) const XMVECTOR* __restrict pUnityTable,
+        _In_ const size_t uLength,
+        _In_ const size_t uCount = 1) noexcept
+    {
+        assert(pReal);
+        assert(pImaginary);
+        assert(pUnityTable);
+        assert(reinterpret_cast<uintptr_t>(pReal) % 16 == 0);
+        assert(reinterpret_cast<uintptr_t>(pImaginary) % 16 == 0);
+        assert(reinterpret_cast<uintptr_t>(pUnityTable) % 16 == 0);
+        assert(uLength > 16);
+        _Analysis_assume_(uLength > 16);
+        assert(ISPOWEROF2(uLength));
+        assert(ISPOWEROF2(uCount));
+
+        const XMVECTOR* __restrict pUnityTableReal = pUnityTable;
+        const XMVECTOR* __restrict pUnityTableImaginary = pUnityTable + (uLength >> 2);
+        const size_t uTotal              = uCount * uLength;
+        const size_t uTotal_vectors      = uTotal >> 2;
+        const size_t uStage_vectors      = uLength >> 2;
+        const size_t uStage_vectors_mask = uStage_vectors - 1;
+        const size_t uStride        = uLength >> 4; // stride between butterfly elements
+        const size_t uStrideMask    = uStride - 1;
+        const size_t uStride2       = uStride * 2;
+        const size_t uStride3       = uStride * 3;
+        const size_t uStrideInvMask = ~uStrideMask;
+
+        for (size_t uIndex=0; uIndex < (uTotal_vectors >> 2); ++uIndex)
+        {
+            const size_t n = ((uIndex & uStrideInvMask) << 2) + (uIndex & uStrideMask);
+            ButterflyDIT4_4(pReal[n],
+                            pReal[n + uStride],
+                            pReal[n + uStride2],
+                            pReal[n + uStride3],
+                            pImaginary[n ],
+                            pImaginary[n + uStride],
+                            pImaginary[n + uStride2],
+                            pImaginary[n + uStride3],
+                            pUnityTableReal + (n & uStage_vectors_mask),
+                            pUnityTableImaginary + (n & uStage_vectors_mask),
+                            uStride, false);
+        }
+
+        if (uLength > 16 * 4)
+        {
+            FFT(pReal, pImaginary, pUnityTable + (uLength >> 1), uLength >> 2, uCount * 4);
+        }
+        else if (uLength == 16 * 4)
+        {
+            FFT16(pReal, pImaginary, uCount * 4);
+        }
+        else if (uLength == 8 * 4)
+        {
+            FFT8(pReal, pImaginary, uCount * 4);
+        }
+        else if (uLength == 4 * 4)
+        {
+            FFT4(pReal, pImaginary, uCount * 4);
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  Initializes unity roots lookup table used by FFT functions.
+    //  Once initialized, the table need not be initialized again unless a
+    //  different FFT length is desired.
+    //
+    // REMARKS:
+    //  The unity tables of FFT length 16 and below are hard coded into the
+    //  respective FFT functions and so need not be initialized.
+    //
+    // PARAMETERS:
+    //  pUnityTable - [out] unity table, receives unity roots lookup table, must have at least uLength elements
+    //  uLength     - [in]  FFT length in frames, must be a power of 2 > 16
+    //----------------------------------------------------------------------------------
+    inline void FFTInitializeUnityTable (_Out_writes_(uLength) XMVECTOR* __restrict pUnityTable, _In_ size_t uLength) noexcept
+    {
+        using namespace DirectX;
+
+        assert(pUnityTable);
+        assert(uLength > 16);
+        _Analysis_assume_(uLength > 16);
+        assert(ISPOWEROF2(uLength));
+
+        // initialize unity table for recursive FFT lengths: uLength, uLength/4, uLength/16... > 16
+        // pUnityTable[0 to uLength*4-1] contains real components for current FFT length
+        // pUnityTable[uLength*4 to uLength*8-1] contains imaginary components for current FFT length
+        static const XMVECTORF32 vXM0123 = { { { 0.0f, 1.0f, 2.0f, 3.0f } } };
+        uLength >>= 2;
+        XMVECTOR vlStep = XMVectorReplicate(XM_PIDIV2 / float(uLength));
+        do
+        {
+            uLength >>= 2;
+            XMVECTOR vJP = vXM0123;
+            for (size_t j = 0; j < uLength; ++j)
+            {
+                XMVECTOR vSin, vCos;
+                XMVECTOR viJP, vlS;
+
+                pUnityTable[j] = g_XMOne;
+                pUnityTable[j + uLength * 4] = XMVectorZero();
+
+                vlS = XMVectorMultiply(vJP, vlStep);
+                XMVectorSinCos(&vSin, &vCos, vlS);
+                pUnityTable[j + uLength] = vCos;
+                pUnityTable[j + uLength * 5] = XMVectorMultiply(vSin, g_XMNegativeOne);
+
+                viJP = XMVectorAdd(vJP, vJP);
+                vlS = XMVectorMultiply(viJP, vlStep);
+                XMVectorSinCos(&vSin, &vCos, vlS);
+                pUnityTable[j + uLength * 2] = vCos;
+                pUnityTable[j + uLength * 6] = XMVectorMultiply(vSin, g_XMNegativeOne);
+
+                viJP = XMVectorAdd(viJP, vJP);
+                vlS = XMVectorMultiply(viJP, vlStep);
+                XMVectorSinCos(&vSin, &vCos, vlS);
+                pUnityTable[j + uLength * 3] = vCos;
+                pUnityTable[j + uLength * 7] = XMVectorMultiply(vSin, g_XMNegativeOne);
+
+                vJP = XMVectorAdd(vJP, g_XMFour);
+            }
+            vlStep = XMVectorMultiply(vlStep, g_XMFour);
+            pUnityTable += uLength * 8;
+        } while (uLength > 4);
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  The FFT functions generate output in bit reversed order.
+    //  Use this function to re-arrange them into order of increasing frequency.
+    //
+    // REMARKS:
+    //  Exponential values and bits correspond, so the reversed upper index can be omitted depending on the number of exponents.
+    //
+    // PARAMETERS:
+    //  pOutput     - [out] output buffer, receives samples in order of increasing frequency, cannot overlap pInput, must have at least (1<<uLog2Length)/4 elements
+    //  pInput      - [in]  input buffer, samples in bit reversed order as generated by FFT functions, cannot overlap pOutput, must have at least (1<<uLog2Length)/4 elements
+    //  uLog2Length - [in]  LOG (base 2) of FFT length in samples, must be >= 2
+    //----------------------------------------------------------------------------------
+    inline void FFTUnswizzle (
+        _Out_writes_((1 << uLog2Length) / 4) XMVECTOR* __restrict pOutput,
+        _In_reads_((1 << uLog2Length) / 4) const XMVECTOR* __restrict pInput,
+        _In_ const size_t uLog2Length) noexcept
+    {
+        assert(pOutput);
+        assert(pInput);
+        assert(uLog2Length >= 2);
+        _Analysis_assume_(uLog2Length >= 2);
+
+        float* __restrict pfOutput = reinterpret_cast<float*>(pOutput);
+        const size_t uLength = size_t(1) << (uLog2Length - 2);
+
+        static const unsigned char cSwizzleTable[256] = {
+            0x00, 0x40, 0x80, 0xC0, 0x10, 0x50, 0x90, 0xD0, 0x20, 0x60, 0xA0, 0xE0, 0x30, 0x70, 0xB0, 0xF0,
+            0x04, 0x44, 0x84, 0xC4, 0x14, 0x54, 0x94, 0xD4, 0x24, 0x64, 0xA4, 0xE4, 0x34, 0x74, 0xB4, 0xF4,
+            0x08, 0x48, 0x88, 0xC8, 0x18, 0x58, 0x98, 0xD8, 0x28, 0x68, 0xA8, 0xE8, 0x38, 0x78, 0xB8, 0xF8,
+            0x0C, 0x4C, 0x8C, 0xCC, 0x1C, 0x5C, 0x9C, 0xDC, 0x2C, 0x6C, 0xAC, 0xEC, 0x3C, 0x7C, 0xBC, 0xFC,
+            0x01, 0x41, 0x81, 0xC1, 0x11, 0x51, 0x91, 0xD1, 0x21, 0x61, 0xA1, 0xE1, 0x31, 0x71, 0xB1, 0xF1,
+            0x05, 0x45, 0x85, 0xC5, 0x15, 0x55, 0x95, 0xD5, 0x25, 0x65, 0xA5, 0xE5, 0x35, 0x75, 0xB5, 0xF5,
+            0x09, 0x49, 0x89, 0xC9, 0x19, 0x59, 0x99, 0xD9, 0x29, 0x69, 0xA9, 0xE9, 0x39, 0x79, 0xB9, 0xF9,
+            0x0D, 0x4D, 0x8D, 0xCD, 0x1D, 0x5D, 0x9D, 0xDD, 0x2D, 0x6D, 0xAD, 0xED, 0x3D, 0x7D, 0xBD, 0xFD,
+            0x02, 0x42, 0x82, 0xC2, 0x12, 0x52, 0x92, 0xD2, 0x22, 0x62, 0xA2, 0xE2, 0x32, 0x72, 0xB2, 0xF2,
+            0x06, 0x46, 0x86, 0xC6, 0x16, 0x56, 0x96, 0xD6, 0x26, 0x66, 0xA6, 0xE6, 0x36, 0x76, 0xB6, 0xF6,
+            0x0A, 0x4A, 0x8A, 0xCA, 0x1A, 0x5A, 0x9A, 0xDA, 0x2A, 0x6A, 0xAA, 0xEA, 0x3A, 0x7A, 0xBA, 0xFA,
+            0x0E, 0x4E, 0x8E, 0xCE, 0x1E, 0x5E, 0x9E, 0xDE, 0x2E, 0x6E, 0xAE, 0xEE, 0x3E, 0x7E, 0xBE, 0xFE,
+            0x03, 0x43, 0x83, 0xC3, 0x13, 0x53, 0x93, 0xD3, 0x23, 0x63, 0xA3, 0xE3, 0x33, 0x73, 0xB3, 0xF3,
+            0x07, 0x47, 0x87, 0xC7, 0x17, 0x57, 0x97, 0xD7, 0x27, 0x67, 0xA7, 0xE7, 0x37, 0x77, 0xB7, 0xF7,
+            0x0B, 0x4B, 0x8B, 0xCB, 0x1B, 0x5B, 0x9B, 0xDB, 0x2B, 0x6B, 0xAB, 0xEB, 0x3B, 0x7B, 0xBB, 0xFB,
+            0x0F, 0x4F, 0x8F, 0xCF, 0x1F, 0x5F, 0x9F, 0xDF, 0x2F, 0x6F, 0xAF, 0xEF, 0x3F, 0x7F, 0xBF, 0xFF
+        };
+        if ((uLog2Length & 1) == 0)
+        {
+            // even powers of two
+            const size_t uRev32 = 32 - uLog2Length;
+            for (size_t uIndex = 0; uIndex < uLength; ++uIndex)
+            {
+                XMFLOAT4A f4a;
+                XMStoreFloat4A(&f4a, pInput[uIndex]);
+                const size_t n = uIndex * 4;
+                const size_t uAddr = (static_cast<size_t>(cSwizzleTable[n & 0xff]) << 24) |
+                    (static_cast<size_t>(cSwizzleTable[(n >> 8) & 0xff]) << 16) |
+                    (static_cast<size_t>(cSwizzleTable[(n >> 16) & 0xff]) << 8) |
+                    (static_cast<size_t>(cSwizzleTable[(n >> 24)]));
+                pfOutput[uAddr >> uRev32] = f4a.x;
+                pfOutput[(0x40000000 | uAddr) >> uRev32] = f4a.y;
+                pfOutput[(0x80000000 | uAddr) >> uRev32] = f4a.z;
+                pfOutput[(0xC0000000 | uAddr) >> uRev32] = f4a.w;
+            }
+        }
+        else
+        {
+            // odd powers of two
+            const size_t uRev7 = size_t(1) << (uLog2Length - 3);
+            const size_t uRev32 = 32 - (uLog2Length - 3);
+            for (size_t uIndex = 0; uIndex < uLength; ++uIndex)
+            {
+                XMFLOAT4A f4a;
+                XMStoreFloat4A(&f4a, pInput[uIndex]);
+                const size_t n = (uIndex >> 1);
+                size_t uAddr = (((static_cast<size_t>(cSwizzleTable[n & 0xff]) << 24) |
+                    (static_cast<size_t>(cSwizzleTable[(n >> 8) & 0xff]) << 16) |
+                    (static_cast<size_t>(cSwizzleTable[(n >> 16) & 0xff]) << 8) |
+                    (static_cast<size_t>(cSwizzleTable[(n >> 24)]))) >> uRev32) |
+                    ((uIndex & 1) * uRev7 * 4);
+                pfOutput[uAddr] = f4a.x;
+                uAddr += uRev7;
+                pfOutput[uAddr] = f4a.y;
+                uAddr += uRev7;
+                pfOutput[uAddr] = f4a.z;
+                uAddr += uRev7;
+                pfOutput[uAddr] = f4a.w;
+            }
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  Convert complex components to polar form.
+    //
+    // PARAMETERS:
+    //  pOutput         - [out] output buffer, receives samples in polar form, must have at least uLength/4 elements
+    //  pInputReal      - [in]  input buffer (real components), must have at least uLength/4 elements
+    //  pInputImaginary - [in]  input buffer (imaginary components), must have at least uLength/4 elements
+    //  uLength         - [in]  FFT length in samples, must be a power of 2 >= 4
+    //----------------------------------------------------------------------------------
+#ifdef _MSC_VER
+#pragma warning(suppress: 6101)
+#endif
+    inline void FFTPolar(
+        _Out_writes_(uLength / 4) XMVECTOR* __restrict pOutput,
+        _In_reads_(uLength / 4) const XMVECTOR* __restrict pInputReal,
+        _In_reads_(uLength / 4) const XMVECTOR* __restrict pInputImaginary,
+        _In_ const size_t uLength) noexcept
+    {
+        using namespace DirectX;
+
+        assert(pOutput);
+        assert(pInputReal);
+        assert(pInputImaginary);
+        assert(uLength >= 4);
+        _Analysis_assume_(uLength >= 4);
+        assert(ISPOWEROF2(uLength));
+
+        const float flOneOverLength = 1.0f / float(uLength);
+
+        // result = sqrtf((real/uLength)^2 + (imaginary/uLength)^2) * 2
+        const XMVECTOR vOneOverLength = XMVectorReplicate(flOneOverLength);
+
+        for (size_t uIndex = 0; uIndex < (uLength >> 2); ++uIndex)
+        {
+            XMVECTOR vReal      = XMVectorMultiply(pInputReal[uIndex], vOneOverLength);
+            XMVECTOR vImaginary = XMVectorMultiply(pInputImaginary[uIndex], vOneOverLength);
+            XMVECTOR vRR        = XMVectorMultiply(vReal, vReal);
+            XMVECTOR vII        = XMVectorMultiply(vImaginary, vImaginary);
+            XMVECTOR vRRplusII  = XMVectorAdd(vRR, vII);
+            XMVECTOR vTotal     = XMVectorSqrt(vRRplusII);
+            pOutput[uIndex]     = XMVectorAdd(vTotal, vTotal);
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  Deinterleaves audio samples
+    //
+    // REMARKS:
+    //  For example, audio of the form [LRLRLR] becomes [LLLRRR].
+    //
+    // PARAMETERS:
+    //  pOutput       - [out] output buffer, receives samples in deinterleaved form, cannot overlap pInput, must have at least (uChannelCount*uFrameCount)/4 elements
+    //  pInput        - [in]  input buffer, cannot overlap pOutput, must have at least (uChannelCount*uFrameCount)/4 elements
+    //  uChannelCount - [in]  number of channels, must be > 1
+    //  uFrameCount   - [in]  number of frames of valid data, must be > 0
+    //----------------------------------------------------------------------------------
+    inline void Deinterleave (
+        _Out_writes_((uChannelCount * uFrameCount) / 4) XMVECTOR* __restrict pOutput,
+        _In_reads_((uChannelCount * uFrameCount) / 4) const XMVECTOR* __restrict pInput,
+        _In_ const size_t uChannelCount,
+        _In_ const size_t uFrameCount) noexcept
+    {
+        assert(pOutput);
+        assert(pInput);
+        assert(uChannelCount > 1);
+        assert(uFrameCount > 0);
+
+        float* __restrict pfOutput = reinterpret_cast<float* __restrict>(pOutput);
+        const float* __restrict pfInput  = reinterpret_cast<const float* __restrict>(pInput);
+
+        for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel)
+        {
+            for (size_t uFrame = 0; uFrame < uFrameCount; ++uFrame)
+            {
+                pfOutput[uChannel * uFrameCount + uFrame] = pfInput[uFrame * uChannelCount + uChannel];
+            }
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  Interleaves audio samples
+    //
+    // REMARKS:
+    //  For example, audio of the form [LLLRRR] becomes [LRLRLR].
+    //
+    // PARAMETERS:
+    //  pOutput       - [out] output buffer, receives samples in interleaved form, cannot overlap pInput, must have at least (uChannelCount*uFrameCount)/4 elements
+    //  pInput        - [in]  input buffer, cannot overlap pOutput, must have at least (uChannelCount*uFrameCount)/4 elements
+    //  uChannelCount - [in]  number of channels, must be > 1
+    //  uFrameCount   - [in]  number of frames of valid data, must be > 0
+    //----------------------------------------------------------------------------------
+    inline void Interleave(
+        _Out_writes_((uChannelCount * uFrameCount) / 4) XMVECTOR* __restrict pOutput,
+        _In_reads_((uChannelCount * uFrameCount) / 4) const XMVECTOR* __restrict pInput,
+        _In_ const size_t uChannelCount,
+        _In_ const size_t uFrameCount) noexcept
+    {
+        assert(pOutput);
+        assert(pInput);
+        assert(uChannelCount > 1);
+        assert(uFrameCount > 0);
+
+        float* __restrict pfOutput = reinterpret_cast<float* __restrict>(pOutput);
+        const float* __restrict pfInput  = reinterpret_cast<const float* __restrict>(pInput);
+
+        for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel)
+        {
+            for (size_t uFrame = 0; uFrame < uFrameCount; ++uFrame)
+            {
+                pfOutput[uFrame * uChannelCount + uChannel] = pfInput[uChannel * uFrameCount + uFrame];
+            }
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  This function applies a 2^N-sample FFT and unswizzles the result such
+    //  that the samples are in order of increasing frequency.
+    //  Audio is first deinterleaved if multichannel.
+    //
+    // PARAMETERS:
+    //  pReal         - [inout] real components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
+    //  pImaginary    - [out]   imaginary components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
+    //  pUnityTable   - [in]    unity table, must have at least (1<<uLog2Length) elements, see FFTInitializeUnityTable()
+    //  uChannelCount - [in]    number of channels, must be within [1, 6]
+    //  uLog2Length   - [in]    LOG (base 2) of FFT length in frames, must within [2, 9]
+    //----------------------------------------------------------------------------------
+    inline void FFTInterleaved(
+        _Inout_updates_(((1 << uLog2Length) * uChannelCount) / 4) XMVECTOR* __restrict pReal,
+        _Out_writes_(((1 << uLog2Length) * uChannelCount) / 4) XMVECTOR* __restrict pImaginary,
+        _In_reads_(1 << uLog2Length) const XMVECTOR* __restrict pUnityTable,
+        _In_ const size_t uChannelCount,
+        _In_ const size_t uLog2Length) noexcept
+    {
+        assert(pReal);
+        assert(pImaginary);
+        assert(pUnityTable);
+        assert(reinterpret_cast<uintptr_t>(pReal) % 16 == 0);
+        assert(reinterpret_cast<uintptr_t>(pImaginary) % 16 == 0);
+        assert(reinterpret_cast<uintptr_t>(pUnityTable) % 16 == 0);
+        assert(uChannelCount > 0 && uChannelCount <= 6);
+        assert(uLog2Length >= 2 && uLog2Length <= 9);
+
+        XMVECTOR vRealTemp[768];
+        XMVECTOR vImaginaryTemp[768];
+        const size_t uLength = size_t(1) << uLog2Length;
+
+        if (uChannelCount > 1)
+        {
+            Deinterleave(vRealTemp, pReal, uChannelCount, uLength);
+        }
+        else
+        {
+            memcpy_s(vRealTemp, sizeof(vRealTemp), pReal, (uLength >> 2) * sizeof(XMVECTOR));
+        }
+
+        memset(vImaginaryTemp, 0, (uChannelCount * (uLength >> 2)) * sizeof(XMVECTOR));
+
+        if (uLength > 16)
+        {
+            for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel)
+            {
+                FFT(&vRealTemp[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)], pUnityTable, uLength);
+            }
+        }
+        else if (uLength == 16)
+        {
+            for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel)
+            {
+                FFT16(&vRealTemp[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)]);
+            }
+        }
+        else if (uLength == 8)
+        {
+            for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel)
+            {
+                FFT8(&vRealTemp[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)]);
+            }
+        }
+        else if (uLength == 4)
+        {
+            for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel)
+            {
+                FFT4(&vRealTemp[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)]);
+            }
+        }
+
+        for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel)
+        {
+            FFTUnswizzle(&pReal[uChannel * (uLength >> 2)], &vRealTemp[uChannel * (uLength >> 2)], uLog2Length);
+            FFTUnswizzle(&pImaginary[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)], uLog2Length);
+        }
+    }
+
+    //----------------------------------------------------------------------------------
+    // DESCRIPTION:
+    //  This function applies a 2^N-sample inverse FFT.
+    //  Audio is interleaved if multichannel.
+    //
+    // PARAMETERS:
+    //  pReal         - [inout] real components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
+    //  pImaginary    - [in]    imaginary components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
+    //  pUnityTable   - [in]    unity table, must have at least (1<<uLog2Length) elements, see FFTInitializeUnityTable()
+    //  uChannelCount - [in]    number of channels, must be > 0
+    //  uLog2Length   - [in]    LOG (base 2) of FFT length in frames, must within [2, 9]
+    //----------------------------------------------------------------------------------
+    inline void IFFTDeinterleaved(
+        _Inout_updates_(((1 << uLog2Length) * uChannelCount) / 4) XMVECTOR* __restrict pReal,
+        _In_reads_(((1 << uLog2Length) * uChannelCount) / 4) const XMVECTOR* __restrict pImaginary,
+        _In_reads_(1 << uLog2Length) const XMVECTOR* __restrict pUnityTable,
+        _In_ const size_t uChannelCount,
+        _In_ const size_t uLog2Length) noexcept
+    {
+        using namespace DirectX;
+
+        assert(pReal);
+        assert(pImaginary);
+        assert(pUnityTable);
+        assert(reinterpret_cast<uintptr_t>(pReal) % 16 == 0);
+        assert(reinterpret_cast<uintptr_t>(pImaginary) % 16 == 0);
+        assert(reinterpret_cast<uintptr_t>(pUnityTable) % 16 == 0);
+        assert(uChannelCount > 0 && uChannelCount <= 6);
+        _Analysis_assume_(uChannelCount > 0 && uChannelCount <= 6);
+        assert(uLog2Length >= 2 && uLog2Length <= 9);
+        _Analysis_assume_(uLog2Length >= 2 && uLog2Length <= 9);
+
+        XMVECTOR vRealTemp[768] = {};
+        XMVECTOR vImaginaryTemp[768] = {};
+
+        const size_t uLength = size_t(1) << uLog2Length;
+
+        const XMVECTOR vRnp = XMVectorReplicate(1.0f / float(uLength));
+        const XMVECTOR vRnm = XMVectorReplicate(-1.0f / float(uLength));
+        for (size_t u = 0; u < uChannelCount * (uLength >> 2); u++)
+        {
+            vRealTemp[u] = XMVectorMultiply(pReal[u], vRnp);
+            vImaginaryTemp[u] = XMVectorMultiply(pImaginary[u], vRnm);
+        }
+
+        if (uLength > 16)
+        {
+            for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel)
+            {
+                FFT(&vRealTemp[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)], pUnityTable, uLength);
+            }
+        }
+        else if (uLength == 16)
+        {
+            for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel)
+            {
+                FFT16(&vRealTemp[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)]);
+            }
+        }
+        else if (uLength == 8)
+        {
+            for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel)
+            {
+                FFT8(&vRealTemp[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)]);
+            }
+        }
+        else if (uLength == 4)
+        {
+            for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel)
+            {
+                FFT4(&vRealTemp[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)]);
+            }
+        }
+
+        for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel)
+        {
+            FFTUnswizzle(&vImaginaryTemp[uChannel * (uLength >> 2)], &vRealTemp[uChannel * (uLength >> 2)], uLog2Length);
+        }
+
+        if (uChannelCount > 1)
+        {
+            Interleave(pReal, vImaginaryTemp, uChannelCount, uLength);
+        }
+        else
+        {
+            memcpy_s(pReal, uLength * uChannelCount * sizeof(float), vImaginaryTemp, (uLength >> 2) * sizeof(XMVECTOR));
+        }
+    }
+
+} // namespace XDSP
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
diff --git a/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-CMake-Dev17.yml b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-CMake-Dev17.yml
new file mode 100644
index 000000000..4cff817cf
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-CMake-Dev17.yml
@@ -0,0 +1,119 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+# http://go.microsoft.com/fwlink/?LinkID=615560
+
+# Builds the library and test suite using CMake.
+
+schedules:
+- cron: "0 0 * * *"
+  displayName: 'Nightly build'
+  branches:
+    include:
+    - main
+
+resources:
+  repositories:
+  - repository: self
+    type: git
+    ref: refs/heads/main
+    trigger: none
+
+name: $(Year:yyyy).$(Month).$(DayOfMonth)$(Rev:.r)
+
+variables:
+  VS_GENERATOR: 'Visual Studio 17 2022'
+  WIN10_SDK: '10.0.19041.0'
+  WIN11_SDK: '10.0.22000.0'
+  GITHUB_PAT: $(GITHUBPUBLICTOKEN)
+
+pool:
+  vmImage: windows-2022
+
+jobs:
+- job: CMAKE_BUILD
+  displayName: CMake using VS Generator BUILD_TESTING=ON
+  cancelTimeoutInMinutes: 1
+  steps:
+  - checkout: self
+    clean: true
+    fetchTags: false
+  - task: CmdLine@2
+    displayName: Fetch Tests
+    inputs:
+      script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests
+  - task: CMake@1
+    displayName: 'CMake (MSVC): Config x64'
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: '-G "$(VS_GENERATOR)" -A x64 -B out -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)'
+  - task: CMake@1
+    displayName: 'CMake (MSVC): Build x64 Debug'
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: --build out -v --config Debug
+  - task: CMake@1
+    displayName: 'CMake (MSVC): Build x64 Release'
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: --build out -v --config RelWithDebInfo
+  - task: CMake@1
+    displayName: 'CMake (MSVC): Config x86'
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: '-G "$(VS_GENERATOR)" -A Win32 -B out2 -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)'
+  - task: CMake@1
+    displayName: 'CMake (MSVC): Build x86 Debug'
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: --build out2 -v --config Debug
+  - task: CMake@1
+    displayName: 'CMake (MSVC): Build x86 Release'
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: --build out2 -v --config RelWithDebInfo
+  - task: CMake@1
+    displayName: 'CMake (MSVC): Config ARM64'
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: '-G "$(VS_GENERATOR)" -A ARM64 -B out3 -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)'
+  - task: CMake@1
+    displayName: 'CMake (MSVC): Build ARM64 Debug'
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: --build out3 -v --config Debug
+  - task: CMake@1
+    displayName: 'CMake (MSVC): Build ARM64 Release'
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: --build out3 -v --config RelWithDebInfo
+  - task: CMake@1
+    displayName: 'CMake (ClangCl): Config x64'
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: '-G "$(VS_GENERATOR)" -A x64 -T clangcl -B out4 -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)'
+  - task: CMake@1
+    displayName: 'CMake (ClangCl): Build x64 Debug'
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: --build out4 -v --config Debug
+  - task: CMake@1
+    displayName: 'CMake (ClangCl): Build x64 Release'
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: --build out4 -v --config RelWithDebInfo
+  - task: CMake@1
+    displayName: 'CMake (ClangCl): Config ARM64'
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: '-G "$(VS_GENERATOR)" -A ARM64 -T clangcl -B out5 -DCMAKE_SYSTEM_VERSION=$(WIN11_SDK)'
+  - task: CMake@1
+    displayName: 'CMake (ClangCl): Build ARM64 Debug'
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: --build out5 -v --config Debug
+  - task: CMake@1
+    displayName: 'CMake (ClangCl): Build ARM64 Release'
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: --build out5 -v --config RelWithDebInfo
diff --git a/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-CMake.yml b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-CMake.yml
new file mode 100644
index 000000000..1c4e4cd43
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-CMake.yml
@@ -0,0 +1,103 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+# http://go.microsoft.com/fwlink/?LinkID=615560
+
+# Builds the library and test suite using CMake.
+
+schedules:
+- cron: "0 0 * * *"
+  displayName: 'Nightly build'
+  branches:
+    include:
+    - main
+
+resources:
+  repositories:
+  - repository: self
+    type: git
+    ref: refs/heads/main
+    trigger: none
+
+name: $(Year:yyyy).$(Month).$(DayOfMonth)$(Rev:.r)
+
+variables:
+  VS_GENERATOR: 'Visual Studio 16 2019'
+  WIN10_SDK: '10.0.19041.0'
+  GITHUB_PAT: $(GITHUBPUBLICTOKEN)
+
+pool:
+  vmImage: windows-2019
+
+jobs:
+- job: CMAKE_BUILD
+  displayName: CMake using VS Generator
+  cancelTimeoutInMinutes: 1
+  steps:
+  - checkout: self
+    clean: true
+    fetchTags: false
+  - task: CmdLine@2
+    displayName: Fetch Tests
+    inputs:
+      script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests
+  - task: CMake@1
+    displayName: CMake (MSVC x64)
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: '-G "$(VS_GENERATOR)" -A x64 -B out -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)'
+  - task: CMake@1
+    displayName: CMake (Build x64)
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: --build out -v
+  - task: CMake@1
+    displayName: CMake Test (MSVC x64)
+    inputs:
+      cwd: Tests
+      cmakeArgs: '-G "$(VS_GENERATOR)" -A x64 -B out -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)'
+  - task: CMake@1
+    displayName: CMake Test (Build x64)
+    inputs:
+      cwd: Tests
+      cmakeArgs: --build out -v
+  - task: CMake@1
+    displayName: CMake (MSVC ARM64)
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: '-G "$(VS_GENERATOR)" -A ARM64 -B out2 -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)'
+  - task: CMake@1
+    displayName: CMake (Build ARM64)
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: --build out2 -v
+  - task: CMake@1
+    displayName: CMake Test (MSVC ARM64)
+    inputs:
+      cwd: Tests
+      cmakeArgs: '-G "$(VS_GENERATOR)" -A ARM64 -B out2 -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)'
+  - task: CMake@1
+    displayName: CMake Test (Build ARM64)
+    inputs:
+      cwd: Tests
+      cmakeArgs: --build out2 -v
+  - task: CMake@1
+    displayName: CMake (ClangCl)
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: '-G "$(VS_GENERATOR)" -A x64 -T clangcl -B out3 -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)'
+  - task: CMake@1
+    displayName: CMake (Build)
+    inputs:
+      cwd: '$(Build.SourcesDirectory)'
+      cmakeArgs: --build out3 -v
+  - task: CMake@1
+    displayName: CMake Test (ClangCL)
+    inputs:
+      cwd: Tests
+      cmakeArgs: '-G "$(VS_GENERATOR)" -A x64 -T clangcl -B out3 -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)'
+  - task: CMake@1
+    displayName: CMake Test (Build)
+    inputs:
+      cwd: Tests
+      cmakeArgs: --build out3 -v
diff --git a/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-Dev17.yml b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-Dev17.yml
new file mode 100644
index 000000000..ae7ea23d2
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-Dev17.yml
@@ -0,0 +1,296 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+# http://go.microsoft.com/fwlink/?LinkID=615560
+
+# Builds the math3 test suite for DirectXMath.
+
+schedules:
+- cron: "0 0 * * *"
+  displayName: 'Nightly build'
+  branches:
+    include:
+    - main
+
+resources:
+  repositories:
+  - repository: self
+    type: git
+    ref: refs/heads/main
+    trigger:
+      branches:
+        include:
+        - main
+      paths:
+        exclude:
+        - README.md
+        - HISTORY.md
+        - SECURITY.md
+
+name: $(Year:yyyy).$(Month).$(DayOfMonth)$(Rev:.r)
+
+pool:
+  vmImage: windows-2022
+
+variables:
+  GITHUB_PAT: $(GITHUBPUBLICTOKEN)
+
+jobs:
+- job: BUILD_DEV17
+  displayName: 'Visual Studio 2022 (v143)'
+  cancelTimeoutInMinutes: 1
+  steps:
+  - checkout: self
+    clean: true
+    fetchTags: false
+  - task: DeleteFiles@1
+    displayName: Delete files from Tests
+    inputs:
+      SourceFolder: Tests
+      Contents: '**'
+      RemoveSourceFolder: true
+      RemoveDotFiles: true
+  - task: CmdLine@2
+    displayName: Fetch Tests
+    inputs:
+      script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x86dbg
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x86
+      configuration: Debug
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x86rel
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x86
+      configuration: Release
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x64dbg
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x64
+      configuration: Debug
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x64rel
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x64
+      configuration: Release
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln arm64dbg
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: ARM64
+      configuration: Debug
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln arm64rel
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: ARM64
+      configuration: Release
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x86dbg sse3
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x86
+      configuration: SSE3 Debug
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x86rel sse3
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x86
+      configuration: SSE3 Release
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x64dbg sse3
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x64
+      configuration: SSE3 Debug
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x64rel sse3
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x64
+      configuration: SSE3 Release
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x86dbg sse4
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x86
+      configuration: SSE4 Debug
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x86rel sse4
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x86
+      configuration: SSE4 Release
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x64dbg sse4
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x64
+      configuration: SSE4 Debug
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x64rel sse4
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x64
+      configuration: SSE4 Release
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x86dbg avx
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x86
+      configuration: AVX Debug
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x86rel avx
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x86
+      configuration: AVX Release
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x64dbg avx
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x64
+      configuration: AVX Debug
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x64rel avx
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x64
+      configuration: AVX Release
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x86dbg avx2
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x86
+      configuration: AVX2 Debug
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x86rel avx2
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x86
+      configuration: AVX2 Release
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x64dbg avx2
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x64
+      configuration: AVX2 Debug
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x64rel avx2
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x64
+      configuration: AVX2 Release
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x86dbg nointrinsics
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x86
+      configuration: NI Debug
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x86rel nointrinsics
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x86
+      configuration: NI Release
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x64dbg nointrinsics
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x64
+      configuration: NI Debug
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x64rel nointrinsics
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x64
+      configuration: NI Release
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln arm64dbg nointrinsics
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: ARM64
+      configuration: NI Debug
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln arm86rel nointrinsics
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: ARM64
+      configuration: NI Release
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x86dbg x87
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x86
+      configuration: x87 Debug
+      msbuildArchitecture: x64
+  - task: VSBuild@1
+    displayName: Build solution math3_2022.sln x86rel x87
+    inputs:
+      solution: Tests/math3/math3_2022.sln
+      vsVersion: 17.0
+      platform: x86
+      configuration: x87 Release
+      msbuildArchitecture: x64
diff --git a/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-MinGW.yml b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-MinGW.yml
new file mode 100644
index 000000000..309e8d76c
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-MinGW.yml
@@ -0,0 +1,170 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+# http://go.microsoft.com/fwlink/?LinkID=615560
+
+# Builds the library and test suite using the MinGW compiler.
+
+schedules:
+- cron: "0 0 * * *"
+  displayName: 'Nightly build'
+  branches:
+    include:
+    - main
+
+resources:
+  repositories:
+  - repository: self
+    type: git
+    ref: refs/heads/main
+    trigger: none
+
+name: $(Year:yyyy).$(Month).$(DayOfMonth)$(Rev:.r)
+
+pool:
+  vmImage: windows-2022
+
+variables:
+  GITHUB_PAT: $(GITHUBPUBLICTOKEN)
+  URL_MINGW32: https://github.com/brechtsanders/winlibs_mingw/releases/download/12.2.0-14.0.6-10.0.0-ucrt-r2/winlibs-i686-posix-dwarf-gcc-12.2.0-llvm-14.0.6-mingw-w64ucrt-10.0.0-r2.zip
+  HASH_MINGW32: 'fcd1e11b896190da01c83d5b5fb0d37b7c61585e53446c2dab0009debc3915e757213882c35e35396329338de6f0222ba012e23a5af86932db45186a225d1272'
+  URL_MINGW64: https://github.com/brechtsanders/winlibs_mingw/releases/download/12.2.0-14.0.6-10.0.0-ucrt-r2/winlibs-x86_64-posix-seh-gcc-12.2.0-llvm-14.0.6-mingw-w64ucrt-10.0.0-r2.zip
+  HASH_MINGW64: '6694e552d73195b57f283645ab78cb0180f4d957b5501a83e6b4f2679dfad13a8e85e1df6f7b061ea4431fbd2bb0c8f2ac3a1dd810489c1a8d1665b226df8092'
+
+jobs:
+- job: MINGW32_BUILD
+  displayName: 'Minimalist GNU for Windows (MinGW32)'
+  steps:
+  - checkout: self
+    clean: true
+    fetchTags: false
+  - task: CmdLine@2
+    displayName: Fetch Tests
+    inputs:
+      script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests
+      workingDirectory: $(Build.SourcesDirectory)
+  - task: PowerShell@2
+    # We install GCC 12.2 as the MS Hosted only offers 11.2
+    displayName: Install MinGW32
+    inputs:
+      targetType: inline
+      script: |
+        $ProgressPreference = 'SilentlyContinue'
+        Write-Host "Downloading winlibs..."
+        Invoke-WebRequest -Uri "$(URL_MINGW32)" -OutFile "gw32.zip"
+        Write-Host "Downloaded."
+        $fileHash = Get-FileHash -Algorithm SHA512 gw32.zip | ForEach { $_.Hash} | Out-String
+        $filehash = $fileHash.Trim()
+        Write-Host "##[debug]SHA512: " $fileHash
+        if ($fileHash -ne '$(HASH_MINGW32)') {
+            Write-Error -Message "##[error]Computed hash does not match!" -ErrorAction Stop
+        }
+        Write-Host "Extracting winlibs..."
+        Expand-Archive -LiteralPath 'gw32.zip'
+        Write-Host "Extracted."
+        Write-Host "Added to path: $env:BUILD_SOURCESDIRECTORY\gw32\mingw32\bin"
+        Write-Host "##vso[task.prependpath]$env:BUILD_SOURCESDIRECTORY\gw32\mingw32\bin"
+
+      workingDirectory: $(Build.SourcesDirectory)
+  - task: CmdLine@2
+    displayName: GCC version
+    inputs:
+      script: g++ --version
+  - task: CMake@1
+    displayName: CMake (MinGW32) Dbg
+    inputs:
+      cwd: Tests
+      cmakeArgs: -B out -DCMAKE_BUILD_TYPE="Debug" -DDXMATH_ARCHITECTURE=x86 -DCMAKE_CXX_COMPILER="g++.exe" -G "MinGW Makefiles"
+  - task: CMake@1
+    displayName: CMake (MinGW32) Build Dbg
+    inputs:
+      cwd: Tests
+      cmakeArgs: --build out
+  - task: CMake@1
+    displayName: CMake (MinGW32) Rel
+    inputs:
+      cwd: Tests
+      cmakeArgs: -B out2 -DCMAKE_BUILD_TYPE="RelWithDebInfo" -DDXMATH_ARCHITECTURE=x86 -DCMAKE_CXX_COMPILER="g++.exe" -G "MinGW Makefiles"
+  - task: CMake@1
+    displayName: CMake (MinGW32) Build Rel
+    inputs:
+      cwd: Tests
+      cmakeArgs: --build out2
+  - task: CMake@1
+    displayName: CMake (MinGW32) Dbg NI
+    inputs:
+      cwd: Tests
+      cmakeArgs: -B out3 -DCMAKE_BUILD_TYPE="Debug" -DBUILD_NO_INTRINSICS=ON -DDXMATH_ARCHITECTURE=x86 -DCMAKE_CXX_COMPILER="g++.exe" -G "MinGW Makefiles"
+  - task: CMake@1
+    displayName: CMake (MinGW32) Build Dbg NI
+    inputs:
+      cwd: Tests
+      cmakeArgs: --build out3
+
+- job: MINGW64_BUILD
+  displayName: 'Minimalist GNU for Windows (MinGW-W64) BUILD_TESTING=ON'
+  steps:
+  - checkout: self
+    clean: true
+    fetchTags: false
+  - task: CmdLine@2
+    displayName: Fetch Tests
+    inputs:
+      script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests
+      workingDirectory: $(Build.SourcesDirectory)
+  - task: PowerShell@2
+    displayName: Install MinGW-W64
+    inputs:
+      targetType: inline
+      script: |
+        $ProgressPreference = 'SilentlyContinue'
+        Write-Host "Downloading winlibs..."
+        Invoke-WebRequest -Uri "$(URL_MINGW64)" -OutFile "gw64.zip"
+        Write-Host "Downloaded."
+        $fileHash = Get-FileHash -Algorithm SHA512 gw64.zip | ForEach { $_.Hash} | Out-String
+        $filehash = $fileHash.Trim()
+        Write-Host "##[debug]SHA512: " $fileHash
+        if ($fileHash -ne '$(HASH_MINGW64)') {
+            Write-Error -Message "##[error]Computed hash does not match!" -ErrorAction Stop
+        }
+        Write-Host "Extracting winlibs..."
+        Expand-Archive -LiteralPath 'gw64.zip'
+        Write-Host "Extracted."
+        Write-Host "Added to path: $env:BUILD_SOURCESDIRECTORY\gw64\mingw64\bin"
+        Write-Host "##vso[task.prependpath]$env:BUILD_SOURCESDIRECTORY\gw64\mingw64\bin"
+
+      workingDirectory: $(Build.SourcesDirectory)
+  - task: CmdLine@2
+    displayName: GCC version
+    inputs:
+      script: g++ --version
+  - task: CMake@1
+    displayName: CMake (MinGW-W64) Dbg
+    inputs:
+      cwd: Tests
+      cmakeArgs: -B out -DCMAKE_BUILD_TYPE="Debug" -DDXMATH_ARCHITECTURE=x64 -DCMAKE_CXX_COMPILER="g++.exe" -G "MinGW Makefiles"
+  - task: CMake@1
+    displayName: CMake (MinGW-W64) Build Dbg
+    inputs:
+      cwd: Tests
+      cmakeArgs: --build out
+  - task: CMake@1
+    displayName: CMake (MinGW-W64) Rel
+    inputs:
+      cwd: Tests
+      cmakeArgs: -B out2 -DCMAKE_BUILD_TYPE="RelWithDebInfo" -DDXMATH_ARCHITECTURE=x64 -DCMAKE_CXX_COMPILER="g++.exe" -G "MinGW Makefiles"
+  - task: CMake@1
+    displayName: CMake (MinGW-W64) Build Rel
+    inputs:
+      cwd: Tests
+      cmakeArgs: --build out2
+  - task: CMake@1
+    displayName: CMake (MinGW-W64) Dbg NI
+    inputs:
+      cwd: Tests
+      cmakeArgs: -B out3 -DCMAKE_BUILD_TYPE="Debug" -DBUILD_NO_INTRINSICS=ON -DDXMATH_ARCHITECTURE=x64 -DCMAKE_CXX_COMPILER="g++.exe" -G "MinGW Makefiles"
+  - task: CMake@1
+    displayName: CMake (MinGW-W64) Build Dbg NI
+    inputs:
+      cwd: Tests
+      cmakeArgs: --build out3
diff --git a/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-WSL-11.yml b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-WSL-11.yml
new file mode 100644
index 000000000..c8f7b3c6f
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-WSL-11.yml
@@ -0,0 +1,64 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+# http://go.microsoft.com/fwlink/?LinkID=615560
+
+# Builds the math3 test suite for Windows Subsystem for Linux (WSL)
+
+schedules:
+- cron: "0 3 * * *"
+  displayName: 'Nightly build'
+  branches:
+    include:
+    - main
+
+resources:
+  repositories:
+  - repository: self
+    type: git
+    ref: refs/heads/main
+    trigger: none
+
+name: $(Year:yyyy).$(Month).$(DayOfMonth)$(Rev:.r)
+
+pool:
+  vmImage: ubuntu-22.04
+
+variables:
+  GITHUB_PAT: $(GITHUBPUBLICTOKEN)
+
+jobs:
+- job: BUILD_WSL
+  displayName: 'Windows Subsystem for Linux (WSL)'
+  steps:
+  - checkout: self
+    clean: true
+    fetchTags: false
+  - task: CmdLine@2
+    displayName: Fetch tests
+    inputs:
+      script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests
+  - task: CMake@1
+    displayName: DirectXMath Tests
+    inputs:
+      cwd: Tests
+      cmakeArgs: .
+  - task: PowerShell@2
+    displayName: Fetch SAL.H
+    inputs:
+      targetType: inline
+      script: |
+        $ProgressPreference = 'SilentlyContinue'
+        Invoke-WebRequest -Uri https://raw.githubusercontent.com/dotnet/corert/master/src/Native/inc/unix/sal.h -o $(Build.SourcesDirectory)/Inc/sal.h
+        $fileHash = Get-FileHash -Algorithm SHA512 $(Build.SourcesDirectory)/Inc/sal.h | ForEach { $_.Hash} | Out-String
+        $filehash = $fileHash.Trim()
+        Write-Host "##[debug]SHA512: " $filehash
+        if ($fileHash -ne "1643571673195d9eb892d2f2ac76eac7113ef7aa0ca116d79f3e4d3dc9df8a31600a9668b7e7678dfbe5a76906f9e0734ef8d6db0903ccc68fc742dd8238d8b0") {
+            Write-Error -Message "##[error]Computed hash does not match!" -ErrorAction Stop
+        }
+
+  - task: CMake@1
+    displayName: DirectXMath Tests Build
+    inputs:
+      cwd: Tests
+      cmakeArgs: --build . -v
diff --git a/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-WSL.yml b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-WSL.yml
new file mode 100644
index 000000000..05d6c1117
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-WSL.yml
@@ -0,0 +1,64 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+# http://go.microsoft.com/fwlink/?LinkID=615560
+
+# Builds the math3 test suite for Windows Subsystem for Linux (WSL)
+
+schedules:
+- cron: "0 3 * * *"
+  displayName: 'Nightly build'
+  branches:
+    include:
+    - main
+
+resources:
+  repositories:
+  - repository: self
+    type: git
+    ref: refs/heads/main
+    trigger: none
+
+name: $(Year:yyyy).$(Month).$(DayOfMonth)$(Rev:.r)
+
+pool:
+  vmImage: ubuntu-20.04
+
+variables:
+  GITHUB_PAT: $(GITHUBPUBLICTOKEN)
+
+jobs:
+- job: BUILD_WSL
+  displayName: 'Windows Subsystem for Linux (WSL)'
+  steps:
+  - checkout: self
+    clean: true
+    fetchTags: false
+  - task: CmdLine@2
+    displayName: Fetch tests
+    inputs:
+      script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests
+  - task: CMake@1
+    displayName: DirectXMath Tests
+    inputs:
+      cwd: Tests
+      cmakeArgs: .
+  - task: PowerShell@2
+    displayName: Fetch SAL.H
+    inputs:
+      targetType: inline
+      script: |
+        $ProgressPreference = 'SilentlyContinue'
+        Invoke-WebRequest -Uri https://raw.githubusercontent.com/dotnet/corert/master/src/Native/inc/unix/sal.h -o $(Build.SourcesDirectory)/Inc/sal.h
+        $fileHash = Get-FileHash -Algorithm SHA512 $(Build.SourcesDirectory)/Inc/sal.h | ForEach { $_.Hash} | Out-String
+        $filehash = $fileHash.Trim()
+        Write-Host "##[debug]SHA512: " $filehash
+        if ($fileHash -ne "1643571673195d9eb892d2f2ac76eac7113ef7aa0ca116d79f3e4d3dc9df8a31600a9668b7e7678dfbe5a76906f9e0734ef8d6db0903ccc68fc742dd8238d8b0") {
+            Write-Error -Message "##[error]Computed hash does not match!" -ErrorAction Stop
+        }
+
+  - task: CMake@1
+    displayName: DirectXMath Tests Build
+    inputs:
+      cwd: Tests
+      cmakeArgs: --build . -v
diff --git a/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub.yml b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub.yml
new file mode 100644
index 000000000..393762bf0
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub.yml
@@ -0,0 +1,543 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+# http://go.microsoft.com/fwlink/?LinkID=615560
+
+# Builds the math3 test suite for DirectXMath.
+
+schedules:
+- cron: "0 0 * * *"
+  displayName: 'Nightly build'
+  branches:
+    include:
+    - main
+
+resources:
+  repositories:
+  - repository: self
+    type: git
+    ref: refs/heads/main
+    trigger:
+      branches:
+        include:
+        - main
+      paths:
+        exclude:
+        - README.md
+        - HISTORY.md
+        - SECURITY.md
+
+name: $(Year:yyyy).$(Month).$(DayOfMonth)$(Rev:.r)
+
+pool:
+  vmImage: windows-2019
+
+variables:
+  GITHUB_PAT: $(GITHUBPUBLICTOKEN)
+
+jobs:
+- job: BUILD_DEV16
+  displayName: 'Visual Studio 2019 (v142)'
+  cancelTimeoutInMinutes: 1
+  steps:
+  - checkout: self
+    clean: true
+    fetchTags: false
+  - task: DeleteFiles@1
+    displayName: Delete files from Tests
+    inputs:
+      SourceFolder: Tests
+      Contents: '**'
+      RemoveSourceFolder: true
+      RemoveDotFiles: true
+  - task: CmdLine@2
+    displayName: Fetch Tests
+    inputs:
+      script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x86dbg
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x86rel
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x64dbg
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x64rel
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln arm64dbg
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: ARM64
+      configuration: Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln arm64rel
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: ARM64
+      configuration: Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x86dbg sse3
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: SSE3 Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x86rel sse3
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: SSE3 Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x64dbg sse3
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: SSE3 Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x64rel sse3
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: SSE3 Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x86dbg sse4
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: SSE4 Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x86rel sse4
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: SSE4 Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x64dbg sse4
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: SSE4 Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x64rel sse4
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: SSE4 Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x86dbg avx
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: AVX Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x86rel avx
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: AVX Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x64dbg avx
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: AVX Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x64rel avx
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: AVX Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x86dbg avx2
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: AVX2 Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x86rel avx2
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: AVX2 Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x64dbg avx2
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: AVX2 Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x64rel avx2
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: AVX2 Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x86dbg nointrinsics
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: NI Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x86rel nointrinsics
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: NI Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x64dbg nointrinsics
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: NI Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x64rel nointrinsics
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: NI Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln arm64dbg nointrinsics
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: ARM64
+      configuration: NI Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln arm86rel nointrinsics
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: ARM64
+      configuration: NI Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x86dbg x87
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: x87 Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2019.sln x86rel x87
+    inputs:
+      solution: Tests/math3/math3_2019.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: x87 Release
+  - task: VSBuild@1
+    displayName: Build solution shmath_2019.sln x64dbg
+    inputs:
+      solution: Tests/shmath/shmath_2019.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: Debug
+  - task: VSBuild@1
+    displayName: Build solution shmath_2019.sln x64rel
+    inputs:
+      solution: Tests/shmath/shmath_2019.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: Release
+  - task: VSBuild@1
+    displayName: Build solution shmath_2019.sln arm64dbg
+    inputs:
+      solution: Tests/shmath/shmath_2019.sln
+      vsVersion: 16.0
+      platform: ARM64
+      configuration: Debug
+  - task: VSBuild@1
+    displayName: Build solution shmath_2019.sln arm64rel
+    inputs:
+      solution: Tests/shmath/shmath_2019.sln
+      vsVersion: 16.0
+      platform: ARM64
+      configuration: Release
+  - task: VSBuild@1
+    displayName: Build solution XDSPTest_2019 x64dbg
+    inputs:
+      solution: Tests/xdsp/XDSPTest_2019.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: Debug
+  - task: VSBuild@1
+    displayName: Build solution XDSPTest_2019 x64rel
+    inputs:
+      solution: Tests/xdsp/XDSPTest_2019.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: Release
+  - task: VSBuild@1
+    displayName: Build solution XDSPTest_2019 arm64dbg
+    inputs:
+      solution: Tests/xdsp/XDSPTest_2019.sln
+      vsVersion: 16.0
+      platform: ARM64
+      configuration: Debug
+  - task: VSBuild@1
+    displayName: Build solution XDSPTest_2019 arm64rel
+    inputs:
+      solution: Tests/xdsp/XDSPTest_2019.sln
+      vsVersion: 16.0
+      platform: ARM64
+      configuration: Release
+
+- job: BUILD_DEV15
+  displayName: 'Visual Studio 2019 (v141)'
+  steps:
+  - checkout: self
+    clean: true
+    fetchTags: false
+  - task: CmdLine@2
+    displayName: Fetch Tests
+    inputs:
+      script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x86dbg
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x86rel
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x64dbg
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x64rel
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x86dbg sse3
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: SSE3 Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x86rel sse3
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: SSE3 Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x64dbg sse3
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: SSE3 Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x64rel sse3
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: SSE3 Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x86dbg sse4
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: SSE4 Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x86rel sse4
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: SSE4 Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x64dbg sse4
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: SSE4 Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x64rel sse4
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: SSE4 Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x86dbg avx
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: AVX Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x86rel avx
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: AVX Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x64dbg avx
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: AVX Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x64rel avx
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: AVX Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x86dbg avx2
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: AVX2 Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x86rel avx2
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: AVX2 Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x64dbg avx2
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: AVX2 Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x64rel avx2
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: AVX2 Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x86dbg nointrinsics
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: NI Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x86rel nointrinsics
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: NI Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x64dbg nointrinsics
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: NI Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x64rel nointrinsics
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: NI Release
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x86dbg x87
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: x87 Debug
+  - task: VSBuild@1
+    displayName: Build solution math3_2017.sln x86rel x87
+    inputs:
+      solution: Tests/math3/math3_2017.sln
+      vsVersion: 16.0
+      platform: x86
+      configuration: x87 Release
+  - task: VSBuild@1
+    displayName: Build solution shmath_2017.sln x64dbg
+    inputs:
+      solution: Tests/shmath/shmath_2017.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: Debug
+  - task: VSBuild@1
+    displayName: Build solution shmath_2017.sln x64rel
+    inputs:
+      solution: Tests/shmath/shmath_2017.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: Release
+  - task: VSBuild@1
+    displayName: Build solution XDSPTest_2017 x64dbg
+    inputs:
+      solution: Tests/xdsp/XDSPTest_2017.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: Debug
+  - task: VSBuild@1
+    displayName: Build solution XDSPTest_2017 x64rel
+    inputs:
+      solution: Tests/xdsp/XDSPTest_2017.sln
+      vsVersion: 16.0
+      platform: x64
+      configuration: Release
diff --git a/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-config.cmake.in b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-config.cmake.in
new file mode 100644
index 000000000..2a485225c
--- /dev/null
+++ b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-config.cmake.in
@@ -0,0 +1,5 @@
+@PACKAGE_INIT@
+
+include(${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@-targets.cmake)
+
+check_required_components("@PROJECT_NAME@")
diff --git a/src/thirdparty/dotnetrt/sal.h b/src/thirdparty/dotnetrt/sal.h
new file mode 100644
index 000000000..2e0457140
--- /dev/null
+++ b/src/thirdparty/dotnetrt/sal.h
@@ -0,0 +1,2953 @@
+// VALVE EDIT:
+// taken from https://github.com/dotnet/runtime/blob/main/src/coreclr/pal/inc/rt/sal.h
+// used for DirectXMath compatibly on POSIX
+
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+/***
+*sal.h - markers for documenting the semantics of APIs
+*
+
+*
+*Purpose:
+*       sal.h provides a set of annotations to describe how a function uses its
+*       parameters - the assumptions it makes about them, and the guarantees it makes
+*       upon finishing.
+****/
+#pragma once
+
+/*==========================================================================
+
+   The comments in this file are intended to give basic understanding of
+   the usage of SAL, the Microsoft Source Code Annotation Language.
+   For more details, please see http://go.microsoft.com/fwlink/?LinkID=242134
+
+   The macros are defined in 3 layers, plus the structural set:
+
+   _In_/_Out_/_Ret_ Layer:
+   ----------------------
+   This layer provides the highest abstraction and its macros should be used
+   in most cases. These macros typically start with:
+      _In_     : input parameter to a function, unmodified by called function
+      _Out_    : output parameter, written to by called function, pointed-to
+                 location not expected to be initialized prior to call
+      _Outptr_ : like _Out_ when returned variable is a pointer type
+                 (so param is pointer-to-pointer type). Called function
+                 provides/allocated space.
+      _Outref_ : like _Outptr_, except param is reference-to-pointer type.
+      _Inout_  : inout parameter, read from and potentially modified by
+                 called function.
+      _Ret_    : for return values
+      _Field_  : class/struct field invariants
+   For common usage, this class of SAL provides the most concise annotations.
+   Note that _In_/_Out_/_Inout_/_Outptr_ annotations are designed to be used
+   with a parameter target. Using them with _At_ to specify non-parameter
+   targets may yield unexpected results.
+
+   This layer also includes a number of other properties that can be specified
+   to extend the ability of code analysis, most notably:
+      -- Designating parameters as format strings for printf/scanf/scanf_s
+      -- Requesting stricter type checking for C enum parameters
+
+   _Pre_/_Post_ Layer:
+   ------------------
+   The macros of this layer only should be used when there is no suitable macro
+   in the _In_/_Out_ layer. Its macros start with _Pre_ or _Post_.
+   This layer provides the most flexibility for annotations.
+
+   Implementation Abstraction Layer:
+   --------------------------------
+   Macros from this layer should never be used directly. The layer only exists
+   to hide the implementation of the annotation macros.
+
+   Structural Layer:
+   ----------------
+   These annotations, like _At_ and _When_, are used with annotations from
+   any of the other layers as modifiers, indicating exactly when and where
+   the annotations apply.
+
+
+   Common syntactic conventions:
+   ----------------------------
+
+   Usage:
+   -----
+   _In_, _Out_, _Inout_, _Pre_, _Post_, are for formal parameters.
+   _Ret_, _Deref_ret_ must be used for return values.
+
+   Nullness:
+   --------
+   If the parameter can be NULL as a precondition to the function, the
+   annotation contains _opt. If the macro does not contain '_opt' the
+   parameter cannot be NULL.
+
+   If an out/inout parameter returns a null pointer as a postcondition, this is
+   indicated by _Ret_maybenull_ or _result_maybenull_. If the macro is not
+   of this form, then the result will not be NULL as a postcondition.
+     _Outptr_ - output value is not NULL
+     _Outptr_result_maybenull_ - output value might be NULL
+
+   String Type:
+   -----------
+   _z: NullTerminated string
+   for _In_ parameters the buffer must have the specified stringtype before the call
+   for _Out_ parameters the buffer must have the specified stringtype after the call
+   for _Inout_ parameters both conditions apply
+
+   Extent Syntax:
+   -------------
+   Buffer sizes are expressed as element counts, unless the macro explicitly
+   contains _byte_ or _bytes_. Some annotations specify two buffer sizes, in
+   which case the second is used to indicate how much of the buffer is valid
+   as a postcondition. This table outlines the precondition buffer allocation
+   size, precondition number of valid elements, postcondition allocation size,
+   and postcondition number of valid elements for representative buffer size
+   annotations:
+                                     Pre    |  Pre    |  Post   |  Post
+                                     alloc  |  valid  |  alloc  |  valid
+      Annotation                     elems  |  elems  |  elems  |  elems
+      ----------                     ------------------------------------
+      _In_reads_(s)                    s    |   s     |   s     |   s
+      _Inout_updates_(s)               s    |   s     |   s     |   s
+      _Inout_updates_to_(s,c)          s    |   s     |   s     |   c
+      _Out_writes_(s)                  s    |   0     |   s     |   s
+      _Out_writes_to_(s,c)             s    |   0     |   s     |   c
+      _Outptr_result_buffer_(s)        ?    |   ?     |   s     |   s
+      _Outptr_result_buffer_to_(s,c)   ?    |   ?     |   s     |   c
+
+   For the _Outptr_ annotations, the buffer in question is at one level of
+   dereference. The called function is responsible for supplying the buffer.
+
+   Success and failure:
+   -------------------
+   The SAL concept of success allows functions to define expressions that can
+   be tested by the caller, which if it evaluates to non-zero, indicates the
+   function succeeded, which means that its postconditions are guaranteed to
+   hold.  Otherwise, if the expression evaluates to zero, the function is
+   considered to have failed, and the postconditions are not guaranteed.
+
+   The success criteria can be specified with the _Success_(expr) annotation:
+     _Success_(return != FALSE) BOOL
+     PathCanonicalizeA(_Out_writes_(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) :
+        pszBuf is only guaranteed to be NULL-terminated when TRUE is returned,
+        and FALSE indicates failure. In common practice, callers check for zero
+        vs. non-zero returns, so it is preferable to express the success
+        criteria in terms of zero/non-zero, not checked for exactly TRUE.
+
+   Functions can specify that some postconditions will still hold, even when
+   the function fails, using _On_failure_(anno-list), or postconditions that
+   hold regardless of success or failure using _Always_(anno-list).
+
+   The annotation _Return_type_success_(expr) may be used with a typedef to
+   give a default _Success_ criteria to all functions returning that type.
+   This is the case for common Windows API status types, including
+   HRESULT and NTSTATUS.  This may be overridden on a per-function basis by
+   specifying a _Success_ annotation locally.
+
+============================================================================*/
+
+#define __ATTR_SAL
+
+#ifndef _SAL_VERSION /*IFSTRIP=IGN*/
+#define _SAL_VERSION 20
+#endif
+
+#ifdef _PREFAST_ // [
+
+// choose attribute or __declspec implementation
+#ifndef _USE_DECLSPECS_FOR_SAL // [
+#define _USE_DECLSPECS_FOR_SAL 1
+#endif // ]
+
+#if _USE_DECLSPECS_FOR_SAL // [
+#undef _USE_ATTRIBUTES_FOR_SAL
+#define _USE_ATTRIBUTES_FOR_SAL 0
+#elif !defined(_USE_ATTRIBUTES_FOR_SAL) // ][
+#if _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // [
+#define _USE_ATTRIBUTES_FOR_SAL 1
+#else // ][
+#define _USE_ATTRIBUTES_FOR_SAL 0
+#endif // ]
+#endif // ]
+
+
+#if !_USE_DECLSPECS_FOR_SAL // [
+#if !_USE_ATTRIBUTES_FOR_SAL // [
+#if _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // [
+#undef _USE_ATTRIBUTES_FOR_SAL
+#define _USE_ATTRIBUTES_FOR_SAL 1
+#else // ][
+#undef _USE_DECLSPECS_FOR_SAL
+#define _USE_DECLSPECS_FOR_SAL  1
+#endif // ]
+#endif // ]
+#endif // ]
+
+#else
+
+// Disable expansion of SAL macros in non-Prefast mode to
+// improve compiler throughput.
+#ifndef _USE_DECLSPECS_FOR_SAL // [
+#define _USE_DECLSPECS_FOR_SAL 0
+#endif // ]
+#ifndef _USE_ATTRIBUTES_FOR_SAL // [
+#define _USE_ATTRIBUTES_FOR_SAL 0
+#endif // ]
+
+#endif // ]
+
+// safeguard for MIDL and RC builds
+#if _USE_DECLSPECS_FOR_SAL && ( defined( MIDL_PASS ) || defined(__midl) || defined(RC_INVOKED) || !defined(_PREFAST_) ) /*IFSTRIP=IGN*/ // [
+#undef _USE_DECLSPECS_FOR_SAL
+#define _USE_DECLSPECS_FOR_SAL 0
+#endif // ]
+#if _USE_ATTRIBUTES_FOR_SAL && ( !defined(_MSC_EXTENSIONS) || defined( MIDL_PASS ) || defined(__midl) || defined(RC_INVOKED) ) /*IFSTRIP=IGN*/ // [
+#undef _USE_ATTRIBUTES_FOR_SAL
+#define _USE_ATTRIBUTES_FOR_SAL 0
+#endif // ]
+
+#if _USE_DECLSPECS_FOR_SAL || _USE_ATTRIBUTES_FOR_SAL
+
+// Special enum type for Y/N/M
+enum __SAL_YesNo {_SAL_notpresent, _SAL_no, _SAL_maybe, _SAL_yes, _SAL_default};
+
+#endif
+
+#if defined(BUILD_WINDOWS) && !_USE_ATTRIBUTES_FOR_SAL /*IFSTRIP=IGN*/
+#define _SAL1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1") _GrouP_(annotes _SAL_nop_impl_)
+#define _SAL1_1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.1") _GrouP_(annotes _SAL_nop_impl_)
+#define _SAL1_2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.2") _GrouP_(annotes _SAL_nop_impl_)
+#define _SAL2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "2") _GrouP_(annotes _SAL_nop_impl_)
+#else
+#define _SAL1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1") _Group_(annotes _SAL_nop_impl_)
+#define _SAL1_1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.1") _Group_(annotes _SAL_nop_impl_)
+#define _SAL1_2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.2") _Group_(annotes _SAL_nop_impl_)
+#define _SAL2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "2") _Group_(annotes _SAL_nop_impl_)
+#endif
+
+//============================================================================
+//   Structural SAL:
+//     These annotations modify the use of other annotations.  They may
+//     express the annotation target (i.e. what parameter/field the annotation
+//     applies to) or the condition under which the annotation is applicable.
+//============================================================================
+
+// _At_(target, annos) specifies that the annotations listed in 'annos' is to
+// be applied to 'target' rather than to the identifier which is the current
+// lexical target.
+#define _At_(target, annos)            _At_impl_(target, annos _SAL_nop_impl_)
+
+// _At_buffer_(target, iter, bound, annos) is similar to _At_, except that
+// target names a buffer, and each annotation in annos is applied to each
+// element of target up to bound, with the variable named in iter usable
+// by the annotations to refer to relevant offsets within target.
+#define _At_buffer_(target, iter, bound, annos)  _At_buffer_impl_(target, iter, bound, annos _SAL_nop_impl_)
+
+// _When_(expr, annos) specifies that the annotations listed in 'annos' only
+// apply when 'expr' evaluates to non-zero.
+#define _When_(expr, annos)            _When_impl_(expr, annos _SAL_nop_impl_)
+#define _Group_(annos)                 _Group_impl_(annos _SAL_nop_impl_)
+#define _GrouP_(annos)                 _GrouP_impl_(annos _SAL_nop_impl_)
+
+// <expr> indicates whether normal post conditions apply to a function
+#define _Success_(expr)                  _SAL2_Source_(_Success_, (expr), _Success_impl_(expr))
+
+// <expr> indicates whether post conditions apply to a function returning
+// the type that this annotation is applied to
+#define _Return_type_success_(expr)      _SAL2_Source_(_Return_type_success_, (expr), _Success_impl_(expr))
+
+// Establish postconditions that apply only if the function does not succeed
+#define _On_failure_(annos)              _On_failure_impl_(annos _SAL_nop_impl_)
+
+// Establish postconditions that apply in both success and failure cases.
+// Only applicable with functions that have  _Success_ or _Return_type_succss_.
+#define _Always_(annos)                  _Always_impl_(annos _SAL_nop_impl_)
+
+// Usable on a function definition. Asserts that a function declaration is
+// in scope, and its annotations are to be used. There are no other annotations
+// allowed on the function definition.
+#define _Use_decl_annotations_         _Use_decl_anno_impl_
+
+// _Notref_ may precede a _Deref_ or "real" annotation, and removes one
+// level of dereference if the parameter is a C++ reference (&).  If the
+// net deref on a "real" annotation is negative, it is simply discarded.
+#define _Notref_                       _Notref_impl_
+
+// Annotations for defensive programming styles.
+#define _Pre_defensive_             _SA_annotes0(SAL_pre_defensive)
+#define _Post_defensive_            _SA_annotes0(SAL_post_defensive)
+
+#define _In_defensive_(annotes)     _Pre_defensive_ _Group_(annotes)
+#define _Out_defensive_(annotes)    _Post_defensive_ _Group_(annotes)
+#define _Inout_defensive_(annotes)  _Pre_defensive_ _Post_defensive_ _Group_(annotes)
+
+//============================================================================
+//   _In_\_Out_ Layer:
+//============================================================================
+
+// Reserved pointer parameters, must always be NULL.
+#define _Reserved_                      _SAL2_Source_(_Reserved_, (), _Pre1_impl_(__null_impl))
+
+// _Const_ allows specification that any namable memory location is considered
+// readonly for a given call.
+#define _Const_                         _SAL2_Source_(_Const_, (), _Pre1_impl_(__readaccess_impl_notref))
+
+
+// Input parameters --------------------------
+
+//   _In_ - Annotations for parameters where data is passed into the function, but not modified.
+//          _In_ by itself can be used with non-pointer types (although it is redundant).
+
+// e.g. void SetPoint( _In_ const POINT* pPT );
+#define _In_                            _SAL2_Source_(_In_, (), _Pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_ _Deref_pre1_impl_(__readaccess_impl_notref))
+#define _In_opt_                        _SAL2_Source_(_In_opt_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_ _Deref_pre_readonly_)
+
+// nullterminated 'in' parameters.
+// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo );
+#define _In_z_                          _SAL2_Source_(_In_z_, (),     _In_     _Pre1_impl_(__zterm_impl))
+#define _In_opt_z_                      _SAL2_Source_(_In_opt_z_, (), _In_opt_ _Pre1_impl_(__zterm_impl))
+
+
+// 'input' buffers with given size
+
+#define _In_reads_(size)               _SAL2_Source_(_In_reads_, (size), _Pre_count_(size)          _Deref_pre_readonly_)
+#define _In_reads_opt_(size)           _SAL2_Source_(_In_reads_opt_, (size), _Pre_opt_count_(size)      _Deref_pre_readonly_)
+#define _In_reads_bytes_(size)         _SAL2_Source_(_In_reads_bytes_, (size), _Pre_bytecount_(size)      _Deref_pre_readonly_)
+#define _In_reads_bytes_opt_(size)     _SAL2_Source_(_In_reads_bytes_opt_, (size), _Pre_opt_bytecount_(size)  _Deref_pre_readonly_)
+#define _In_reads_z_(size)             _SAL2_Source_(_In_reads_z_, (size), _In_reads_(size)     _Pre_z_)
+#define _In_reads_opt_z_(size)         _SAL2_Source_(_In_reads_opt_z_, (size), _Pre_opt_count_(size)      _Deref_pre_readonly_     _Pre_opt_z_)
+#define _In_reads_or_z_(size)          _SAL2_Source_(_In_reads_or_z_, (size), _In_ _When_(_String_length_(_Curr_) < (size), _Pre_z_) _When_(_String_length_(_Curr_) >= (size), _Pre1_impl_(__count_impl(size))))
+#define _In_reads_or_z_opt_(size)      _SAL2_Source_(_In_reads_or_z_opt_, (size), _In_opt_ _When_(_String_length_(_Curr_) < (size), _Pre_z_) _When_(_String_length_(_Curr_) >= (size), _Pre1_impl_(__count_impl(size))))
+
+
+// 'input' buffers valid to the given end pointer
+
+#define _In_reads_to_ptr_(ptr)         _SAL2_Source_(_In_reads_to_ptr_, (ptr), _Pre_ptrdiff_count_(ptr)     _Deref_pre_readonly_)
+#define _In_reads_to_ptr_opt_(ptr)     _SAL2_Source_(_In_reads_to_ptr_opt_, (ptr), _Pre_opt_ptrdiff_count_(ptr) _Deref_pre_readonly_)
+#define _In_reads_to_ptr_z_(ptr)       _SAL2_Source_(_In_reads_to_ptr_z_, (ptr), _In_reads_to_ptr_(ptr) _Pre_z_)
+#define _In_reads_to_ptr_opt_z_(ptr)   _SAL2_Source_(_In_reads_to_ptr_opt_z_, (ptr), _Pre_opt_ptrdiff_count_(ptr) _Deref_pre_readonly_  _Pre_opt_z_)
+
+
+
+// Output parameters --------------------------
+
+//   _Out_ - Annotations for pointer or reference parameters where data passed back to the caller.
+//           These are mostly used where the pointer/reference is to a non-pointer type.
+//           _Outptr_/_Outref) (see below) are typically used to return pointers via parameters.
+
+// e.g. void GetPoint( _Out_ POINT* pPT );
+#define _Out_                                  _SAL2_Source_(_Out_, (),     _Out_impl_)
+#define _Out_opt_                              _SAL2_Source_(_Out_opt_, (), _Out_opt_impl_)
+
+#define _Out_writes_(size)                     _SAL2_Source_(_Out_writes_, (size), _Pre_cap_(size)            _Post_valid_impl_)
+#define _Out_writes_opt_(size)                 _SAL2_Source_(_Out_writes_opt_, (size), _Pre_opt_cap_(size)        _Post_valid_impl_)
+#define _Out_writes_bytes_(size)               _SAL2_Source_(_Out_writes_bytes_, (size), _Pre_bytecap_(size)        _Post_valid_impl_)
+#define _Out_writes_bytes_opt_(size)           _SAL2_Source_(_Out_writes_bytes_opt_, (size), _Pre_opt_bytecap_(size)    _Post_valid_impl_)
+#define _Out_writes_z_(size)                   _SAL2_Source_(_Out_writes_z_, (size), _Pre_cap_(size)            _Post_valid_impl_ _Post_z_)
+#define _Out_writes_opt_z_(size)               _SAL2_Source_(_Out_writes_opt_z_, (size), _Pre_opt_cap_(size)        _Post_valid_impl_ _Post_z_)
+
+#define _Out_writes_to_(size,count)            _SAL2_Source_(_Out_writes_to_, (size,count), _Pre_cap_(size)            _Post_valid_impl_ _Post_count_(count))
+#define _Out_writes_to_opt_(size,count)        _SAL2_Source_(_Out_writes_to_opt_, (size,count), _Pre_opt_cap_(size)        _Post_valid_impl_ _Post_count_(count))
+#define _Out_writes_all_(size)                 _SAL2_Source_(_Out_writes_all_, (size), _Out_writes_to_(_Old_(size), _Old_(size)))
+#define _Out_writes_all_opt_(size)             _SAL2_Source_(_Out_writes_all_opt_, (size), _Out_writes_to_opt_(_Old_(size), _Old_(size)))
+
+#define _Out_writes_bytes_to_(size,count)      _SAL2_Source_(_Out_writes_bytes_to_, (size,count), _Pre_bytecap_(size)        _Post_valid_impl_ _Post_bytecount_(count))
+#define _Out_writes_bytes_to_opt_(size,count)  _SAL2_Source_(_Out_writes_bytes_to_opt_, (size,count), _Pre_opt_bytecap_(size) _Post_valid_impl_ _Post_bytecount_(count))
+#define _Out_writes_bytes_all_(size)           _SAL2_Source_(_Out_writes_bytes_all_, (size), _Out_writes_bytes_to_(_Old_(size), _Old_(size)))
+#define _Out_writes_bytes_all_opt_(size)       _SAL2_Source_(_Out_writes_bytes_all_opt_, (size), _Out_writes_bytes_to_opt_(_Old_(size), _Old_(size)))
+
+#define _Out_writes_to_ptr_(ptr)               _SAL2_Source_(_Out_writes_to_ptr_, (ptr), _Pre_ptrdiff_cap_(ptr)     _Post_valid_impl_)
+#define _Out_writes_to_ptr_opt_(ptr)           _SAL2_Source_(_Out_writes_to_ptr_opt_, (ptr), _Pre_opt_ptrdiff_cap_(ptr) _Post_valid_impl_)
+#define _Out_writes_to_ptr_z_(ptr)             _SAL2_Source_(_Out_writes_to_ptr_z_, (ptr), _Pre_ptrdiff_cap_(ptr)     _Post_valid_impl_ Post_z_)
+#define _Out_writes_to_ptr_opt_z_(ptr)         _SAL2_Source_(_Out_writes_to_ptr_opt_z_, (ptr), _Pre_opt_ptrdiff_cap_(ptr) _Post_valid_impl_ Post_z_)
+
+
+// Inout parameters ----------------------------
+
+//   _Inout_ - Annotations for pointer or reference parameters where data is passed in and
+//        potentially modified.
+//          void ModifyPoint( _Inout_ POINT* pPT );
+//          void ModifyPointByRef( _Inout_ POINT& pPT );
+
+#define _Inout_                                _SAL2_Source_(_Inout_, (), _Prepost_valid_)
+#define _Inout_opt_                            _SAL2_Source_(_Inout_opt_, (), _Prepost_opt_valid_)
+
+// For modifying string buffers
+//   void toupper( _Inout_z_ char* sz );
+#define _Inout_z_                              _SAL2_Source_(_Inout_z_, (), _Prepost_z_)
+#define _Inout_opt_z_                          _SAL2_Source_(_Inout_opt_z_, (), _Prepost_opt_z_)
+
+// For modifying buffers with explicit element size
+#define _Inout_updates_(size)                  _SAL2_Source_(_Inout_updates_, (size), _Pre_cap_(size)         _Pre_valid_impl_ _Post_valid_impl_)
+#define _Inout_updates_opt_(size)              _SAL2_Source_(_Inout_updates_opt_, (size), _Pre_opt_cap_(size)     _Pre_valid_impl_ _Post_valid_impl_)
+#define _Inout_updates_z_(size)                _SAL2_Source_(_Inout_updates_z_, (size), _Pre_cap_(size)         _Pre_valid_impl_ _Post_valid_impl_ _Pre1_impl_(__zterm_impl) _Post1_impl_(__zterm_impl))
+#define _Inout_updates_opt_z_(size)            _SAL2_Source_(_Inout_updates_opt_z_, (size), _Pre_opt_cap_(size)     _Pre_valid_impl_ _Post_valid_impl_ _Pre1_impl_(__zterm_impl) _Post1_impl_(__zterm_impl))
+
+#define _Inout_updates_to_(size,count)         _SAL2_Source_(_Inout_updates_to_, (size,count), _Out_writes_to_(size,count) _Pre_valid_impl_ _Pre1_impl_(__count_impl(count)))
+#define _Inout_updates_to_opt_(size,count)     _SAL2_Source_(_Inout_updates_to_opt_, (size,count), _Out_writes_to_opt_(size,count) _Pre_valid_impl_ _Pre1_impl_(__count_impl(count)))
+
+#define _Inout_updates_all_(size)              _SAL2_Source_(_Inout_updates_all_, (size), _Inout_updates_to_(_Old_(size), _Old_(size)))
+#define _Inout_updates_all_opt_(size)          _SAL2_Source_(_Inout_updates_all_opt_, (size), _Inout_updates_to_opt_(_Old_(size), _Old_(size)))
+
+// For modifying buffers with explicit byte size
+#define _Inout_updates_bytes_(size)            _SAL2_Source_(_Inout_updates_bytes_, (size), _Pre_bytecap_(size)     _Pre_valid_impl_ _Post_valid_impl_)
+#define _Inout_updates_bytes_opt_(size)        _SAL2_Source_(_Inout_updates_bytes_opt_, (size), _Pre_opt_bytecap_(size) _Pre_valid_impl_ _Post_valid_impl_)
+
+#define _Inout_updates_bytes_to_(size,count)       _SAL2_Source_(_Inout_updates_bytes_to_, (size,count), _Out_writes_bytes_to_(size,count) _Pre_valid_impl_ _Pre1_impl_(__bytecount_impl(count)))
+#define _Inout_updates_bytes_to_opt_(size,count)   _SAL2_Source_(_Inout_updates_bytes_to_opt_, (size,count), _Out_writes_bytes_to_opt_(size,count) _Pre_valid_impl_ _Pre1_impl_(__bytecount_impl(count)))
+
+#define _Inout_updates_bytes_all_(size)        _SAL2_Source_(_Inout_updates_bytes_all_, (size), _Inout_updates_bytes_to_(_Old_(size), _Old_(size)))
+#define _Inout_updates_bytes_all_opt_(size)    _SAL2_Source_(_Inout_updates_bytes_all_opt_, (size), _Inout_updates_bytes_to_opt_(_Old_(size), _Old_(size)))
+
+
+// Pointer to pointer parameters -------------------------
+
+//   _Outptr_ - Annotations for output params returning pointers
+//      These describe parameters where the called function provides the buffer:
+//        HRESULT SHStrDupW(_In_ LPCWSTR psz, _Outptr_ LPWSTR *ppwsz);
+//      The caller passes the address of an LPWSTR variable as ppwsz, and SHStrDupW allocates
+//      and initializes memory and returns the pointer to the new LPWSTR in *ppwsz.
+//
+//    _Outptr_opt_ - describes parameters that are allowed to be NULL.
+//    _Outptr_*_result_maybenull_ - describes parameters where the called function might return NULL to the caller.
+//
+//    Example:
+//       void MyFunc(_Outptr_opt_ int **ppData1, _Outptr_result_maybenull_ int **ppData2);
+//    Callers:
+//       MyFunc(NULL, NULL);           // error: parameter 2, ppData2, should not be NULL
+//       MyFunc(&pData1, &pData2);     // ok: both non-NULL
+//       if (*pData1 == *pData2) ...   // error: pData2 might be NULL after call
+
+#define _Outptr_                         _SAL2_Source_(_Outptr_, (),                      _Out_impl_     _Deref_post2_impl_(__notnull_impl_notref,   __count_impl(1)))
+#define _Outptr_result_maybenull_        _SAL2_Source_(_Outptr_result_maybenull_, (),     _Out_impl_     _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(1)))
+#define _Outptr_opt_                     _SAL2_Source_(_Outptr_opt_, (),                  _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref,   __count_impl(1)))
+#define _Outptr_opt_result_maybenull_    _SAL2_Source_(_Outptr_opt_result_maybenull_, (), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(1)))
+
+// Annotations for _Outptr_ parameters returning pointers to null terminated strings.
+
+#define _Outptr_result_z_                _SAL2_Source_(_Outptr_result_z_, (),               _Out_impl_     _Deref_post_z_)
+#define _Outptr_opt_result_z_            _SAL2_Source_(_Outptr_opt_result_z_, (),           _Out_opt_impl_ _Deref_post_z_)
+#define _Outptr_result_maybenull_z_      _SAL2_Source_(_Outptr_result_maybenull_z_, (),     _Out_impl_     _Deref_post_opt_z_)
+#define _Outptr_opt_result_maybenull_z_  _SAL2_Source_(_Outptr_opt_result_maybenull_z_, (), _Out_opt_impl_ _Deref_post_opt_z_)
+
+// Annotations for _Outptr_ parameters where the output pointer is set to NULL if the function fails.
+
+#define _Outptr_result_nullonfailure_       _SAL2_Source_(_Outptr_result_nullonfailure_, (),     _Outptr_      _On_failure_(_Deref_post_null_))
+#define _Outptr_opt_result_nullonfailure_   _SAL2_Source_(_Outptr_opt_result_nullonfailure_, (), _Outptr_opt_  _On_failure_(_Deref_post_null_))
+
+// Annotations for _Outptr_ parameters which return a pointer to a ref-counted COM object,
+// following the COM convention of setting the output to NULL on failure.
+// The current implementation is identical to _Outptr_result_nullonfailure_.
+// For pointers to types that are not COM objects, _Outptr_result_nullonfailure_ is preferred.
+
+#define _COM_Outptr_                        _SAL2_Source_(_COM_Outptr_, (),                      _Outptr_                      _On_failure_(_Deref_post_null_))
+#define _COM_Outptr_result_maybenull_       _SAL2_Source_(_COM_Outptr_result_maybenull_, (),     _Outptr_result_maybenull_     _On_failure_(_Deref_post_null_))
+#define _COM_Outptr_opt_                    _SAL2_Source_(_COM_Outptr_opt_, (),                  _Outptr_opt_                  _On_failure_(_Deref_post_null_))
+#define _COM_Outptr_opt_result_maybenull_   _SAL2_Source_(_COM_Outptr_opt_result_maybenull_, (), _Outptr_opt_result_maybenull_ _On_failure_(_Deref_post_null_))
+
+// Annotations for _Outptr_ parameters returning a pointer to buffer with a specified number of elements/bytes
+
+#define _Outptr_result_buffer_(size)                      _SAL2_Source_(_Outptr_result_buffer_, (size),               _Out_impl_     _Deref_post2_impl_(__notnull_impl_notref, __cap_impl(size)))
+#define _Outptr_opt_result_buffer_(size)                  _SAL2_Source_(_Outptr_opt_result_buffer_, (size),           _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __cap_impl(size)))
+#define _Outptr_result_buffer_to_(size, count)            _SAL2_Source_(_Outptr_result_buffer_to_, (size, count),     _Out_impl_     _Deref_post3_impl_(__notnull_impl_notref, __cap_impl(size), __count_impl(count)))
+#define _Outptr_opt_result_buffer_to_(size, count)        _SAL2_Source_(_Outptr_opt_result_buffer_to_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__notnull_impl_notref, __cap_impl(size), __count_impl(count)))
+
+#define _Outptr_result_buffer_all_(size)                  _SAL2_Source_(_Outptr_result_buffer_all_, (size),           _Out_impl_     _Deref_post2_impl_(__notnull_impl_notref, __count_impl(size)))
+#define _Outptr_opt_result_buffer_all_(size)              _SAL2_Source_(_Outptr_opt_result_buffer_all_, (size),       _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(size)))
+
+#define _Outptr_result_buffer_maybenull_(size)               _SAL2_Source_(_Outptr_result_buffer_maybenull_, (size),               _Out_impl_     _Deref_post2_impl_(__maybenull_impl_notref, __cap_impl(size)))
+#define _Outptr_opt_result_buffer_maybenull_(size)           _SAL2_Source_(_Outptr_opt_result_buffer_maybenull_, (size),           _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __cap_impl(size)))
+#define _Outptr_result_buffer_to_maybenull_(size, count)     _SAL2_Source_(_Outptr_result_buffer_to_maybenull_, (size, count),     _Out_impl_     _Deref_post3_impl_(__maybenull_impl_notref, __cap_impl(size), __count_impl(count)))
+#define _Outptr_opt_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_opt_result_buffer_to_maybenull_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __cap_impl(size), __count_impl(count)))
+
+#define _Outptr_result_buffer_all_maybenull_(size)           _SAL2_Source_(_Outptr_result_buffer_all_maybenull_, (size),           _Out_impl_     _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(size)))
+#define _Outptr_opt_result_buffer_all_maybenull_(size)       _SAL2_Source_(_Outptr_opt_result_buffer_all_maybenull_, (size),       _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(size)))
+
+#define _Outptr_result_bytebuffer_(size)                     _SAL2_Source_(_Outptr_result_bytebuffer_, (size),                     _Out_impl_     _Deref_post2_impl_(__notnull_impl_notref, __bytecap_impl(size)))
+#define _Outptr_opt_result_bytebuffer_(size)                 _SAL2_Source_(_Outptr_opt_result_bytebuffer_, (size),                 _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecap_impl(size)))
+#define _Outptr_result_bytebuffer_to_(size, count)           _SAL2_Source_(_Outptr_result_bytebuffer_to_, (size, count),           _Out_impl_     _Deref_post3_impl_(__notnull_impl_notref, __bytecap_impl(size), __bytecount_impl(count)))
+#define _Outptr_opt_result_bytebuffer_to_(size, count)       _SAL2_Source_(_Outptr_opt_result_bytebuffer_to_, (size, count),       _Out_opt_impl_ _Deref_post3_impl_(__notnull_impl_notref, __bytecap_impl(size), __bytecount_impl(count)))
+
+#define _Outptr_result_bytebuffer_all_(size)                 _SAL2_Source_(_Outptr_result_bytebuffer_all_, (size),                 _Out_impl_     _Deref_post2_impl_(__notnull_impl_notref, __bytecount_impl(size)))
+#define _Outptr_opt_result_bytebuffer_all_(size)             _SAL2_Source_(_Outptr_opt_result_bytebuffer_all_, (size),             _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecount_impl(size)))
+
+#define _Outptr_result_bytebuffer_maybenull_(size)                 _SAL2_Source_(_Outptr_result_bytebuffer_maybenull_, (size),               _Out_impl_     _Deref_post2_impl_(__maybenull_impl_notref, __bytecap_impl(size)))
+#define _Outptr_opt_result_bytebuffer_maybenull_(size)             _SAL2_Source_(_Outptr_opt_result_bytebuffer_maybenull_, (size),           _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecap_impl(size)))
+#define _Outptr_result_bytebuffer_to_maybenull_(size, count)       _SAL2_Source_(_Outptr_result_bytebuffer_to_maybenull_, (size, count),     _Out_impl_     _Deref_post3_impl_(__maybenull_impl_notref, __bytecap_impl(size), __bytecount_impl(count)))
+#define _Outptr_opt_result_bytebuffer_to_maybenull_(size, count)   _SAL2_Source_(_Outptr_opt_result_bytebuffer_to_maybenull_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __bytecap_impl(size), __bytecount_impl(count)))
+
+#define _Outptr_result_bytebuffer_all_maybenull_(size)         _SAL2_Source_(_Outptr_result_bytebuffer_all_maybenull_, (size),               _Out_impl_     _Deref_post2_impl_(__maybenull_impl_notref, __bytecount_impl(size)))
+#define _Outptr_opt_result_bytebuffer_all_maybenull_(size)     _SAL2_Source_(_Outptr_opt_result_bytebuffer_all_maybenull_, (size),           _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecount_impl(size)))
+
+// Annotations for output reference to pointer parameters.
+
+#define _Outref_                                               _SAL2_Source_(_Outref_, (),                  _Out_impl_ _Post_notnull_)
+#define _Outref_result_maybenull_                              _SAL2_Source_(_Outref_result_maybenull_, (), _Pre2_impl_(__notnull_impl_notref, __cap_c_one_notref_impl) _Post_maybenull_ _Post_valid_impl_)
+
+#define _Outref_result_buffer_(size)                           _SAL2_Source_(_Outref_result_buffer_, (size),                         _Outref_ _Post1_impl_(__cap_impl(size)))
+#define _Outref_result_bytebuffer_(size)                       _SAL2_Source_(_Outref_result_bytebuffer_, (size),                     _Outref_ _Post1_impl_(__bytecap_impl(size)))
+#define _Outref_result_buffer_to_(size, count)                 _SAL2_Source_(_Outref_result_buffer_to_, (size, count),               _Outref_result_buffer_(size) _Post1_impl_(__count_impl(count)))
+#define _Outref_result_bytebuffer_to_(size, count)             _SAL2_Source_(_Outref_result_bytebuffer_to_, (size, count),           _Outref_result_bytebuffer_(size) _Post1_impl_(__bytecount_impl(count)))
+#define _Outref_result_buffer_all_(size)                       _SAL2_Source_(_Outref_result_buffer_all_, (size),                     _Outref_result_buffer_to_(size, _Old_(size)))
+#define _Outref_result_bytebuffer_all_(size)                   _SAL2_Source_(_Outref_result_bytebuffer_all_, (size),                 _Outref_result_bytebuffer_to_(size, _Old_(size)))
+
+#define _Outref_result_buffer_maybenull_(size)                 _SAL2_Source_(_Outref_result_buffer_maybenull_, (size),               _Outref_result_maybenull_ _Post1_impl_(__cap_impl(size)))
+#define _Outref_result_bytebuffer_maybenull_(size)             _SAL2_Source_(_Outref_result_bytebuffer_maybenull_, (size),           _Outref_result_maybenull_ _Post1_impl_(__bytecap_impl(size)))
+#define _Outref_result_buffer_to_maybenull_(size, count)       _SAL2_Source_(_Outref_result_buffer_to_maybenull_, (size, count),     _Outref_result_buffer_maybenull_(size) _Post1_impl_(__count_impl(count)))
+#define _Outref_result_bytebuffer_to_maybenull_(size, count)   _SAL2_Source_(_Outref_result_bytebuffer_to_maybenull_, (size, count), _Outref_result_bytebuffer_maybenull_(size) _Post1_impl_(__bytecount_impl(count)))
+#define _Outref_result_buffer_all_maybenull_(size)             _SAL2_Source_(_Outref_result_buffer_all_maybenull_, (size),           _Outref_result_buffer_to_maybenull_(size, _Old_(size)))
+#define _Outref_result_bytebuffer_all_maybenull_(size)         _SAL2_Source_(_Outref_result_bytebuffer_all_maybenull_, (size),       _Outref_result_bytebuffer_to_maybenull_(size, _Old_(size)))
+
+// Annotations for output reference to pointer parameters that guarantee
+// that the pointer is set to NULL on failure.
+#define _Outref_result_nullonfailure_                          _SAL2_Source_(_Outref_result_nullonfailure_, (), _Outref_    _On_failure_(_Post_null_))
+
+// Generic annotations to set output value of a by-pointer or by-reference parameter to null/zero on failure.
+#define _Result_nullonfailure_                                 _SAL2_Source_(_Result_nullonfailure_, (), _On_failure_(_Notref_impl_ _Deref_impl_ _Post_null_))
+#define _Result_zeroonfailure_                                 _SAL2_Source_(_Result_zeroonfailure_, (), _On_failure_(_Notref_impl_ _Deref_impl_ _Out_range_(==, 0)))
+
+
+// return values -------------------------------
+
+//
+// _Ret_ annotations
+//
+// describing conditions that hold for return values after the call
+
+// e.g. _Ret_z_ CString::operator const WCHAR*() const throw();
+#define _Ret_z_                             _SAL2_Source_(_Ret_z_, (), _Ret2_impl_(__notnull_impl,  __zterm_impl) _Ret_valid_impl_)
+#define _Ret_maybenull_z_                   _SAL2_Source_(_Ret_maybenull_z_, (), _Ret2_impl_(__maybenull_impl,__zterm_impl) _Ret_valid_impl_)
+
+// used with allocated but not yet initialized objects
+#define _Ret_notnull_                       _SAL2_Source_(_Ret_notnull_, (), _Ret1_impl_(__notnull_impl))
+#define _Ret_maybenull_                     _SAL2_Source_(_Ret_maybenull_, (), _Ret1_impl_(__maybenull_impl))
+#define _Ret_null_                          _SAL2_Source_(_Ret_null_, (), _Ret1_impl_(__null_impl))
+
+// used with allocated and initialized objects
+//    returns single valid object
+#define _Ret_valid_                         _SAL2_Source_(_Ret_valid_, (), _Ret1_impl_(__notnull_impl_notref)   _Ret_valid_impl_)
+
+//    returns pointer to initialized buffer of specified size
+#define _Ret_writes_(size)                  _SAL2_Source_(_Ret_writes_, (size), _Ret2_impl_(__notnull_impl,  __count_impl(size))          _Ret_valid_impl_)
+#define _Ret_writes_z_(size)                _SAL2_Source_(_Ret_writes_z_, (size), _Ret3_impl_(__notnull_impl,  __count_impl(size), __zterm_impl) _Ret_valid_impl_)
+#define _Ret_writes_bytes_(size)            _SAL2_Source_(_Ret_writes_bytes_, (size), _Ret2_impl_(__notnull_impl,  __bytecount_impl(size))      _Ret_valid_impl_)
+#define _Ret_writes_maybenull_(size)        _SAL2_Source_(_Ret_writes_maybenull_, (size), _Ret2_impl_(__maybenull_impl,__count_impl(size))          _Ret_valid_impl_)
+#define _Ret_writes_maybenull_z_(size)      _SAL2_Source_(_Ret_writes_maybenull_z_, (size), _Ret3_impl_(__maybenull_impl,__count_impl(size),__zterm_impl)  _Ret_valid_impl_)
+#define _Ret_writes_bytes_maybenull_(size)  _SAL2_Source_(_Ret_writes_bytes_maybenull_, (size), _Ret2_impl_(__maybenull_impl,__bytecount_impl(size))      _Ret_valid_impl_)
+
+//    returns pointer to partially initialized buffer, with total size 'size' and initialized size 'count'
+#define _Ret_writes_to_(size,count)                   _SAL2_Source_(_Ret_writes_to_, (size,count), _Ret3_impl_(__notnull_impl,  __cap_impl(size),     __count_impl(count))     _Ret_valid_impl_)
+#define _Ret_writes_bytes_to_(size,count)             _SAL2_Source_(_Ret_writes_bytes_to_, (size,count), _Ret3_impl_(__notnull_impl,  __bytecap_impl(size), __bytecount_impl(count)) _Ret_valid_impl_)
+#define _Ret_writes_to_maybenull_(size,count)         _SAL2_Source_(_Ret_writes_to_maybenull_, (size,count), _Ret3_impl_(__maybenull_impl,  __cap_impl(size),     __count_impl(count))     _Ret_valid_impl_)
+#define _Ret_writes_bytes_to_maybenull_(size,count)   _SAL2_Source_(_Ret_writes_bytes_to_maybenull_, (size,count), _Ret3_impl_(__maybenull_impl,  __bytecap_impl(size), __bytecount_impl(count)) _Ret_valid_impl_)
+
+
+// Annotations for strict type checking
+#define _Points_to_data_        _SAL2_Source_(_Points_to_data_, (), _Pre_ _Points_to_data_impl_)
+#define _Literal_               _SAL2_Source_(_Literal_, (), _Pre_ _Literal_impl_)
+#define _Notliteral_            _SAL2_Source_(_Notliteral_, (), _Pre_ _Notliteral_impl_)
+
+// Check the return value of a function e.g. _Check_return_ ErrorCode Foo();
+#define _Check_return_           _SAL2_Source_(_Check_return_, (), _Check_return_impl_)
+#define _Must_inspect_result_    _SAL2_Source_(_Must_inspect_result_, (), _Must_inspect_impl_ _Check_return_impl_)
+
+// e.g. MyPrintF( _Printf_format_string_ const WCHAR* wzFormat, ... );
+#define _Printf_format_string_  _SAL2_Source_(_Printf_format_string_, (), _Printf_format_string_impl_)
+#define _Scanf_format_string_   _SAL2_Source_(_Scanf_format_string_, (), _Scanf_format_string_impl_)
+#define _Scanf_s_format_string_  _SAL2_Source_(_Scanf_s_format_string_, (), _Scanf_s_format_string_impl_)
+
+#define _Format_string_impl_(kind,where)  _SA_annotes2(SAL_IsFormatString2, kind, where)
+#define _Printf_format_string_params_(x)  _SAL2_Source_(_Printf_format_string_params_, (x), _Format_string_impl_("printf", x))
+#define _Scanf_format_string_params_(x)   _SAL2_Source_(_Scanf_format_string_params_, (x), _Format_string_impl_("scanf", x))
+#define _Scanf_s_format_string_params_(x) _SAL2_Source_(_Scanf_s_format_string_params_, (x), _Format_string_impl_("scanf_s", x))
+
+// annotations to express value of integral or pointer parameter
+#define _In_range_(lb,ub)           _SAL2_Source_(_In_range_, (lb,ub), _In_range_impl_(lb,ub))
+#define _Out_range_(lb,ub)          _SAL2_Source_(_Out_range_, (lb,ub), _Out_range_impl_(lb,ub))
+#define _Ret_range_(lb,ub)          _SAL2_Source_(_Ret_range_, (lb,ub), _Ret_range_impl_(lb,ub))
+#define _Deref_in_range_(lb,ub)     _SAL2_Source_(_Deref_in_range_, (lb,ub), _Deref_in_range_impl_(lb,ub))
+#define _Deref_out_range_(lb,ub)    _SAL2_Source_(_Deref_out_range_, (lb,ub), _Deref_out_range_impl_(lb,ub))
+#define _Deref_ret_range_(lb,ub)    _SAL2_Source_(_Deref_ret_range_, (lb,ub), _Deref_ret_range_impl_(lb,ub))
+#define _Pre_equal_to_(expr)        _SAL2_Source_(_Pre_equal_to_, (expr), _In_range_(==, expr))
+#define _Post_equal_to_(expr)       _SAL2_Source_(_Post_equal_to_, (expr), _Out_range_(==, expr))
+
+// annotation to express that a value (usually a field of a mutable class)
+// is not changed by a function call
+#define _Unchanged_(e)              _SAL2_Source_(_Unchanged_, (e), _At_(e, _Post_equal_to_(_Old_(e)) _Const_))
+
+// Annotations to allow expressing generalized pre and post conditions.
+// 'cond' may be any valid SAL expression that is considered to be true as a precondition
+// or postcondition (respsectively).
+#define _Pre_satisfies_(cond)       _SAL2_Source_(_Pre_satisfies_, (cond), _Pre_satisfies_impl_(cond))
+#define _Post_satisfies_(cond)      _SAL2_Source_(_Post_satisfies_, (cond), _Post_satisfies_impl_(cond))
+
+// Annotations to express struct, class and field invariants
+#define _Struct_size_bytes_(size)                  _SAL2_Source_(_Struct_size_bytes_, (size), _Writable_bytes_(size))
+
+#define _Field_size_(size)                         _SAL2_Source_(_Field_size_, (size), _Notnull_   _Writable_elements_(size))
+#define _Field_size_opt_(size)                     _SAL2_Source_(_Field_size_opt_, (size), _Maybenull_ _Writable_elements_(size))
+#define _Field_size_part_(size, count)             _SAL2_Source_(_Field_size_part_, (size, count), _Notnull_   _Writable_elements_(size) _Readable_elements_(count))
+#define _Field_size_part_opt_(size, count)         _SAL2_Source_(_Field_size_part_opt_, (size, count), _Maybenull_ _Writable_elements_(size) _Readable_elements_(count))
+#define _Field_size_full_(size)                    _SAL2_Source_(_Field_size_full_, (size), _Field_size_part_(size, size))
+#define _Field_size_full_opt_(size)                _SAL2_Source_(_Field_size_full_opt_, (size), _Field_size_part_opt_(size, size))
+
+#define _Field_size_bytes_(size)                   _SAL2_Source_(_Field_size_bytes_, (size), _Notnull_   _Writable_bytes_(size))
+#define _Field_size_bytes_opt_(size)               _SAL2_Source_(_Field_size_bytes_opt_, (size), _Maybenull_ _Writable_bytes_(size))
+#define _Field_size_bytes_part_(size, count)       _SAL2_Source_(_Field_size_bytes_part_, (size, count), _Notnull_   _Writable_bytes_(size) _Readable_bytes_(count))
+#define _Field_size_bytes_part_opt_(size, count)   _SAL2_Source_(_Field_size_bytes_part_opt_, (size, count), _Maybenull_ _Writable_bytes_(size) _Readable_bytes_(count))
+#define _Field_size_bytes_full_(size)              _SAL2_Source_(_Field_size_bytes_full_, (size), _Field_size_bytes_part_(size, size))
+#define _Field_size_bytes_full_opt_(size)          _SAL2_Source_(_Field_size_bytes_full_opt_, (size), _Field_size_bytes_part_opt_(size, size))
+
+#define _Field_z_                                  _SAL2_Source_(_Field_z_, (), _Null_terminated_)
+
+#define _Field_range_(min,max)                     _SAL2_Source_(_Field_range_, (min,max), _Field_range_impl_(min,max))
+
+//============================================================================
+//   _Pre_\_Post_ Layer:
+//============================================================================
+
+//
+// Raw Pre/Post for declaring custom pre/post conditions
+//
+
+#define _Pre_                             _Pre_impl_
+#define _Post_                            _Post_impl_
+
+//
+// Validity property
+//
+
+#define _Valid_                           _Valid_impl_
+#define _Notvalid_                        _Notvalid_impl_
+#define _Maybevalid_                      _Maybevalid_impl_
+
+//
+// Buffer size properties
+//
+
+// Expressing buffer sizes without specifying pre or post condition
+#define _Readable_bytes_(size)            _SAL2_Source_(_Readable_bytes_, (size), _Readable_bytes_impl_(size))
+#define _Readable_elements_(size)         _SAL2_Source_(_Readable_elements_, (size), _Readable_elements_impl_(size))
+#define _Writable_bytes_(size)            _SAL2_Source_(_Writable_bytes_, (size), _Writable_bytes_impl_(size))
+#define _Writable_elements_(size)         _SAL2_Source_(_Writable_elements_, (size), _Writable_elements_impl_(size))
+
+#define _Null_terminated_                 _SAL2_Source_(_Null_terminated_, (), _Null_terminated_impl_)
+#define _NullNull_terminated_             _SAL2_Source_(_NullNull_terminated_, (), _NullNull_terminated_impl_)
+
+// Expressing buffer size as pre or post condition
+#define _Pre_readable_size_(size)         _SAL2_Source_(_Pre_readable_size_, (size), _Pre1_impl_(__count_impl(size))      _Pre_valid_impl_)
+#define _Pre_writable_size_(size)         _SAL2_Source_(_Pre_writable_size_, (size), _Pre1_impl_(__cap_impl(size)))
+#define _Pre_readable_byte_size_(size)    _SAL2_Source_(_Pre_readable_byte_size_, (size), _Pre1_impl_(__bytecount_impl(size))  _Pre_valid_impl_)
+#define _Pre_writable_byte_size_(size)    _SAL2_Source_(_Pre_writable_byte_size_, (size), _Pre1_impl_(__bytecap_impl(size)))
+
+#define _Post_readable_size_(size)        _SAL2_Source_(_Post_readable_size_, (size), _Post1_impl_(__count_impl(size))     _Post_valid_impl_)
+#define _Post_writable_size_(size)        _SAL2_Source_(_Post_writable_size_, (size), _Post1_impl_(__cap_impl(size)))
+#define _Post_readable_byte_size_(size)   _SAL2_Source_(_Post_readable_byte_size_, (size), _Post1_impl_(__bytecount_impl(size)) _Post_valid_impl_)
+#define _Post_writable_byte_size_(size)   _SAL2_Source_(_Post_writable_byte_size_, (size), _Post1_impl_(__bytecap_impl(size)))
+
+//
+// Pointer null-ness properties
+//
+#define _Null_                            _Null_impl_
+#define _Notnull_                         _Notnull_impl_
+#define _Maybenull_                       _Maybenull_impl_
+
+//
+// _Pre_ annotations ---
+//
+// describing conditions that must be met before the call of the function
+
+// e.g. int strlen( _Pre_z_ const char* sz );
+// buffer is a zero terminated string
+#define _Pre_z_                           _SAL2_Source_(_Pre_z_, (), _Pre1_impl_(__zterm_impl) _Pre_valid_impl_)
+
+// valid size unknown or indicated by type (e.g.:LPSTR)
+#define _Pre_valid_                       _SAL2_Source_(_Pre_valid_, (), _Pre1_impl_(__notnull_impl_notref)   _Pre_valid_impl_)
+#define _Pre_opt_valid_                   _SAL2_Source_(_Pre_opt_valid_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_)
+
+#define _Pre_invalid_                     _SAL2_Source_(_Pre_invalid_, (), _Deref_pre1_impl_(__notvalid_impl))
+
+// Overrides recursive valid when some field is not yet initialized when using _Inout_
+#define _Pre_unknown_                     _SAL2_Source_(_Pre_unknown_, (), _Pre1_impl_(__maybevalid_impl))
+
+// used with allocated but not yet initialized objects
+#define _Pre_notnull_                     _SAL2_Source_(_Pre_notnull_, (), _Pre1_impl_(__notnull_impl_notref))
+#define _Pre_maybenull_                   _SAL2_Source_(_Pre_maybenull_, (), _Pre1_impl_(__maybenull_impl_notref))
+#define _Pre_null_                        _SAL2_Source_(_Pre_null_, (), _Pre1_impl_(__null_impl_notref))
+
+//
+// _Post_ annotations ---
+//
+// describing conditions that hold after the function call
+
+// void CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_ char* szFrom, size_t cchFrom );
+// buffer will be a zero-terminated string after the call
+#define _Post_z_                         _SAL2_Source_(_Post_z_, (), _Post1_impl_(__zterm_impl) _Post_valid_impl_)
+
+// e.g. HRESULT InitStruct( _Post_valid_ Struct* pobj );
+#define _Post_valid_                     _SAL2_Source_(_Post_valid_, (), _Post_valid_impl_)
+#define _Post_invalid_                   _SAL2_Source_(_Post_invalid_, (), _Deref_post1_impl_(__notvalid_impl))
+
+// e.g. void free( _Post_ptr_invalid_ void* pv );
+#define _Post_ptr_invalid_               _SAL2_Source_(_Post_ptr_invalid_, (), _Post1_impl_(__notvalid_impl))
+
+// e.g. void ThrowExceptionIfNull( _Post_notnull_ const void* pv );
+#define _Post_notnull_                   _SAL2_Source_(_Post_notnull_, (), _Post1_impl_(__notnull_impl))
+
+// e.g. HRESULT GetObject(_Outptr_ _On_failure_(_At_(*p, _Post_null_)) T **p);
+#define _Post_null_                      _SAL2_Source_(_Post_null_, (), _Post1_impl_(__null_impl))
+
+#define _Post_maybenull_                 _SAL2_Source_(_Post_maybenull_, (), _Post1_impl_(__maybenull_impl))
+
+#define _Prepost_z_                      _SAL2_Source_(_Prepost_z_, (), _Pre_z_      _Post_z_)
+
+
+// #pragma region Input Buffer SAL 1 compatibility macros
+
+/*==========================================================================
+
+   This section contains definitions for macros defined for VS2010 and earlier.
+   Usage of these macros is still supported, but the SAL 2 macros defined above
+   are recommended instead.  This comment block is retained to assist in
+   understanding SAL that still uses the older syntax.
+
+   The macros are defined in 3 layers:
+
+   _In_\_Out_ Layer:
+   ----------------
+   This layer provides the highest abstraction and its macros should be used
+   in most cases. Its macros start with _In_, _Out_ or _Inout_. For the
+   typical case they provide the most concise annotations.
+
+   _Pre_\_Post_ Layer:
+   ------------------
+   The macros of this layer only should be used when there is no suitable macro
+   in the _In_\_Out_ layer. Its macros start with _Pre_, _Post_, _Ret_,
+   _Deref_pre_ _Deref_post_ and _Deref_ret_. This layer provides the most
+   flexibility for annotations.
+
+   Implementation Abstraction Layer:
+   --------------------------------
+   Macros from this layer should never be used directly. The layer only exists
+   to hide the implementation of the annotation macros.
+
+
+   Annotation Syntax:
+   |--------------|----------|----------------|-----------------------------|
+   |   Usage      | Nullness | ZeroTerminated |  Extent                     |
+   |--------------|----------|----------------|-----------------------------|
+   | _In_         | <>       | <>             | <>                          |
+   | _Out_        | opt_     | z_             | [byte]cap_[c_|x_]( size )   |
+   | _Inout_      |          |                | [byte]count_[c_|x_]( size ) |
+   | _Deref_out_  |          |                | ptrdiff_cap_( ptr )         |
+   |--------------|          |                | ptrdiff_count_( ptr )       |
+   | _Ret_        |          |                |                             |
+   | _Deref_ret_  |          |                |                             |
+   |--------------|          |                |                             |
+   | _Pre_        |          |                |                             |
+   | _Post_       |          |                |                             |
+   | _Deref_pre_  |          |                |                             |
+   | _Deref_post_ |          |                |                             |
+   |--------------|----------|----------------|-----------------------------|
+
+   Usage:
+   -----
+   _In_, _Out_, _Inout_, _Pre_, _Post_, _Deref_pre_, _Deref_post_ are for
+   formal parameters.
+   _Ret_, _Deref_ret_ must be used for return values.
+
+   Nullness:
+   --------
+   If the pointer can be NULL the annotation contains _opt. If the macro
+   does not contain '_opt' the pointer may not be NULL.
+
+   String Type:
+   -----------
+   _z: NullTerminated string
+   for _In_ parameters the buffer must have the specified stringtype before the call
+   for _Out_ parameters the buffer must have the specified stringtype after the call
+   for _Inout_ parameters both conditions apply
+
+   Extent Syntax:
+   |------|---------------|---------------|
+   | Unit | Writ\Readable | Argument Type |
+   |------|---------------|---------------|
+   |  <>  | cap_          | <>            |
+   | byte | count_        | c_            |
+   |      |               | x_            |
+   |------|---------------|---------------|
+
+   'cap' (capacity) describes the writable size of the buffer and is typically used
+   with _Out_. The default unit is elements. Use 'bytecap' if the size is given in bytes
+   'count' describes the readable size of the buffer and is typically used with _In_.
+   The default unit is elements. Use 'bytecount' if the size is given in bytes.
+
+   Argument syntax for cap_, bytecap_, count_, bytecount_:
+   (<parameter>|return)[+n]  e.g. cch, return, cb+2
+
+   If the buffer size is a constant expression use the c_ postfix.
+   E.g. cap_c_(20), count_c_(MAX_PATH), bytecount_c_(16)
+
+   If the buffer size is given by a limiting pointer use the ptrdiff_ versions
+   of the macros.
+
+   If the buffer size is neither a parameter nor a constant expression use the x_
+   postfix. e.g. bytecount_x_(num*size) x_ annotations accept any arbitrary string.
+   No analysis can be done for x_ annotations but they at least tell the tool that
+   the buffer has some sort of extent description. x_ annotations might be supported
+   by future compiler versions.
+
+============================================================================*/
+
+// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch )
+// valid buffer extent described by another parameter
+#define _In_count_(size)               _SAL1_1_Source_(_In_count_, (size), _Pre_count_(size)         _Deref_pre_readonly_)
+#define _In_opt_count_(size)           _SAL1_1_Source_(_In_opt_count_, (size), _Pre_opt_count_(size)     _Deref_pre_readonly_)
+#define _In_bytecount_(size)           _SAL1_1_Source_(_In_bytecount_, (size), _Pre_bytecount_(size)     _Deref_pre_readonly_)
+#define _In_opt_bytecount_(size)       _SAL1_1_Source_(_In_opt_bytecount_, (size), _Pre_opt_bytecount_(size) _Deref_pre_readonly_)
+
+// valid buffer extent described by a constant extression
+#define _In_count_c_(size)             _SAL1_1_Source_(_In_count_c_, (size), _Pre_count_c_(size)         _Deref_pre_readonly_)
+#define _In_opt_count_c_(size)         _SAL1_1_Source_(_In_opt_count_c_, (size), _Pre_opt_count_c_(size)     _Deref_pre_readonly_)
+#define _In_bytecount_c_(size)         _SAL1_1_Source_(_In_bytecount_c_, (size), _Pre_bytecount_c_(size)     _Deref_pre_readonly_)
+#define _In_opt_bytecount_c_(size)     _SAL1_1_Source_(_In_opt_bytecount_c_, (size), _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_)
+
+// nullterminated  'input' buffers with given size
+
+// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch )
+// nullterminated valid buffer extent described by another parameter
+#define _In_z_count_(size)               _SAL1_1_Source_(_In_z_count_, (size), _Pre_z_ _Pre_count_(size)         _Deref_pre_readonly_)
+#define _In_opt_z_count_(size)           _SAL1_1_Source_(_In_opt_z_count_, (size), _Pre_opt_z_ _Pre_opt_count_(size)     _Deref_pre_readonly_)
+#define _In_z_bytecount_(size)           _SAL1_1_Source_(_In_z_bytecount_, (size), _Pre_z_ _Pre_bytecount_(size)     _Deref_pre_readonly_)
+#define _In_opt_z_bytecount_(size)       _SAL1_1_Source_(_In_opt_z_bytecount_, (size), _Pre_opt_z_ _Pre_opt_bytecount_(size) _Deref_pre_readonly_)
+
+// nullterminated valid buffer extent described by a constant extression
+#define _In_z_count_c_(size)             _SAL1_1_Source_(_In_z_count_c_, (size), _Pre_z_ _Pre_count_c_(size)         _Deref_pre_readonly_)
+#define _In_opt_z_count_c_(size)         _SAL1_1_Source_(_In_opt_z_count_c_, (size), _Pre_opt_z_ _Pre_opt_count_c_(size)     _Deref_pre_readonly_)
+#define _In_z_bytecount_c_(size)         _SAL1_1_Source_(_In_z_bytecount_c_, (size), _Pre_z_ _Pre_bytecount_c_(size)     _Deref_pre_readonly_)
+#define _In_opt_z_bytecount_c_(size)     _SAL1_1_Source_(_In_opt_z_bytecount_c_, (size), _Pre_opt_z_ _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_)
+
+// buffer capacity is described by another pointer
+// e.g. void Foo( _In_ptrdiff_count_(pchMax) const char* pch, const char* pchMax ) { while pch < pchMax ) pch++; }
+#define _In_ptrdiff_count_(size)       _SAL1_1_Source_(_In_ptrdiff_count_, (size), _Pre_ptrdiff_count_(size)     _Deref_pre_readonly_)
+#define _In_opt_ptrdiff_count_(size)   _SAL1_1_Source_(_In_opt_ptrdiff_count_, (size), _Pre_opt_ptrdiff_count_(size) _Deref_pre_readonly_)
+
+// 'x' version for complex expressions that are not supported by the current compiler version
+// e.g. void Set3ColMatrix( _In_count_x_(3*cRows) const Elem* matrix, int cRows );
+#define _In_count_x_(size)             _SAL1_1_Source_(_In_count_x_, (size), _Pre_count_x_(size)         _Deref_pre_readonly_)
+#define _In_opt_count_x_(size)         _SAL1_1_Source_(_In_opt_count_x_, (size), _Pre_opt_count_x_(size)     _Deref_pre_readonly_)
+#define _In_bytecount_x_(size)         _SAL1_1_Source_(_In_bytecount_x_, (size), _Pre_bytecount_x_(size)     _Deref_pre_readonly_)
+#define _In_opt_bytecount_x_(size)     _SAL1_1_Source_(_In_opt_bytecount_x_, (size), _Pre_opt_bytecount_x_(size) _Deref_pre_readonly_)
+
+
+// 'out' with buffer size
+// e.g. void GetIndices( _Out_cap_(cIndices) int* rgIndices, size_t cIndices );
+// buffer capacity is described by another parameter
+#define _Out_cap_(size)                   _SAL1_1_Source_(_Out_cap_, (size), _Pre_cap_(size)           _Post_valid_impl_)
+#define _Out_opt_cap_(size)               _SAL1_1_Source_(_Out_opt_cap_, (size), _Pre_opt_cap_(size)       _Post_valid_impl_)
+#define _Out_bytecap_(size)               _SAL1_1_Source_(_Out_bytecap_, (size), _Pre_bytecap_(size)       _Post_valid_impl_)
+#define _Out_opt_bytecap_(size)           _SAL1_1_Source_(_Out_opt_bytecap_, (size), _Pre_opt_bytecap_(size)   _Post_valid_impl_)
+
+// buffer capacity is described by a constant expression
+#define _Out_cap_c_(size)                 _SAL1_1_Source_(_Out_cap_c_, (size), _Pre_cap_c_(size)         _Post_valid_impl_)
+#define _Out_opt_cap_c_(size)             _SAL1_1_Source_(_Out_opt_cap_c_, (size), _Pre_opt_cap_c_(size)     _Post_valid_impl_)
+#define _Out_bytecap_c_(size)             _SAL1_1_Source_(_Out_bytecap_c_, (size), _Pre_bytecap_c_(size)     _Post_valid_impl_)
+#define _Out_opt_bytecap_c_(size)         _SAL1_1_Source_(_Out_opt_bytecap_c_, (size), _Pre_opt_bytecap_c_(size) _Post_valid_impl_)
+
+// buffer capacity is described by another parameter multiplied by a constant expression
+#define _Out_cap_m_(mult,size)            _SAL1_1_Source_(_Out_cap_m_, (mult,size), _Pre_cap_m_(mult,size)     _Post_valid_impl_)
+#define _Out_opt_cap_m_(mult,size)        _SAL1_1_Source_(_Out_opt_cap_m_, (mult,size), _Pre_opt_cap_m_(mult,size) _Post_valid_impl_)
+#define _Out_z_cap_m_(mult,size)          _SAL1_1_Source_(_Out_z_cap_m_, (mult,size), _Pre_cap_m_(mult,size)     _Post_valid_impl_ _Post_z_)
+#define _Out_opt_z_cap_m_(mult,size)      _SAL1_1_Source_(_Out_opt_z_cap_m_, (mult,size), _Pre_opt_cap_m_(mult,size) _Post_valid_impl_ _Post_z_)
+
+// buffer capacity is described by another pointer
+// e.g. void Foo( _Out_ptrdiff_cap_(pchMax) char* pch, const char* pchMax ) { while pch < pchMax ) pch++; }
+#define _Out_ptrdiff_cap_(size)           _SAL1_1_Source_(_Out_ptrdiff_cap_, (size), _Pre_ptrdiff_cap_(size)     _Post_valid_impl_)
+#define _Out_opt_ptrdiff_cap_(size)       _SAL1_1_Source_(_Out_opt_ptrdiff_cap_, (size), _Pre_opt_ptrdiff_cap_(size) _Post_valid_impl_)
+
+// buffer capacity is described by a complex expression
+#define _Out_cap_x_(size)                 _SAL1_1_Source_(_Out_cap_x_, (size), _Pre_cap_x_(size)         _Post_valid_impl_)
+#define _Out_opt_cap_x_(size)             _SAL1_1_Source_(_Out_opt_cap_x_, (size), _Pre_opt_cap_x_(size)     _Post_valid_impl_)
+#define _Out_bytecap_x_(size)             _SAL1_1_Source_(_Out_bytecap_x_, (size), _Pre_bytecap_x_(size)     _Post_valid_impl_)
+#define _Out_opt_bytecap_x_(size)         _SAL1_1_Source_(_Out_opt_bytecap_x_, (size), _Pre_opt_bytecap_x_(size) _Post_valid_impl_)
+
+// a zero terminated string is filled into a buffer of given capacity
+// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo );
+// buffer capacity is described by another parameter
+#define _Out_z_cap_(size)                 _SAL1_1_Source_(_Out_z_cap_, (size), _Pre_cap_(size)           _Post_valid_impl_ _Post_z_)
+#define _Out_opt_z_cap_(size)             _SAL1_1_Source_(_Out_opt_z_cap_, (size), _Pre_opt_cap_(size)       _Post_valid_impl_ _Post_z_)
+#define _Out_z_bytecap_(size)             _SAL1_1_Source_(_Out_z_bytecap_, (size), _Pre_bytecap_(size)       _Post_valid_impl_ _Post_z_)
+#define _Out_opt_z_bytecap_(size)         _SAL1_1_Source_(_Out_opt_z_bytecap_, (size), _Pre_opt_bytecap_(size)   _Post_valid_impl_ _Post_z_)
+
+// buffer capacity is described by a constant expression
+#define _Out_z_cap_c_(size)               _SAL1_1_Source_(_Out_z_cap_c_, (size), _Pre_cap_c_(size)         _Post_valid_impl_ _Post_z_)
+#define _Out_opt_z_cap_c_(size)           _SAL1_1_Source_(_Out_opt_z_cap_c_, (size), _Pre_opt_cap_c_(size)     _Post_valid_impl_ _Post_z_)
+#define _Out_z_bytecap_c_(size)           _SAL1_1_Source_(_Out_z_bytecap_c_, (size), _Pre_bytecap_c_(size)     _Post_valid_impl_ _Post_z_)
+#define _Out_opt_z_bytecap_c_(size)       _SAL1_1_Source_(_Out_opt_z_bytecap_c_, (size), _Pre_opt_bytecap_c_(size) _Post_valid_impl_ _Post_z_)
+
+// buffer capacity is described by a complex expression
+#define _Out_z_cap_x_(size)               _SAL1_1_Source_(_Out_z_cap_x_, (size), _Pre_cap_x_(size)         _Post_valid_impl_ _Post_z_)
+#define _Out_opt_z_cap_x_(size)           _SAL1_1_Source_(_Out_opt_z_cap_x_, (size), _Pre_opt_cap_x_(size)     _Post_valid_impl_ _Post_z_)
+#define _Out_z_bytecap_x_(size)           _SAL1_1_Source_(_Out_z_bytecap_x_, (size), _Pre_bytecap_x_(size)     _Post_valid_impl_ _Post_z_)
+#define _Out_opt_z_bytecap_x_(size)       _SAL1_1_Source_(_Out_opt_z_bytecap_x_, (size), _Pre_opt_bytecap_x_(size) _Post_valid_impl_ _Post_z_)
+
+// a zero terminated string is filled into a buffer of given capacity
+// e.g. size_t CopyCharRange( _In_count_(cchFrom) const char* rgchFrom, size_t cchFrom, _Out_cap_post_count_(cchTo,return)) char* rgchTo, size_t cchTo );
+#define _Out_cap_post_count_(cap,count)                _SAL1_1_Source_(_Out_cap_post_count_, (cap,count), _Pre_cap_(cap)         _Post_valid_impl_ _Post_count_(count))
+#define _Out_opt_cap_post_count_(cap,count)            _SAL1_1_Source_(_Out_opt_cap_post_count_, (cap,count), _Pre_opt_cap_(cap)     _Post_valid_impl_ _Post_count_(count))
+#define _Out_bytecap_post_bytecount_(cap,count)        _SAL1_1_Source_(_Out_bytecap_post_bytecount_, (cap,count), _Pre_bytecap_(cap)     _Post_valid_impl_ _Post_bytecount_(count))
+#define _Out_opt_bytecap_post_bytecount_(cap,count)    _SAL1_1_Source_(_Out_opt_bytecap_post_bytecount_, (cap,count), _Pre_opt_bytecap_(cap) _Post_valid_impl_ _Post_bytecount_(count))
+
+// a zero terminated string is filled into a buffer of given capacity
+// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Out_z_cap_post_count_(cchTo,return+1) char* szTo, size_t cchTo );
+#define _Out_z_cap_post_count_(cap,count)               _SAL1_1_Source_(_Out_z_cap_post_count_, (cap,count), _Pre_cap_(cap)         _Post_valid_impl_ _Post_z_count_(count))
+#define _Out_opt_z_cap_post_count_(cap,count)           _SAL1_1_Source_(_Out_opt_z_cap_post_count_, (cap,count), _Pre_opt_cap_(cap)     _Post_valid_impl_ _Post_z_count_(count))
+#define _Out_z_bytecap_post_bytecount_(cap,count)       _SAL1_1_Source_(_Out_z_bytecap_post_bytecount_, (cap,count), _Pre_bytecap_(cap)     _Post_valid_impl_ _Post_z_bytecount_(count))
+#define _Out_opt_z_bytecap_post_bytecount_(cap,count)   _SAL1_1_Source_(_Out_opt_z_bytecap_post_bytecount_, (cap,count), _Pre_opt_bytecap_(cap) _Post_valid_impl_ _Post_z_bytecount_(count))
+
+// only use with dereferenced arguments e.g. '*pcch'
+#define _Out_capcount_(capcount)             _SAL1_1_Source_(_Out_capcount_, (capcount), _Pre_cap_(capcount)         _Post_valid_impl_ _Post_count_(capcount))
+#define _Out_opt_capcount_(capcount)         _SAL1_1_Source_(_Out_opt_capcount_, (capcount), _Pre_opt_cap_(capcount)     _Post_valid_impl_ _Post_count_(capcount))
+#define _Out_bytecapcount_(capcount)         _SAL1_1_Source_(_Out_bytecapcount_, (capcount), _Pre_bytecap_(capcount)     _Post_valid_impl_ _Post_bytecount_(capcount))
+#define _Out_opt_bytecapcount_(capcount)     _SAL1_1_Source_(_Out_opt_bytecapcount_, (capcount), _Pre_opt_bytecap_(capcount) _Post_valid_impl_ _Post_bytecount_(capcount))
+
+#define _Out_capcount_x_(capcount)           _SAL1_1_Source_(_Out_capcount_x_, (capcount), _Pre_cap_x_(capcount)         _Post_valid_impl_ _Post_count_x_(capcount))
+#define _Out_opt_capcount_x_(capcount)       _SAL1_1_Source_(_Out_opt_capcount_x_, (capcount), _Pre_opt_cap_x_(capcount)     _Post_valid_impl_ _Post_count_x_(capcount))
+#define _Out_bytecapcount_x_(capcount)       _SAL1_1_Source_(_Out_bytecapcount_x_, (capcount), _Pre_bytecap_x_(capcount)     _Post_valid_impl_ _Post_bytecount_x_(capcount))
+#define _Out_opt_bytecapcount_x_(capcount)   _SAL1_1_Source_(_Out_opt_bytecapcount_x_, (capcount), _Pre_opt_bytecap_x_(capcount) _Post_valid_impl_ _Post_bytecount_x_(capcount))
+
+// e.g. GetString( _Out_z_capcount_(*pLen+1) char* sz, size_t* pLen );
+#define _Out_z_capcount_(capcount)           _SAL1_1_Source_(_Out_z_capcount_, (capcount), _Pre_cap_(capcount)         _Post_valid_impl_ _Post_z_count_(capcount))
+#define _Out_opt_z_capcount_(capcount)       _SAL1_1_Source_(_Out_opt_z_capcount_, (capcount), _Pre_opt_cap_(capcount)     _Post_valid_impl_ _Post_z_count_(capcount))
+#define _Out_z_bytecapcount_(capcount)       _SAL1_1_Source_(_Out_z_bytecapcount_, (capcount), _Pre_bytecap_(capcount)     _Post_valid_impl_ _Post_z_bytecount_(capcount))
+#define _Out_opt_z_bytecapcount_(capcount)   _SAL1_1_Source_(_Out_opt_z_bytecapcount_, (capcount), _Pre_opt_bytecap_(capcount) _Post_valid_impl_ _Post_z_bytecount_(capcount))
+
+
+// 'inout' buffers with initialized elements before and after the call
+// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndices, size_t cIndices );
+#define _Inout_count_(size)               _SAL1_1_Source_(_Inout_count_, (size), _Prepost_count_(size))
+#define _Inout_opt_count_(size)           _SAL1_1_Source_(_Inout_opt_count_, (size), _Prepost_opt_count_(size))
+#define _Inout_bytecount_(size)           _SAL1_1_Source_(_Inout_bytecount_, (size), _Prepost_bytecount_(size))
+#define _Inout_opt_bytecount_(size)       _SAL1_1_Source_(_Inout_opt_bytecount_, (size), _Prepost_opt_bytecount_(size))
+
+#define _Inout_count_c_(size)             _SAL1_1_Source_(_Inout_count_c_, (size), _Prepost_count_c_(size))
+#define _Inout_opt_count_c_(size)         _SAL1_1_Source_(_Inout_opt_count_c_, (size), _Prepost_opt_count_c_(size))
+#define _Inout_bytecount_c_(size)         _SAL1_1_Source_(_Inout_bytecount_c_, (size), _Prepost_bytecount_c_(size))
+#define _Inout_opt_bytecount_c_(size)     _SAL1_1_Source_(_Inout_opt_bytecount_c_, (size), _Prepost_opt_bytecount_c_(size))
+
+// nullterminated 'inout' buffers with initialized elements before and after the call
+// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndices, size_t cIndices );
+#define _Inout_z_count_(size)               _SAL1_1_Source_(_Inout_z_count_, (size), _Prepost_z_ _Prepost_count_(size))
+#define _Inout_opt_z_count_(size)           _SAL1_1_Source_(_Inout_opt_z_count_, (size), _Prepost_z_ _Prepost_opt_count_(size))
+#define _Inout_z_bytecount_(size)           _SAL1_1_Source_(_Inout_z_bytecount_, (size), _Prepost_z_ _Prepost_bytecount_(size))
+#define _Inout_opt_z_bytecount_(size)       _SAL1_1_Source_(_Inout_opt_z_bytecount_, (size), _Prepost_z_ _Prepost_opt_bytecount_(size))
+
+#define _Inout_z_count_c_(size)             _SAL1_1_Source_(_Inout_z_count_c_, (size), _Prepost_z_ _Prepost_count_c_(size))
+#define _Inout_opt_z_count_c_(size)         _SAL1_1_Source_(_Inout_opt_z_count_c_, (size), _Prepost_z_ _Prepost_opt_count_c_(size))
+#define _Inout_z_bytecount_c_(size)         _SAL1_1_Source_(_Inout_z_bytecount_c_, (size), _Prepost_z_ _Prepost_bytecount_c_(size))
+#define _Inout_opt_z_bytecount_c_(size)     _SAL1_1_Source_(_Inout_opt_z_bytecount_c_, (size), _Prepost_z_ _Prepost_opt_bytecount_c_(size))
+
+#define _Inout_ptrdiff_count_(size)       _SAL1_1_Source_(_Inout_ptrdiff_count_, (size), _Pre_ptrdiff_count_(size))
+#define _Inout_opt_ptrdiff_count_(size)   _SAL1_1_Source_(_Inout_opt_ptrdiff_count_, (size), _Pre_opt_ptrdiff_count_(size))
+
+#define _Inout_count_x_(size)             _SAL1_1_Source_(_Inout_count_x_, (size), _Prepost_count_x_(size))
+#define _Inout_opt_count_x_(size)         _SAL1_1_Source_(_Inout_opt_count_x_, (size), _Prepost_opt_count_x_(size))
+#define _Inout_bytecount_x_(size)         _SAL1_1_Source_(_Inout_bytecount_x_, (size), _Prepost_bytecount_x_(size))
+#define _Inout_opt_bytecount_x_(size)     _SAL1_1_Source_(_Inout_opt_bytecount_x_, (size), _Prepost_opt_bytecount_x_(size))
+
+// e.g. void AppendToLPSTR( _In_ LPCSTR szFrom, _Inout_cap_(cchTo) LPSTR* szTo, size_t cchTo );
+#define _Inout_cap_(size)                 _SAL1_1_Source_(_Inout_cap_, (size), _Pre_valid_cap_(size)           _Post_valid_)
+#define _Inout_opt_cap_(size)             _SAL1_1_Source_(_Inout_opt_cap_, (size), _Pre_opt_valid_cap_(size)       _Post_valid_)
+#define _Inout_bytecap_(size)             _SAL1_1_Source_(_Inout_bytecap_, (size), _Pre_valid_bytecap_(size)       _Post_valid_)
+#define _Inout_opt_bytecap_(size)         _SAL1_1_Source_(_Inout_opt_bytecap_, (size), _Pre_opt_valid_bytecap_(size)   _Post_valid_)
+
+#define _Inout_cap_c_(size)               _SAL1_1_Source_(_Inout_cap_c_, (size), _Pre_valid_cap_c_(size)         _Post_valid_)
+#define _Inout_opt_cap_c_(size)           _SAL1_1_Source_(_Inout_opt_cap_c_, (size), _Pre_opt_valid_cap_c_(size)     _Post_valid_)
+#define _Inout_bytecap_c_(size)           _SAL1_1_Source_(_Inout_bytecap_c_, (size), _Pre_valid_bytecap_c_(size)     _Post_valid_)
+#define _Inout_opt_bytecap_c_(size)       _SAL1_1_Source_(_Inout_opt_bytecap_c_, (size), _Pre_opt_valid_bytecap_c_(size) _Post_valid_)
+
+#define _Inout_cap_x_(size)               _SAL1_1_Source_(_Inout_cap_x_, (size), _Pre_valid_cap_x_(size)         _Post_valid_)
+#define _Inout_opt_cap_x_(size)           _SAL1_1_Source_(_Inout_opt_cap_x_, (size), _Pre_opt_valid_cap_x_(size)     _Post_valid_)
+#define _Inout_bytecap_x_(size)           _SAL1_1_Source_(_Inout_bytecap_x_, (size), _Pre_valid_bytecap_x_(size)     _Post_valid_)
+#define _Inout_opt_bytecap_x_(size)       _SAL1_1_Source_(_Inout_opt_bytecap_x_, (size), _Pre_opt_valid_bytecap_x_(size) _Post_valid_)
+
+// inout string buffers with writable size
+// e.g. void AppendStr( _In_z_ const char* szFrom, _Inout_z_cap_(cchTo) char* szTo, size_t cchTo );
+#define _Inout_z_cap_(size)                  _SAL1_1_Source_(_Inout_z_cap_, (size), _Pre_z_cap_(size)            _Post_z_)
+#define _Inout_opt_z_cap_(size)              _SAL1_1_Source_(_Inout_opt_z_cap_, (size), _Pre_opt_z_cap_(size)        _Post_z_)
+#define _Inout_z_bytecap_(size)              _SAL1_1_Source_(_Inout_z_bytecap_, (size), _Pre_z_bytecap_(size)        _Post_z_)
+#define _Inout_opt_z_bytecap_(size)          _SAL1_1_Source_(_Inout_opt_z_bytecap_, (size), _Pre_opt_z_bytecap_(size)    _Post_z_)
+
+#define _Inout_z_cap_c_(size)                _SAL1_1_Source_(_Inout_z_cap_c_, (size), _Pre_z_cap_c_(size)          _Post_z_)
+#define _Inout_opt_z_cap_c_(size)            _SAL1_1_Source_(_Inout_opt_z_cap_c_, (size), _Pre_opt_z_cap_c_(size)      _Post_z_)
+#define _Inout_z_bytecap_c_(size)            _SAL1_1_Source_(_Inout_z_bytecap_c_, (size), _Pre_z_bytecap_c_(size)      _Post_z_)
+#define _Inout_opt_z_bytecap_c_(size)        _SAL1_1_Source_(_Inout_opt_z_bytecap_c_, (size), _Pre_opt_z_bytecap_c_(size)  _Post_z_)
+
+#define _Inout_z_cap_x_(size)                _SAL1_1_Source_(_Inout_z_cap_x_, (size), _Pre_z_cap_x_(size)          _Post_z_)
+#define _Inout_opt_z_cap_x_(size)            _SAL1_1_Source_(_Inout_opt_z_cap_x_, (size), _Pre_opt_z_cap_x_(size)      _Post_z_)
+#define _Inout_z_bytecap_x_(size)            _SAL1_1_Source_(_Inout_z_bytecap_x_, (size), _Pre_z_bytecap_x_(size)      _Post_z_)
+#define _Inout_opt_z_bytecap_x_(size)        _SAL1_1_Source_(_Inout_opt_z_bytecap_x_, (size), _Pre_opt_z_bytecap_x_(size)  _Post_z_)
+
+
+// returning pointers to valid objects
+#define _Ret_                   _SAL1_1_Source_(_Ret_, (), _Ret_valid_)
+#define _Ret_opt_               _SAL1_1_Source_(_Ret_opt_, (), _Ret_opt_valid_)
+
+// annotations to express 'boundedness' of integral value parameter
+#define _In_bound_           _SAL1_1_Source_(_In_bound_, (), _In_bound_impl_)
+#define _Out_bound_          _SAL1_1_Source_(_Out_bound_, (), _Out_bound_impl_)
+#define _Ret_bound_          _SAL1_1_Source_(_Ret_bound_, (), _Ret_bound_impl_)
+#define _Deref_in_bound_     _SAL1_1_Source_(_Deref_in_bound_, (), _Deref_in_bound_impl_)
+#define _Deref_out_bound_    _SAL1_1_Source_(_Deref_out_bound_, (), _Deref_out_bound_impl_)
+#define _Deref_inout_bound_  _SAL1_1_Source_(_Deref_inout_bound_, (), _Deref_in_bound_ _Deref_out_bound_)
+#define _Deref_ret_bound_    _SAL1_1_Source_(_Deref_ret_bound_, (), _Deref_ret_bound_impl_)
+
+// e.g.  HRESULT HrCreatePoint( _Deref_out_opt_ POINT** ppPT );
+#define _Deref_out_             _SAL1_1_Source_(_Deref_out_, (), _Out_ _Deref_post_valid_)
+#define _Deref_out_opt_         _SAL1_1_Source_(_Deref_out_opt_, (), _Out_ _Deref_post_opt_valid_)
+#define _Deref_opt_out_         _SAL1_1_Source_(_Deref_opt_out_, (), _Out_opt_ _Deref_post_valid_)
+#define _Deref_opt_out_opt_     _SAL1_1_Source_(_Deref_opt_out_opt_, (), _Out_opt_ _Deref_post_opt_valid_)
+
+// e.g.  void CloneString( _In_z_ const WCHAR* wzFrom, _Deref_out_z_ WCHAR** pWzTo );
+#define _Deref_out_z_           _SAL1_1_Source_(_Deref_out_z_, (), _Out_ _Deref_post_z_)
+#define _Deref_out_opt_z_       _SAL1_1_Source_(_Deref_out_opt_z_, (), _Out_ _Deref_post_opt_z_)
+#define _Deref_opt_out_z_       _SAL1_1_Source_(_Deref_opt_out_z_, (), _Out_opt_ _Deref_post_z_)
+#define _Deref_opt_out_opt_z_   _SAL1_1_Source_(_Deref_opt_out_opt_z_, (), _Out_opt_ _Deref_post_opt_z_)
+
+//
+// _Deref_pre_ ---
+//
+// describing conditions for array elements of dereferenced pointer parameters that must be met before the call
+
+// e.g. void SaveStringArray( _In_count_(cStrings) _Deref_pre_z_ const WCHAR* const rgpwch[] );
+#define _Deref_pre_z_                           _SAL1_1_Source_(_Deref_pre_z_, (), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__zterm_impl) _Pre_valid_impl_)
+#define _Deref_pre_opt_z_                       _SAL1_1_Source_(_Deref_pre_opt_z_, (), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__zterm_impl) _Pre_valid_impl_)
+
+// e.g. void FillInArrayOfStr32( _In_count_(cStrings) _Deref_pre_cap_c_(32) _Deref_post_z_ WCHAR* const rgpwch[] );
+// buffer capacity is described by another parameter
+#define _Deref_pre_cap_(size)                   _SAL1_1_Source_(_Deref_pre_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__cap_impl(size)))
+#define _Deref_pre_opt_cap_(size)               _SAL1_1_Source_(_Deref_pre_opt_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_impl(size)))
+#define _Deref_pre_bytecap_(size)               _SAL1_1_Source_(_Deref_pre_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecap_impl(size)))
+#define _Deref_pre_opt_bytecap_(size)           _SAL1_1_Source_(_Deref_pre_opt_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size)))
+
+// buffer capacity is described by a constant expression
+#define _Deref_pre_cap_c_(size)                 _SAL1_1_Source_(_Deref_pre_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__cap_c_impl(size)))
+#define _Deref_pre_opt_cap_c_(size)             _SAL1_1_Source_(_Deref_pre_opt_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size)))
+#define _Deref_pre_bytecap_c_(size)             _SAL1_1_Source_(_Deref_pre_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecap_c_impl(size)))
+#define _Deref_pre_opt_bytecap_c_(size)         _SAL1_1_Source_(_Deref_pre_opt_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size)))
+
+// buffer capacity is described by a complex condition
+#define _Deref_pre_cap_x_(size)                 _SAL1_1_Source_(_Deref_pre_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__cap_x_impl(size)))
+#define _Deref_pre_opt_cap_x_(size)             _SAL1_1_Source_(_Deref_pre_opt_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size)))
+#define _Deref_pre_bytecap_x_(size)             _SAL1_1_Source_(_Deref_pre_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecap_x_impl(size)))
+#define _Deref_pre_opt_bytecap_x_(size)         _SAL1_1_Source_(_Deref_pre_opt_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size)))
+
+// convenience macros for nullterminated buffers with given capacity
+#define _Deref_pre_z_cap_(size)                 _SAL1_1_Source_(_Deref_pre_z_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre2_impl_(__zterm_impl,__cap_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_opt_z_cap_(size)             _SAL1_1_Source_(_Deref_pre_opt_z_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_z_bytecap_(size)             _SAL1_1_Source_(_Deref_pre_z_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_)
+#define _Deref_pre_opt_z_bytecap_(size)         _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_)
+
+#define _Deref_pre_z_cap_c_(size)               _SAL1_1_Source_(_Deref_pre_z_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre2_impl_(__zterm_impl,__cap_c_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_opt_z_cap_c_(size)           _SAL1_1_Source_(_Deref_pre_opt_z_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_c_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_z_bytecap_c_(size)           _SAL1_1_Source_(_Deref_pre_z_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_)
+#define _Deref_pre_opt_z_bytecap_c_(size)       _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_)
+
+#define _Deref_pre_z_cap_x_(size)               _SAL1_1_Source_(_Deref_pre_z_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre2_impl_(__zterm_impl,__cap_x_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_opt_z_cap_x_(size)           _SAL1_1_Source_(_Deref_pre_opt_z_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_x_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_z_bytecap_x_(size)           _SAL1_1_Source_(_Deref_pre_z_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_)
+#define _Deref_pre_opt_z_bytecap_x_(size)       _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_)
+
+// known capacity and valid but unknown readable extent
+#define _Deref_pre_valid_cap_(size)             _SAL1_1_Source_(_Deref_pre_valid_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__cap_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_opt_valid_cap_(size)         _SAL1_1_Source_(_Deref_pre_opt_valid_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_valid_bytecap_(size)         _SAL1_1_Source_(_Deref_pre_valid_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_)
+#define _Deref_pre_opt_valid_bytecap_(size)     _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_)
+
+#define _Deref_pre_valid_cap_c_(size)           _SAL1_1_Source_(_Deref_pre_valid_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__cap_c_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_opt_valid_cap_c_(size)       _SAL1_1_Source_(_Deref_pre_opt_valid_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_valid_bytecap_c_(size)       _SAL1_1_Source_(_Deref_pre_valid_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_)
+#define _Deref_pre_opt_valid_bytecap_c_(size)   _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_)
+
+#define _Deref_pre_valid_cap_x_(size)           _SAL1_1_Source_(_Deref_pre_valid_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__cap_x_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_opt_valid_cap_x_(size)       _SAL1_1_Source_(_Deref_pre_opt_valid_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_valid_bytecap_x_(size)       _SAL1_1_Source_(_Deref_pre_valid_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_)
+#define _Deref_pre_opt_valid_bytecap_x_(size)   _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_)
+
+// e.g. void SaveMatrix( _In_count_(n) _Deref_pre_count_(n) const Elem** matrix, size_t n );
+// valid buffer extent is described by another parameter
+#define _Deref_pre_count_(size)                 _SAL1_1_Source_(_Deref_pre_count_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__count_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_opt_count_(size)             _SAL1_1_Source_(_Deref_pre_opt_count_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_bytecount_(size)             _SAL1_1_Source_(_Deref_pre_bytecount_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_)
+#define _Deref_pre_opt_bytecount_(size)         _SAL1_1_Source_(_Deref_pre_opt_bytecount_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_)
+
+// valid buffer extent is described by a constant expression
+#define _Deref_pre_count_c_(size)               _SAL1_1_Source_(_Deref_pre_count_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__count_c_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_opt_count_c_(size)           _SAL1_1_Source_(_Deref_pre_opt_count_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_c_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_bytecount_c_(size)           _SAL1_1_Source_(_Deref_pre_bytecount_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_)
+#define _Deref_pre_opt_bytecount_c_(size)       _SAL1_1_Source_(_Deref_pre_opt_bytecount_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_)
+
+// valid buffer extent is described by a complex expression
+#define _Deref_pre_count_x_(size)               _SAL1_1_Source_(_Deref_pre_count_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__count_x_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_opt_count_x_(size)           _SAL1_1_Source_(_Deref_pre_opt_count_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_x_impl(size))     _Pre_valid_impl_)
+#define _Deref_pre_bytecount_x_(size)           _SAL1_1_Source_(_Deref_pre_bytecount_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_)
+#define _Deref_pre_opt_bytecount_x_(size)       _SAL1_1_Source_(_Deref_pre_opt_bytecount_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_)
+
+// e.g. void PrintStringArray( _In_count_(cElems) _Deref_pre_valid_ LPCSTR rgStr[], size_t cElems );
+#define _Deref_pre_valid_                       _SAL1_1_Source_(_Deref_pre_valid_, (), _Deref_pre1_impl_(__notnull_impl_notref)   _Pre_valid_impl_)
+#define _Deref_pre_opt_valid_                   _SAL1_1_Source_(_Deref_pre_opt_valid_, (), _Deref_pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_)
+#define _Deref_pre_invalid_                     _SAL1_1_Source_(_Deref_pre_invalid_, (), _Deref_pre1_impl_(__notvalid_impl))
+
+#define _Deref_pre_notnull_                     _SAL1_1_Source_(_Deref_pre_notnull_, (), _Deref_pre1_impl_(__notnull_impl_notref))
+#define _Deref_pre_maybenull_                   _SAL1_1_Source_(_Deref_pre_maybenull_, (), _Deref_pre1_impl_(__maybenull_impl_notref))
+#define _Deref_pre_null_                        _SAL1_1_Source_(_Deref_pre_null_, (), _Deref_pre1_impl_(__null_impl_notref))
+
+// restrict access rights
+#define _Deref_pre_readonly_                    _SAL1_1_Source_(_Deref_pre_readonly_, (), _Deref_pre1_impl_(__readaccess_impl_notref))
+#define _Deref_pre_writeonly_                   _SAL1_1_Source_(_Deref_pre_writeonly_, (), _Deref_pre1_impl_(__writeaccess_impl_notref))
+
+//
+// _Deref_post_ ---
+//
+// describing conditions for array elements or dereferenced pointer parameters that hold after the call
+
+// e.g. void CloneString( _In_z_ const Wchar_t* wzIn _Out_ _Deref_post_z_ WCHAR** pWzOut );
+#define _Deref_post_z_                           _SAL1_1_Source_(_Deref_post_z_, (), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__zterm_impl) _Post_valid_impl_)
+#define _Deref_post_opt_z_                       _SAL1_1_Source_(_Deref_post_opt_z_, (), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__zterm_impl) _Post_valid_impl_)
+
+// e.g. HRESULT HrAllocateMemory( size_t cb, _Out_ _Deref_post_bytecap_(cb) void** ppv );
+// buffer capacity is described by another parameter
+#define _Deref_post_cap_(size)                   _SAL1_1_Source_(_Deref_post_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_impl(size)))
+#define _Deref_post_opt_cap_(size)               _SAL1_1_Source_(_Deref_post_opt_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_impl(size)))
+#define _Deref_post_bytecap_(size)               _SAL1_1_Source_(_Deref_post_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size)))
+#define _Deref_post_opt_bytecap_(size)           _SAL1_1_Source_(_Deref_post_opt_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size)))
+
+// buffer capacity is described by a constant expression
+#define _Deref_post_cap_c_(size)                 _SAL1_1_Source_(_Deref_post_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size)))
+#define _Deref_post_opt_cap_c_(size)             _SAL1_1_Source_(_Deref_post_opt_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size)))
+#define _Deref_post_bytecap_c_(size)             _SAL1_1_Source_(_Deref_post_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)))
+#define _Deref_post_opt_bytecap_c_(size)         _SAL1_1_Source_(_Deref_post_opt_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)))
+
+// buffer capacity is described by a complex expression
+#define _Deref_post_cap_x_(size)                 _SAL1_1_Source_(_Deref_post_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size)))
+#define _Deref_post_opt_cap_x_(size)             _SAL1_1_Source_(_Deref_post_opt_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size)))
+#define _Deref_post_bytecap_x_(size)             _SAL1_1_Source_(_Deref_post_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)))
+#define _Deref_post_opt_bytecap_x_(size)         _SAL1_1_Source_(_Deref_post_opt_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)))
+
+// convenience macros for nullterminated buffers with given capacity
+#define _Deref_post_z_cap_(size)                 _SAL1_1_Source_(_Deref_post_z_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_impl(size))       _Post_valid_impl_)
+#define _Deref_post_opt_z_cap_(size)             _SAL1_1_Source_(_Deref_post_opt_z_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_impl(size))       _Post_valid_impl_)
+#define _Deref_post_z_bytecap_(size)             _SAL1_1_Source_(_Deref_post_z_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_impl(size))   _Post_valid_impl_)
+#define _Deref_post_opt_z_bytecap_(size)         _SAL1_1_Source_(_Deref_post_opt_z_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_impl(size))   _Post_valid_impl_)
+
+#define _Deref_post_z_cap_c_(size)               _SAL1_1_Source_(_Deref_post_z_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_c_impl(size))     _Post_valid_impl_)
+#define _Deref_post_opt_z_cap_c_(size)           _SAL1_1_Source_(_Deref_post_opt_z_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_c_impl(size))     _Post_valid_impl_)
+#define _Deref_post_z_bytecap_c_(size)           _SAL1_1_Source_(_Deref_post_z_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Post_valid_impl_)
+#define _Deref_post_opt_z_bytecap_c_(size)       _SAL1_1_Source_(_Deref_post_opt_z_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Post_valid_impl_)
+
+#define _Deref_post_z_cap_x_(size)               _SAL1_1_Source_(_Deref_post_z_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_x_impl(size))     _Post_valid_impl_)
+#define _Deref_post_opt_z_cap_x_(size)           _SAL1_1_Source_(_Deref_post_opt_z_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_x_impl(size))     _Post_valid_impl_)
+#define _Deref_post_z_bytecap_x_(size)           _SAL1_1_Source_(_Deref_post_z_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Post_valid_impl_)
+#define _Deref_post_opt_z_bytecap_x_(size)       _SAL1_1_Source_(_Deref_post_opt_z_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Post_valid_impl_)
+
+// known capacity and valid but unknown readable extent
+#define _Deref_post_valid_cap_(size)             _SAL1_1_Source_(_Deref_post_valid_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_impl(size))       _Post_valid_impl_)
+#define _Deref_post_opt_valid_cap_(size)         _SAL1_1_Source_(_Deref_post_opt_valid_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_impl(size))       _Post_valid_impl_)
+#define _Deref_post_valid_bytecap_(size)         _SAL1_1_Source_(_Deref_post_valid_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size))   _Post_valid_impl_)
+#define _Deref_post_opt_valid_bytecap_(size)     _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size))   _Post_valid_impl_)
+
+#define _Deref_post_valid_cap_c_(size)           _SAL1_1_Source_(_Deref_post_valid_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size))     _Post_valid_impl_)
+#define _Deref_post_opt_valid_cap_c_(size)       _SAL1_1_Source_(_Deref_post_opt_valid_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size))     _Post_valid_impl_)
+#define _Deref_post_valid_bytecap_c_(size)       _SAL1_1_Source_(_Deref_post_valid_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)) _Post_valid_impl_)
+#define _Deref_post_opt_valid_bytecap_c_(size)   _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)) _Post_valid_impl_)
+
+#define _Deref_post_valid_cap_x_(size)           _SAL1_1_Source_(_Deref_post_valid_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size))     _Post_valid_impl_)
+#define _Deref_post_opt_valid_cap_x_(size)       _SAL1_1_Source_(_Deref_post_opt_valid_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size))     _Post_valid_impl_)
+#define _Deref_post_valid_bytecap_x_(size)       _SAL1_1_Source_(_Deref_post_valid_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)) _Post_valid_impl_)
+#define _Deref_post_opt_valid_bytecap_x_(size)   _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)) _Post_valid_impl_)
+
+// e.g. HRESULT HrAllocateZeroInitializedMemory( size_t cb, _Out_ _Deref_post_bytecount_(cb) void** ppv );
+// valid buffer extent is described by another parameter
+#define _Deref_post_count_(size)                 _SAL1_1_Source_(_Deref_post_count_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_impl(size))       _Post_valid_impl_)
+#define _Deref_post_opt_count_(size)             _SAL1_1_Source_(_Deref_post_opt_count_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_impl(size))       _Post_valid_impl_)
+#define _Deref_post_bytecount_(size)             _SAL1_1_Source_(_Deref_post_bytecount_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_impl(size))   _Post_valid_impl_)
+#define _Deref_post_opt_bytecount_(size)         _SAL1_1_Source_(_Deref_post_opt_bytecount_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_impl(size))   _Post_valid_impl_)
+
+// buffer capacity is described by a constant expression
+#define _Deref_post_count_c_(size)               _SAL1_1_Source_(_Deref_post_count_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_c_impl(size))     _Post_valid_impl_)
+#define _Deref_post_opt_count_c_(size)           _SAL1_1_Source_(_Deref_post_opt_count_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_c_impl(size))     _Post_valid_impl_)
+#define _Deref_post_bytecount_c_(size)           _SAL1_1_Source_(_Deref_post_bytecount_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_)
+#define _Deref_post_opt_bytecount_c_(size)       _SAL1_1_Source_(_Deref_post_opt_bytecount_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_)
+
+// buffer capacity is described by a complex expression
+#define _Deref_post_count_x_(size)               _SAL1_1_Source_(_Deref_post_count_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_x_impl(size))     _Post_valid_impl_)
+#define _Deref_post_opt_count_x_(size)           _SAL1_1_Source_(_Deref_post_opt_count_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_x_impl(size))     _Post_valid_impl_)
+#define _Deref_post_bytecount_x_(size)           _SAL1_1_Source_(_Deref_post_bytecount_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_)
+#define _Deref_post_opt_bytecount_x_(size)       _SAL1_1_Source_(_Deref_post_opt_bytecount_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_)
+
+// e.g. void GetStrings( _Out_count_(cElems) _Deref_post_valid_ LPSTR const rgStr[], size_t cElems );
+#define _Deref_post_valid_                       _SAL1_1_Source_(_Deref_post_valid_, (), _Deref_post1_impl_(__notnull_impl_notref)   _Post_valid_impl_)
+#define _Deref_post_opt_valid_                   _SAL1_1_Source_(_Deref_post_opt_valid_, (), _Deref_post1_impl_(__maybenull_impl_notref) _Post_valid_impl_)
+
+#define _Deref_post_notnull_                     _SAL1_1_Source_(_Deref_post_notnull_, (), _Deref_post1_impl_(__notnull_impl_notref))
+#define _Deref_post_maybenull_                   _SAL1_1_Source_(_Deref_post_maybenull_, (), _Deref_post1_impl_(__maybenull_impl_notref))
+#define _Deref_post_null_                        _SAL1_1_Source_(_Deref_post_null_, (), _Deref_post1_impl_(__null_impl_notref))
+
+//
+// _Deref_ret_ ---
+//
+
+#define _Deref_ret_z_                            _SAL1_1_Source_(_Deref_ret_z_, (), _Deref_ret1_impl_(__notnull_impl_notref) _Deref_ret1_impl_(__zterm_impl))
+#define _Deref_ret_opt_z_                        _SAL1_1_Source_(_Deref_ret_opt_z_, (), _Deref_ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__zterm_impl))
+
+//
+// special _Deref_ ---
+//
+#define _Deref2_pre_readonly_                    _SAL1_1_Source_(_Deref2_pre_readonly_, (), _Deref2_pre1_impl_(__readaccess_impl_notref))
+
+//
+// _Ret_ ---
+//
+
+// e.g. _Ret_opt_valid_ LPSTR void* CloneSTR( _Pre_valid_ LPSTR src );
+#define _Ret_opt_valid_                   _SAL1_1_Source_(_Ret_opt_valid_, (), _Ret1_impl_(__maybenull_impl_notref) _Ret_valid_impl_)
+#define _Ret_opt_z_                       _SAL1_1_Source_(_Ret_opt_z_, (), _Ret2_impl_(__maybenull_impl,__zterm_impl) _Ret_valid_impl_)
+
+// e.g. _Ret_opt_bytecap_(cb) void* AllocateMemory( size_t cb );
+// Buffer capacity is described by another parameter
+#define _Ret_cap_(size)                   _SAL1_1_Source_(_Ret_cap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_impl(size)))
+#define _Ret_opt_cap_(size)               _SAL1_1_Source_(_Ret_opt_cap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_impl(size)))
+#define _Ret_bytecap_(size)               _SAL1_1_Source_(_Ret_bytecap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_impl(size)))
+#define _Ret_opt_bytecap_(size)           _SAL1_1_Source_(_Ret_opt_bytecap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_impl(size)))
+
+// Buffer capacity is described by a constant expression
+#define _Ret_cap_c_(size)                 _SAL1_1_Source_(_Ret_cap_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_c_impl(size)))
+#define _Ret_opt_cap_c_(size)             _SAL1_1_Source_(_Ret_opt_cap_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_c_impl(size)))
+#define _Ret_bytecap_c_(size)             _SAL1_1_Source_(_Ret_bytecap_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_c_impl(size)))
+#define _Ret_opt_bytecap_c_(size)         _SAL1_1_Source_(_Ret_opt_bytecap_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_c_impl(size)))
+
+// Buffer capacity is described by a complex condition
+#define _Ret_cap_x_(size)                 _SAL1_1_Source_(_Ret_cap_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_x_impl(size)))
+#define _Ret_opt_cap_x_(size)             _SAL1_1_Source_(_Ret_opt_cap_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_x_impl(size)))
+#define _Ret_bytecap_x_(size)             _SAL1_1_Source_(_Ret_bytecap_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_x_impl(size)))
+#define _Ret_opt_bytecap_x_(size)         _SAL1_1_Source_(_Ret_opt_bytecap_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_x_impl(size)))
+
+// return value is nullterminated and capacity is given by another parameter
+#define _Ret_z_cap_(size)                 _SAL1_1_Source_(_Ret_z_cap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__cap_impl(size))     _Ret_valid_impl_)
+#define _Ret_opt_z_cap_(size)             _SAL1_1_Source_(_Ret_opt_z_cap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__cap_impl(size))     _Ret_valid_impl_)
+#define _Ret_z_bytecap_(size)             _SAL1_1_Source_(_Ret_z_bytecap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecap_impl(size)) _Ret_valid_impl_)
+#define _Ret_opt_z_bytecap_(size)         _SAL1_1_Source_(_Ret_opt_z_bytecap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecap_impl(size)) _Ret_valid_impl_)
+
+// e.g. _Ret_opt_bytecount_(cb) void* AllocateZeroInitializedMemory( size_t cb );
+// Valid Buffer extent is described by another parameter
+#define _Ret_count_(size)                 _SAL1_1_Source_(_Ret_count_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_impl(size))     _Ret_valid_impl_)
+#define _Ret_opt_count_(size)             _SAL1_1_Source_(_Ret_opt_count_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_impl(size))     _Ret_valid_impl_)
+#define _Ret_bytecount_(size)             _SAL1_1_Source_(_Ret_bytecount_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_impl(size)) _Ret_valid_impl_)
+#define _Ret_opt_bytecount_(size)         _SAL1_1_Source_(_Ret_opt_bytecount_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_impl(size)) _Ret_valid_impl_)
+
+// Valid Buffer extent is described by a constant expression
+#define _Ret_count_c_(size)               _SAL1_1_Source_(_Ret_count_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_c_impl(size))     _Ret_valid_impl_)
+#define _Ret_opt_count_c_(size)           _SAL1_1_Source_(_Ret_opt_count_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_c_impl(size))     _Ret_valid_impl_)
+#define _Ret_bytecount_c_(size)           _SAL1_1_Source_(_Ret_bytecount_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_c_impl(size)) _Ret_valid_impl_)
+#define _Ret_opt_bytecount_c_(size)       _SAL1_1_Source_(_Ret_opt_bytecount_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_c_impl(size)) _Ret_valid_impl_)
+
+// Valid Buffer extent is described by a complex expression
+#define _Ret_count_x_(size)               _SAL1_1_Source_(_Ret_count_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_x_impl(size))     _Ret_valid_impl_)
+#define _Ret_opt_count_x_(size)           _SAL1_1_Source_(_Ret_opt_count_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_x_impl(size))     _Ret_valid_impl_)
+#define _Ret_bytecount_x_(size)           _SAL1_1_Source_(_Ret_bytecount_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_x_impl(size)) _Ret_valid_impl_)
+#define _Ret_opt_bytecount_x_(size)       _SAL1_1_Source_(_Ret_opt_bytecount_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_x_impl(size)) _Ret_valid_impl_)
+
+// return value is nullterminated and length is given by another parameter
+#define _Ret_z_count_(size)               _SAL1_1_Source_(_Ret_z_count_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__count_impl(size))     _Ret_valid_impl_)
+#define _Ret_opt_z_count_(size)           _SAL1_1_Source_(_Ret_opt_z_count_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__count_impl(size))     _Ret_valid_impl_)
+#define _Ret_z_bytecount_(size)           _SAL1_1_Source_(_Ret_z_bytecount_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecount_impl(size)) _Ret_valid_impl_)
+#define _Ret_opt_z_bytecount_(size)       _SAL1_1_Source_(_Ret_opt_z_bytecount_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecount_impl(size)) _Ret_valid_impl_)
+
+
+// _Pre_ annotations ---
+#define _Pre_opt_z_                       _SAL1_1_Source_(_Pre_opt_z_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__zterm_impl) _Pre_valid_impl_)
+
+// restrict access rights
+#define _Pre_readonly_                    _SAL1_1_Source_(_Pre_readonly_, (), _Pre1_impl_(__readaccess_impl_notref))
+#define _Pre_writeonly_                   _SAL1_1_Source_(_Pre_writeonly_, (), _Pre1_impl_(__writeaccess_impl_notref))
+
+// e.g. void FreeMemory( _Pre_bytecap_(cb) _Post_ptr_invalid_ void* pv, size_t cb );
+// buffer capacity described by another parameter
+#define _Pre_cap_(size)                   _SAL1_1_Source_(_Pre_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_impl(size)))
+#define _Pre_opt_cap_(size)               _SAL1_1_Source_(_Pre_opt_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_impl(size)))
+#define _Pre_bytecap_(size)               _SAL1_1_Source_(_Pre_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_impl(size)))
+#define _Pre_opt_bytecap_(size)           _SAL1_1_Source_(_Pre_opt_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_impl(size)))
+
+// buffer capacity described by a constant expression
+#define _Pre_cap_c_(size)                 _SAL1_1_Source_(_Pre_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_impl(size)))
+#define _Pre_opt_cap_c_(size)             _SAL1_1_Source_(_Pre_opt_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_impl(size)))
+#define _Pre_bytecap_c_(size)             _SAL1_1_Source_(_Pre_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)))
+#define _Pre_opt_bytecap_c_(size)         _SAL1_1_Source_(_Pre_opt_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)))
+#define _Pre_cap_c_one_                   _SAL1_1_Source_(_Pre_cap_c_one_, (), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl))
+#define _Pre_opt_cap_c_one_               _SAL1_1_Source_(_Pre_opt_cap_c_one_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl))
+
+// buffer capacity is described by another parameter multiplied by a constant expression
+#define _Pre_cap_m_(mult,size)            _SAL1_1_Source_(_Pre_cap_m_, (mult,size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__mult_impl(mult,size)))
+#define _Pre_opt_cap_m_(mult,size)        _SAL1_1_Source_(_Pre_opt_cap_m_, (mult,size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__mult_impl(mult,size)))
+
+// buffer capacity described by size of other buffer, only used by dangerous legacy APIs
+// e.g. int strcpy(_Pre_cap_for_(src) char* dst, const char* src);
+#define _Pre_cap_for_(param)              _SAL1_1_Source_(_Pre_cap_for_, (param), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_for_impl(param)))
+#define _Pre_opt_cap_for_(param)          _SAL1_1_Source_(_Pre_opt_cap_for_, (param), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_for_impl(param)))
+
+// buffer capacity described by a complex condition
+#define _Pre_cap_x_(size)                 _SAL1_1_Source_(_Pre_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(size)))
+#define _Pre_opt_cap_x_(size)             _SAL1_1_Source_(_Pre_opt_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(size)))
+#define _Pre_bytecap_x_(size)             _SAL1_1_Source_(_Pre_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)))
+#define _Pre_opt_bytecap_x_(size)         _SAL1_1_Source_(_Pre_opt_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)))
+
+// buffer capacity described by the difference to another pointer parameter
+#define _Pre_ptrdiff_cap_(ptr)            _SAL1_1_Source_(_Pre_ptrdiff_cap_, (ptr), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(__ptrdiff(ptr))))
+#define _Pre_opt_ptrdiff_cap_(ptr)        _SAL1_1_Source_(_Pre_opt_ptrdiff_cap_, (ptr), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(__ptrdiff(ptr))))
+
+// e.g. void AppendStr( _Pre_z_ const char* szFrom, _Pre_z_cap_(cchTo) _Post_z_ char* szTo, size_t cchTo );
+#define _Pre_z_cap_(size)                 _SAL1_1_Source_(_Pre_z_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_impl(size))       _Pre_valid_impl_)
+#define _Pre_opt_z_cap_(size)             _SAL1_1_Source_(_Pre_opt_z_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_impl(size))       _Pre_valid_impl_)
+#define _Pre_z_bytecap_(size)             _SAL1_1_Source_(_Pre_z_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_impl(size))   _Pre_valid_impl_)
+#define _Pre_opt_z_bytecap_(size)         _SAL1_1_Source_(_Pre_opt_z_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_impl(size))   _Pre_valid_impl_)
+
+#define _Pre_z_cap_c_(size)               _SAL1_1_Source_(_Pre_z_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_c_impl(size))     _Pre_valid_impl_)
+#define _Pre_opt_z_cap_c_(size)           _SAL1_1_Source_(_Pre_opt_z_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_c_impl(size))     _Pre_valid_impl_)
+#define _Pre_z_bytecap_c_(size)           _SAL1_1_Source_(_Pre_z_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_)
+#define _Pre_opt_z_bytecap_c_(size)       _SAL1_1_Source_(_Pre_opt_z_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_)
+
+#define _Pre_z_cap_x_(size)               _SAL1_1_Source_(_Pre_z_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_x_impl(size))     _Pre_valid_impl_)
+#define _Pre_opt_z_cap_x_(size)           _SAL1_1_Source_(_Pre_opt_z_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_x_impl(size))     _Pre_valid_impl_)
+#define _Pre_z_bytecap_x_(size)           _SAL1_1_Source_(_Pre_z_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_)
+#define _Pre_opt_z_bytecap_x_(size)       _SAL1_1_Source_(_Pre_opt_z_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_)
+
+// known capacity and valid but unknown readable extent
+#define _Pre_valid_cap_(size)             _SAL1_1_Source_(_Pre_valid_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_impl(size))       _Pre_valid_impl_)
+#define _Pre_opt_valid_cap_(size)         _SAL1_1_Source_(_Pre_opt_valid_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_impl(size))       _Pre_valid_impl_)
+#define _Pre_valid_bytecap_(size)         _SAL1_1_Source_(_Pre_valid_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_impl(size))   _Pre_valid_impl_)
+#define _Pre_opt_valid_bytecap_(size)     _SAL1_1_Source_(_Pre_opt_valid_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_impl(size))   _Pre_valid_impl_)
+
+#define _Pre_valid_cap_c_(size)           _SAL1_1_Source_(_Pre_valid_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_impl(size))     _Pre_valid_impl_)
+#define _Pre_opt_valid_cap_c_(size)       _SAL1_1_Source_(_Pre_opt_valid_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_impl(size))     _Pre_valid_impl_)
+#define _Pre_valid_bytecap_c_(size)       _SAL1_1_Source_(_Pre_valid_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_)
+#define _Pre_opt_valid_bytecap_c_(size)   _SAL1_1_Source_(_Pre_opt_valid_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_)
+
+#define _Pre_valid_cap_x_(size)           _SAL1_1_Source_(_Pre_valid_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(size))     _Pre_valid_impl_)
+#define _Pre_opt_valid_cap_x_(size)       _SAL1_1_Source_(_Pre_opt_valid_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(size))     _Pre_valid_impl_)
+#define _Pre_valid_bytecap_x_(size)       _SAL1_1_Source_(_Pre_valid_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_)
+#define _Pre_opt_valid_bytecap_x_(size)   _SAL1_1_Source_(_Pre_opt_valid_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_)
+
+// e.g. void AppendCharRange( _Pre_count_(cchFrom) const char* rgFrom, size_t cchFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo );
+// Valid buffer extent described by another parameter
+#define _Pre_count_(size)                 _SAL1_1_Source_(_Pre_count_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_impl(size))       _Pre_valid_impl_)
+#define _Pre_opt_count_(size)             _SAL1_1_Source_(_Pre_opt_count_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_impl(size))       _Pre_valid_impl_)
+#define _Pre_bytecount_(size)             _SAL1_1_Source_(_Pre_bytecount_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_impl(size))   _Pre_valid_impl_)
+#define _Pre_opt_bytecount_(size)         _SAL1_1_Source_(_Pre_opt_bytecount_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_impl(size))   _Pre_valid_impl_)
+
+// Valid buffer extent described by a constant expression
+#define _Pre_count_c_(size)               _SAL1_1_Source_(_Pre_count_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_c_impl(size))     _Pre_valid_impl_)
+#define _Pre_opt_count_c_(size)           _SAL1_1_Source_(_Pre_opt_count_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_c_impl(size))     _Pre_valid_impl_)
+#define _Pre_bytecount_c_(size)           _SAL1_1_Source_(_Pre_bytecount_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_)
+#define _Pre_opt_bytecount_c_(size)       _SAL1_1_Source_(_Pre_opt_bytecount_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_)
+
+// Valid buffer extent described by a complex expression
+#define _Pre_count_x_(size)               _SAL1_1_Source_(_Pre_count_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_x_impl(size))     _Pre_valid_impl_)
+#define _Pre_opt_count_x_(size)           _SAL1_1_Source_(_Pre_opt_count_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_x_impl(size))     _Pre_valid_impl_)
+#define _Pre_bytecount_x_(size)           _SAL1_1_Source_(_Pre_bytecount_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_)
+#define _Pre_opt_bytecount_x_(size)       _SAL1_1_Source_(_Pre_opt_bytecount_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_)
+
+// Valid buffer extent described by the difference to another pointer parameter
+#define _Pre_ptrdiff_count_(ptr)          _SAL1_1_Source_(_Pre_ptrdiff_count_, (ptr), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_x_impl(__ptrdiff(ptr))) _Pre_valid_impl_)
+#define _Pre_opt_ptrdiff_count_(ptr)      _SAL1_1_Source_(_Pre_opt_ptrdiff_count_, (ptr), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_x_impl(__ptrdiff(ptr))) _Pre_valid_impl_)
+
+
+// char * strncpy(_Out_cap_(_Count) _Post_maybez_ char * _Dest, _In_z_ const char * _Source, _In_ size_t _Count)
+// buffer maybe zero-terminated after the call
+#define _Post_maybez_                    _SAL1_1_Source_(_Post_maybez_, (), _Post1_impl_(__maybezterm_impl))
+
+// e.g. SIZE_T HeapSize( _In_ HANDLE hHeap, DWORD dwFlags, _Pre_notnull_ _Post_bytecap_(return) LPCVOID lpMem );
+#define _Post_cap_(size)                 _SAL1_1_Source_(_Post_cap_, (size), _Post1_impl_(__cap_impl(size)))
+#define _Post_bytecap_(size)             _SAL1_1_Source_(_Post_bytecap_, (size), _Post1_impl_(__bytecap_impl(size)))
+
+// e.g. int strlen( _In_z_ _Post_count_(return+1) const char* sz );
+#define _Post_count_(size)               _SAL1_1_Source_(_Post_count_, (size), _Post1_impl_(__count_impl(size))       _Post_valid_impl_)
+#define _Post_bytecount_(size)           _SAL1_1_Source_(_Post_bytecount_, (size), _Post1_impl_(__bytecount_impl(size))   _Post_valid_impl_)
+#define _Post_count_c_(size)             _SAL1_1_Source_(_Post_count_c_, (size), _Post1_impl_(__count_c_impl(size))     _Post_valid_impl_)
+#define _Post_bytecount_c_(size)         _SAL1_1_Source_(_Post_bytecount_c_, (size), _Post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_)
+#define _Post_count_x_(size)             _SAL1_1_Source_(_Post_count_x_, (size), _Post1_impl_(__count_x_impl(size))     _Post_valid_impl_)
+#define _Post_bytecount_x_(size)         _SAL1_1_Source_(_Post_bytecount_x_, (size), _Post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_)
+
+// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_count_(return+1) char* szFrom, size_t cchFrom );
+#define _Post_z_count_(size)             _SAL1_1_Source_(_Post_z_count_, (size), _Post2_impl_(__zterm_impl,__count_impl(size))       _Post_valid_impl_)
+#define _Post_z_bytecount_(size)         _SAL1_1_Source_(_Post_z_bytecount_, (size), _Post2_impl_(__zterm_impl,__bytecount_impl(size))   _Post_valid_impl_)
+#define _Post_z_count_c_(size)           _SAL1_1_Source_(_Post_z_count_c_, (size), _Post2_impl_(__zterm_impl,__count_c_impl(size))     _Post_valid_impl_)
+#define _Post_z_bytecount_c_(size)       _SAL1_1_Source_(_Post_z_bytecount_c_, (size), _Post2_impl_(__zterm_impl,__bytecount_c_impl(size)) _Post_valid_impl_)
+#define _Post_z_count_x_(size)           _SAL1_1_Source_(_Post_z_count_x_, (size), _Post2_impl_(__zterm_impl,__count_x_impl(size))     _Post_valid_impl_)
+#define _Post_z_bytecount_x_(size)       _SAL1_1_Source_(_Post_z_bytecount_x_, (size), _Post2_impl_(__zterm_impl,__bytecount_x_impl(size)) _Post_valid_impl_)
+
+//
+// _Prepost_ ---
+//
+// describing conditions that hold before and after the function call
+
+#define _Prepost_opt_z_                  _SAL1_1_Source_(_Prepost_opt_z_, (), _Pre_opt_z_  _Post_z_)
+
+#define _Prepost_count_(size)            _SAL1_1_Source_(_Prepost_count_, (size), _Pre_count_(size)           _Post_count_(size))
+#define _Prepost_opt_count_(size)        _SAL1_1_Source_(_Prepost_opt_count_, (size), _Pre_opt_count_(size)       _Post_count_(size))
+#define _Prepost_bytecount_(size)        _SAL1_1_Source_(_Prepost_bytecount_, (size), _Pre_bytecount_(size)       _Post_bytecount_(size))
+#define _Prepost_opt_bytecount_(size)    _SAL1_1_Source_(_Prepost_opt_bytecount_, (size), _Pre_opt_bytecount_(size)   _Post_bytecount_(size))
+#define _Prepost_count_c_(size)          _SAL1_1_Source_(_Prepost_count_c_, (size), _Pre_count_c_(size)         _Post_count_c_(size))
+#define _Prepost_opt_count_c_(size)      _SAL1_1_Source_(_Prepost_opt_count_c_, (size), _Pre_opt_count_c_(size)     _Post_count_c_(size))
+#define _Prepost_bytecount_c_(size)      _SAL1_1_Source_(_Prepost_bytecount_c_, (size), _Pre_bytecount_c_(size)     _Post_bytecount_c_(size))
+#define _Prepost_opt_bytecount_c_(size)  _SAL1_1_Source_(_Prepost_opt_bytecount_c_, (size), _Pre_opt_bytecount_c_(size) _Post_bytecount_c_(size))
+#define _Prepost_count_x_(size)          _SAL1_1_Source_(_Prepost_count_x_, (size), _Pre_count_x_(size)         _Post_count_x_(size))
+#define _Prepost_opt_count_x_(size)      _SAL1_1_Source_(_Prepost_opt_count_x_, (size), _Pre_opt_count_x_(size)     _Post_count_x_(size))
+#define _Prepost_bytecount_x_(size)      _SAL1_1_Source_(_Prepost_bytecount_x_, (size), _Pre_bytecount_x_(size)     _Post_bytecount_x_(size))
+#define _Prepost_opt_bytecount_x_(size)  _SAL1_1_Source_(_Prepost_opt_bytecount_x_, (size), _Pre_opt_bytecount_x_(size) _Post_bytecount_x_(size))
+
+#define _Prepost_valid_                   _SAL1_1_Source_(_Prepost_valid_, (), _Pre_valid_     _Post_valid_)
+#define _Prepost_opt_valid_               _SAL1_1_Source_(_Prepost_opt_valid_, (), _Pre_opt_valid_ _Post_valid_)
+
+//
+// _Deref_<both> ---
+//
+// short version for _Deref_pre_<ann> _Deref_post_<ann>
+// describing conditions for array elements or dereferenced pointer parameters that hold before and after the call
+
+#define _Deref_prepost_z_                         _SAL1_1_Source_(_Deref_prepost_z_, (), _Deref_pre_z_      _Deref_post_z_)
+#define _Deref_prepost_opt_z_                     _SAL1_1_Source_(_Deref_prepost_opt_z_, (), _Deref_pre_opt_z_  _Deref_post_opt_z_)
+
+#define _Deref_prepost_cap_(size)                 _SAL1_1_Source_(_Deref_prepost_cap_, (size), _Deref_pre_cap_(size)                _Deref_post_cap_(size))
+#define _Deref_prepost_opt_cap_(size)             _SAL1_1_Source_(_Deref_prepost_opt_cap_, (size), _Deref_pre_opt_cap_(size)            _Deref_post_opt_cap_(size))
+#define _Deref_prepost_bytecap_(size)             _SAL1_1_Source_(_Deref_prepost_bytecap_, (size), _Deref_pre_bytecap_(size)            _Deref_post_bytecap_(size))
+#define _Deref_prepost_opt_bytecap_(size)         _SAL1_1_Source_(_Deref_prepost_opt_bytecap_, (size), _Deref_pre_opt_bytecap_(size)        _Deref_post_opt_bytecap_(size))
+
+#define _Deref_prepost_cap_x_(size)               _SAL1_1_Source_(_Deref_prepost_cap_x_, (size), _Deref_pre_cap_x_(size)              _Deref_post_cap_x_(size))
+#define _Deref_prepost_opt_cap_x_(size)           _SAL1_1_Source_(_Deref_prepost_opt_cap_x_, (size), _Deref_pre_opt_cap_x_(size)          _Deref_post_opt_cap_x_(size))
+#define _Deref_prepost_bytecap_x_(size)           _SAL1_1_Source_(_Deref_prepost_bytecap_x_, (size), _Deref_pre_bytecap_x_(size)          _Deref_post_bytecap_x_(size))
+#define _Deref_prepost_opt_bytecap_x_(size)       _SAL1_1_Source_(_Deref_prepost_opt_bytecap_x_, (size), _Deref_pre_opt_bytecap_x_(size)      _Deref_post_opt_bytecap_x_(size))
+
+#define _Deref_prepost_z_cap_(size)               _SAL1_1_Source_(_Deref_prepost_z_cap_, (size), _Deref_pre_z_cap_(size)              _Deref_post_z_cap_(size))
+#define _Deref_prepost_opt_z_cap_(size)           _SAL1_1_Source_(_Deref_prepost_opt_z_cap_, (size), _Deref_pre_opt_z_cap_(size)          _Deref_post_opt_z_cap_(size))
+#define _Deref_prepost_z_bytecap_(size)           _SAL1_1_Source_(_Deref_prepost_z_bytecap_, (size), _Deref_pre_z_bytecap_(size)          _Deref_post_z_bytecap_(size))
+#define _Deref_prepost_opt_z_bytecap_(size)       _SAL1_1_Source_(_Deref_prepost_opt_z_bytecap_, (size), _Deref_pre_opt_z_bytecap_(size)      _Deref_post_opt_z_bytecap_(size))
+
+#define _Deref_prepost_valid_cap_(size)           _SAL1_1_Source_(_Deref_prepost_valid_cap_, (size), _Deref_pre_valid_cap_(size)          _Deref_post_valid_cap_(size))
+#define _Deref_prepost_opt_valid_cap_(size)       _SAL1_1_Source_(_Deref_prepost_opt_valid_cap_, (size), _Deref_pre_opt_valid_cap_(size)      _Deref_post_opt_valid_cap_(size))
+#define _Deref_prepost_valid_bytecap_(size)       _SAL1_1_Source_(_Deref_prepost_valid_bytecap_, (size), _Deref_pre_valid_bytecap_(size)      _Deref_post_valid_bytecap_(size))
+#define _Deref_prepost_opt_valid_bytecap_(size)   _SAL1_1_Source_(_Deref_prepost_opt_valid_bytecap_, (size), _Deref_pre_opt_valid_bytecap_(size)  _Deref_post_opt_valid_bytecap_(size))
+
+#define _Deref_prepost_valid_cap_x_(size)           _SAL1_1_Source_(_Deref_prepost_valid_cap_x_, (size), _Deref_pre_valid_cap_x_(size)          _Deref_post_valid_cap_x_(size))
+#define _Deref_prepost_opt_valid_cap_x_(size)       _SAL1_1_Source_(_Deref_prepost_opt_valid_cap_x_, (size), _Deref_pre_opt_valid_cap_x_(size)      _Deref_post_opt_valid_cap_x_(size))
+#define _Deref_prepost_valid_bytecap_x_(size)       _SAL1_1_Source_(_Deref_prepost_valid_bytecap_x_, (size), _Deref_pre_valid_bytecap_x_(size)      _Deref_post_valid_bytecap_x_(size))
+#define _Deref_prepost_opt_valid_bytecap_x_(size)   _SAL1_1_Source_(_Deref_prepost_opt_valid_bytecap_x_, (size), _Deref_pre_opt_valid_bytecap_x_(size)  _Deref_post_opt_valid_bytecap_x_(size))
+
+#define _Deref_prepost_count_(size)             _SAL1_1_Source_(_Deref_prepost_count_, (size), _Deref_pre_count_(size)            _Deref_post_count_(size))
+#define _Deref_prepost_opt_count_(size)         _SAL1_1_Source_(_Deref_prepost_opt_count_, (size), _Deref_pre_opt_count_(size)        _Deref_post_opt_count_(size))
+#define _Deref_prepost_bytecount_(size)         _SAL1_1_Source_(_Deref_prepost_bytecount_, (size), _Deref_pre_bytecount_(size)        _Deref_post_bytecount_(size))
+#define _Deref_prepost_opt_bytecount_(size)     _SAL1_1_Source_(_Deref_prepost_opt_bytecount_, (size), _Deref_pre_opt_bytecount_(size)    _Deref_post_opt_bytecount_(size))
+
+#define _Deref_prepost_count_x_(size)           _SAL1_1_Source_(_Deref_prepost_count_x_, (size), _Deref_pre_count_x_(size)          _Deref_post_count_x_(size))
+#define _Deref_prepost_opt_count_x_(size)       _SAL1_1_Source_(_Deref_prepost_opt_count_x_, (size), _Deref_pre_opt_count_x_(size)      _Deref_post_opt_count_x_(size))
+#define _Deref_prepost_bytecount_x_(size)       _SAL1_1_Source_(_Deref_prepost_bytecount_x_, (size), _Deref_pre_bytecount_x_(size)      _Deref_post_bytecount_x_(size))
+#define _Deref_prepost_opt_bytecount_x_(size)   _SAL1_1_Source_(_Deref_prepost_opt_bytecount_x_, (size), _Deref_pre_opt_bytecount_x_(size)  _Deref_post_opt_bytecount_x_(size))
+
+#define _Deref_prepost_valid_                    _SAL1_1_Source_(_Deref_prepost_valid_, (), _Deref_pre_valid_     _Deref_post_valid_)
+#define _Deref_prepost_opt_valid_                _SAL1_1_Source_(_Deref_prepost_opt_valid_, (), _Deref_pre_opt_valid_ _Deref_post_opt_valid_)
+
+//
+// _Deref_<miscellaneous>
+//
+// used with references to arrays
+
+#define _Deref_out_z_cap_c_(size)  _SAL1_1_Source_(_Deref_out_z_cap_c_, (size), _Deref_pre_cap_c_(size) _Deref_post_z_)
+#define _Deref_inout_z_cap_c_(size)  _SAL1_1_Source_(_Deref_inout_z_cap_c_, (size), _Deref_pre_z_cap_c_(size) _Deref_post_z_)
+#define _Deref_out_z_bytecap_c_(size)  _SAL1_1_Source_(_Deref_out_z_bytecap_c_, (size), _Deref_pre_bytecap_c_(size) _Deref_post_z_)
+#define _Deref_inout_z_bytecap_c_(size)  _SAL1_1_Source_(_Deref_inout_z_bytecap_c_, (size), _Deref_pre_z_bytecap_c_(size) _Deref_post_z_)
+#define _Deref_inout_z_  _SAL1_1_Source_(_Deref_inout_z_, (), _Deref_prepost_z_)
+
+// #pragma endregion Input Buffer SAL 1 compatibility macros
+
+
+//============================================================================
+//   Implementation Layer:
+//============================================================================
+
+
+// Naming conventions:
+// A symbol the begins with _SA_ is for the machinery of creating any
+// annotations; many of those come from sourceannotations.h in the case
+// of attributes.
+
+// A symbol that ends with _impl is the very lowest level macro.  It is
+// not required to be a legal standalone annotation, and in the case
+// of attribute annotations, usually is not.  (In the case of some declspec
+// annotations, it might be, but it should not be assumed so.)  Those
+// symols will be used in the _PreN..., _PostN... and _RetN... annotations
+// to build up more complete annotations.
+
+// A symbol ending in _impl_ is reserved to the implementation as well,
+// but it does form a complete annotation; usually they are used to build
+// up even higher level annotations.
+
+
+#if _USE_ATTRIBUTES_FOR_SAL || _USE_DECLSPECS_FOR_SAL // [
+// Sharable "_impl" macros: these can be shared between the various annotation
+// forms but are part of the implementation of the macros.  These are collected
+// here to assure that only necessary differences in the annotations
+// exist.
+
+#define _Always_impl_(annos)            _Group_(annos _SAL_nop_impl_) _On_failure_impl_(annos _SAL_nop_impl_)
+#define _Bound_impl_                    _SA_annotes0(SAL_bound)
+#define _Field_range_impl_(min,max)     _Range_impl_(min,max)
+#define _Literal_impl_                  _SA_annotes1(SAL_constant, __yes)
+#define _Maybenull_impl_                _SA_annotes1(SAL_null, __maybe)
+#define _Maybevalid_impl_               _SA_annotes1(SAL_valid, __maybe)
+#define _Must_inspect_impl_ _Post_impl_ _SA_annotes0(SAL_mustInspect)
+#define _Notliteral_impl_               _SA_annotes1(SAL_constant, __no)
+#define _Notnull_impl_                  _SA_annotes1(SAL_null, __no)
+#define _Notvalid_impl_                 _SA_annotes1(SAL_valid, __no)
+#define _NullNull_terminated_impl_      _Group_(_SA_annotes1(SAL_nullTerminated, __yes) _SA_annotes1(SAL_readableTo,inexpressibleCount("NullNull terminated string")))
+#define _Null_impl_                     _SA_annotes1(SAL_null, __yes)
+#define _Null_terminated_impl_          _SA_annotes1(SAL_nullTerminated, __yes)
+#define _Out_impl_                      _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl) _Post_valid_impl_
+#define _Out_opt_impl_                  _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl) _Post_valid_impl_
+#define _Points_to_data_impl_           _At_(*_Curr_, _SA_annotes1(SAL_mayBePointer, __no))
+#define _Post_satisfies_impl_(cond)     _Post_impl_ _Satisfies_impl_(cond)
+#define _Post_valid_impl_               _Post1_impl_(__valid_impl)
+#define _Pre_satisfies_impl_(cond)      _Pre_impl_ _Satisfies_impl_(cond)
+#define _Pre_valid_impl_                _Pre1_impl_(__valid_impl)
+#define _Range_impl_(min,max)           _SA_annotes2(SAL_range, min, max)
+#define _Readable_bytes_impl_(size)     _SA_annotes1(SAL_readableTo, byteCount(size))
+#define _Readable_elements_impl_(size)  _SA_annotes1(SAL_readableTo, elementCount(size))
+#define _Ret_valid_impl_                _Ret1_impl_(__valid_impl)
+#define _Satisfies_impl_(cond)          _SA_annotes1(SAL_satisfies, cond)
+#define _Valid_impl_                    _SA_annotes1(SAL_valid, __yes)
+#define _Writable_bytes_impl_(size)     _SA_annotes1(SAL_writableTo, byteCount(size))
+#define _Writable_elements_impl_(size)  _SA_annotes1(SAL_writableTo, elementCount(size))
+
+#define _In_range_impl_(min,max)        _Pre_impl_ _Range_impl_(min,max)
+#define _Out_range_impl_(min,max)       _Post_impl_ _Range_impl_(min,max)
+#define _Ret_range_impl_(min,max)       _Post_impl_ _Range_impl_(min,max)
+#define _Deref_in_range_impl_(min,max)  _Deref_pre_impl_ _Range_impl_(min,max)
+#define _Deref_out_range_impl_(min,max) _Deref_post_impl_ _Range_impl_(min,max)
+#define _Deref_ret_range_impl_(min,max) _Deref_post_impl_ _Range_impl_(min,max)
+
+#define _Deref_pre_impl_                _Pre_impl_  _Notref_impl_ _Deref_impl_
+#define _Deref_post_impl_               _Post_impl_ _Notref_impl_ _Deref_impl_
+
+// The following are for the implementation machinery, and are not
+// suitable for annotating general code.
+// We're tying to phase this out, someday.  The parser quotes the param.
+#define __AuToQuOtE                     _SA_annotes0(SAL_AuToQuOtE)
+
+// Normally the parser does some simple type checking of annotation params,
+// defer that check to the plugin.
+#define __deferTypecheck                _SA_annotes0(SAL_deferTypecheck)
+
+#define _SA_SPECSTRIZE( x ) #x
+#define _SAL_nop_impl_       /* nothing */
+#define __nop_impl(x)            x
+#endif
+
+
+#if _USE_ATTRIBUTES_FOR_SAL // [
+
+// Using attributes for sal
+
+#include "codeanalysis\sourceannotations.h"
+
+
+#define _SA_annotes0(n)                [SAL_annotes(Name=#n)]
+#define _SA_annotes1(n,pp1)            [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1))]
+#define _SA_annotes2(n,pp1,pp2)        [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1), p2=_SA_SPECSTRIZE(pp2))]
+#define _SA_annotes3(n,pp1,pp2,pp3)    [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1), p2=_SA_SPECSTRIZE(pp2), p3=_SA_SPECSTRIZE(pp3))]
+
+#define _Pre_impl_                     [SAL_pre]
+#define _Post_impl_                    [SAL_post]
+#define _Deref_impl_                   [SAL_deref]
+#define _Notref_impl_                  [SAL_notref]
+
+
+// Declare a function to be an annotation or primop (respectively).
+// Done this way so that they don't appear in the regular compiler's
+// namespace.
+#define __ANNOTATION(fun)              _SA_annotes0(SAL_annotation)  void __SA_##fun;
+#define __PRIMOP(type, fun)            _SA_annotes0(SAL_primop)  type __SA_##fun;
+#define __QUALIFIER(fun)               _SA_annotes0(SAL_qualifier)  void __SA_##fun;
+
+// Benign declspec needed here for WindowsPREfast
+#define __In_impl_ [SA_Pre(Valid=SA_Yes)] [SA_Pre(Deref=1, Notref=1, Access=SA_Read)] __declspec("SAL_pre SAL_valid")
+
+#elif _USE_DECLSPECS_FOR_SAL // ][
+
+// Using declspecs for sal
+
+#define _SA_annotes0(n)                __declspec(#n)
+#define _SA_annotes1(n,pp1)            __declspec(#n "(" _SA_SPECSTRIZE(pp1) ")" )
+#define _SA_annotes2(n,pp1,pp2)        __declspec(#n "(" _SA_SPECSTRIZE(pp1) "," _SA_SPECSTRIZE(pp2) ")")
+#define _SA_annotes3(n,pp1,pp2,pp3)    __declspec(#n "(" _SA_SPECSTRIZE(pp1) "," _SA_SPECSTRIZE(pp2) "," _SA_SPECSTRIZE(pp3) ")")
+
+#define _Pre_impl_                     _SA_annotes0(SAL_pre)
+#define _Post_impl_                    _SA_annotes0(SAL_post)
+#define _Deref_impl_                   _SA_annotes0(SAL_deref)
+#define _Notref_impl_                  _SA_annotes0(SAL_notref)
+
+// Declare a function to be an annotation or primop (respectively).
+// Done this way so that they don't appear in the regular compiler's
+// namespace.
+#define __ANNOTATION(fun)              _SA_annotes0(SAL_annotation) void __SA_##fun
+
+#define __PRIMOP(type, fun)            _SA_annotes0(SAL_primop) type __SA_##fun
+
+#define __QUALIFIER(fun)               _SA_annotes0(SAL_qualifier)  void __SA_##fun;
+
+#define __In_impl_ _Pre_impl_ _SA_annotes0(SAL_valid) _Pre_impl_ _Deref_impl_ _Notref_impl_ _SA_annotes0(SAL_readonly)
+
+#else // ][
+
+// Using "nothing" for sal
+
+#define _SA_annotes0(n)
+#define _SA_annotes1(n,pp1)
+#define _SA_annotes2(n,pp1,pp2)
+#define _SA_annotes3(n,pp1,pp2,pp3)
+
+#define __ANNOTATION(fun)
+#define __PRIMOP(type, fun)
+#define __QUALIFIER(type, fun)
+
+#endif // ]
+
+#if _USE_ATTRIBUTES_FOR_SAL || _USE_DECLSPECS_FOR_SAL // [
+
+// Declare annotations that need to be declared.
+__ANNOTATION(SAL_useHeader(void));
+__ANNOTATION(SAL_bound(void));
+__ANNOTATION(SAL_allocator(void));   //??? resolve with PFD
+__ANNOTATION(SAL_file_parser(__AuToQuOtE __In_impl_ char *, __In_impl_ char *));
+__ANNOTATION(SAL_source_code_content(__In_impl_ char *));
+__ANNOTATION(SAL_analysisHint(__AuToQuOtE __In_impl_ char *));
+__ANNOTATION(SAL_untrusted_data_source(__AuToQuOtE __In_impl_ char *));
+__ANNOTATION(SAL_untrusted_data_source_this(__AuToQuOtE __In_impl_ char *));
+__ANNOTATION(SAL_validated(__AuToQuOtE __In_impl_ char *));
+__ANNOTATION(SAL_validated_this(__AuToQuOtE __In_impl_ char *));
+__ANNOTATION(SAL_encoded(void));
+__ANNOTATION(SAL_adt(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *));
+__ANNOTATION(SAL_add_adt_property(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *));
+__ANNOTATION(SAL_remove_adt_property(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *));
+__ANNOTATION(SAL_transfer_adt_property_from(__AuToQuOtE __In_impl_ char *));
+__ANNOTATION(SAL_post_type(__AuToQuOtE __In_impl_ char *));
+__ANNOTATION(SAL_volatile(void));
+__ANNOTATION(SAL_nonvolatile(void));
+__ANNOTATION(SAL_entrypoint(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *));
+__ANNOTATION(SAL_blocksOn(__In_impl_ void*));
+__ANNOTATION(SAL_mustInspect(void));
+
+// Only appears in model files, but needs to be declared.
+__ANNOTATION(SAL_TypeName(__AuToQuOtE __In_impl_ char *));
+
+// To be declared well-known soon.
+__ANNOTATION(SAL_interlocked(void);)
+
+#pragma warning (suppress: 28227 28241)
+__ANNOTATION(SAL_name(__In_impl_ char *, __In_impl_ char *, __In_impl_ char *);)
+
+__PRIMOP(char *, _Macro_value_(__In_impl_ char *));
+__PRIMOP(int, _Macro_defined_(__In_impl_ char *));
+__PRIMOP(char *, _Strstr_(__In_impl_ char *, __In_impl_ char *));
+
+#endif // ]
+
+#if _USE_ATTRIBUTES_FOR_SAL // [
+
+#define _Check_return_impl_           [SA_Post(MustCheck=SA_Yes)]
+
+#define _Success_impl_(expr)          [SA_Success(Condition=#expr)]
+#define _On_failure_impl_(annos)      [SAL_context(p1="SAL_failed")] _Group_(_Post_impl_ _Group_(annos _SAL_nop_impl_))
+
+#define _Printf_format_string_impl_   [SA_FormatString(Style="printf")]
+#define _Scanf_format_string_impl_    [SA_FormatString(Style="scanf")]
+#define _Scanf_s_format_string_impl_  [SA_FormatString(Style="scanf_s")]
+
+#define _In_bound_impl_               [SA_PreBound(Deref=0)]
+#define _Out_bound_impl_              [SA_PostBound(Deref=0)]
+#define _Ret_bound_impl_              [SA_PostBound(Deref=0)]
+#define _Deref_in_bound_impl_         [SA_PreBound(Deref=1)]
+#define _Deref_out_bound_impl_        [SA_PostBound(Deref=1)]
+#define _Deref_ret_bound_impl_        [SA_PostBound(Deref=1)]
+
+#define __valid_impl                  Valid=SA_Yes
+#define __maybevalid_impl             Valid=SA_Maybe
+#define __notvalid_impl               Valid=SA_No
+
+#define __null_impl                   Null=SA_Yes
+#define __maybenull_impl              Null=SA_Maybe
+#define __notnull_impl                Null=SA_No
+
+#define __null_impl_notref        Null=SA_Yes,Notref=1
+#define __maybenull_impl_notref   Null=SA_Maybe,Notref=1
+#define __notnull_impl_notref     Null=SA_No,Notref=1
+
+#define __zterm_impl              NullTerminated=SA_Yes
+#define __maybezterm_impl         NullTerminated=SA_Maybe
+#define __maybzterm_impl          NullTerminated=SA_Maybe
+#define __notzterm_impl           NullTerminated=SA_No
+
+#define __readaccess_impl         Access=SA_Read
+#define __writeaccess_impl        Access=SA_Write
+#define __allaccess_impl          Access=SA_ReadWrite
+
+#define __readaccess_impl_notref  Access=SA_Read,Notref=1
+#define __writeaccess_impl_notref Access=SA_Write,Notref=1
+#define __allaccess_impl_notref   Access=SA_ReadWrite,Notref=1
+
+#if _MSC_VER >= 1610 /*IFSTRIP=IGN*/ // [
+
+// For SAL2, we need to expect general expressions.
+
+#define __cap_impl(size)          WritableElements="\n"#size
+#define __bytecap_impl(size)      WritableBytes="\n"#size
+#define __bytecount_impl(size)    ValidBytes="\n"#size
+#define __count_impl(size)        ValidElements="\n"#size
+
+#else // ][
+
+#define __cap_impl(size)          WritableElements=#size
+#define __bytecap_impl(size)      WritableBytes=#size
+#define __bytecount_impl(size)    ValidBytes=#size
+#define __count_impl(size)        ValidElements=#size
+
+#endif // ]
+
+#define __cap_c_impl(size)        WritableElementsConst=size
+#define __cap_c_one_notref_impl   WritableElementsConst=1,Notref=1
+#define __cap_for_impl(param)     WritableElementsLength=#param
+#define __cap_x_impl(size)        WritableElements="\n@"#size
+
+#define __bytecap_c_impl(size)    WritableBytesConst=size
+#define __bytecap_x_impl(size)    WritableBytes="\n@"#size
+
+#define __mult_impl(mult,size)    __cap_impl((mult)*(size))
+
+#define __count_c_impl(size)      ValidElementsConst=size
+#define __count_x_impl(size)      ValidElements="\n@"#size
+
+#define __bytecount_c_impl(size)  ValidBytesConst=size
+#define __bytecount_x_impl(size)  ValidBytes="\n@"#size
+
+
+#define _At_impl_(target, annos)       [SAL_at(p1=#target)] _Group_(annos)
+#define _At_buffer_impl_(target, iter, bound, annos)  [SAL_at_buffer(p1=#target, p2=#iter, p3=#bound)] _Group_(annos)
+#define _When_impl_(expr, annos)       [SAL_when(p1=#expr)] _Group_(annos)
+
+#define _Group_impl_(annos)            [SAL_begin] annos [SAL_end]
+#define _GrouP_impl_(annos)            [SAL_BEGIN] annos [SAL_END]
+
+#define _Use_decl_anno_impl_               _SA_annotes0(SAL_useHeader) // this is a special case!
+
+#define _Pre1_impl_(p1)                    [SA_Pre(p1)]
+#define _Pre2_impl_(p1,p2)                 [SA_Pre(p1,p2)]
+#define _Pre3_impl_(p1,p2,p3)              [SA_Pre(p1,p2,p3)]
+
+#define _Post1_impl_(p1)                   [SA_Post(p1)]
+#define _Post2_impl_(p1,p2)                [SA_Post(p1,p2)]
+#define _Post3_impl_(p1,p2,p3)             [SA_Post(p1,p2,p3)]
+
+#define _Ret1_impl_(p1)                    [SA_Post(p1)]
+#define _Ret2_impl_(p1,p2)                 [SA_Post(p1,p2)]
+#define _Ret3_impl_(p1,p2,p3)              [SA_Post(p1,p2,p3)]
+
+#define _Deref_pre1_impl_(p1)              [SA_Pre(Deref=1,p1)]
+#define _Deref_pre2_impl_(p1,p2)           [SA_Pre(Deref=1,p1,p2)]
+#define _Deref_pre3_impl_(p1,p2,p3)        [SA_Pre(Deref=1,p1,p2,p3)]
+
+
+#define _Deref_post1_impl_(p1)             [SA_Post(Deref=1,p1)]
+#define _Deref_post2_impl_(p1,p2)          [SA_Post(Deref=1,p1,p2)]
+#define _Deref_post3_impl_(p1,p2,p3)       [SA_Post(Deref=1,p1,p2,p3)]
+
+#define _Deref_ret1_impl_(p1)              [SA_Post(Deref=1,p1)]
+#define _Deref_ret2_impl_(p1,p2)           [SA_Post(Deref=1,p1,p2)]
+#define _Deref_ret3_impl_(p1,p2,p3)        [SA_Post(Deref=1,p1,p2,p3)]
+
+#define _Deref2_pre1_impl_(p1)             [SA_Pre(Deref=2,Notref=1,p1)]
+#define _Deref2_post1_impl_(p1)            [SA_Post(Deref=2,Notref=1,p1)]
+#define _Deref2_ret1_impl_(p1)             [SA_Post(Deref=2,Notref=1,p1)]
+
+// Obsolete -- may be needed for transition to attributes.
+#define __inner_typefix(ctype)             [SAL_typefix(p1=_SA_SPECSTRIZE(ctype))]
+#define __inner_exceptthat                 [SAL_except]
+
+
+#elif _USE_DECLSPECS_FOR_SAL // ][
+
+#define _Check_return_impl_ __post      _SA_annotes0(SAL_checkReturn)
+
+#define _Success_impl_(expr)            _SA_annotes1(SAL_success, expr)
+#define _On_failure_impl_(annos)        _SA_annotes1(SAL_context, SAL_failed) _Group_(_Post_impl_ _Group_(_SAL_nop_impl_ annos))
+
+#define _Printf_format_string_impl_     _SA_annotes1(SAL_IsFormatString, "printf")
+#define _Scanf_format_string_impl_      _SA_annotes1(SAL_IsFormatString, "scanf")
+#define _Scanf_s_format_string_impl_    _SA_annotes1(SAL_IsFormatString, "scanf_s")
+
+#define _In_bound_impl_                 _Pre_impl_ _Bound_impl_
+#define _Out_bound_impl_                _Post_impl_ _Bound_impl_
+#define _Ret_bound_impl_                _Post_impl_ _Bound_impl_
+#define _Deref_in_bound_impl_           _Deref_pre_impl_ _Bound_impl_
+#define _Deref_out_bound_impl_          _Deref_post_impl_ _Bound_impl_
+#define _Deref_ret_bound_impl_          _Deref_post_impl_ _Bound_impl_
+
+
+#define __null_impl              _SA_annotes0(SAL_null) // _SA_annotes1(SAL_null, __yes)
+#define __notnull_impl           _SA_annotes0(SAL_notnull) // _SA_annotes1(SAL_null, __no)
+#define __maybenull_impl         _SA_annotes0(SAL_maybenull) // _SA_annotes1(SAL_null, __maybe)
+
+#define __valid_impl             _SA_annotes0(SAL_valid) // _SA_annotes1(SAL_valid, __yes)
+#define __notvalid_impl          _SA_annotes0(SAL_notvalid) // _SA_annotes1(SAL_valid, __no)
+#define __maybevalid_impl        _SA_annotes0(SAL_maybevalid) // _SA_annotes1(SAL_valid, __maybe)
+
+#define __null_impl_notref       _Notref_ _Null_impl_
+#define __maybenull_impl_notref  _Notref_ _Maybenull_impl_
+#define __notnull_impl_notref    _Notref_ _Notnull_impl_
+
+#define __zterm_impl             _SA_annotes1(SAL_nullTerminated, __yes)
+#define __maybezterm_impl        _SA_annotes1(SAL_nullTerminated, __maybe)
+#define __maybzterm_impl         _SA_annotes1(SAL_nullTerminated, __maybe)
+#define __notzterm_impl          _SA_annotes1(SAL_nullTerminated, __no)
+
+#define __readaccess_impl        _SA_annotes1(SAL_access, 0x1)
+#define __writeaccess_impl       _SA_annotes1(SAL_access, 0x2)
+#define __allaccess_impl         _SA_annotes1(SAL_access, 0x3)
+
+#define __readaccess_impl_notref  _Notref_ _SA_annotes1(SAL_access, 0x1)
+#define __writeaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x2)
+#define __allaccess_impl_notref   _Notref_ _SA_annotes1(SAL_access, 0x3)
+
+#define __cap_impl(size)         _SA_annotes1(SAL_writableTo,elementCount(size))
+#define __cap_c_impl(size)       _SA_annotes1(SAL_writableTo,elementCount(size))
+#define __cap_c_one_notref_impl  _Notref_ _SA_annotes1(SAL_writableTo,elementCount(1))
+#define __cap_for_impl(param)    _SA_annotes1(SAL_writableTo,inexpressibleCount(sizeof(param)))
+#define __cap_x_impl(size)       _SA_annotes1(SAL_writableTo,inexpressibleCount(#size))
+
+#define __bytecap_impl(size)     _SA_annotes1(SAL_writableTo,byteCount(size))
+#define __bytecap_c_impl(size)   _SA_annotes1(SAL_writableTo,byteCount(size))
+#define __bytecap_x_impl(size)   _SA_annotes1(SAL_writableTo,inexpressibleCount(#size))
+
+#define __mult_impl(mult,size)   _SA_annotes1(SAL_writableTo,(mult)*(size))
+
+#define __count_impl(size)       _SA_annotes1(SAL_readableTo,elementCount(size))
+#define __count_c_impl(size)     _SA_annotes1(SAL_readableTo,elementCount(size))
+#define __count_x_impl(size)     _SA_annotes1(SAL_readableTo,inexpressibleCount(#size))
+
+#define __bytecount_impl(size)   _SA_annotes1(SAL_readableTo,byteCount(size))
+#define __bytecount_c_impl(size) _SA_annotes1(SAL_readableTo,byteCount(size))
+#define __bytecount_x_impl(size) _SA_annotes1(SAL_readableTo,inexpressibleCount(#size))
+
+#define _At_impl_(target, annos)     _SA_annotes0(SAL_at(target)) _Group_(annos)
+#define _At_buffer_impl_(target, iter, bound, annos)  _SA_annotes3(SAL_at_buffer, target, iter, bound) _Group_(annos)
+#define _Group_impl_(annos)          _SA_annotes0(SAL_begin) annos _SA_annotes0(SAL_end)
+#define _GrouP_impl_(annos)          _SA_annotes0(SAL_BEGIN) annos _SA_annotes0(SAL_END)
+#define _When_impl_(expr, annos)     _SA_annotes0(SAL_when(expr)) _Group_(annos)
+
+#define _Use_decl_anno_impl_         __declspec("SAL_useHeader()") // this is a special case!
+
+#define _Pre1_impl_(p1)              _Pre_impl_ p1
+#define _Pre2_impl_(p1,p2)           _Pre_impl_ p1 _Pre_impl_ p2
+#define _Pre3_impl_(p1,p2,p3)        _Pre_impl_ p1 _Pre_impl_ p2 _Pre_impl_ p3
+
+#define _Post1_impl_(p1)             _Post_impl_ p1
+#define _Post2_impl_(p1,p2)          _Post_impl_ p1 _Post_impl_ p2
+#define _Post3_impl_(p1,p2,p3)       _Post_impl_ p1 _Post_impl_ p2 _Post_impl_ p3
+
+#define _Ret1_impl_(p1)              _Post_impl_ p1
+#define _Ret2_impl_(p1,p2)           _Post_impl_ p1 _Post_impl_ p2
+#define _Ret3_impl_(p1,p2,p3)        _Post_impl_ p1 _Post_impl_ p2 _Post_impl_ p3
+
+#define _Deref_pre1_impl_(p1)        _Deref_pre_impl_ p1
+#define _Deref_pre2_impl_(p1,p2)     _Deref_pre_impl_ p1 _Deref_pre_impl_ p2
+#define _Deref_pre3_impl_(p1,p2,p3)  _Deref_pre_impl_ p1 _Deref_pre_impl_ p2 _Deref_pre_impl_ p3
+
+#define _Deref_post1_impl_(p1)       _Deref_post_impl_ p1
+#define _Deref_post2_impl_(p1,p2)    _Deref_post_impl_ p1 _Deref_post_impl_ p2
+#define _Deref_post3_impl_(p1,p2,p3) _Deref_post_impl_ p1 _Deref_post_impl_ p2 _Deref_post_impl_ p3
+
+#define _Deref_ret1_impl_(p1)        _Deref_post_impl_ p1
+#define _Deref_ret2_impl_(p1,p2)     _Deref_post_impl_ p1 _Deref_post_impl_ p2
+#define _Deref_ret3_impl_(p1,p2,p3)  _Deref_post_impl_ p1 _Deref_post_impl_ p2 _Deref_post_impl_ p3
+
+#define _Deref2_pre1_impl_(p1)       _Deref_pre_impl_ _Notref_impl_ _Deref_impl_ p1
+#define _Deref2_post1_impl_(p1)      _Deref_post_impl_ _Notref_impl_ _Deref_impl_ p1
+#define _Deref2_ret1_impl_(p1)       _Deref_post_impl_ _Notref_impl_ _Deref_impl_ p1
+
+#define __inner_typefix(ctype)             _SA_annotes1(SAL_typefix, ctype)
+#define __inner_exceptthat                 _SA_annotes0(SAL_except)
+
+#elif defined(_MSC_EXTENSIONS) && !defined( MIDL_PASS ) && !defined(__midl) && !defined(RC_INVOKED) && defined(_PFT_VER) && _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // ][
+
+// minimum attribute expansion for foreground build
+
+#pragma push_macro( "SA" )
+#pragma push_macro( "REPEATABLE" )
+
+#ifdef __cplusplus // [
+#define SA( id ) id
+#define REPEATABLE [repeatable]
+#else  // !__cplusplus // ][
+#define SA( id ) SA_##id
+#define REPEATABLE
+#endif  // !__cplusplus // ]
+
+REPEATABLE
+[source_annotation_attribute( SA( Parameter ) )]
+struct __P_impl
+{
+#ifdef __cplusplus // [
+    __P_impl();
+#endif // ]
+   int __d_;
+};
+typedef struct __P_impl __P_impl;
+
+REPEATABLE
+[source_annotation_attribute( SA( ReturnValue ) )]
+struct __R_impl
+{
+#ifdef __cplusplus // [
+    __R_impl();
+#endif // ]
+   int __d_;
+};
+typedef struct __R_impl __R_impl;
+
+[source_annotation_attribute( SA( Method ) )]
+struct __M_
+{
+#ifdef __cplusplus // [
+    __M_();
+#endif // ]
+   int __d_;
+};
+typedef struct __M_ __M_;
+
+[source_annotation_attribute( SA( All ) )]
+struct __A_
+{
+#ifdef __cplusplus // [
+    __A_();
+#endif // ]
+   int __d_;
+};
+typedef struct __A_ __A_;
+
+[source_annotation_attribute( SA( Field ) )]
+struct __F_
+{
+#ifdef __cplusplus // [
+    __F_();
+#endif // ]
+   int __d_;
+};
+typedef struct __F_ __F_;
+
+#pragma pop_macro( "REPEATABLE" )
+#pragma pop_macro( "SA" )
+
+
+#define _SAL_nop_impl_
+
+#define _At_impl_(target, annos)        [__A_(__d_=0)]
+#define _At_buffer_impl_(target, iter, bound, annos)  [__A_(__d_=0)]
+#define _When_impl_(expr, annos)        annos
+#define _Group_impl_(annos)             annos
+#define _GrouP_impl_(annos)             annos
+#define _Use_decl_anno_impl_            [__M_(__d_=0)]
+
+#define _Points_to_data_impl_           [__P_impl(__d_=0)]
+#define _Literal_impl_                  [__P_impl(__d_=0)]
+#define _Notliteral_impl_               [__P_impl(__d_=0)]
+
+#define _Pre_valid_impl_                [__P_impl(__d_=0)]
+#define _Post_valid_impl_               [__P_impl(__d_=0)]
+#define _Ret_valid_impl_                [__R_impl(__d_=0)]
+
+#define _Check_return_impl_             [__R_impl(__d_=0)]
+#define _Must_inspect_impl_             [__R_impl(__d_=0)]
+
+#define _Success_impl_(expr)            [__M_(__d_=0)]
+#define _On_failure_impl_(expr)         [__M_(__d_=0)]
+#define _Always_impl_(expr)             [__M_(__d_=0)]
+
+#define _Printf_format_string_impl_     [__P_impl(__d_=0)]
+#define _Scanf_format_string_impl_      [__P_impl(__d_=0)]
+#define _Scanf_s_format_string_impl_    [__P_impl(__d_=0)]
+
+#define _Raises_SEH_exception_impl_         [__M_(__d_=0)]
+#define _Maybe_raises_SEH_exception_impl_   [__M_(__d_=0)]
+
+#define _In_bound_impl_                 [__P_impl(__d_=0)]
+#define _Out_bound_impl_                [__P_impl(__d_=0)]
+#define _Ret_bound_impl_                [__R_impl(__d_=0)]
+#define _Deref_in_bound_impl_           [__P_impl(__d_=0)]
+#define _Deref_out_bound_impl_          [__P_impl(__d_=0)]
+#define _Deref_ret_bound_impl_          [__R_impl(__d_=0)]
+
+#define _Range_impl_(min,max)           [__P_impl(__d_=0)]
+#define _In_range_impl_(min,max)        [__P_impl(__d_=0)]
+#define _Out_range_impl_(min,max)       [__P_impl(__d_=0)]
+#define _Ret_range_impl_(min,max)       [__R_impl(__d_=0)]
+#define _Deref_in_range_impl_(min,max)  [__P_impl(__d_=0)]
+#define _Deref_out_range_impl_(min,max) [__P_impl(__d_=0)]
+#define _Deref_ret_range_impl_(min,max) [__R_impl(__d_=0)]
+
+#define _Field_range_impl_(min,max)     [__F_(__d_=0)]
+
+#define _Pre_satisfies_impl_(cond)      [__A_(__d_=0)]
+#define _Post_satisfies_impl_(cond)     [__A_(__d_=0)]
+#define _Satisfies_impl_(cond)          [__A_(__d_=0)]
+
+#define _Null_impl_                     [__A_(__d_=0)]
+#define _Notnull_impl_                  [__A_(__d_=0)]
+#define _Maybenull_impl_                [__A_(__d_=0)]
+
+#define _Valid_impl_                    [__A_(__d_=0)]
+#define _Notvalid_impl_                 [__A_(__d_=0)]
+#define _Maybevalid_impl_               [__A_(__d_=0)]
+
+#define _Readable_bytes_impl_(size)     [__A_(__d_=0)]
+#define _Readable_elements_impl_(size)  [__A_(__d_=0)]
+#define _Writable_bytes_impl_(size)     [__A_(__d_=0)]
+#define _Writable_elements_impl_(size)  [__A_(__d_=0)]
+
+#define _Null_terminated_impl_          [__A_(__d_=0)]
+#define _NullNull_terminated_impl_      [__A_(__d_=0)]
+
+#define _Pre_impl_                      [__P_impl(__d_=0)]
+#define _Pre1_impl_(p1)                 [__P_impl(__d_=0)]
+#define _Pre2_impl_(p1,p2)              [__P_impl(__d_=0)]
+#define _Pre3_impl_(p1,p2,p3)           [__P_impl(__d_=0)]
+
+#define _Post_impl_                     [__P_impl(__d_=0)]
+#define _Post1_impl_(p1)                [__P_impl(__d_=0)]
+#define _Post2_impl_(p1,p2)             [__P_impl(__d_=0)]
+#define _Post3_impl_(p1,p2,p3)          [__P_impl(__d_=0)]
+
+#define _Ret1_impl_(p1)                 [__R_impl(__d_=0)]
+#define _Ret2_impl_(p1,p2)              [__R_impl(__d_=0)]
+#define _Ret3_impl_(p1,p2,p3)           [__R_impl(__d_=0)]
+
+#define _Deref_pre1_impl_(p1)           [__P_impl(__d_=0)]
+#define _Deref_pre2_impl_(p1,p2)        [__P_impl(__d_=0)]
+#define _Deref_pre3_impl_(p1,p2,p3)     [__P_impl(__d_=0)]
+
+#define _Deref_post1_impl_(p1)          [__P_impl(__d_=0)]
+#define _Deref_post2_impl_(p1,p2)       [__P_impl(__d_=0)]
+#define _Deref_post3_impl_(p1,p2,p3)    [__P_impl(__d_=0)]
+
+#define _Deref_ret1_impl_(p1)           [__R_impl(__d_=0)]
+#define _Deref_ret2_impl_(p1,p2)        [__R_impl(__d_=0)]
+#define _Deref_ret3_impl_(p1,p2,p3)     [__R_impl(__d_=0)]
+
+#define _Deref2_pre1_impl_(p1)          //[__P_impl(__d_=0)]
+#define _Deref2_post1_impl_(p1)         //[__P_impl(__d_=0)]
+#define _Deref2_ret1_impl_(p1)          //[__P_impl(__d_=0)]
+
+#else // ][
+
+
+#define _SAL_nop_impl_ X
+
+#define _At_impl_(target, annos)
+#define _When_impl_(expr, annos)
+#define _Group_impl_(annos)
+#define _GrouP_impl_(annos)
+#define _At_buffer_impl_(target, iter, bound, annos)
+#define _Use_decl_anno_impl_
+#define _Points_to_data_impl_
+#define _Literal_impl_
+#define _Notliteral_impl_
+#define _Notref_impl_
+
+#define _Pre_valid_impl_
+#define _Post_valid_impl_
+#define _Ret_valid_impl_
+
+#define _Check_return_impl_
+#define _Must_inspect_impl_
+
+#define _Success_impl_(expr)
+#define _On_failure_impl_(annos)
+#define _Always_impl_(annos)
+
+#define _Printf_format_string_impl_
+#define _Scanf_format_string_impl_
+#define _Scanf_s_format_string_impl_
+
+#define _In_bound_impl_
+#define _Out_bound_impl_
+#define _Ret_bound_impl_
+#define _Deref_in_bound_impl_
+#define _Deref_out_bound_impl_
+#define _Deref_ret_bound_impl_
+
+#define _Range_impl_(min,max)
+#define _In_range_impl_(min,max)
+#define _Out_range_impl_(min,max)
+#define _Ret_range_impl_(min,max)
+#define _Deref_in_range_impl_(min,max)
+#define _Deref_out_range_impl_(min,max)
+#define _Deref_ret_range_impl_(min,max)
+
+#define _Satisfies_impl_(expr)
+#define _Pre_satisfies_impl_(expr)
+#define _Post_satisfies_impl_(expr)
+
+#define _Null_impl_
+#define _Notnull_impl_
+#define _Maybenull_impl_
+
+#define _Valid_impl_
+#define _Notvalid_impl_
+#define _Maybevalid_impl_
+
+#define _Field_range_impl_(min,max)
+
+#define _Pre_impl_
+#define _Pre1_impl_(p1)
+#define _Pre2_impl_(p1,p2)
+#define _Pre3_impl_(p1,p2,p3)
+
+#define _Post_impl_
+#define _Post1_impl_(p1)
+#define _Post2_impl_(p1,p2)
+#define _Post3_impl_(p1,p2,p3)
+
+#define _Ret1_impl_(p1)
+#define _Ret2_impl_(p1,p2)
+#define _Ret3_impl_(p1,p2,p3)
+
+#define _Deref_pre1_impl_(p1)
+#define _Deref_pre2_impl_(p1,p2)
+#define _Deref_pre3_impl_(p1,p2,p3)
+
+#define _Deref_post1_impl_(p1)
+#define _Deref_post2_impl_(p1,p2)
+#define _Deref_post3_impl_(p1,p2,p3)
+
+#define _Deref_ret1_impl_(p1)
+#define _Deref_ret2_impl_(p1,p2)
+#define _Deref_ret3_impl_(p1,p2,p3)
+
+#define _Deref2_pre1_impl_(p1)
+#define _Deref2_post1_impl_(p1)
+#define _Deref2_ret1_impl_(p1)
+
+#define _Readable_bytes_impl_(size)
+#define _Readable_elements_impl_(size)
+#define _Writable_bytes_impl_(size)
+#define _Writable_elements_impl_(size)
+
+#define _Null_terminated_impl_
+#define _NullNull_terminated_impl_
+
+// Obsolete -- may be needed for transition to attributes.
+#define __inner_typefix(ctype)
+#define __inner_exceptthat
+
+#endif // ]
+
+// This section contains the deprecated annotations
+
+/*
+ -------------------------------------------------------------------------------
+ Introduction
+
+ sal.h provides a set of annotations to describe how a function uses its
+ parameters - the assumptions it makes about them, and the guarantees it makes
+ upon finishing.
+
+ Annotations may be placed before either a function parameter's type or its return
+ type, and describe the function's behavior regarding the parameter or return value.
+ There are two classes of annotations: buffer annotations and advanced annotations.
+ Buffer annotations describe how functions use their pointer parameters, and
+ advanced annotations either describe complex/unusual buffer behavior, or provide
+ additional information about a parameter that is not otherwise expressible.
+
+ -------------------------------------------------------------------------------
+ Buffer Annotations
+
+ The most important annotations in sal.h provide a consistent way to annotate
+ buffer parameters or return values for a function. Each of these annotations describes
+ a single buffer (which could be a string, a fixed-length or variable-length array,
+ or just a pointer) that the function interacts with: where it is, how large it is,
+ how much is initialized, and what the function does with it.
+
+ The appropriate macro for a given buffer can be constructed using the table below.
+ Just pick the appropriate values from each category, and combine them together
+ with a leading underscore. Some combinations of values do not make sense as buffer
+ annotations. Only meaningful annotations can be added to your code; for a list of
+ these, see the buffer annotation definitions section.
+
+ Only a single buffer annotation should be used for each parameter.
+
+ |------------|------------|---------|--------|----------|----------|---------------|
+ |   Level    |   Usage    |  Size   | Output | NullTerm | Optional |  Parameters   |
+ |------------|------------|---------|--------|----------|----------|---------------|
+ | <>         | <>         | <>      | <>     | _z       | <>       | <>            |
+ | _deref     | _in        | _ecount | _full  | _nz      | _opt     | (size)        |
+ | _deref_opt | _out       | _bcount | _part  |          |          | (size,length) |
+ |            | _inout     |         |        |          |          |               |
+ |            |            |         |        |          |          |               |
+ |------------|------------|---------|--------|----------|----------|---------------|
+
+ Level: Describes the buffer pointer's level of indirection from the parameter or
+          return value 'p'.
+
+ <>         : p is the buffer pointer.
+ _deref     : *p is the buffer pointer. p must not be NULL.
+ _deref_opt : *p may be the buffer pointer. p may be NULL, in which case the rest of
+                the annotation is ignored.
+
+ Usage: Describes how the function uses the buffer.
+
+ <>     : The buffer is not accessed. If used on the return value or with _deref, the
+            function will provide the buffer, and it will be uninitialized at exit.
+            Otherwise, the caller must provide the buffer. This should only be used
+            for alloc and free functions.
+ _in    : The function will only read from the buffer. The caller must provide the
+            buffer and initialize it. Cannot be used with _deref.
+ _out   : The function will only write to the buffer. If used on the return value or
+            with _deref, the function will provide the buffer and initialize it.
+            Otherwise, the caller must provide the buffer, and the function will
+            initialize it.
+ _inout : The function may freely read from and write to the buffer. The caller must
+            provide the buffer and initialize it. If used with _deref, the buffer may
+            be reallocated by the function.
+
+ Size: Describes the total size of the buffer. This may be less than the space actually
+         allocated for the buffer, in which case it describes the accessible amount.
+
+ <>      : No buffer size is given. If the type specifies the buffer size (such as
+             with LPSTR and LPWSTR), that amount is used. Otherwise, the buffer is one
+             element long. Must be used with _in, _out, or _inout.
+ _ecount : The buffer size is an explicit element count.
+ _bcount : The buffer size is an explicit byte count.
+
+ Output: Describes how much of the buffer will be initialized by the function. For
+           _inout buffers, this also describes how much is initialized at entry. Omit this
+           category for _in buffers; they must be fully initialized by the caller.
+
+ <>    : The type specifies how much is initialized. For instance, a function initializing
+           an LPWSTR must NULL-terminate the string.
+ _full : The function initializes the entire buffer.
+ _part : The function initializes part of the buffer, and explicitly indicates how much.
+
+ NullTerm: States if the present of a '\0' marks the end of valid elements in the buffer.
+ _z    : A '\0' indicated the end of the buffer
+ _nz     : The buffer may not be null terminated and a '\0' does not indicate the end of the
+          buffer.
+ Optional: Describes if the buffer itself is optional.
+
+ <>   : The pointer to the buffer must not be NULL.
+ _opt : The pointer to the buffer might be NULL. It will be checked before being dereferenced.
+
+ Parameters: Gives explicit counts for the size and length of the buffer.
+
+ <>            : There is no explicit count. Use when neither _ecount nor _bcount is used.
+ (size)        : Only the buffer's total size is given. Use with _ecount or _bcount but not _part.
+ (size,length) : The buffer's total size and initialized length are given. Use with _ecount_part
+                   and _bcount_part.
+
+ -------------------------------------------------------------------------------
+ Buffer Annotation Examples
+
+ LWSTDAPI_(BOOL) StrToIntExA(
+     __in LPCSTR pszString,
+     DWORD dwFlags,
+     __out int *piRet                     -- A pointer whose dereference will be filled in.
+ );
+
+ void MyPaintingFunction(
+     __in HWND hwndControl,               -- An initialized read-only parameter.
+     __in_opt HDC hdcOptional,            -- An initialized read-only parameter that might be NULL.
+     __inout IPropertyStore *ppsStore     -- An initialized parameter that may be freely used
+                                          --   and modified.
+ );
+
+ LWSTDAPI_(BOOL) PathCompactPathExA(
+     __out_ecount(cchMax) LPSTR pszOut,   -- A string buffer with cch elements that will
+                                          --   be NULL terminated on exit.
+     __in LPCSTR pszSrc,
+     UINT cchMax,
+     DWORD dwFlags
+ );
+
+ HRESULT SHLocalAllocBytes(
+     size_t cb,
+     __deref_bcount(cb) T **ppv           -- A pointer whose dereference will be set to an
+                                          --   uninitialized buffer with cb bytes.
+ );
+
+ __inout_bcount_full(cb) : A buffer with cb elements that is fully initialized at
+     entry and exit, and may be written to by this function.
+
+ __out_ecount_part(count, *countOut) : A buffer with count elements that will be
+     partially initialized by this function. The function indicates how much it
+     initialized by setting *countOut.
+
+ -------------------------------------------------------------------------------
+ Advanced Annotations
+
+ Advanced annotations describe behavior that is not expressible with the regular
+ buffer macros. These may be used either to annotate buffer parameters that involve
+ complex or conditional behavior, or to enrich existing annotations with additional
+ information.
+
+ __success(expr) f :
+     <expr> indicates whether function f succeeded or not. If <expr> is true at exit,
+     all the function's guarantees (as given by other annotations) must hold. If <expr>
+     is false at exit, the caller should not expect any of the function's guarantees
+     to hold. If not used, the function must always satisfy its guarantees. Added
+     automatically to functions that indicate success in standard ways, such as by
+     returning an HRESULT.
+
+ __nullterminated p :
+     Pointer p is a buffer that may be read or written up to and including the first
+     NULL character or pointer. May be used on typedefs, which marks valid (properly
+     initialized) instances of that type as being NULL-terminated.
+
+ __nullnullterminated p :
+     Pointer p is a buffer that may be read or written up to and including the first
+     sequence of two NULL characters or pointers. May be used on typedefs, which marks
+     valid instances of that type as being double-NULL terminated.
+
+ __reserved v :
+     Value v must be 0/NULL, reserved for future use.
+
+ __checkReturn v :
+     Return value v must not be ignored by callers of this function.
+
+ __typefix(ctype) v :
+     Value v should be treated as an instance of ctype, rather than its declared type.
+
+ __override f :
+     Specify C#-style 'override' behaviour for overriding virtual methods.
+
+ __callback f :
+     Function f can be used as a function pointer.
+
+ __format_string p :
+     Pointer p is a string that contains % markers in the style of printf.
+
+ __blocksOn(resource) f :
+     Function f blocks on the resource 'resource'.
+
+ FALLTHROUGH :
+     Annotates switch statement labels where fall-through is desired, to distinguish
+     from forgotten break statements.
+
+ -------------------------------------------------------------------------------
+ Advanced Annotation Examples
+
+ __success(return != FALSE) LWSTDAPI_(BOOL)
+ PathCanonicalizeA(__out_ecount(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) :
+    pszBuf is only guaranteed to be NULL-terminated when TRUE is returned.
+
+ typedef __nullterminated WCHAR* LPWSTR : Initialized LPWSTRs are NULL-terminated strings.
+
+ __out_ecount(cch) __typefix(LPWSTR) void *psz : psz is a buffer parameter which will be
+     a NULL-terminated WCHAR string at exit, and which initially contains cch WCHARs.
+
+ -------------------------------------------------------------------------------
+*/
+
+#define __specstrings
+
+#ifdef  __cplusplus // [
+#ifndef __nothrow // [
+# define __nothrow NOTHROW_DECL
+#endif // ]
+extern "C" {
+#else // ][
+#ifndef __nothrow // [
+# define __nothrow
+#endif // ]
+#endif  /* #ifdef __cplusplus */ // ]
+
+
+/*
+ -------------------------------------------------------------------------------
+ Helper Macro Definitions
+
+ These express behavior common to many of the high-level annotations.
+ DO NOT USE THESE IN YOUR CODE.
+ -------------------------------------------------------------------------------
+*/
+
+/*
+    The helper annotations are only understood by the compiler version used by
+    various defect detection tools. When the regular compiler is running, they
+    are defined into nothing, and do not affect the compiled code.
+*/
+
+#if !defined(__midl) && defined(_PREFAST_) // [
+
+    /*
+     In the primitive "SAL_*" annotations "SAL" stands for Standard
+     Annotation Language.  These "SAL_*" annotations are the
+     primitives the compiler understands and high-level MACROs
+     will decompose into these primivates.
+    */
+
+    #define _SA_SPECSTRIZE( x ) #x
+
+    /*
+     __null p
+     __notnull p
+     __maybenull p
+
+     Annotates a pointer p. States that pointer p is null. Commonly used
+     in the negated form __notnull or the possibly null form __maybenull.
+    */
+
+#ifndef PAL_STDCPP_COMPAT
+    #define __null                  _Null_impl_
+    #define __notnull               _Notnull_impl_
+    #define __maybenull             _Maybenull_impl_
+#endif // !PAL_STDCPP_COMPAT
+
+    /*
+     __readonly l
+     __notreadonly l
+     __maybereadonly l
+
+     Annotates a location l. States that location l is not modified after
+     this point.  If the annotation is placed on the precondition state of
+     a function, the restriction only applies until the postcondition state
+     of the function.  __maybereadonly states that the annotated location
+     may be modified, whereas __notreadonly states that a location must be
+     modified.
+    */
+
+    #define __readonly              _Pre1_impl_(__readaccess_impl)
+    #define __notreadonly           _Pre1_impl_(__allaccess_impl)
+    #define __maybereadonly         _Pre1_impl_(__readaccess_impl)
+
+    /*
+     __valid v
+     __notvalid v
+     __maybevalid v
+
+     Annotates any value v. States that the value satisfies all properties of
+     valid values of its type. For example, for a string buffer, valid means
+     that the buffer pointer is either NULL or points to a NULL-terminated string.
+    */
+
+    #define __valid                 _Valid_impl_
+    #define __notvalid              _Notvalid_impl_
+    #define __maybevalid            _Maybevalid_impl_
+
+    /*
+     __readableTo(extent) p
+
+     Annotates a buffer pointer p.  If the buffer can be read, extent describes
+     how much of the buffer is readable. For a reader of the buffer, this is
+     an explicit permission to read up to that amount, rather than a restriction to
+     read only up to it.
+    */
+
+    #define __readableTo(extent)    _SA_annotes1(SAL_readableTo, extent)
+
+    /*
+
+     __elem_readableTo(size)
+
+     Annotates a buffer pointer p as being readable to size elements.
+    */
+
+    #define __elem_readableTo(size)   _SA_annotes1(SAL_readableTo, elementCount( size ))
+
+    /*
+     __byte_readableTo(size)
+
+     Annotates a buffer pointer p as being readable to size bytes.
+    */
+    #define __byte_readableTo(size)   _SA_annotes1(SAL_readableTo, byteCount(size))
+
+    /*
+     __writableTo(extent) p
+
+     Annotates a buffer pointer p. If the buffer can be modified, extent
+     describes how much of the buffer is writable (usually the allocation
+     size). For a writer of the buffer, this is an explicit permission to
+     write up to that amount, rather than a restriction to write only up to it.
+    */
+    #define __writableTo(size)   _SA_annotes1(SAL_writableTo, size)
+
+    /*
+     __elem_writableTo(size)
+
+     Annotates a buffer pointer p as being writable to size elements.
+    */
+    #define __elem_writableTo(size)   _SA_annotes1(SAL_writableTo, elementCount( size ))
+
+    /*
+     __byte_writableTo(size)
+
+     Annotates a buffer pointer p as being writable to size bytes.
+    */
+    #define __byte_writableTo(size)   _SA_annotes1(SAL_writableTo, byteCount( size))
+
+    /*
+     __deref p
+
+     Annotates a pointer p. The next annotation applies one dereference down
+     in the type. If readableTo(p, size) then the next annotation applies to
+     all elements *(p+i) for which i satisfies the size. If p is a pointer
+     to a struct, the next annotation applies to all fields of the struct.
+    */
+    #define __deref                 _Deref_impl_
+
+    /*
+     __pre __next_annotation
+
+     The next annotation applies in the precondition state
+    */
+    #define __pre                   _Pre_impl_
+
+    /*
+     __post __next_annotation
+
+     The next annotation applies in the postcondition state
+    */
+    #define __post                  _Post_impl_
+
+    /*
+     __precond(<expr>)
+
+     When <expr> is true, the next annotation applies in the precondition state
+     (currently not enabled)
+    */
+    #define __precond(expr)         __pre
+
+    /*
+     __postcond(<expr>)
+
+     When <expr> is true, the next annotation applies in the postcondition state
+     (currently not enabled)
+    */
+    #define __postcond(expr)        __post
+
+    /*
+     __exceptthat
+
+     Given a set of annotations Q containing __exceptthat maybeP, the effect of
+     the except clause is to erase any P or notP annotations (explicit or
+     implied) within Q at the same level of dereferencing that the except
+     clause appears, and to replace it with maybeP.
+
+      Example 1: __valid __pre_except_maybenull on a pointer p means that the
+                 pointer may be null, and is otherwise valid, thus overriding
+                 the implicit notnull annotation implied by __valid on
+                 pointers.
+
+      Example 2: __valid __deref __pre_except_maybenull on an int **p means
+                 that p is not null (implied by valid), but the elements
+                 pointed to by p could be null, and are otherwise valid.
+    */
+    #define __exceptthat                __inner_exceptthat
+
+    /*
+     _refparam
+
+     Added to all out parameter macros to indicate that they are all reference
+     parameters.
+    */
+    #define __refparam                  _Notref_ __deref __notreadonly
+
+    /*
+     __inner_*
+
+     Helper macros that directly correspond to certain high-level annotations.
+
+    */
+
+    /*
+     Macros to classify the entrypoints and indicate their category.
+
+     Pre-defined control point categories include: RPC, LPC, DeviceDriver, UserToKernel, ISAPI, COM.
+
+    */
+    #define __inner_control_entrypoint(category) _SA_annotes2(SAL_entrypoint, controlEntry, category)
+
+
+    /*
+     Pre-defined data entry point categories include: Registry, File, Network.
+    */
+    #define __inner_data_entrypoint(category)    _SA_annotes2(SAL_entrypoint, dataEntry, category)
+
+    #define __inner_override                    _SA_annotes0(__override)
+    #define __inner_callback                    _SA_annotes0(__callback)
+    #define __inner_blocksOn(resource)          _SA_annotes1(SAL_blocksOn, resource)
+
+    #define __post_except_maybenull     __post __inner_exceptthat _Maybenull_impl_
+    #define __pre_except_maybenull      __pre  __inner_exceptthat _Maybenull_impl_
+
+    #define __post_deref_except_maybenull       __post __deref __inner_exceptthat _Maybenull_impl_
+    #define __pre_deref_except_maybenull    __pre  __deref __inner_exceptthat _Maybenull_impl_
+
+    #define __inexpressible_readableTo(size)  _Readable_elements_impl_(_Inexpressible_(size))
+    #define __inexpressible_writableTo(size)  _Writable_elements_impl_(_Inexpressible_(size))
+
+
+#else // ][
+#ifndef PAL_STDCPP_COMPAT
+    #define __null
+    #define __notnull
+    #define __deref
+#endif // !PAL_STDCPP_COMPAT
+    #define __maybenull
+    #define __readonly
+    #define __notreadonly
+    #define __maybereadonly
+    #define __valid
+    #define __notvalid
+    #define __maybevalid
+    #define __readableTo(extent)
+    #define __elem_readableTo(size)
+    #define __byte_readableTo(size)
+    #define __writableTo(size)
+    #define __elem_writableTo(size)
+    #define __byte_writableTo(size)
+    #define __pre
+    #define __post
+    #define __precond(expr)
+    #define __postcond(expr)
+    #define __exceptthat
+    #define __inner_override
+    #define __inner_callback
+    #define __inner_blocksOn(resource)
+    #define __refparam
+    #define __inner_control_entrypoint(category)
+    #define __inner_data_entrypoint(category)
+
+    #define __post_except_maybenull
+    #define __pre_except_maybenull
+    #define __post_deref_except_maybenull
+    #define __pre_deref_except_maybenull
+
+    #define __inexpressible_readableTo(size)
+    #define __inexpressible_writableTo(size)
+
+#endif /* #if !defined(__midl) && defined(_PREFAST_) */ // ]
+
+/*
+-------------------------------------------------------------------------------
+Buffer Annotation Definitions
+
+Any of these may be used to directly annotate functions, but only one should
+be used for each parameter. To determine which annotation to use for a given
+buffer, use the table in the buffer annotations section.
+-------------------------------------------------------------------------------
+*/
+
+#define __ecount(size)                                           _SAL1_Source_(__ecount, (size), __notnull __elem_writableTo(size))
+#define __bcount(size)                                           _SAL1_Source_(__bcount, (size), __notnull __byte_writableTo(size))
+#define __in_ecount(size)                                        _SAL1_Source_(__in_ecount, (size), _In_reads_(size))
+#define __in_bcount(size)                                        _SAL1_Source_(__in_bcount, (size), _In_reads_bytes_(size))
+#define __in_z                                                   _SAL1_Source_(__in_z, (), _In_z_)
+#define __in_ecount_z(size)                                      _SAL1_Source_(__in_ecount_z, (size), _In_reads_z_(size))
+#define __in_bcount_z(size)                                      _SAL1_Source_(__in_bcount_z, (size), __in_bcount(size) __pre __nullterminated)
+#define __in_nz                                                  _SAL1_Source_(__in_nz, (), __in)
+#define __in_ecount_nz(size)                                     _SAL1_Source_(__in_ecount_nz, (size), __in_ecount(size))
+#define __in_bcount_nz(size)                                     _SAL1_Source_(__in_bcount_nz, (size), __in_bcount(size))
+#define __out_ecount(size)                                       _SAL1_Source_(__out_ecount, (size), _Out_writes_(size))
+#define __out_bcount(size)                                       _SAL1_Source_(__out_bcount, (size), _Out_writes_bytes_(size))
+#define __out_ecount_part(size,length)                           _SAL1_Source_(__out_ecount_part, (size,length), _Out_writes_to_(size,length))
+#define __out_bcount_part(size,length)                           _SAL1_Source_(__out_bcount_part, (size,length), _Out_writes_bytes_to_(size,length))
+#define __out_ecount_full(size)                                  _SAL1_Source_(__out_ecount_full, (size), _Out_writes_all_(size))
+#define __out_bcount_full(size)                                  _SAL1_Source_(__out_bcount_full, (size), _Out_writes_bytes_all_(size))
+#define __out_z                                                  _SAL1_Source_(__out_z, (), __post __valid __refparam __post __nullterminated)
+#define __out_z_opt                                              _SAL1_Source_(__out_z_opt, (), __post __valid __refparam __post __nullterminated __pre_except_maybenull)
+#define __out_ecount_z(size)                                     _SAL1_Source_(__out_ecount_z, (size), __ecount(size) __post __valid __refparam __post __nullterminated)
+#define __out_bcount_z(size)                                     _SAL1_Source_(__out_bcount_z, (size), __bcount(size) __post __valid __refparam __post __nullterminated)
+#define __out_ecount_part_z(size,length)                         _SAL1_Source_(__out_ecount_part_z, (size,length), __out_ecount_part(size,length) __post __nullterminated)
+#define __out_bcount_part_z(size,length)                         _SAL1_Source_(__out_bcount_part_z, (size,length), __out_bcount_part(size,length) __post __nullterminated)
+#define __out_ecount_full_z(size)                                _SAL1_Source_(__out_ecount_full_z, (size), __out_ecount_full(size) __post __nullterminated)
+#define __out_bcount_full_z(size)                                _SAL1_Source_(__out_bcount_full_z, (size), __out_bcount_full(size) __post __nullterminated)
+#define __out_nz                                                 _SAL1_Source_(__out_nz, (), __post __valid __refparam)
+#define __out_nz_opt                                             _SAL1_Source_(__out_nz_opt, (), __post __valid __refparam __post_except_maybenull_)
+#define __out_ecount_nz(size)                                    _SAL1_Source_(__out_ecount_nz, (size), __ecount(size) __post __valid __refparam)
+#define __out_bcount_nz(size)                                    _SAL1_Source_(__out_bcount_nz, (size), __bcount(size) __post __valid __refparam)
+#define __inout                                                  _SAL1_Source_(__inout, (), _Inout_)
+#define __inout_ecount(size)                                     _SAL1_Source_(__inout_ecount, (size), _Inout_updates_(size))
+#define __inout_bcount(size)                                     _SAL1_Source_(__inout_bcount, (size), _Inout_updates_bytes_(size))
+#define __inout_ecount_part(size,length)                         _SAL1_Source_(__inout_ecount_part, (size,length), _Inout_updates_to_(size,length))
+#define __inout_bcount_part(size,length)                         _SAL1_Source_(__inout_bcount_part, (size,length), _Inout_updates_bytes_to_(size,length))
+#define __inout_ecount_full(size)                                _SAL1_Source_(__inout_ecount_full, (size), _Inout_updates_all_(size))
+#define __inout_bcount_full(size)                                _SAL1_Source_(__inout_bcount_full, (size), _Inout_updates_bytes_all_(size))
+#define __inout_z                                                _SAL1_Source_(__inout_z, (), _Inout_z_)
+#define __inout_ecount_z(size)                                   _SAL1_Source_(__inout_ecount_z, (size), _Inout_updates_z_(size))
+#define __inout_bcount_z(size)                                   _SAL1_Source_(__inout_bcount_z, (size), __inout_bcount(size) __pre __nullterminated __post __nullterminated)
+#define __inout_nz                                               _SAL1_Source_(__inout_nz, (), __inout)
+#define __inout_ecount_nz(size)                                  _SAL1_Source_(__inout_ecount_nz, (size), __inout_ecount(size))
+#define __inout_bcount_nz(size)                                  _SAL1_Source_(__inout_bcount_nz, (size), __inout_bcount(size))
+#define __ecount_opt(size)                                       _SAL1_Source_(__ecount_opt, (size), __ecount(size)                              __pre_except_maybenull)
+#define __bcount_opt(size)                                       _SAL1_Source_(__bcount_opt, (size), __bcount(size)                              __pre_except_maybenull)
+#define __in_opt                                                 _SAL1_Source_(__in_opt, (), _In_opt_)
+#define __in_ecount_opt(size)                                    _SAL1_Source_(__in_ecount_opt, (size), _In_reads_opt_(size))
+#define __in_bcount_opt(size)                                    _SAL1_Source_(__in_bcount_opt, (size), _In_reads_bytes_opt_(size))
+#define __in_z_opt                                               _SAL1_Source_(__in_z_opt, (), _In_opt_z_)
+#define __in_ecount_z_opt(size)                                  _SAL1_Source_(__in_ecount_z_opt, (size), __in_ecount_opt(size) __pre __nullterminated)
+#define __in_bcount_z_opt(size)                                  _SAL1_Source_(__in_bcount_z_opt, (size), __in_bcount_opt(size) __pre __nullterminated)
+#define __in_nz_opt                                              _SAL1_Source_(__in_nz_opt, (), __in_opt)
+#define __in_ecount_nz_opt(size)                                 _SAL1_Source_(__in_ecount_nz_opt, (size), __in_ecount_opt(size))
+#define __in_bcount_nz_opt(size)                                 _SAL1_Source_(__in_bcount_nz_opt, (size), __in_bcount_opt(size))
+#define __out_opt                                                _SAL1_Source_(__out_opt, (), _Out_opt_)
+#define __out_ecount_opt(size)                                   _SAL1_Source_(__out_ecount_opt, (size), _Out_writes_opt_(size))
+#define __out_bcount_opt(size)                                   _SAL1_Source_(__out_bcount_opt, (size), _Out_writes_bytes_opt_(size))
+#define __out_ecount_part_opt(size,length)                       _SAL1_Source_(__out_ecount_part_opt, (size,length), __out_ecount_part(size,length)              __pre_except_maybenull)
+#define __out_bcount_part_opt(size,length)                       _SAL1_Source_(__out_bcount_part_opt, (size,length), __out_bcount_part(size,length)              __pre_except_maybenull)
+#define __out_ecount_full_opt(size)                              _SAL1_Source_(__out_ecount_full_opt, (size), __out_ecount_full(size)                     __pre_except_maybenull)
+#define __out_bcount_full_opt(size)                              _SAL1_Source_(__out_bcount_full_opt, (size), __out_bcount_full(size)                     __pre_except_maybenull)
+#define __out_ecount_z_opt(size)                                 _SAL1_Source_(__out_ecount_z_opt, (size), __out_ecount_opt(size) __post __nullterminated)
+#define __out_bcount_z_opt(size)                                 _SAL1_Source_(__out_bcount_z_opt, (size), __out_bcount_opt(size) __post __nullterminated)
+#define __out_ecount_part_z_opt(size,length)                     _SAL1_Source_(__out_ecount_part_z_opt, (size,length), __out_ecount_part_opt(size,length) __post __nullterminated)
+#define __out_bcount_part_z_opt(size,length)                     _SAL1_Source_(__out_bcount_part_z_opt, (size,length), __out_bcount_part_opt(size,length) __post __nullterminated)
+#define __out_ecount_full_z_opt(size)                            _SAL1_Source_(__out_ecount_full_z_opt, (size), __out_ecount_full_opt(size) __post __nullterminated)
+#define __out_bcount_full_z_opt(size)                            _SAL1_Source_(__out_bcount_full_z_opt, (size), __out_bcount_full_opt(size) __post __nullterminated)
+#define __out_ecount_nz_opt(size)                                _SAL1_Source_(__out_ecount_nz_opt, (size), __out_ecount_opt(size) __post __nullterminated)
+#define __out_bcount_nz_opt(size)                                _SAL1_Source_(__out_bcount_nz_opt, (size), __out_bcount_opt(size) __post __nullterminated)
+#define __inout_opt                                              _SAL1_Source_(__inout_opt, (), _Inout_opt_)
+#define __inout_ecount_opt(size)                                 _SAL1_Source_(__inout_ecount_opt, (size), __inout_ecount(size)                        __pre_except_maybenull)
+#define __inout_bcount_opt(size)                                 _SAL1_Source_(__inout_bcount_opt, (size), __inout_bcount(size)                        __pre_except_maybenull)
+#define __inout_ecount_part_opt(size,length)                     _SAL1_Source_(__inout_ecount_part_opt, (size,length), __inout_ecount_part(size,length)            __pre_except_maybenull)
+#define __inout_bcount_part_opt(size,length)                     _SAL1_Source_(__inout_bcount_part_opt, (size,length), __inout_bcount_part(size,length)            __pre_except_maybenull)
+#define __inout_ecount_full_opt(size)                            _SAL1_Source_(__inout_ecount_full_opt, (size), __inout_ecount_full(size)                   __pre_except_maybenull)
+#define __inout_bcount_full_opt(size)                            _SAL1_Source_(__inout_bcount_full_opt, (size), __inout_bcount_full(size)                   __pre_except_maybenull)
+#define __inout_z_opt                                            _SAL1_Source_(__inout_z_opt, (), __inout_opt __pre __nullterminated __post __nullterminated)
+#define __inout_ecount_z_opt(size)                               _SAL1_Source_(__inout_ecount_z_opt, (size), __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated)
+#define __inout_ecount_z_opt(size)                               _SAL1_Source_(__inout_ecount_z_opt, (size), __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated)
+#define __inout_bcount_z_opt(size)                               _SAL1_Source_(__inout_bcount_z_opt, (size), __inout_bcount_opt(size))
+#define __inout_nz_opt                                           _SAL1_Source_(__inout_nz_opt, (), __inout_opt)
+#define __inout_ecount_nz_opt(size)                              _SAL1_Source_(__inout_ecount_nz_opt, (size), __inout_ecount_opt(size))
+#define __inout_bcount_nz_opt(size)                              _SAL1_Source_(__inout_bcount_nz_opt, (size), __inout_bcount_opt(size))
+#define __deref_ecount(size)                                     _SAL1_Source_(__deref_ecount, (size), _Notref_ __ecount(1) __post _Notref_ __elem_readableTo(1) __post _Notref_ __deref _Notref_ __notnull __post __deref __elem_writableTo(size))
+#define __deref_bcount(size)                                     _SAL1_Source_(__deref_bcount, (size), _Notref_ __ecount(1) __post _Notref_ __elem_readableTo(1) __post _Notref_ __deref _Notref_ __notnull __post __deref __byte_writableTo(size))
+#define __deref_out                                              _SAL1_Source_(__deref_out, (), _Outptr_)
+#define __deref_out_ecount(size)                                 _SAL1_Source_(__deref_out_ecount, (size), _Outptr_result_buffer_(size))
+#define __deref_out_bcount(size)                                 _SAL1_Source_(__deref_out_bcount, (size), _Outptr_result_bytebuffer_(size))
+#define __deref_out_ecount_part(size,length)                     _SAL1_Source_(__deref_out_ecount_part, (size,length), _Outptr_result_buffer_to_(size,length))
+#define __deref_out_bcount_part(size,length)                     _SAL1_Source_(__deref_out_bcount_part, (size,length), _Outptr_result_bytebuffer_to_(size,length))
+#define __deref_out_ecount_full(size)                            _SAL1_Source_(__deref_out_ecount_full, (size), __deref_out_ecount_part(size,size))
+#define __deref_out_bcount_full(size)                            _SAL1_Source_(__deref_out_bcount_full, (size), __deref_out_bcount_part(size,size))
+#define __deref_out_z                                            _SAL1_Source_(__deref_out_z, (), _Outptr_result_z_)
+#define __deref_out_ecount_z(size)                               _SAL1_Source_(__deref_out_ecount_z, (size), __deref_out_ecount(size) __post __deref __nullterminated)
+#define __deref_out_bcount_z(size)                               _SAL1_Source_(__deref_out_bcount_z, (size), __deref_out_bcount(size) __post __deref __nullterminated)
+#define __deref_out_nz                                           _SAL1_Source_(__deref_out_nz, (), __deref_out)
+#define __deref_out_ecount_nz(size)                              _SAL1_Source_(__deref_out_ecount_nz, (size), __deref_out_ecount(size))
+#define __deref_out_bcount_nz(size)                              _SAL1_Source_(__deref_out_bcount_nz, (size), __deref_out_ecount(size))
+#define __deref_inout                                            _SAL1_Source_(__deref_inout, (), _Notref_ __notnull _Notref_ __elem_readableTo(1) __pre __deref __valid __post _Notref_ __deref __valid __refparam)
+#define __deref_inout_z                                          _SAL1_Source_(__deref_inout_z, (), __deref_inout __pre __deref __nullterminated __post _Notref_ __deref __nullterminated)
+#define __deref_inout_ecount(size)                               _SAL1_Source_(__deref_inout_ecount, (size), __deref_inout __pre __deref __elem_writableTo(size) __post _Notref_ __deref __elem_writableTo(size))
+#define __deref_inout_bcount(size)                               _SAL1_Source_(__deref_inout_bcount, (size), __deref_inout __pre __deref __byte_writableTo(size) __post _Notref_ __deref __byte_writableTo(size))
+#define __deref_inout_ecount_part(size,length)                   _SAL1_Source_(__deref_inout_ecount_part, (size,length), __deref_inout_ecount(size) __pre __deref __elem_readableTo(length) __post __deref __elem_readableTo(length))
+#define __deref_inout_bcount_part(size,length)                   _SAL1_Source_(__deref_inout_bcount_part, (size,length), __deref_inout_bcount(size) __pre __deref __byte_readableTo(length) __post __deref __byte_readableTo(length))
+#define __deref_inout_ecount_full(size)                          _SAL1_Source_(__deref_inout_ecount_full, (size), __deref_inout_ecount_part(size,size))
+#define __deref_inout_bcount_full(size)                          _SAL1_Source_(__deref_inout_bcount_full, (size), __deref_inout_bcount_part(size,size))
+#define __deref_inout_ecount_z(size)                             _SAL1_Source_(__deref_inout_ecount_z, (size), __deref_inout_ecount(size) __pre __deref __nullterminated __post __deref __nullterminated)
+#define __deref_inout_bcount_z(size)                             _SAL1_Source_(__deref_inout_bcount_z, (size), __deref_inout_bcount(size) __pre __deref __nullterminated __post __deref __nullterminated)
+#define __deref_inout_nz                                         _SAL1_Source_(__deref_inout_nz, (), __deref_inout)
+#define __deref_inout_ecount_nz(size)                            _SAL1_Source_(__deref_inout_ecount_nz, (size), __deref_inout_ecount(size))
+#define __deref_inout_bcount_nz(size)                            _SAL1_Source_(__deref_inout_bcount_nz, (size), __deref_inout_ecount(size))
+#define __deref_ecount_opt(size)                                 _SAL1_Source_(__deref_ecount_opt, (size), __deref_ecount(size)                        __post_deref_except_maybenull)
+#define __deref_bcount_opt(size)                                 _SAL1_Source_(__deref_bcount_opt, (size), __deref_bcount(size)                        __post_deref_except_maybenull)
+#define __deref_out_opt                                          _SAL1_Source_(__deref_out_opt, (), __deref_out                                 __post_deref_except_maybenull)
+#define __deref_out_ecount_opt(size)                             _SAL1_Source_(__deref_out_ecount_opt, (size), __deref_out_ecount(size)                    __post_deref_except_maybenull)
+#define __deref_out_bcount_opt(size)                             _SAL1_Source_(__deref_out_bcount_opt, (size), __deref_out_bcount(size)                    __post_deref_except_maybenull)
+#define __deref_out_ecount_part_opt(size,length)                 _SAL1_Source_(__deref_out_ecount_part_opt, (size,length), __deref_out_ecount_part(size,length)        __post_deref_except_maybenull)
+#define __deref_out_bcount_part_opt(size,length)                 _SAL1_Source_(__deref_out_bcount_part_opt, (size,length), __deref_out_bcount_part(size,length)        __post_deref_except_maybenull)
+#define __deref_out_ecount_full_opt(size)                        _SAL1_Source_(__deref_out_ecount_full_opt, (size), __deref_out_ecount_full(size)               __post_deref_except_maybenull)
+#define __deref_out_bcount_full_opt(size)                        _SAL1_Source_(__deref_out_bcount_full_opt, (size), __deref_out_bcount_full(size)               __post_deref_except_maybenull)
+#define __deref_out_z_opt                                        _SAL1_Source_(__deref_out_z_opt, (), _Outptr_result_maybenull_z_)
+#define __deref_out_ecount_z_opt(size)                           _SAL1_Source_(__deref_out_ecount_z_opt, (size), __deref_out_ecount_opt(size) __post __deref __nullterminated)
+#define __deref_out_bcount_z_opt(size)                           _SAL1_Source_(__deref_out_bcount_z_opt, (size), __deref_out_bcount_opt(size) __post __deref __nullterminated)
+#define __deref_out_nz_opt                                       _SAL1_Source_(__deref_out_nz_opt, (), __deref_out_opt)
+#define __deref_out_ecount_nz_opt(size)                          _SAL1_Source_(__deref_out_ecount_nz_opt, (size), __deref_out_ecount_opt(size))
+#define __deref_out_bcount_nz_opt(size)                          _SAL1_Source_(__deref_out_bcount_nz_opt, (size), __deref_out_bcount_opt(size))
+#define __deref_inout_opt                                        _SAL1_Source_(__deref_inout_opt, (), __deref_inout                               __pre_deref_except_maybenull __post_deref_except_maybenull)
+#define __deref_inout_ecount_opt(size)                           _SAL1_Source_(__deref_inout_ecount_opt, (size), __deref_inout_ecount(size)                  __pre_deref_except_maybenull __post_deref_except_maybenull)
+#define __deref_inout_bcount_opt(size)                           _SAL1_Source_(__deref_inout_bcount_opt, (size), __deref_inout_bcount(size)                  __pre_deref_except_maybenull __post_deref_except_maybenull)
+#define __deref_inout_ecount_part_opt(size,length)               _SAL1_Source_(__deref_inout_ecount_part_opt, (size,length), __deref_inout_ecount_part(size,length)      __pre_deref_except_maybenull __post_deref_except_maybenull)
+#define __deref_inout_bcount_part_opt(size,length)               _SAL1_Source_(__deref_inout_bcount_part_opt, (size,length), __deref_inout_bcount_part(size,length)      __pre_deref_except_maybenull __post_deref_except_maybenull)
+#define __deref_inout_ecount_full_opt(size)                      _SAL1_Source_(__deref_inout_ecount_full_opt, (size), __deref_inout_ecount_full(size)             __pre_deref_except_maybenull __post_deref_except_maybenull)
+#define __deref_inout_bcount_full_opt(size)                      _SAL1_Source_(__deref_inout_bcount_full_opt, (size), __deref_inout_bcount_full(size)             __pre_deref_except_maybenull __post_deref_except_maybenull)
+#define __deref_inout_z_opt                                      _SAL1_Source_(__deref_inout_z_opt, (), __deref_inout_opt __pre __deref __nullterminated __post __deref __nullterminated)
+#define __deref_inout_ecount_z_opt(size)                         _SAL1_Source_(__deref_inout_ecount_z_opt, (size), __deref_inout_ecount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated)
+#define __deref_inout_bcount_z_opt(size)                         _SAL1_Source_(__deref_inout_bcount_z_opt, (size), __deref_inout_bcount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated)
+#define __deref_inout_nz_opt                                     _SAL1_Source_(__deref_inout_nz_opt, (), __deref_inout_opt)
+#define __deref_inout_ecount_nz_opt(size)                        _SAL1_Source_(__deref_inout_ecount_nz_opt, (size), __deref_inout_ecount_opt(size))
+#define __deref_inout_bcount_nz_opt(size)                        _SAL1_Source_(__deref_inout_bcount_nz_opt, (size), __deref_inout_bcount_opt(size))
+#define __deref_opt_ecount(size)                                 _SAL1_Source_(__deref_opt_ecount, (size), __deref_ecount(size)                        __pre_except_maybenull)
+#define __deref_opt_bcount(size)                                 _SAL1_Source_(__deref_opt_bcount, (size), __deref_bcount(size)                        __pre_except_maybenull)
+#define __deref_opt_out                                          _SAL1_Source_(__deref_opt_out, (), _Outptr_opt_)
+#define __deref_opt_out_z                                        _SAL1_Source_(__deref_opt_out_z, (), _Outptr_opt_result_z_)
+#define __deref_opt_out_ecount(size)                             _SAL1_Source_(__deref_opt_out_ecount, (size), __deref_out_ecount(size)                    __pre_except_maybenull)
+#define __deref_opt_out_bcount(size)                             _SAL1_Source_(__deref_opt_out_bcount, (size), __deref_out_bcount(size)                    __pre_except_maybenull)
+#define __deref_opt_out_ecount_part(size,length)                 _SAL1_Source_(__deref_opt_out_ecount_part, (size,length), __deref_out_ecount_part(size,length)        __pre_except_maybenull)
+#define __deref_opt_out_bcount_part(size,length)                 _SAL1_Source_(__deref_opt_out_bcount_part, (size,length), __deref_out_bcount_part(size,length)        __pre_except_maybenull)
+#define __deref_opt_out_ecount_full(size)                        _SAL1_Source_(__deref_opt_out_ecount_full, (size), __deref_out_ecount_full(size)               __pre_except_maybenull)
+#define __deref_opt_out_bcount_full(size)                        _SAL1_Source_(__deref_opt_out_bcount_full, (size), __deref_out_bcount_full(size)               __pre_except_maybenull)
+#define __deref_opt_inout                                        _SAL1_Source_(__deref_opt_inout, (), _Inout_opt_)
+#define __deref_opt_inout_ecount(size)                           _SAL1_Source_(__deref_opt_inout_ecount, (size), __deref_inout_ecount(size)                  __pre_except_maybenull)
+#define __deref_opt_inout_bcount(size)                           _SAL1_Source_(__deref_opt_inout_bcount, (size), __deref_inout_bcount(size)                  __pre_except_maybenull)
+#define __deref_opt_inout_ecount_part(size,length)               _SAL1_Source_(__deref_opt_inout_ecount_part, (size,length), __deref_inout_ecount_part(size,length)      __pre_except_maybenull)
+#define __deref_opt_inout_bcount_part(size,length)               _SAL1_Source_(__deref_opt_inout_bcount_part, (size,length), __deref_inout_bcount_part(size,length)      __pre_except_maybenull)
+#define __deref_opt_inout_ecount_full(size)                      _SAL1_Source_(__deref_opt_inout_ecount_full, (size), __deref_inout_ecount_full(size)             __pre_except_maybenull)
+#define __deref_opt_inout_bcount_full(size)                      _SAL1_Source_(__deref_opt_inout_bcount_full, (size), __deref_inout_bcount_full(size)             __pre_except_maybenull)
+#define __deref_opt_inout_z                                      _SAL1_Source_(__deref_opt_inout_z, (), __deref_opt_inout __pre __deref __nullterminated __post __deref __nullterminated)
+#define __deref_opt_inout_ecount_z(size)                         _SAL1_Source_(__deref_opt_inout_ecount_z, (size), __deref_opt_inout_ecount(size) __pre __deref __nullterminated __post __deref __nullterminated)
+#define __deref_opt_inout_bcount_z(size)                         _SAL1_Source_(__deref_opt_inout_bcount_z, (size), __deref_opt_inout_bcount(size) __pre __deref __nullterminated __post __deref __nullterminated)
+#define __deref_opt_inout_nz                                     _SAL1_Source_(__deref_opt_inout_nz, (), __deref_opt_inout)
+#define __deref_opt_inout_ecount_nz(size)                        _SAL1_Source_(__deref_opt_inout_ecount_nz, (size), __deref_opt_inout_ecount(size))
+#define __deref_opt_inout_bcount_nz(size)                        _SAL1_Source_(__deref_opt_inout_bcount_nz, (size), __deref_opt_inout_bcount(size))
+#define __deref_opt_ecount_opt(size)                             _SAL1_Source_(__deref_opt_ecount_opt, (size), __deref_ecount_opt(size)                    __pre_except_maybenull)
+#define __deref_opt_bcount_opt(size)                             _SAL1_Source_(__deref_opt_bcount_opt, (size), __deref_bcount_opt(size)                    __pre_except_maybenull)
+#define __deref_opt_out_opt                                      _SAL1_Source_(__deref_opt_out_opt, (), _Outptr_opt_result_maybenull_)
+#define __deref_opt_out_ecount_opt(size)                         _SAL1_Source_(__deref_opt_out_ecount_opt, (size), __deref_out_ecount_opt(size)                __pre_except_maybenull)
+#define __deref_opt_out_bcount_opt(size)                         _SAL1_Source_(__deref_opt_out_bcount_opt, (size), __deref_out_bcount_opt(size)                __pre_except_maybenull)
+#define __deref_opt_out_ecount_part_opt(size,length)             _SAL1_Source_(__deref_opt_out_ecount_part_opt, (size,length), __deref_out_ecount_part_opt(size,length)    __pre_except_maybenull)
+#define __deref_opt_out_bcount_part_opt(size,length)             _SAL1_Source_(__deref_opt_out_bcount_part_opt, (size,length), __deref_out_bcount_part_opt(size,length)    __pre_except_maybenull)
+#define __deref_opt_out_ecount_full_opt(size)                    _SAL1_Source_(__deref_opt_out_ecount_full_opt, (size), __deref_out_ecount_full_opt(size)           __pre_except_maybenull)
+#define __deref_opt_out_bcount_full_opt(size)                    _SAL1_Source_(__deref_opt_out_bcount_full_opt, (size), __deref_out_bcount_full_opt(size)           __pre_except_maybenull)
+#define __deref_opt_out_z_opt                                    _SAL1_Source_(__deref_opt_out_z_opt, (), __post __deref __valid __refparam __pre_except_maybenull __pre_deref_except_maybenull __post_deref_except_maybenull __post __deref __nullterminated)
+#define __deref_opt_out_ecount_z_opt(size)                       _SAL1_Source_(__deref_opt_out_ecount_z_opt, (size), __deref_opt_out_ecount_opt(size) __post __deref __nullterminated)
+#define __deref_opt_out_bcount_z_opt(size)                       _SAL1_Source_(__deref_opt_out_bcount_z_opt, (size), __deref_opt_out_bcount_opt(size) __post __deref __nullterminated)
+#define __deref_opt_out_nz_opt                                   _SAL1_Source_(__deref_opt_out_nz_opt, (), __deref_opt_out_opt)
+#define __deref_opt_out_ecount_nz_opt(size)                      _SAL1_Source_(__deref_opt_out_ecount_nz_opt, (size), __deref_opt_out_ecount_opt(size))
+#define __deref_opt_out_bcount_nz_opt(size)                      _SAL1_Source_(__deref_opt_out_bcount_nz_opt, (size), __deref_opt_out_bcount_opt(size))
+#define __deref_opt_inout_opt                                    _SAL1_Source_(__deref_opt_inout_opt, (), __deref_inout_opt                           __pre_except_maybenull)
+#define __deref_opt_inout_ecount_opt(size)                       _SAL1_Source_(__deref_opt_inout_ecount_opt, (size), __deref_inout_ecount_opt(size)              __pre_except_maybenull)
+#define __deref_opt_inout_bcount_opt(size)                       _SAL1_Source_(__deref_opt_inout_bcount_opt, (size), __deref_inout_bcount_opt(size)              __pre_except_maybenull)
+#define __deref_opt_inout_ecount_part_opt(size,length)           _SAL1_Source_(__deref_opt_inout_ecount_part_opt, (size,length), __deref_inout_ecount_part_opt(size,length)  __pre_except_maybenull)
+#define __deref_opt_inout_bcount_part_opt(size,length)           _SAL1_Source_(__deref_opt_inout_bcount_part_opt, (size,length), __deref_inout_bcount_part_opt(size,length)  __pre_except_maybenull)
+#define __deref_opt_inout_ecount_full_opt(size)                  _SAL1_Source_(__deref_opt_inout_ecount_full_opt, (size), __deref_inout_ecount_full_opt(size)         __pre_except_maybenull)
+#define __deref_opt_inout_bcount_full_opt(size)                  _SAL1_Source_(__deref_opt_inout_bcount_full_opt, (size), __deref_inout_bcount_full_opt(size)         __pre_except_maybenull)
+#define __deref_opt_inout_z_opt                                  _SAL1_Source_(__deref_opt_inout_z_opt, (), __deref_opt_inout_opt  __pre __deref __nullterminated __post __deref __nullterminated)
+#define __deref_opt_inout_ecount_z_opt(size)                     _SAL1_Source_(__deref_opt_inout_ecount_z_opt, (size), __deref_opt_inout_ecount_opt(size)  __pre __deref __nullterminated __post __deref __nullterminated)
+#define __deref_opt_inout_bcount_z_opt(size)                     _SAL1_Source_(__deref_opt_inout_bcount_z_opt, (size), __deref_opt_inout_bcount_opt(size)  __pre __deref __nullterminated __post __deref __nullterminated)
+#define __deref_opt_inout_nz_opt                                 _SAL1_Source_(__deref_opt_inout_nz_opt, (), __deref_opt_inout_opt)
+#define __deref_opt_inout_ecount_nz_opt(size)                    _SAL1_Source_(__deref_opt_inout_ecount_nz_opt, (size), __deref_opt_inout_ecount_opt(size))
+#define __deref_opt_inout_bcount_nz_opt(size)                    _SAL1_Source_(__deref_opt_inout_bcount_nz_opt, (size), __deref_opt_inout_bcount_opt(size))
+
+/*
+-------------------------------------------------------------------------------
+Advanced Annotation Definitions
+
+Any of these may be used to directly annotate functions, and may be used in
+combination with each other or with regular buffer macros. For an explanation
+of each annotation, see the advanced annotations section.
+-------------------------------------------------------------------------------
+*/
+
+#define __success(expr)                      _Success_(expr)
+#define __nullterminated                     _Null_terminated_
+#define __nullnullterminated
+#define __clr_reserved                       _SAL1_Source_(__reserved, (), _Reserved_)
+#define __checkReturn                        _SAL1_Source_(__checkReturn, (), _Check_return_)
+#define __typefix(ctype)                     _SAL1_Source_(__typefix, (ctype), __inner_typefix(ctype))
+#define __override                           __inner_override
+#define __callback                           __inner_callback
+#define __format_string                      _Printf_format_string_
+#define __blocksOn(resource)                 __inner_blocksOn(resource)
+#define __control_entrypoint(category)       __inner_control_entrypoint(category)
+#define __data_entrypoint(category)          __inner_data_entrypoint(category)
+#define __useHeader                          _Use_decl_anno_impl_
+#define __on_failure(annotes)                _On_failure_impl_(annotes _SAL_nop_impl_)
+
+#ifndef __has_cpp_attribute
+#define __has_cpp_attribute(x) (0)
+#endif
+
+#ifndef __fallthrough // [
+#if __has_cpp_attribute(fallthrough)
+#define __fallthrough [[fallthrough]]
+#else
+#define __fallthrough
+#endif
+#endif // ]
+
+#ifndef __analysis_assume // [
+#ifdef _PREFAST_ // [
+#define __analysis_assume(expr) __assume(expr)
+#else // ][
+#define __analysis_assume(expr)
+#endif // ]
+#endif // ]
+
+#ifndef _Analysis_assume_ // [
+#ifdef _PREFAST_ // [
+#define _Analysis_assume_(expr) __assume(expr)
+#else // ][
+#define _Analysis_assume_(expr)
+#endif // ]
+#endif // ]
+
+#define _Analysis_noreturn_    _SAL2_Source_(_Analysis_noreturn_, (), _SA_annotes0(SAL_terminates))
+
+#ifdef _PREFAST_ // [
+__inline __nothrow
+void __AnalysisAssumeNullterminated(_Post_ __nullterminated void *p);
+
+#define _Analysis_assume_nullterminated_(x) __AnalysisAssumeNullterminated(x)
+#else // ][
+#define _Analysis_assume_nullterminated_(x)
+#endif // ]
+
+//
+// Set the analysis mode (global flags to analysis).
+// They take effect at the point of declaration; use at global scope
+// as a declaration.
+//
+
+// Synthesize a unique symbol.
+#define ___MKID(x, y) x ## y
+#define __MKID(x, y) ___MKID(x, y)
+#define __GENSYM(x) __MKID(x, __COUNTER__)
+
+__ANNOTATION(SAL_analysisMode(__AuToQuOtE __In_impl_ char *mode);)
+
+#define _Analysis_mode_impl_(mode) _SA_annotes1(SAL_analysisMode, #mode)
+
+#define _Analysis_mode_(mode)                                                 \
+    typedef _Analysis_mode_impl_(mode) int                                    \
+        __GENSYM(__prefast_analysis_mode_flag);
+
+// The following are predefined:
+//  _Analysis_operator_new_throw_   (operator new throws)
+//  _Analysis_operator_new_null_        (operator new returns null)
+//  _Analysis_operator_new_never_fails_ (operator new never fails)
+//
+
+// Function class annotations.
+__ANNOTATION(SAL_functionClassNew(__In_impl_ char*);)
+__PRIMOP(int, _In_function_class_(__In_impl_ char*);)
+#define _In_function_class_(x)  _In_function_class_(#x)
+
+#define _Function_class_(x)  _SA_annotes1(SAL_functionClassNew, #x)
+
+/*
+ * interlocked operand used in interlocked instructions
+ */
+//#define _Interlocked_operand_ _Pre_ _SA_annotes0(SAL_interlocked)
+
+#define _Enum_is_bitflag_    _SA_annotes0(SAL_enumIsBitflag)
+#define _Strict_type_match_  _SA_annotes0(SAL_strictType2)
+
+#define _Maybe_raises_SEH_exception_   _Pre_ _SA_annotes1(SAL_inTry,__yes)
+#define _Raises_SEH_exception_         _Group_(_Maybe_raises_SEH_exception_ _Analysis_noreturn_)
+
+#ifdef  __cplusplus // [
+}
+#endif // ]

From f76d081c8282c4b3471d9681c7a95b85028bac7c Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Sat, 4 Mar 2023 17:32:14 -0500
Subject: [PATCH 04/42] perf: fix code bugs with SSE2 compiler

with vectorization, more math optimizations, and alignment,
the compiler is more sensitive to some forms of code bugs
---
 src/engine/gl_lightmap.cpp   | 8 ++++----
 src/mathlib/mathlib_base.cpp | 2 +-
 src/public/mathlib/mathlib.h | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/engine/gl_lightmap.cpp b/src/engine/gl_lightmap.cpp
index 5619120cd..4ee8b0a63 100644
--- a/src/engine/gl_lightmap.cpp
+++ b/src/engine/gl_lightmap.cpp
@@ -1339,7 +1339,7 @@ static void SortSurfacesByLightmapID( SurfaceHandle_t *pToSort, int iSurfaceCoun
 	SurfaceHandle_t *pSortTemp = (SurfaceHandle_t *)stackalloc( sizeof( SurfaceHandle_t ) * iSurfaceCount );
 	
 	//radix sort
-	for( int radix = 0; radix != 4; ++radix )
+	for( int radix = 0; radix < 4; ++radix )
 	{
 		//swap the inputs for the next pass
 		{
@@ -1350,7 +1350,7 @@ static void SortSurfacesByLightmapID( SurfaceHandle_t *pToSort, int iSurfaceCoun
 
 		int iCounts[256] = { 0 };
 		int iBitOffset = radix * 8;
-		for( int i = 0; i != iSurfaceCount; ++i )
+		for( int i = 0; i < iSurfaceCount; ++i )
 		{
 			uint8 val = (materialSortInfoArray[MSurf_MaterialSortID( pSortTemp[i] )].lightmapPageID >> iBitOffset) & 0xFF;
 			++iCounts[val];
@@ -1358,12 +1358,12 @@ static void SortSurfacesByLightmapID( SurfaceHandle_t *pToSort, int iSurfaceCoun
 
 		int iOffsetTable[256];
 		iOffsetTable[0] = 0;
-		for( int i = 0; i != 255; ++i )
+		for( int i = 0; i < 255; ++i )
 		{
 			iOffsetTable[i + 1] = iOffsetTable[i] + iCounts[i];
 		}
 
-		for( int i = 0; i != iSurfaceCount; ++i )
+		for( int i = 0; i < iSurfaceCount; ++i )
 		{
 			uint8 val = (materialSortInfoArray[MSurf_MaterialSortID( pSortTemp[i] )].lightmapPageID >> iBitOffset) & 0xFF;
 			int iWriteIndex = iOffsetTable[val];
diff --git a/src/mathlib/mathlib_base.cpp b/src/mathlib/mathlib_base.cpp
index 64069a7be..6503072cb 100644
--- a/src/mathlib/mathlib_base.cpp
+++ b/src/mathlib/mathlib_base.cpp
@@ -1488,7 +1488,7 @@ float SmoothCurve( float x )
 inline float MovePeak( float x, float flPeakPos )
 {
 	// Todo: make this higher-order?
-	if( x < flPeakPos )
+	if ( (x < flPeakPos || flPeakPos == 1) && flPeakPos != 0 )
 		return x * 0.5f / flPeakPos;
 	else
 		return 0.5 + 0.5 * (x - flPeakPos) / (1 - flPeakPos);
diff --git a/src/public/mathlib/mathlib.h b/src/public/mathlib/mathlib.h
index a6d302ff1..42317632b 100644
--- a/src/public/mathlib/mathlib.h
+++ b/src/public/mathlib/mathlib.h
@@ -1004,7 +1004,7 @@ void BuildGammaTable( float gamma, float texGamma, float brightness, int overbri
 // convert texture to linear 0..1 value
 inline float TexLightToLinear( int c, int exponent )
 {
-	extern float power2_n[256]; 
+	extern ALIGN128 float power2_n[256];
 	Assert( exponent >= -128 && exponent <= 127 );
 	return ( float )c * power2_n[exponent+128];
 }

From 0bd3752e3b26519eb2c40a577db96d0eefa62292 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Sat, 4 Mar 2023 23:49:45 -0500
Subject: [PATCH 05/42] perf: implement SSE2 math using DXMath and CSGO
 backports

* remove redirecting fast math calls as SSE2 is now guaranteed
  this is a pretty destructive change and makes the diff a bit ugly,
  but I'm sure it's for the best.
* replace SinCos with the DirectXMath minimax polynomial approximation
  This could have regressions, since it's not as accurate as sin and cos
  calls, but I found it to be sufficient in accuracy from experience.
* remove SSE2_SinCos (Extended precision modular arithmetic).
  It seems to share lineage with sse_mathfun (and thus cephes), and I don't
  see a reason to use it over the much simpler minimax polynomial approximation
  however, we could reimplement it using sse_mathfun and finally introduce that code!
* implement new fast math calls
  I profiled these and found them to be faster, particularly for the
  animation code which was a big win, especially in large fights.
* remove SinCos table since it was only being used by a single effect
  no point in using memory, potentially slowing down cache for a single
  effect's lookup table for cos, which probably doesn't even save time
* remove SSE clamp, it generates heavier assembly
* implement various mathlib calls with DXMath
  some of these aren't used, but I figured I'd implement them anyway
  in case some more backports/other introduce usage of them.
* use SIMD versions of Angle functions
  (except for AngleVectors, I couldn't really find a perf win on that one.)
* optimize SSE rounding functions
  More efficient way to get around the round-to-nearest-even on 0.5
  also, one of them was missing mitigation for this behavior.
* enable SIMD Quaternion (faster on SSE2)
  I implemented some functions with DirectXMath. Works well with
  hlmv/other tools, in-game.
* backport FourQuaternions from CSGO
* backport SlerpBonesSpeedy from CSGO (needed to adjust QuaternionAligned usage for this one)
  this is another big win in large fights, especially with the new math calls.
---
 src/engine/initmathlib.cpp                    |   45 +-
 src/game/client/c_baseanimating.cpp           |    2 +-
 src/game/client/c_baseanimating.h             |    2 +-
 src/game/client/c_smokestack.cpp              |    2 +-
 src/game/server/baseanimating.cpp             |    2 +-
 src/mathlib/3dnow.cpp                         |  197 --
 src/mathlib/3dnow.h                           |   16 -
 src/mathlib/mathlib.vpc                       |    4 -
 src/mathlib/mathlib_base.cpp                  |  176 +-
 src/mathlib/sse.cpp                           | 1107 ------
 src/mathlib/sse.h                             |   27 -
 src/mathlib/sseconst.cpp                      |    4 +
 src/public/bone_setup.cpp                     |  522 ++-
 src/public/mathlib/dxmath.h                   |   20 +
 src/public/mathlib/math_pfns.h                |   56 +-
 src/public/mathlib/mathlib.h                  |   68 +-
 src/public/mathlib/ssemath.h                  |   90 +-
 src/public/mathlib/ssequaternion.h            |  842 ++++-
 src/public/mathlib/vector.h                   |   52 +-
 src/public/mathlib/vector4d.h                 |   34 +-
 .../DirectXMath-dec2022/Inc/DirectXMath.h     |    6 +-
 src/thirdparty/dotnetrt/sal.h                 | 3130 ++---------------
 src/thirdparty/sse_mathfun/sse_mathfun.h      |  710 ++++
 src/tier2/util_init.cpp                       |    2 +-
 src/tier3/mdlutils.cpp                        |    4 +-
 25 files changed, 2517 insertions(+), 4603 deletions(-)
 delete mode 100644 src/mathlib/3dnow.cpp
 delete mode 100644 src/mathlib/3dnow.h
 delete mode 100644 src/mathlib/sse.cpp
 delete mode 100644 src/mathlib/sse.h
 create mode 100644 src/public/mathlib/dxmath.h
 create mode 100644 src/thirdparty/sse_mathfun/sse_mathfun.h

diff --git a/src/engine/initmathlib.cpp b/src/engine/initmathlib.cpp
index 962830534..f5da3a7ec 100644
--- a/src/engine/initmathlib.cpp
+++ b/src/engine/initmathlib.cpp
@@ -15,53 +15,10 @@
 // memdbgon must be the last include file in a .cpp file!!!
 #include "tier0/memdbgon.h"
 
-static bool s_bAllow3DNow = true;
-static bool s_bAllowSSE2 = true;
-
 void InitMathlib( void )
 {
 	MathLib_Init( 2.2f, // v_gamma.GetFloat()
 		2.2f, // v_texgamma.GetFloat()
 		0.0f /*v_brightness.GetFloat() */, 
-		2.0f /*mat_overbright.GetInt() */, s_bAllow3DNow, true, s_bAllowSSE2, true );
-}
-
-/*
-===============
-R_SSE2
-===============
-*/
-CON_COMMAND( r_sse2, "Enable/disable SSE2 code" )
-{
-	if (args.ArgC() == 1)
-	{
-		s_bAllowSSE2 = true;
-	}
-	else
-	{
-		s_bAllowSSE2 = atoi( args[1] ) ? true : false;
-	}
-
-	InitMathlib();
-	ConMsg( "SSE2 code is %s\n", MathLib_SSE2Enabled() ? "enabled" : "disabled" );
+		2.0f /*mat_overbright.GetInt() */ );
 }
-
-/*
-===============
-R_3DNow
-===============
-*/
-CON_COMMAND( r_3dnow, "Enable/disable 3DNow code" )
-{
-	if (args.ArgC() == 1)
-	{
-		s_bAllow3DNow = true;
-	}
-	else
-	{
-		s_bAllow3DNow  = atoi( args[1] ) ? true : false;
-	}
-
-	InitMathlib();
-	ConMsg( "3DNow code is %s\n", MathLib_3DNowEnabled() ? "enabled" : "disabled" );
-}
\ No newline at end of file
diff --git a/src/game/client/c_baseanimating.cpp b/src/game/client/c_baseanimating.cpp
index 1dac8e4ff..c78f8582f 100644
--- a/src/game/client/c_baseanimating.cpp
+++ b/src/game/client/c_baseanimating.cpp
@@ -2950,7 +2950,7 @@ bool C_BaseAnimating::SetupBones( matrix3x4_t *pBoneToWorldOut, int nMaxBones, i
 			}
 
 			Vector		pos[MAXSTUDIOBONES];
-			Quaternion	q[MAXSTUDIOBONES];
+			QuaternionAligned	q[MAXSTUDIOBONES];
 #if defined(FP_EXCEPTIONS_ENABLED) || defined(DBGFLAG_ASSERT)
 			// Having these uninitialized means that some bugs are very hard
 			// to reproduce. A memset of 0xFF is a simple way of getting NaNs.
diff --git a/src/game/client/c_baseanimating.h b/src/game/client/c_baseanimating.h
index c6715fb3f..1c8a74b30 100644
--- a/src/game/client/c_baseanimating.h
+++ b/src/game/client/c_baseanimating.h
@@ -68,7 +68,7 @@ struct RagdollInfo_t
 	float		m_flSaveTime;
 	int			m_nNumBones;
 	Vector		m_rgBonePos[MAXSTUDIOBONES];
-	Quaternion	m_rgBoneQuaternion[MAXSTUDIOBONES];
+	QuaternionAligned	m_rgBoneQuaternion[MAXSTUDIOBONES];
 };
 
 
diff --git a/src/game/client/c_smokestack.cpp b/src/game/client/c_smokestack.cpp
index c2c0a9f0d..7d2eac62d 100644
--- a/src/game/client/c_smokestack.cpp
+++ b/src/game/client/c_smokestack.cpp
@@ -406,7 +406,7 @@ void C_SmokeStack::RenderParticles( CParticleRenderIterator *pIterator )
 		// makes it get translucent and fade out for a longer time.
 		//float alpha = cosf( -M_PI_F + tLifetime * M_PI_F * 2.f ) * 0.5f + 0.5f;
 		float tLifetime = pParticle->m_Lifetime * m_InvLifetime;
-		float alpha = TableCos( -M_PI_F + tLifetime * M_PI_F * 2.f ) * 0.5f + 0.5f;
+		float alpha = FastCos( -M_PI_F + tLifetime * M_PI_F * 2.f ) * 0.5f + 0.5f;
 		if( tLifetime > 0.5f )
 			alpha *= alpha;
 
diff --git a/src/game/server/baseanimating.cpp b/src/game/server/baseanimating.cpp
index f321b4205..e8f035929 100644
--- a/src/game/server/baseanimating.cpp
+++ b/src/game/server/baseanimating.cpp
@@ -1798,7 +1798,7 @@ void CBaseAnimating::SetupBones( matrix3x4_t *pBoneToWorld, int boneMask )
 	AddEFlags( EFL_SETTING_UP_BONES );
 
 	Vector pos[MAXSTUDIOBONES];
-	Quaternion q[MAXSTUDIOBONES];
+	QuaternionAligned q[MAXSTUDIOBONES];
 
 	// adjust hit boxes based on IK driven offset
 	Vector adjOrigin = GetAbsOrigin() + Vector( 0, 0, m_flEstIkOffset );
diff --git a/src/mathlib/3dnow.cpp b/src/mathlib/3dnow.cpp
deleted file mode 100644
index db17c8c10..000000000
--- a/src/mathlib/3dnow.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 3DNow Math primitives.
-//
-//=====================================================================================//
-
-#include <math.h>
-#include <float.h>	// Needed for FLT_EPSILON
-#include "basetypes.h"
-#include <memory.h>
-#include "tier0/dbg.h"
-#include "mathlib/mathlib.h"
-#include "mathlib/amd3dx.h"
-#include "mathlib/vector.h"
-
-// memdbgon must be the last include file in a .cpp file!!!
-#include "tier0/memdbgon.h"
-
-#if !defined(COMPILER_MSVC64) && !defined(LINUX)
-// Implement for 64-bit Windows if needed.
-// Clang hits "fatal error: error in backend:" and other errors when trying
-// to compile the inline assembly below. 3DNow support is highly unlikely to
-// be useful/used, so it's not worth spending time on fixing.
-
-#pragma warning(disable:4244)   // "conversion from 'const int' to 'float', possible loss of data"
-#pragma warning(disable:4730)	// "mixing _m64 and floating point expressions may result in incorrect code"
-
-//-----------------------------------------------------------------------------
-// 3D Now Implementations of optimized routines:
-//-----------------------------------------------------------------------------
-float _3DNow_Sqrt(float x)
-{
-	Assert( s_bMathlibInitialized );
-	float	root = 0.f;
-#ifdef _WIN32
-	_asm
-	{
-		femms
-		movd		mm0, x
-		PFRSQRT		(mm1,mm0)
-		punpckldq	mm0, mm0
-		PFMUL		(mm0, mm1)
-		movd		root, mm0
-		femms
-	}
-#elif LINUX
- 	__asm __volatile__( "femms" );
- 	__asm __volatile__
-	(
-		"pfrsqrt    %y0, %y1 \n\t"
-		"punpckldq   %y1, %y1 \n\t"
-		"pfmul      %y1, %y0 \n\t"
-		: "=y" (root), "=y" (x)
- 		:"0" (x)
- 	);
- 	__asm __volatile__( "femms" );
-#else
-#error
-#endif
-
-	return root;
-}
-
-// NJS FIXME: Need to test Recripricol squareroot performance and accuraccy
-// on AMD's before using the specialized instruction.
-float _3DNow_RSqrt(float x)
-{
-	Assert( s_bMathlibInitialized );
-
-	return 1.f / _3DNow_Sqrt(x);
-}
-
-
-float FASTCALL _3DNow_VectorNormalize (Vector& vec)
-{
-	Assert( s_bMathlibInitialized );
-	float *v = &vec[0];
-	float	radius = 0.f;
-
-	if ( v[0] || v[1] || v[2] )
-	{
-#ifdef _WIN32
-	_asm
-		{
-			mov			eax, v
-			femms
-			movq		mm0, QWORD PTR [eax]
-			movd		mm1, DWORD PTR [eax+8]
-			movq		mm2, mm0
-			movq		mm3, mm1
-			PFMUL		(mm0, mm0)
-			PFMUL		(mm1, mm1)
-			PFACC		(mm0, mm0)
-			PFADD		(mm1, mm0)
-			PFRSQRT		(mm0, mm1)
-			punpckldq	mm1, mm1
-			PFMUL		(mm1, mm0)
-			PFMUL		(mm2, mm0)
-			PFMUL		(mm3, mm0)
-			movq		QWORD PTR [eax], mm2
-			movd		DWORD PTR [eax+8], mm3
-			movd		radius, mm1
-			femms
-		}
-#elif LINUX	
-		long long a,c;
-    		int b,d;
-    		memcpy(&a,&vec[0],sizeof(a));
-    		memcpy(&b,&vec[2],sizeof(b));
-    		memcpy(&c,&vec[0],sizeof(c));
-    		memcpy(&d,&vec[2],sizeof(d));
-
-      		__asm __volatile__( "femms" );
-        	__asm __volatile__
-        	(
-        		"pfmul           %y3, %y3\n\t"
-        		"pfmul           %y0, %y0 \n\t"
-        		"pfacc           %y3, %y3 \n\t"
-        		"pfadd           %y3, %y0 \n\t"
-        		"pfrsqrt         %y0, %y3 \n\t"
-        		"punpckldq       %y0, %y0 \n\t"
-        		"pfmul           %y3, %y0 \n\t"
-        		"pfmul           %y3, %y2 \n\t"
-        		"pfmul           %y3, %y1 \n\t"
-        		: "=y" (radius), "=y" (c), "=y" (d)
-        		: "y" (a), "0" (b), "1" (c), "2" (d)
-        	);
-      		memcpy(&vec[0],&c,sizeof(c));
-      		memcpy(&vec[2],&d,sizeof(d));		
-        	__asm __volatile__( "femms" );
-
-#else
-#error
-#endif
-	}
-    return radius;
-}
-
-
-void FASTCALL _3DNow_VectorNormalizeFast (Vector& vec)
-{
-	_3DNow_VectorNormalize( vec );
-}
-
-
-// JAY: This complains with the latest processor pack
-#pragma warning(disable: 4730)
-
-float _3DNow_InvRSquared(const float* v)
-{
-	Assert( s_bMathlibInitialized );
-	float	r2 = 1.f;
-#ifdef _WIN32
-	_asm { // AMD 3DNow only routine
-		mov			eax, v
-		femms
-		movq		mm0, QWORD PTR [eax]
-		movd		mm1, DWORD PTR [eax+8]
-		movd		mm2, [r2]
-		PFMUL		(mm0, mm0)
-		PFMUL		(mm1, mm1)
-		PFACC		(mm0, mm0)
-		PFADD		(mm1, mm0)
-		PFMAX		(mm1, mm2)
-		PFRCP		(mm0, mm1)
-		movd		[r2], mm0
-		femms
-	}
-#elif LINUX
-		long long a,c;
-    		int b;
-    		memcpy(&a,&v[0],sizeof(a));
-    		memcpy(&b,&v[2],sizeof(b));
-    		memcpy(&c,&v[0],sizeof(c));
-
-      		__asm __volatile__( "femms" );
-        	__asm __volatile__
-        	(
-			"PFMUL          %y2, %y2 \n\t"
-                        "PFMUL          %y3, %y3 \n\t"
-                        "PFACC          %y2, %y2 \n\t"
-                        "PFADD          %y2, %y3 \n\t"
-                        "PFMAX          %y3, %y4 \n\t"
-                        "PFRCP          %y3, %y2 \n\t"
-                        "movq           %y2, %y0 \n\t"
-        		: "=y" (r2)
-        		: "0" (r2), "y" (a), "y" (b), "y" (c)
-        	);
-        	__asm __volatile__( "femms" );
-#else
-#error
-#endif
-
-	return r2;
-}
-
-#endif // COMPILER_MSVC64 
diff --git a/src/mathlib/3dnow.h b/src/mathlib/3dnow.h
deleted file mode 100644
index c39b2ec5c..000000000
--- a/src/mathlib/3dnow.h
+++ /dev/null
@@ -1,16 +0,0 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 
-//
-//=====================================================================================//
-
-#ifndef _3DNOW_H
-#define _3DNOW_H
-
-float _3DNow_Sqrt(float x);
-float _3DNow_RSqrt(float x);
-float FASTCALL _3DNow_VectorNormalize (Vector& vec);
-void FASTCALL _3DNow_VectorNormalizeFast (Vector& vec);
-float _3DNow_InvRSquared(const float* v);
-
-#endif // _3DNOW_H
diff --git a/src/mathlib/mathlib.vpc b/src/mathlib/mathlib.vpc
index 17021025f..ae3c6c757 100644
--- a/src/mathlib/mathlib.vpc
+++ b/src/mathlib/mathlib.vpc
@@ -27,9 +27,7 @@ $Project "mathlib"
 		$File	"powsse.cpp"
 		$File	"sparse_convolution_noise.cpp"
 		$File	"sseconst.cpp"
-		$File	"sse.cpp"					[$WINDOWS||$POSIX]
 		$File	"ssenoise.cpp"				
-		$File	"3dnow.cpp"					[$WINDOWS||$LINUX]
 		$File	"anorms.cpp"
 		$File	"bumpvects.cpp"
 		$File	"IceKey.cpp"
@@ -76,7 +74,5 @@ $Project "mathlib"
 	$Folder	"Header Files"
 	{
 		$File	"noisedata.h"
-		$File	"sse.h"					[$WINDOWS||$POSIX]
-		$File	"3dnow.h"				[$WINDOWS||$LINUX]
 	}
 }
diff --git a/src/mathlib/mathlib_base.cpp b/src/mathlib/mathlib_base.cpp
index 6503072cb..4fe9d8fae 100644
--- a/src/mathlib/mathlib_base.cpp
+++ b/src/mathlib/mathlib_base.cpp
@@ -21,13 +21,6 @@
 
 #include "mathlib/mathlib.h"
 #include "mathlib/vector.h"
-#if !defined( _X360 )
-#include "mathlib/amd3dx.h"
-#ifndef OSX
-#include "3dnow.h"
-#endif
-#include "sse.h"
-#endif
 
 #include "mathlib/ssemath.h"
 #include "mathlib/ssequaternion.h"
@@ -47,83 +40,6 @@ const QAngle vec3_angle(0,0,0);
 const Vector vec3_invalid( FLT_MAX, FLT_MAX, FLT_MAX );
 const int nanmask = 255<<23;
 
-//-----------------------------------------------------------------------------
-// Standard C implementations of optimized routines:
-//-----------------------------------------------------------------------------
-float _sqrtf(float _X)
-{
-	Assert( s_bMathlibInitialized );
-	return sqrtf(_X); 
-}
-
-float _rsqrtf(float x)
-{
-	Assert( s_bMathlibInitialized );
-
-	return 1.f / _sqrtf( x );
-}
-
-float FASTCALL _VectorNormalize (Vector& vec)
-{
-#ifdef _VPROF_MATHLIB
-	VPROF_BUDGET( "_VectorNormalize", "Mathlib" );
-#endif
-	Assert( s_bMathlibInitialized );
-	float radius = sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z);
-
-	// FLT_EPSILON is added to the radius to eliminate the possibility of divide by zero.
-	float iradius = 1.f / ( radius + FLT_EPSILON );
-	
-	vec.x *= iradius;
-	vec.y *= iradius;
-	vec.z *= iradius;
-	
-	return radius;
-}
-
-// TODO: Add fast C VectorNormalizeFast.
-// Perhaps use approximate rsqrt trick, if the accuracy isn't too bad.
-void FASTCALL _VectorNormalizeFast (Vector& vec)
-{
-	Assert( s_bMathlibInitialized );
-
-	// FLT_EPSILON is added to the radius to eliminate the possibility of divide by zero.
-	float iradius = 1.f / ( sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z) + FLT_EPSILON );
-	
-	vec.x *= iradius;
-	vec.y *= iradius;
-	vec.z *= iradius;
-	
-}
-
-float _InvRSquared(const float* v)
-{
-	Assert( s_bMathlibInitialized );
-	float	r2 = DotProduct(v, v);
-	return r2 < 1.f ? 1.f : 1/r2;
-}
-
-//-----------------------------------------------------------------------------
-// Function pointers selecting the appropriate implementation
-//-----------------------------------------------------------------------------
-float (*pfSqrt)(float x)  = _sqrtf;
-float (*pfRSqrt)(float x) = _rsqrtf;
-float (*pfRSqrtFast)(float x) = _rsqrtf;
-float (FASTCALL *pfVectorNormalize)(Vector& v) = _VectorNormalize;
-void  (FASTCALL *pfVectorNormalizeFast)(Vector& v) = _VectorNormalizeFast;
-float (*pfInvRSquared)(const float* v) = _InvRSquared;
-void  (*pfFastSinCos)(float x, float* s, float* c) = SinCos;
-float (*pfFastCos)(float x) = cosf;
-
-float SinCosTable[SIN_TABLE_SIZE];
-void InitSinCosTable()
-{
-	for( int i = 0; i < SIN_TABLE_SIZE; i++ )
-	{
-		SinCosTable[i] = sin(i * 2.0 * M_PI / SIN_TABLE_SIZE);
-	}
-}
-
 qboolean VectorsEqual( const float *v1, const float *v2 )
 {
 	Assert( s_bMathlibInitialized );
@@ -1200,7 +1116,7 @@ void AngleMatrix( const QAngle &angles, matrix3x4_t& matrix )
 
 	float sr, sp, sy, cr, cp, cy;
 
-#ifdef _X360
+#if defined(_X360) || USE_DXMATH
 	fltx4 radians, scale, sine, cosine;
 	radians = LoadUnaligned3SIMD( angles.Base() );
 	scale = ReplicateX4( M_PI_F / 180.f ); 
@@ -2024,7 +1940,7 @@ void AngleQuaternion( const RadianEuler &angles, Quaternion &outQuat )
 
 	float sr, sp, sy, cr, cp, cy;
 
-#ifdef _X360
+#if defined(_X360) || USE_DXMATH
 	fltx4 radians, scale, sine, cosine;
 	radians = LoadUnaligned3SIMD( &angles.x );
 	scale = ReplicateX4( 0.5f ); 
@@ -2068,7 +1984,7 @@ void AngleQuaternion( const QAngle &angles, Quaternion &outQuat )
 
 	float sr, sp, sy, cr, cp, cy;
 
-#ifdef _X360
+#if defined(_X360)
 	fltx4 radians, scale, sine, cosine;
 	radians = LoadUnaligned3SIMD( angles.Base() );
 	scale = ReplicateX4( 0.5f * M_PI_F / 180.f ); 
@@ -3317,92 +3233,14 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright
 	// FIXME: Hook SSE into VectorAligned + Vector4DAligned
 
 #if !defined( _X360 )
-	// Grab the processor information:
-	const CPUInformation& pi = *GetCPUInformation();
-
-	// Select the default generic routines.
-	pfSqrt = _sqrtf;
-	pfRSqrt = _rsqrtf;
-	pfRSqrtFast = _rsqrtf;
-	pfVectorNormalize = _VectorNormalize;
-	pfVectorNormalizeFast = _VectorNormalizeFast;
-	pfInvRSquared = _InvRSquared;
-	pfFastSinCos = SinCos;
-	pfFastCos = cosf;
-
-	if ( bAllowMMX && pi.m_bMMX )
-	{
-		// Select the MMX specific routines if available
-		// (MMX routines were used by SW span fillers - not currently used for HW)
-		s_bMMXEnabled = true;
-	}
-	else
-	{
-		s_bMMXEnabled = false;
-	}
-
-	// SSE Generally performs better than 3DNow when present, so this is placed 
-	// first to allow SSE to override these settings.
-#if !defined( OSX ) && !defined( PLATFORM_WINDOWS_PC64 ) && !defined(LINUX)
-	if ( bAllow3DNow && pi.m_b3DNow )
-	{
-		s_b3DNowEnabled = true;
-
-		// Select the 3DNow specific routines if available;
-		pfVectorNormalize = _3DNow_VectorNormalize;
-		pfVectorNormalizeFast = _3DNow_VectorNormalizeFast;
-		pfInvRSquared = _3DNow_InvRSquared;
-		pfSqrt = _3DNow_Sqrt;
-		pfRSqrt = _3DNow_RSqrt;
-		pfRSqrtFast = _3DNow_RSqrt;
-	}
-	else
-#endif
-	{
-		s_b3DNowEnabled = false;
-	}
-
-	if ( bAllowSSE && pi.m_bSSE )
-	{
-		s_bSSEEnabled = true;
-
-#ifndef PLATFORM_WINDOWS_PC64
-		// These are not yet available.
-		// Select the SSE specific routines if available
-		pfVectorNormalize = _VectorNormalize;
-		pfVectorNormalizeFast = _SSE_VectorNormalizeFast;
-		pfInvRSquared = _SSE_InvRSquared;
-		pfSqrt = _SSE_Sqrt;
-		pfRSqrt = _SSE_RSqrtAccurate;
-		pfRSqrtFast = _SSE_RSqrtFast;
-#endif
-#ifdef PLATFORM_WINDOWS_PC32
-		pfFastSinCos = _SSE_SinCos;
-		pfFastCos = _SSE_cos;
-#endif
-	}
-	else
-	{
-		s_bSSEEnabled = false;
-	}
-
-	if ( bAllowSSE2 && pi.m_bSSE2 )
-	{
-		s_bSSE2Enabled = true;
-#ifdef PLATFORM_WINDOWS_PC32
-		pfFastSinCos = _SSE2_SinCos;
-		pfFastCos = _SSE2_cos;
-#endif
-	} 
-	else
-	{
-		s_bSSE2Enabled = false;
-	}
+	s_b3DNowEnabled = false;
+	s_bMMXEnabled = false;
+	s_bSSEEnabled = true;
+	s_bSSE2Enabled = true;
 #endif // !_X360
 
 	s_bMathlibInitialized = true;
 
-	InitSinCosTable();
 	BuildGammaTable( gamma, texGamma, brightness, overbright );
 }
 
diff --git a/src/mathlib/sse.cpp b/src/mathlib/sse.cpp
deleted file mode 100644
index 018a7a5b9..000000000
--- a/src/mathlib/sse.cpp
+++ /dev/null
@@ -1,1107 +0,0 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: SSE Math primitives.
-//
-//=====================================================================================//
-
-#include <math.h>
-#include <float.h>	// Needed for FLT_EPSILON
-#include "basetypes.h"
-#include <memory.h>
-#include "tier0/dbg.h"
-#include "mathlib/mathlib.h"
-#include "mathlib/vector.h"
-#include "sse.h"
-
-// memdbgon must be the last include file in a .cpp file!!!
-#include "tier0/memdbgon.h"
-
-#ifndef COMPILER_MSVC64
-// Implement for 64-bit Windows if needed.
-
-static const uint32 _sincos_masks[]	  = { (uint32)0x0,  (uint32)~0x0 };
-static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };
-
-//-----------------------------------------------------------------------------
-// Macros and constants required by some of the SSE assembly:
-//-----------------------------------------------------------------------------
-
-#ifdef _WIN32
-	#define _PS_EXTERN_CONST(Name, Val) \
-		const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
-
-	#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
-		const __declspec(align(16)) Type _ps_##Name[4] = { Val, Val, Val, Val }; \
-
-	#define _EPI32_CONST(Name, Val) \
-		static const __declspec(align(16)) __int32 _epi32_##Name[4] = { Val, Val, Val, Val }
-
-	#define _PS_CONST(Name, Val) \
-		static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
-#elif POSIX
-	#define _PS_EXTERN_CONST(Name, Val) \
-		const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
-
-	#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
-		const Type _ps_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }; \
-
-	#define _EPI32_CONST(Name, Val) \
-		static const int32 _epi32_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }
-
-	#define _PS_CONST(Name, Val) \
-		static const float _ps_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }
-#endif
-
-_PS_EXTERN_CONST(am_0, 0.0f);
-_PS_EXTERN_CONST(am_1, 1.0f);
-_PS_EXTERN_CONST(am_m1, -1.0f);
-_PS_EXTERN_CONST(am_0p5, 0.5f);
-_PS_EXTERN_CONST(am_1p5, 1.5f);
-_PS_EXTERN_CONST(am_pi, (float)M_PI);
-_PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0));
-_PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI));
-_PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0));
-_PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI));
-_PS_EXTERN_CONST_TYPE(am_sign_mask, uint32, 0x80000000);
-_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, uint32, ~0x80000000);
-_PS_EXTERN_CONST_TYPE(am_min_norm_pos,uint32, 0x00800000);
-_PS_EXTERN_CONST_TYPE(am_mant_mask, uint32, 0x7f800000);
-_PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000);
-
-_EPI32_CONST(1, 1);
-_EPI32_CONST(2, 2);
-
-_PS_CONST(sincos_p0, 0.15707963267948963959e1f);
-_PS_CONST(sincos_p1, -0.64596409750621907082e0f);
-_PS_CONST(sincos_p2, 0.7969262624561800806e-1f);
-_PS_CONST(sincos_p3, -0.468175413106023168e-2f);
-
-#ifdef PFN_VECTORMA
-void  __cdecl _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest );
-#endif
-
-//-----------------------------------------------------------------------------
-// SSE implementations of optimized routines:
-//-----------------------------------------------------------------------------
-float _SSE_Sqrt(float x)
-{
-	Assert( s_bMathlibInitialized );
-	float	root = 0.f;
-#ifdef _WIN32
-	_asm
-	{
-		sqrtss		xmm0, x
-		movss		root, xmm0
-	}
-#elif POSIX
-	_mm_store_ss( &root, _mm_sqrt_ss( _mm_load_ss( &x ) ) );
-#endif
-	return root;
-}
-
-// Single iteration NewtonRaphson reciprocal square root:
-// 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x)) 	
-// Very low error, and fine to use in place of 1.f / sqrtf(x).	
-#if 0
-float _SSE_RSqrtAccurate(float x)
-{
-	Assert( s_bMathlibInitialized );
-
-	float rroot;
-	_asm
-	{
-		rsqrtss	xmm0, x
-		movss	rroot, xmm0
-	}
-
-	return (0.5f * rroot) * (3.f - (x * rroot) * rroot);
-}
-#else
-
-#ifdef POSIX
-const __m128  f3  = _mm_set_ss(3.0f);  // 3 as SSE value
-const __m128  f05 = _mm_set_ss(0.5f);  // 0.5 as SSE value
-#endif
-
-// Intel / Kipps SSE RSqrt.  Significantly faster than above.
-float _SSE_RSqrtAccurate(float a)
-{
-
-#ifdef _WIN32
-	float x;
-	float half = 0.5f;
-	float three = 3.f;
-
-	__asm
-	{
-		movss   xmm3, a;
-		movss   xmm1, half;
-		movss   xmm2, three;
-		rsqrtss xmm0, xmm3;
-
-		mulss   xmm3, xmm0;
-		mulss   xmm1, xmm0;
-		mulss   xmm3, xmm0;
-		subss   xmm2, xmm3;
-		mulss   xmm1, xmm2;
-
-		movss   x,    xmm1;
-	}
-
-	return x;
-#elif POSIX	
-	__m128  xx = _mm_load_ss( &a );
-    __m128  xr = _mm_rsqrt_ss( xx );
-    __m128  xt;
-	
-    xt = _mm_mul_ss( xr, xr );
-    xt = _mm_mul_ss( xt, xx );
-    xt = _mm_sub_ss( f3, xt );
-    xt = _mm_mul_ss( xt, f05 );
-    xr = _mm_mul_ss( xr, xt );
-	
-    _mm_store_ss( &a, xr );
-    return a;
-#else
-	#error "Not Implemented"
-#endif
-
-}
-#endif
-
-// Simple SSE rsqrt.  Usually accurate to around 6 (relative) decimal places 
-// or so, so ok for closed transforms.  (ie, computing lighting normals)
-float _SSE_RSqrtFast(float x)
-{
-	Assert( s_bMathlibInitialized );
-
-	float rroot;
-#ifdef _WIN32
-	_asm
-	{
-		rsqrtss	xmm0, x
-		movss	rroot, xmm0
-	}
-#elif POSIX
-	__asm__ __volatile__( "rsqrtss %0, %1" : "=x" (rroot) : "x" (x) );
-#else
-#error
-#endif
-
-	return rroot;
-}
-
-float FASTCALL _SSE_VectorNormalize (Vector& vec)
-{
-	Assert( s_bMathlibInitialized );
-
-	// NOTE: This is necessary to prevent an memory overwrite...
-	// sice vec only has 3 floats, we can't "movaps" directly into it.
-#ifdef _WIN32
-	__declspec(align(16)) float result[4];
-#elif POSIX
-	 float result[4] __attribute__((aligned(16)));
-#endif
-
-	float *v = &vec[0];
-#ifdef _WIN32
-	float *r = &result[0];
-#endif
-
-	float	radius = 0.f;
-	// Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't 
-	// be much of a performance win, considering you will very likely miss 3 branch predicts in a row.
-	if ( v[0] || v[1] || v[2] )
-	{
-#ifdef _WIN32
-	_asm
-		{
-			mov			eax, v
-			mov			edx, r
-#ifdef ALIGNED_VECTOR
-			movaps		xmm4, [eax]			// r4 = vx, vy, vz, X
-			movaps		xmm1, xmm4			// r1 = r4
-#else
-			movups		xmm4, [eax]			// r4 = vx, vy, vz, X
-			movaps		xmm1, xmm4			// r1 = r4
-#endif
-			mulps		xmm1, xmm4			// r1 = vx * vx, vy * vy, vz * vz, X
-			movhlps		xmm3, xmm1			// r3 = vz * vz, X, X, X
-			movaps		xmm2, xmm1			// r2 = r1
-			shufps		xmm2, xmm2, 1		// r2 = vy * vy, X, X, X
-			addss		xmm1, xmm2			// r1 = (vx * vx) + (vy * vy), X, X, X
-			addss		xmm1, xmm3			// r1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
-			sqrtss		xmm1, xmm1			// r1 = sqrt((vx * vx) + (vy * vy) + (vz * vz)), X, X, X
-			movss		radius, xmm1		// radius = sqrt((vx * vx) + (vy * vy) + (vz * vz))
-			rcpss		xmm1, xmm1			// r1 = 1/radius, X, X, X
-			shufps		xmm1, xmm1, 0		// r1 = 1/radius, 1/radius, 1/radius, X
-			mulps		xmm4, xmm1			// r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
-			movaps		[edx], xmm4			// v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
-		}
-#elif POSIX
-		__asm__ __volatile__(
-#ifdef ALIGNED_VECTOR
-            "movaps          %2, %%xmm4 \n\t"
-            "movaps          %%xmm4, %%xmm1 \n\t"
-#else
-            "movups          %2, %%xmm4 \n\t"
-            "movaps          %%xmm4, %%xmm1 \n\t"
-#endif
-            "mulps           %%xmm4, %%xmm1 \n\t"
-            "movhlps         %%xmm1, %%xmm3 \n\t"
-            "movaps          %%xmm1, %%xmm2 \n\t"
-            "shufps          $1, %%xmm2, %%xmm2 \n\t"
-            "addss           %%xmm2, %%xmm1 \n\t"
-            "addss           %%xmm3, %%xmm1 \n\t"
-            "sqrtss          %%xmm1, %%xmm1 \n\t"
-            "movss           %%xmm1, %0 \n\t"
-            "rcpss           %%xmm1, %%xmm1 \n\t"
-            "shufps          $0, %%xmm1, %%xmm1 \n\t"
-            "mulps           %%xmm1, %%xmm4 \n\t"
-            "movaps          %%xmm4, %1 \n\t"
-            : "=m" (radius), "=m" (result)
-            : "m" (*v)
-            : "xmm1", "xmm2", "xmm3", "xmm4"
- 		);
-#else
-	#error "Not Implemented"
-#endif
-		vec.x = result[0];
-		vec.y = result[1];
-		vec.z = result[2];
-
-	}
-
-	return radius;
-}
-
-void FASTCALL _SSE_VectorNormalizeFast (Vector& vec)
-{
-	float ool = _SSE_RSqrtAccurate( FLT_EPSILON + vec.x * vec.x + vec.y * vec.y + vec.z * vec.z );
-
-	vec.x *= ool;
-	vec.y *= ool;
-	vec.z *= ool;
-}
-
-float _SSE_InvRSquared(const float* v)
-{
-	float	inv_r2 = 1.f;
-#ifdef _WIN32
-	_asm { // Intel SSE only routine
-		mov			eax, v
-		movss		xmm5, inv_r2		// x5 = 1.0, 0, 0, 0
-#ifdef ALIGNED_VECTOR
-		movaps		xmm4, [eax]			// x4 = vx, vy, vz, X
-#else
-		movups		xmm4, [eax]			// x4 = vx, vy, vz, X
-#endif
-		movaps		xmm1, xmm4			// x1 = x4
-		mulps		xmm1, xmm4			// x1 = vx * vx, vy * vy, vz * vz, X
-		movhlps		xmm3, xmm1			// x3 = vz * vz, X, X, X
-		movaps		xmm2, xmm1			// x2 = x1
-		shufps		xmm2, xmm2, 1		// x2 = vy * vy, X, X, X
-		addss		xmm1, xmm2			// x1 = (vx * vx) + (vy * vy), X, X, X
-		addss		xmm1, xmm3			// x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
-		maxss		xmm1, xmm5			// x1 = max( 1.0, x1 )
-		rcpss		xmm0, xmm1			// x0 = 1 / max( 1.0, x1 )
-		movss		inv_r2, xmm0		// inv_r2 = x0
-	}
-#elif POSIX
-		__asm__ __volatile__(
-		"movss			 %0, %%xmm5 \n\t"
-#ifdef ALIGNED_VECTOR
-		"movaps          %1, %%xmm4 \n\t"
-#else
-		"movups          %1, %%xmm4 \n\t"
-#endif
-        "movaps          %%xmm4, %%xmm1 \n\t"
-        "mulps           %%xmm4, %%xmm1 \n\t"
-		"movhlps         %%xmm1, %%xmm3 \n\t"
-		"movaps          %%xmm1, %%xmm2 \n\t"
-        "shufps          $1, %%xmm2, %%xmm2 \n\t"
-        "addss           %%xmm2, %%xmm1 \n\t"
-        "addss           %%xmm3, %%xmm1 \n\t"
-		"maxss           %%xmm5, %%xmm1 \n\t"
-        "rcpss           %%xmm1, %%xmm0 \n\t"
-		"movss           %%xmm0, %0 \n\t" 
-        : "+m" (inv_r2)
-        : "m" (*v)
-        : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- 		);
-#else
-	#error "Not Implemented"
-#endif
-
-	return inv_r2;
-}
-
-
-#ifdef POSIX
-// #define _PS_CONST(Name, Val) static const ALIGN16 float _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
-#define _PS_CONST_TYPE(Name, Type, Val) static const ALIGN16 Type _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
-
-_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
-_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
-
-
-#define _PI32_CONST(Name, Val)  static const ALIGN16 int _pi32_##Name[4]  ALIGN16_POST = { Val, Val, Val, Val }
-
-_PI32_CONST(1, 1);
-_PI32_CONST(inv1, ~1);
-_PI32_CONST(2, 2);
-_PI32_CONST(4, 4);
-_PI32_CONST(0x7f, 0x7f);
-_PS_CONST(1  , 1.0f);
-_PS_CONST(0p5, 0.5f);
-
-_PS_CONST(minus_cephes_DP1, -0.78515625);
-_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
-_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
-_PS_CONST(sincof_p0, -1.9515295891E-4);
-_PS_CONST(sincof_p1,  8.3321608736E-3);
-_PS_CONST(sincof_p2, -1.6666654611E-1);
-_PS_CONST(coscof_p0,  2.443315711809948E-005);
-_PS_CONST(coscof_p1, -1.388731625493765E-003);
-_PS_CONST(coscof_p2,  4.166664568298827E-002);
-_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
-
-typedef union xmm_mm_union {
-	__m128 xmm;
-	__m64 mm[2];
-} xmm_mm_union;
-
-#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; }
-
-typedef __m128 v4sf;  // vector of 4 float (sse1)
-typedef __m64 v2si;   // vector of 2 int (mmx)
-
-#endif
-
-void _SSE_SinCos(float x, float* s, float* c)
-{
-#ifdef _WIN32
-	float t4, t8, t12;
-
-	__asm
-	{
-		movss	xmm0, x
-		movss	t12, xmm0
-		movss	xmm1, _ps_am_inv_sign_mask
-		mov		eax, t12
-		mulss	xmm0, _ps_am_2_o_pi
-		andps	xmm0, xmm1
-		and		eax, 0x80000000
-
-		cvttss2si	edx, xmm0
-		mov		ecx, edx
-		mov		t12, esi
-		mov		esi, edx
-		add		edx, 0x1	
-		shl		ecx, (31 - 1)
-		shl		edx, (31 - 1)
-
-		movss	xmm4, _ps_am_1
-		cvtsi2ss	xmm3, esi
-		mov		t8, eax
-		and		esi, 0x1
-
-		subss	xmm0, xmm3
-		movss	xmm3, _sincos_inv_masks[esi * 4]
-		minss	xmm0, xmm4
-
-		subss	xmm4, xmm0
-
-		movss	xmm6, xmm4
-		andps	xmm4, xmm3
-		and		ecx, 0x80000000
-		movss	xmm2, xmm3
-		andnps	xmm3, xmm0
-		and		edx, 0x80000000
-		movss	xmm7, t8
-		andps	xmm0, xmm2
-		mov		t8, ecx
-		mov		t4, edx
-		orps	xmm4, xmm3
-
-		mov		eax, s     //mov eax, [esp + 4 + 16]
-		mov		edx, c //mov edx, [esp + 4 + 16 + 4]
-
-		andnps	xmm2, xmm6
-		orps	xmm0, xmm2
-
-		movss	xmm2, t8
-		movss	xmm1, xmm0
-		movss	xmm5, xmm4
-		xorps	xmm7, xmm2
-		movss	xmm3, _ps_sincos_p3
-		mulss	xmm0, xmm0
-		mulss	xmm4, xmm4
-		movss	xmm2, xmm0
-		movss	xmm6, xmm4
-		orps	xmm1, xmm7
-		movss	xmm7, _ps_sincos_p2
-		mulss	xmm0, xmm3
-		mulss	xmm4, xmm3
-		movss	xmm3, _ps_sincos_p1
-		addss	xmm0, xmm7
-		addss	xmm4, xmm7
-		movss	xmm7, _ps_sincos_p0
-		mulss	xmm0, xmm2
-		mulss	xmm4, xmm6
-		addss	xmm0, xmm3
-		addss	xmm4, xmm3
-		movss	xmm3, t4
-		mulss	xmm0, xmm2
-		mulss	xmm4, xmm6
-		orps	xmm5, xmm3
-		mov		esi, t12
-		addss	xmm0, xmm7
-		addss	xmm4, xmm7
-		mulss	xmm0, xmm1
-		mulss	xmm4, xmm5
-
-		// use full stores since caller might reload with full loads
-		movss	[eax], xmm0
-		movss	[edx], xmm4
-	}
-#elif POSIX
-	
-	Assert( "Needs testing, verify impl!\n" );
-	
-	v4sf  xx = _mm_load_ss( &x );
-	
-	v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
-	v2si mm0, mm1, mm2, mm3, mm4, mm5;
-	sign_bit_sin = xx;
-	/* take the absolute value */
-	xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
-	/* extract the sign bit (upper one) */
-	sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
-	
-	/* scale by 4/Pi */
-	y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
-	
-	/* store the integer part of y in mm2:mm3 */
-	xmm3 = _mm_movehl_ps(xmm3, y);
-	mm2 = _mm_cvttps_pi32(y);
-	mm3 = _mm_cvttps_pi32(xmm3);
-	
-	/* j=(j+1) & (~1) (see the cephes sources) */
-	mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
-	mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
-	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
-	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
-	
-	y = _mm_cvtpi32x2_ps(mm2, mm3);
-	
-	mm4 = mm2;
-	mm5 = mm3;
-	
-	/* get the swap sign flag for the sine */
-	mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
-	mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
-	mm0 = _mm_slli_pi32(mm0, 29);
-	mm1 = _mm_slli_pi32(mm1, 29);
-	v4sf swap_sign_bit_sin;
-	COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
-	
-	/* get the polynom selection mask for the sine */
-	
-	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
-	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
-	mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
-	mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
-	v4sf poly_mask;
-	COPY_MM_TO_XMM(mm2, mm3, poly_mask);
-	
-	/* The magic pass: "Extended precision modular arithmetic" 
-	 x = ((x - y * DP1) - y * DP2) - y * DP3; */
-	xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
-	xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
-	xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
-	xmm1 = _mm_mul_ps(y, xmm1);
-	xmm2 = _mm_mul_ps(y, xmm2);
-	xmm3 = _mm_mul_ps(y, xmm3);
-	xx = _mm_add_ps(xx, xmm1);
-	xx = _mm_add_ps(xx, xmm2);
-	xx = _mm_add_ps(xx, xmm3);
-	
-	/* get the sign flag for the cosine */
-	mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
-	mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
-	mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
-	mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
-	mm4 = _mm_slli_pi32(mm4, 29);
-	mm5 = _mm_slli_pi32(mm5, 29);
-	v4sf sign_bit_cos;
-	COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
-	_mm_empty(); /* good-bye mmx */
-	
-	sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
-	
-	
-	/* Evaluate the first polynom  (0 <= x <= Pi/4) */
-	v4sf z = _mm_mul_ps(xx,xx);
-	y = *(v4sf*)_ps_coscof_p0;
-	
-	y = _mm_mul_ps(y, z);
-	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
-	y = _mm_mul_ps(y, z);
-	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
-	y = _mm_mul_ps(y, z);
-	y = _mm_mul_ps(y, z);
-	v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
-	y = _mm_sub_ps(y, tmp);
-	y = _mm_add_ps(y, *(v4sf*)_ps_1);
-	
-	/* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-	
-	v4sf y2 = *(v4sf*)_ps_sincof_p0;
-	y2 = _mm_mul_ps(y2, z);
-	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
-	y2 = _mm_mul_ps(y2, z);
-	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
-	y2 = _mm_mul_ps(y2, z);
-	y2 = _mm_mul_ps(y2, xx);
-	y2 = _mm_add_ps(y2, xx);
-	
-	/* select the correct result from the two polynoms */  
-	xmm3 = poly_mask;
-	v4sf ysin2 = _mm_and_ps(xmm3, y2);
-	v4sf ysin1 = _mm_andnot_ps(xmm3, y);
-	y2 = _mm_sub_ps(y2,ysin2);
-	y = _mm_sub_ps(y, ysin1);
-	
-	xmm1 = _mm_add_ps(ysin1,ysin2);
-	xmm2 = _mm_add_ps(y,y2);
-	
-	/* update the sign */
-	_mm_store_ss( s, _mm_xor_ps(xmm1, sign_bit_sin) );
-	_mm_store_ss( c, _mm_xor_ps(xmm2, sign_bit_cos) );
-
-#else
-	#error "Not Implemented"
-#endif
-}
-
-float _SSE_cos( float x )
-{
-#ifdef _WIN32
-	float temp;
-	__asm
-	{
-		movss	xmm0, x
-		movss	xmm1, _ps_am_inv_sign_mask
-		andps	xmm0, xmm1
-		addss	xmm0, _ps_am_pi_o_2
-		mulss	xmm0, _ps_am_2_o_pi
-
-		cvttss2si	ecx, xmm0
-		movss	xmm5, _ps_am_1
-		mov		edx, ecx
-		shl		edx, (31 - 1)
-		cvtsi2ss	xmm1, ecx
-		and		edx, 0x80000000
-		and		ecx, 0x1
-
-		subss	xmm0, xmm1
-		movss	xmm6, _sincos_masks[ecx * 4]
-		minss	xmm0, xmm5
-
-		movss	xmm1, _ps_sincos_p3
-		subss	xmm5, xmm0
-
-		andps	xmm5, xmm6
-		movss	xmm7, _ps_sincos_p2
-		andnps	xmm6, xmm0
-		mov		temp, edx
-		orps	xmm5, xmm6
-		movss	xmm0, xmm5
-
-		mulss	xmm5, xmm5
-		movss	xmm4, _ps_sincos_p1
-		movss	xmm2, xmm5
-		mulss	xmm5, xmm1
-		movss	xmm1, _ps_sincos_p0
-		addss	xmm5, xmm7
-		mulss	xmm5, xmm2
-		movss	xmm3, temp
-		addss	xmm5, xmm4
-		mulss	xmm5, xmm2
-		orps	xmm0, xmm3
-		addss	xmm5, xmm1
-		mulss	xmm0, xmm5
-		
-		movss   x,    xmm0
-
-	}
-#elif POSIX
-
-	Assert( "Needs testing, verify impl!\n" );
-
-	v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
-	v2si mm0, mm1, mm2, mm3;
-	/* take the absolute value */
-	v4sf  xx = _mm_load_ss( &x );
-
-	xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
-		
-	/* scale by 4/Pi */
-	y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
-	
-	/* store the integer part of y in mm0:mm1 */
-	xmm2 = _mm_movehl_ps(xmm2, y);
-	mm2 = _mm_cvttps_pi32(y);
-	mm3 = _mm_cvttps_pi32(xmm2);
-	
-	/* j=(j+1) & (~1) (see the cephes sources) */
-	mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
-	mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
-	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
-	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
-	
-	y = _mm_cvtpi32x2_ps(mm2, mm3);
-	
-	
-	mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
-	mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
-	
-	/* get the swap sign flag in mm0:mm1 and the 
-	 polynom selection mask in mm2:mm3 */
-	
-	mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
-	mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
-	mm0 = _mm_slli_pi32(mm0, 29);
-	mm1 = _mm_slli_pi32(mm1, 29);
-	
-	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
-	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
-	
-	mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
-	mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
-	
-	v4sf sign_bit, poly_mask;
-	COPY_MM_TO_XMM(mm0, mm1, sign_bit);
-	COPY_MM_TO_XMM(mm2, mm3, poly_mask);
-	_mm_empty(); /* good-bye mmx */
-
-	/* The magic pass: "Extended precision modular arithmetic" 
-	 x = ((x - y * DP1) - y * DP2) - y * DP3; */
-	xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
-	xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
-	xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
-	xmm1 = _mm_mul_ps(y, xmm1);
-	xmm2 = _mm_mul_ps(y, xmm2);
-	xmm3 = _mm_mul_ps(y, xmm3);
-	xx = _mm_add_ps(xx, xmm1);
-	xx = _mm_add_ps(xx, xmm2);
-	xx = _mm_add_ps(xx, xmm3);
-	
-	/* Evaluate the first polynom  (0 <= x <= Pi/4) */
-	y = *(v4sf*)_ps_coscof_p0;
-	v4sf z = _mm_mul_ps(xx,xx);
-	
-	y = _mm_mul_ps(y, z);
-	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
-	y = _mm_mul_ps(y, z);
-	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
-	y = _mm_mul_ps(y, z);
-	y = _mm_mul_ps(y, z);
-	v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
-	y = _mm_sub_ps(y, tmp);
-	y = _mm_add_ps(y, *(v4sf*)_ps_1);
-	
-	/* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-	
-	v4sf y2 = *(v4sf*)_ps_sincof_p0;
-	y2 = _mm_mul_ps(y2, z);
-	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
-	y2 = _mm_mul_ps(y2, z);
-	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
-	y2 = _mm_mul_ps(y2, z);
-	y2 = _mm_mul_ps(y2, xx);
-	y2 = _mm_add_ps(y2, xx);
-	
-	/* select the correct result from the two polynoms */  
-	xmm3 = poly_mask;
-	y2 = _mm_and_ps(xmm3, y2); //, xmm3);
-	y = _mm_andnot_ps(xmm3, y);
-	y = _mm_add_ps(y,y2);
-	/* update the sign */
-
-	_mm_store_ss( &x, _mm_xor_ps(y, sign_bit) );
-
-#else
-	#error "Not Implemented"
-#endif
-
-	return x;
-}
-
-//-----------------------------------------------------------------------------
-// SSE2 implementations of optimized routines:
-//-----------------------------------------------------------------------------
-#ifdef PLATFORM_WINDOWS_PC32
-void _SSE2_SinCos(float x, float* s, float* c)  // any x
-{
-#ifdef _WIN32
-	__asm
-	{
-		movss	xmm0, x
-		movaps	xmm7, xmm0
-		movss	xmm1, _ps_am_inv_sign_mask
-		movss	xmm2, _ps_am_sign_mask
-		movss	xmm3, _ps_am_2_o_pi
-		andps	xmm0, xmm1
-		andps	xmm7, xmm2
-		mulss	xmm0, xmm3
-
-		pxor	xmm3, xmm3
-		movd	xmm5, _epi32_1
-		movss	xmm4, _ps_am_1
-
-		cvttps2dq	xmm2, xmm0
-		pand	xmm5, xmm2
-		movd	xmm1, _epi32_2
-		pcmpeqd	xmm5, xmm3
-		movd	xmm3, _epi32_1
-		cvtdq2ps	xmm6, xmm2
-		paddd	xmm3, xmm2
-		pand	xmm2, xmm1
-		pand	xmm3, xmm1
-		subss	xmm0, xmm6
-		pslld	xmm2, (31 - 1)
-		minss	xmm0, xmm4
-
-		mov		eax, s     // mov eax, [esp + 4 + 16]
-		mov		edx, c	   // mov edx, [esp + 4 + 16 + 4]
-
-		subss	xmm4, xmm0
-		pslld	xmm3, (31 - 1)
-
-		movaps	xmm6, xmm4
-		xorps	xmm2, xmm7
-		movaps	xmm7, xmm5
-		andps	xmm6, xmm7
-		andnps	xmm7, xmm0
-		andps	xmm0, xmm5
-		andnps	xmm5, xmm4
-		movss	xmm4, _ps_sincos_p3
-		orps	xmm6, xmm7
-		orps	xmm0, xmm5
-		movss	xmm5, _ps_sincos_p2
-
-		movaps	xmm1, xmm0
-		movaps	xmm7, xmm6
-		mulss	xmm0, xmm0
-		mulss	xmm6, xmm6
-		orps	xmm1, xmm2
-		orps	xmm7, xmm3
-		movaps	xmm2, xmm0
-		movaps	xmm3, xmm6
-		mulss	xmm0, xmm4
-		mulss	xmm6, xmm4
-		movss	xmm4, _ps_sincos_p1
-		addss	xmm0, xmm5
-		addss	xmm6, xmm5
-		movss	xmm5, _ps_sincos_p0
-		mulss	xmm0, xmm2
-		mulss	xmm6, xmm3
-		addss	xmm0, xmm4
-		addss	xmm6, xmm4
-		mulss	xmm0, xmm2
-		mulss	xmm6, xmm3
-		addss	xmm0, xmm5
-		addss	xmm6, xmm5
-		mulss	xmm0, xmm1
-		mulss	xmm6, xmm7
-
-		// use full stores since caller might reload with full loads
-		movss	[eax], xmm0
-		movss	[edx], xmm6
-	}
-#elif POSIX
-	#warning "_SSE2_SinCos NOT implemented!"
-	Assert( 0 );
-#else
-	#error "Not Implemented"
-#endif
-}
-#endif // PLATFORM_WINDOWS_PC32
-
-#ifdef PLATFORM_WINDOWS_PC32
-float _SSE2_cos(float x)  
-{
-#ifdef _WIN32
-	__asm
-	{
-		movss	xmm0, x
-		movss	xmm1, _ps_am_inv_sign_mask
-		movss	xmm2, _ps_am_pi_o_2
-		movss	xmm3, _ps_am_2_o_pi
-		andps	xmm0, xmm1
-		addss	xmm0, xmm2
-		mulss	xmm0, xmm3
-
-		pxor	xmm3, xmm3
-		movd	xmm5, _epi32_1
-		movss	xmm4, _ps_am_1
-		cvttps2dq	xmm2, xmm0
-		pand	xmm5, xmm2
-		movd	xmm1, _epi32_2
-		pcmpeqd	xmm5, xmm3
-		cvtdq2ps	xmm6, xmm2
-		pand	xmm2, xmm1
-		pslld	xmm2, (31 - 1)
-
-		subss	xmm0, xmm6
-		movss	xmm3, _ps_sincos_p3
-		minss	xmm0, xmm4
-		subss	xmm4, xmm0
-		andps	xmm0, xmm5
-		andnps	xmm5, xmm4
-		orps	xmm0, xmm5
-
-		movaps	xmm1, xmm0
-		movss	xmm4, _ps_sincos_p2
-		mulss	xmm0, xmm0
-		movss	xmm5, _ps_sincos_p1
-		orps	xmm1, xmm2
-		movaps	xmm7, xmm0
-		mulss	xmm0, xmm3
-		movss	xmm6, _ps_sincos_p0
-		addss	xmm0, xmm4
-		mulss	xmm0, xmm7
-		addss	xmm0, xmm5
-		mulss	xmm0, xmm7
-		addss	xmm0, xmm6
-		mulss	xmm0, xmm1
-		movss   x,    xmm0
-	}
-#elif POSIX
-	#warning "_SSE2_cos NOT implemented!"
-	Assert( 0 );
-#else
-	#error "Not Implemented"
-#endif
-
-	return x;
-}
-#endif // PLATFORM_WINDOWS_PC32
-
-#if 0
-// SSE Version of VectorTransform
-void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
-{
-	Assert( s_bMathlibInitialized );
-	Assert( in1 != out1 );
-
-#ifdef _WIN32
-	__asm
-	{
-		mov eax, in1;
-		mov ecx, in2;
-		mov edx, out1;
-
-		movss xmm0, [eax];
-		mulss xmm0, [ecx];
-		movss xmm1, [eax+4];
-		mulss xmm1, [ecx+4];
-		movss xmm2, [eax+8];
-		mulss xmm2, [ecx+8];
-		addss xmm0, xmm1;
-		addss xmm0, xmm2;
-		addss xmm0, [ecx+12]
- 		movss [edx], xmm0;
-		add ecx, 16;
-
-		movss xmm0, [eax];
-		mulss xmm0, [ecx];
-		movss xmm1, [eax+4];
-		mulss xmm1, [ecx+4];
-		movss xmm2, [eax+8];
-		mulss xmm2, [ecx+8];
-		addss xmm0, xmm1;
-		addss xmm0, xmm2;
-		addss xmm0, [ecx+12]
-		movss [edx+4], xmm0;
-		add ecx, 16;
-
-		movss xmm0, [eax];
-		mulss xmm0, [ecx];
-		movss xmm1, [eax+4];
-		mulss xmm1, [ecx+4];
-		movss xmm2, [eax+8];
-		mulss xmm2, [ecx+8];
-		addss xmm0, xmm1;
-		addss xmm0, xmm2;
-		addss xmm0, [ecx+12]
-		movss [edx+8], xmm0;
-	}
-#elif POSIX
-	#warning "VectorTransformSSE C implementation only"
-		out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
-		out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
-		out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
-#else
-	#error "Not Implemented"
-#endif
-}
-#endif
-
-#if 0
-void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
-{
-	Assert( s_bMathlibInitialized );
-	Assert( in1 != out1 );
-
-#ifdef _WIN32
-	__asm
-	{
-		mov eax, in1;
-		mov ecx, in2;
-		mov edx, out1;
-
-		movss xmm0, [eax];
-		mulss xmm0, [ecx];
-		movss xmm1, [eax+4];
-		mulss xmm1, [ecx+4];
-		movss xmm2, [eax+8];
-		mulss xmm2, [ecx+8];
-		addss xmm0, xmm1;
-		addss xmm0, xmm2;
- 		movss [edx], xmm0;
-		add ecx, 16;
-
-		movss xmm0, [eax];
-		mulss xmm0, [ecx];
-		movss xmm1, [eax+4];
-		mulss xmm1, [ecx+4];
-		movss xmm2, [eax+8];
-		mulss xmm2, [ecx+8];
-		addss xmm0, xmm1;
-		addss xmm0, xmm2;
-		movss [edx+4], xmm0;
-		add ecx, 16;
-
-		movss xmm0, [eax];
-		mulss xmm0, [ecx];
-		movss xmm1, [eax+4];
-		mulss xmm1, [ecx+4];
-		movss xmm2, [eax+8];
-		mulss xmm2, [ecx+8];
-		addss xmm0, xmm1;
-		addss xmm0, xmm2;
-		movss [edx+8], xmm0;
-	}
-#elif POSIX
-	#warning "VectorRotateSSE C implementation only"
-		out1[0] = DotProduct( in1, in2[0] );
-		out1[1] = DotProduct( in1, in2[1] );
-		out1[2] = DotProduct( in1, in2[2] );
-#else
-	#error "Not Implemented"
-#endif
-}
-#endif
-
-#ifdef _WIN32
-void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest )
-{
-	// FIXME: This don't work!! It will overwrite memory in the write to dest
-	Assert(0);
-
-	Assert( s_bMathlibInitialized );
-	_asm {  // Intel SSE only routine
-		mov	eax, DWORD PTR [esp+0x04]	; *start, s0..s2
-		mov ecx, DWORD PTR [esp+0x0c]	; *direction, d0..d2
-		mov edx, DWORD PTR [esp+0x10]	; *dest
-		movss	xmm2, [esp+0x08]		; x2 = scale, 0, 0, 0
-#ifdef ALIGNED_VECTOR
-		movaps	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
-		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
-		movaps	xmm1, [eax]				; x1 = start1, start2, start3, X
-		mulps	xmm3, xmm2				; x3 *= x2
-		addps	xmm3, xmm1				; x3 += x1
-		movaps	[edx], xmm3				; *dest = x3
-#else
-		movups	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
-		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
-		movups	xmm1, [eax]				; x1 = start1, start2, start3, X
-		mulps	xmm3, xmm2				; x3 *= x2
-		addps	xmm3, xmm1				; x3 += x1
-		movups	[edx], xmm3				; *dest = x3
-#endif
-	}
-}
-#endif
-
-#ifdef _WIN32
-#ifdef PFN_VECTORMA
-void _declspec(naked) __cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
-{
-	// FIXME: This don't work!! It will overwrite memory in the write to dest
-	Assert(0);
-
-	Assert( s_bMathlibInitialized );
-	_asm 
-	{  
-		// Intel SSE only routine
-		mov	eax, DWORD PTR [esp+0x04]	; *start, s0..s2
-		mov ecx, DWORD PTR [esp+0x0c]	; *direction, d0..d2
-		mov edx, DWORD PTR [esp+0x10]	; *dest
-		movss	xmm2, [esp+0x08]		; x2 = scale, 0, 0, 0
-#ifdef ALIGNED_VECTOR
-		movaps	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
-		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
-		movaps	xmm1, [eax]				; x1 = start1, start2, start3, X
-		mulps	xmm3, xmm2				; x3 *= x2
-		addps	xmm3, xmm1				; x3 += x1
-		movaps	[edx], xmm3				; *dest = x3
-#else
-		movups	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
-		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
-		movups	xmm1, [eax]				; x1 = start1, start2, start3, X
-		mulps	xmm3, xmm2				; x3 *= x2
-		addps	xmm3, xmm1				; x3 += x1
-		movups	[edx], xmm3				; *dest = x3
-#endif
-	}
-}
-float (__cdecl *pfVectorMA)(Vector& v) = _VectorMA;
-#endif
-#endif
-
-
-// SSE DotProduct -- it's a smidgen faster than the asm DotProduct...
-//   Should be validated too!  :)
-//   NJS: (Nov 1 2002) -NOT- faster.  may time a couple cycles faster in a single function like 
-//   this, but when inlined, and instruction scheduled, the C version is faster.  
-//   Verified this via VTune
-/*
-vec_t DotProduct (const vec_t *a, const vec_t *c)
-{
-	vec_t temp;
-
-	__asm
-	{
-		mov eax, a;
-		mov ecx, c;
-		mov edx, DWORD PTR [temp]
-		movss xmm0, [eax];
-		mulss xmm0, [ecx];
-		movss xmm1, [eax+4];
-		mulss xmm1, [ecx+4];
-		movss xmm2, [eax+8];
-		mulss xmm2, [ecx+8];
-		addss xmm0, xmm1;
-		addss xmm0, xmm2;
-		movss [edx], xmm0;
-		fld DWORD PTR [edx];
-		ret
-	}
-}
-*/
-
-#endif // COMPILER_MSVC64 
diff --git a/src/mathlib/sse.h b/src/mathlib/sse.h
deleted file mode 100644
index 1b49c50c1..000000000
--- a/src/mathlib/sse.h
+++ /dev/null
@@ -1,27 +0,0 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 
-//
-//=====================================================================================//
-
-#ifndef _SSE_H
-#define _SSE_H
-
-float _SSE_Sqrt(float x);
-float _SSE_RSqrtAccurate(float a);
-float _SSE_RSqrtFast(float x);
-float FASTCALL _SSE_VectorNormalize(Vector& vec);
-void FASTCALL _SSE_VectorNormalizeFast(Vector& vec);
-float _SSE_InvRSquared(const float* v);
-void _SSE_SinCos(float x, float* s, float* c);
-float _SSE_cos( float x);
-#ifdef PLATFORM_WINDOWS_PC32
-void _SSE2_SinCos(float x, float* s, float* c);
-float _SSE2_cos(float x); 
-#endif
-#if 0
-void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1);
-void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 );
-#endif
-
-#endif // _SSE_H
diff --git a/src/mathlib/sseconst.cpp b/src/mathlib/sseconst.cpp
index d68588fdd..6c2c83ca1 100644
--- a/src/mathlib/sseconst.cpp
+++ b/src/mathlib/sseconst.cpp
@@ -17,6 +17,8 @@ const fltx4 Four_Threes={3.0,3.0,3.0,3.0};
 const fltx4 Four_Fours={4.0,4.0,4.0,4.0};
 const fltx4 Four_Origin={0,0,0,1};
 const fltx4 Four_NegativeOnes={-1,-1,-1,-1};
+const fltx4 Four_DegToRad = { ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)) };
+const fltx4 Four_360 = { 360.f, 360.f, 360.f, 360.f };
 
 const fltx4 Four_2ToThe21s={ (float) (1<<21), (float) (1<<21), (float) (1<<21), (float)(1<<21) };
 const fltx4 Four_2ToThe22s={ (float) (1<<22), (float) (1<<22), (float) (1<<22), (float)(1<<22) };
@@ -58,6 +60,8 @@ const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST =
 	{ 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 },
 };
 
+const int32 ALIGN16 g_SIMD_EveryOtherMask[4] = { 0, ~0, 0, ~0 };
+
 
 	// FUNCTIONS
 	// NOTE: WHY YOU **DO NOT** WANT TO PUT FUNCTIONS HERE
diff --git a/src/public/bone_setup.cpp b/src/public/bone_setup.cpp
index 35d72d5e3..27ab95dcf 100644
--- a/src/public/bone_setup.cpp
+++ b/src/public/bone_setup.cpp
@@ -430,7 +430,7 @@ void CalcBoneQuaternion( int frame, float s,
 			AngleQuaternion( angle1, q1 );
 			AngleQuaternion( angle2, q2 );
 
-	#ifdef _X360
+	#if defined(_X360) || USE_DXMATH
 			fltx4 q1simd, q2simd, qsimd;
 			q1simd = LoadAlignedSIMD( q1 );
 			q2simd = LoadAlignedSIMD( q2 );
@@ -1370,7 +1370,476 @@ void WorldSpaceSlerp(
 	g_MatrixPool.Free( targetBoneToWorld );
 }
 
+void SlerpBonesSpeedy(
+	const CStudioHdr* pStudioHdr,
+	QuaternionAligned q1[MAXSTUDIOBONES],
+	Vector pos1[MAXSTUDIOBONES],
+	mstudioseqdesc_t& seqdesc,  // source of q2 and pos2
+	int sequence,
+	const QuaternionAligned q2[MAXSTUDIOBONES],
+	const Vector pos2[MAXSTUDIOBONES],
+	float s,
+	int boneMask)
+{
+	if (s <= 0.0f)
+		return;
+	if (s > 1.0f)
+	{
+		s = 1.0f;
+	}
+
+	if (seqdesc.flags & STUDIO_WORLD)
+	{
+		WorldSpaceSlerp(pStudioHdr, q1, pos1, seqdesc, sequence, q2, pos2, s, boneMask);
+		return;
+	}
 
+	int			i;
+	virtualmodel_t* pVModel = pStudioHdr->GetVirtualModel();
+	const virtualgroup_t* RESTRICT pSeqGroup = NULL;
+	if (pVModel)
+	{
+		pSeqGroup = pVModel->pSeqGroup(sequence);
+	}
+
+	// Build weightlist for all bones
+	int nBoneCount = pStudioHdr->numbones();
+	float* RESTRICT pS2 = (float*)stackalloc(nBoneCount * sizeof(float)); // 16-byte aligned
+
+
+	if (pSeqGroup) // hoist this branch outside of the inner loop for speed (even correctly predicted branches are an eight cycle latency)
+	{
+		for (i = 0; i < nBoneCount; i++)
+		{
+			// skip unused bones
+			if (!(pStudioHdr->boneFlags(i) & boneMask) ||
+				pSeqGroup->boneMap[i] < 0)
+			{
+				pS2[i] = 0.0f;
+			}
+			else
+			{
+				// boneMap[i] is not a float, don't be lured by the siren call of fcmp
+				pS2[i] = s * seqdesc.weight(pSeqGroup->boneMap[i]);
+			}
+		}
+	}
+	else // !pSeqGroup
+	{
+		for (i = 0; i < nBoneCount; i++)
+		{
+			// skip unused bones
+			if (!(pStudioHdr->boneFlags(i) & boneMask))
+			{
+				pS2[i] = 0.0f;
+			}
+			else
+			{
+				pS2[i] = s * seqdesc.weight(i);	// blend in based on this bones weight
+			}
+		}
+	}
+
+	float weight;
+	int nBoneCountRoundedFour = (nBoneCount) & (~(3));
+	if (seqdesc.flags & STUDIO_DELTA)
+	{
+		// do as many as we can four at a time, then take care of stragglers.
+		for (i = 0; i < nBoneCountRoundedFour; i += 4)
+		{
+			// drag the next cache line in
+			PREFETCH360(q1, i * 16 + 128);
+			PREFETCH360(pos1, i * 16 + 128);
+			PREFETCH360(q2, i * 16 + 128);
+			PREFETCH360(pos2, i * 16 + 128);
+
+			fltx4 weightfour = LoadAlignedSIMD(pS2 + i); // four weights
+
+			FourQuaternions q1four, q2four;
+			FourQuaternions result;
+
+			q1four.LoadAndSwizzleAligned(q1 + i); // four quaternions
+			q2four.LoadAndSwizzleAligned(q2 + i); // four quaternions
+
+			if (seqdesc.flags & STUDIO_POST)
+			{
+
+				// result = q1 * ( weight * q2 ) 
+				result = q1four.MulAc(weightfour, q2four);
+			}
+			else
+			{
+
+				// result = ( s * q1 ) * q2
+				result = q2four.ScaleMul(weightfour, q1four);
+			}
+
+			// mask out unused channels, replacing them with original data
+			{
+				fltx4 tinyScales = CmpLeSIMD(weightfour, Four_Zeros);
+				result.x = MaskedAssign(tinyScales, q1four.x, result.x);
+				result.y = MaskedAssign(tinyScales, q1four.y, result.y);
+				result.z = MaskedAssign(tinyScales, q1four.z, result.z);
+				result.w = MaskedAssign(tinyScales, q1four.w, result.w);
+			}
+
+
+			result.SwizzleAndStoreAlignedMasked(q1 + i, CmpGtSIMD(weightfour, Four_Zeros));
+
+			fltx4 originalpos1simd[4], pos1simd[4], pos2simd[4];
+			originalpos1simd[0] = pos1simd[0] = LoadUnalignedSIMD(pos1[i + 0].Base());
+			originalpos1simd[1] = pos1simd[1] = LoadUnalignedSIMD(pos1[i + 1].Base());
+			originalpos1simd[2] = pos1simd[2] = LoadUnalignedSIMD(pos1[i + 2].Base());
+			originalpos1simd[3] = pos1simd[3] = LoadUnalignedSIMD(pos1[i + 3].Base());
+			pos2simd[0] = LoadUnalignedSIMD(pos2[i + 0].Base());
+			pos2simd[1] = LoadUnalignedSIMD(pos2[i + 1].Base());
+			pos2simd[2] = LoadUnalignedSIMD(pos2[i + 2].Base());
+			pos2simd[3] = LoadUnalignedSIMD(pos2[i + 3].Base());
+
+			fltx4 splatweights[4] = { SplatXSIMD(weightfour),
+									  SplatYSIMD(weightfour),
+									  SplatZSIMD(weightfour),
+									  SplatWSIMD(weightfour) };
+
+			fltx4 Zero = Four_Zeros;
+			pos1simd[0] = MaddSIMD(pos2simd[0], splatweights[0], pos1simd[0]);
+			splatweights[0] = (fltx4)CmpGtSIMD(splatweights[0], Zero);
+			pos1simd[1] = MaddSIMD(pos2simd[1], splatweights[1], pos1simd[1]);
+			splatweights[1] = (fltx4)CmpGtSIMD(splatweights[1], Zero);
+			pos1simd[2] = MaddSIMD(pos2simd[2], splatweights[2], pos1simd[2]);
+			splatweights[2] = (fltx4)CmpGtSIMD(splatweights[2], Zero);
+			pos1simd[3] = MaddSIMD(pos2simd[3], splatweights[3], pos1simd[3]);
+			splatweights[3] = (fltx4)CmpGtSIMD(splatweights[3], Zero);
+
+			// mask out unweighted bones
+			/*
+			if (pS2[i+0] > 0)
+				StoreUnaligned3SIMD( pos1[i + 0].Base(), pos1simd[0] );
+			if (pS2[i+1] > 0)
+				StoreUnaligned3SIMD( pos1[i + 1].Base(), pos1simd[1] );
+			if (pS2[i+2] > 0)
+				StoreUnaligned3SIMD( pos1[i + 2].Base(), pos1simd[2] );
+			if (pS2[i+3] > 0)
+				StoreUnaligned3SIMD( pos1[i + 3].Base(), pos1simd[3] );
+			*/
+			StoreUnaligned3SIMD(pos1[i + 0].Base(), MaskedAssign((fltx4)splatweights[0], pos1simd[0], originalpos1simd[0]));
+			StoreUnaligned3SIMD(pos1[i + 1].Base(), MaskedAssign((fltx4)splatweights[1], pos1simd[1], originalpos1simd[1]));
+			StoreUnaligned3SIMD(pos1[i + 2].Base(), MaskedAssign((fltx4)splatweights[2], pos1simd[2], originalpos1simd[2]));
+			StoreUnaligned3SIMD(pos1[i + 3].Base(), MaskedAssign((fltx4)splatweights[3], pos1simd[3], originalpos1simd[3]));
+
+		}
+
+		// take care of stragglers
+		for (false; i < nBoneCount; i++)
+		{
+			weight = pS2[i];
+			if (weight <= 0.0f)
+				continue;
+
+			if (seqdesc.flags & STUDIO_POST)
+			{
+#if !defined(_X360) && !USE_DXMATH
+				QuaternionMA(q1[i], weight, q2[i], q1[i]);
+#else
+				fltx4 q1simd = LoadUnalignedSIMD(q1[i].Base());
+				fltx4 q2simd = LoadAlignedSIMD(q2[i]);
+				fltx4 result = QuaternionMASIMD(q1simd, weight, q2simd);
+				StoreUnalignedSIMD(q1[i].Base(), result);
+#endif
+			}
+			else
+			{
+#if !defined(_X360) && !USE_DXMATH
+				QuaternionSM(weight, q2[i], q1[i], q1[i]);
+#else
+				fltx4 q1simd = LoadUnalignedSIMD(q1[i].Base());
+				fltx4 q2simd = LoadAlignedSIMD(q2[i]);
+				fltx4 result = QuaternionSMSIMD(weight, q2simd, q1simd);
+				StoreUnalignedSIMD(q1[i].Base(), result);
+#endif
+			}
+			// do this explicitly to make the scheduling better
+			// (otherwise it might think pos1 and pos2 overlap,
+			// and thus save one before starting the next)
+			float x, y, z;
+			x = pos1[i][0] + pos2[i][0] * weight;
+			y = pos1[i][1] + pos2[i][1] * weight;
+			z = pos1[i][2] + pos2[i][2] * weight;
+			pos1[i][0] = x;
+			pos1[i][1] = y;
+			pos1[i][2] = z;
+		}
+		return;
+	}
+
+	//// SLERP PHASE
+
+	// Some bones need to be slerped with alignment.
+	// Others do not.
+	// Some need to be ignored altogether.
+	// Build arrays indicating which are which. 
+	// This is the corral approach. Another approach
+	// would be to compute both the aligned and unaligned
+	// slerps of each bone in the first pass through the 
+	// array, and then do a masked selection of each 
+	// based on the masks. However there really isn't 
+	// a convenient way to turn the int flags that
+	// specify which approach to take, into fltx4 masks.
+
+	// float * RESTRICT pS2 = (float*)stackalloc( nBoneCount * sizeof(float) );
+	int* RESTRICT aBonesSlerpAlign = (int*)stackalloc(nBoneCount * sizeof(int));
+	float* RESTRICT aBonesSlerpAlignWeights = (float*)stackalloc(nBoneCount * sizeof(float));
+	int* RESTRICT aBonesSlerpNoAlign = (int*)stackalloc(nBoneCount * sizeof(int));
+	float* RESTRICT aBonesSlerpNoAlignWeights = (float*)stackalloc(nBoneCount * sizeof(float));
+	int numBonesSlerpAlign = 0;
+	int numBonesSlerpNoAlign = 0;
+
+	// BoneQuaternionAligned * RESTRICT testOutput = (BoneQuaternionAligned *)stackalloc(nBoneCount * sizeof(BoneQuaternionAligned));
+
+	// sweep forward through the array and determine where to corral each bone.
+	for (i = 0; i < nBoneCount; ++i)
+	{
+		float weight = pS2[i];
+		if (weight == 1.0f)
+		{
+			q1[i] = q2[i];
+			pos1[i] = pos2[i];
+		}
+		else if (weight > 0.0f) // ignore small bones
+		{
+			if (pStudioHdr->boneFlags(i) & BONE_FIXED_ALIGNMENT)
+			{
+				aBonesSlerpNoAlign[numBonesSlerpNoAlign] = i;
+				aBonesSlerpNoAlignWeights[numBonesSlerpNoAlign] = weight;
+				++numBonesSlerpNoAlign;
+			}
+			else
+			{
+				aBonesSlerpAlign[numBonesSlerpAlign] = i;
+				aBonesSlerpAlignWeights[numBonesSlerpAlign] = weight;
+				++numBonesSlerpAlign;
+			}
+		}
+	}
+
+	// okay, compute all the aligned, and all the unaligned bones, four at
+	// a time if possible.
+	const fltx4 One = Four_Ones;
+	/////////////////
+	// // // Aligned!
+	nBoneCountRoundedFour = (numBonesSlerpAlign) & ~3;
+	for (i = 0; i < nBoneCountRoundedFour; i += 4)
+	{
+		// drag the next cache line in
+		PREFETCH360(q1, i * 16 + 128);
+		PREFETCH360(pos1, i * sizeof(*pos1) + 128);
+		PREFETCH360(q2, i * 16 + 128);
+		PREFETCH360(pos2, i * sizeof(*pos2) + 128);
+
+		fltx4 weights = LoadAlignedSIMD(aBonesSlerpAlignWeights + i);
+		fltx4 oneMinusWeight = SubSIMD(One, weights);
+
+		// position component:
+		// pos1[i][0] = pos1[i][0] * s1 + pos2[i][0] * weight;
+		fltx4 pos1simd[4];
+		fltx4 pos2simd[4];
+		pos1simd[0] = LoadUnaligned3SIMD(pos1[aBonesSlerpAlign[i + 0]].Base());
+		pos1simd[1] = LoadUnaligned3SIMD(pos1[aBonesSlerpAlign[i + 1]].Base());
+		pos1simd[2] = LoadUnaligned3SIMD(pos1[aBonesSlerpAlign[i + 2]].Base());
+		pos1simd[3] = LoadUnaligned3SIMD(pos1[aBonesSlerpAlign[i + 3]].Base());
+		pos2simd[0] = LoadUnaligned3SIMD(pos2[aBonesSlerpAlign[i + 0]].Base());
+		pos2simd[1] = LoadUnaligned3SIMD(pos2[aBonesSlerpAlign[i + 1]].Base());
+		pos2simd[2] = LoadUnaligned3SIMD(pos2[aBonesSlerpAlign[i + 2]].Base());
+		pos2simd[3] = LoadUnaligned3SIMD(pos2[aBonesSlerpAlign[i + 3]].Base());
+
+		pos1simd[0] = MulSIMD(SplatXSIMD(oneMinusWeight), pos1simd[0]);
+		pos1simd[1] = MulSIMD(SplatYSIMD(oneMinusWeight), pos1simd[1]);
+		pos1simd[2] = MulSIMD(SplatZSIMD(oneMinusWeight), pos1simd[2]);
+		pos1simd[3] = MulSIMD(SplatWSIMD(oneMinusWeight), pos1simd[3]);
+
+		fltx4 posWriteMasks[4]; // don't overwrite where there was zero weight
+		{
+			fltx4 splatweights[4];
+			fltx4 Zero = Four_Zeros;
+			splatweights[0] = SplatXSIMD(weights);
+			splatweights[1] = SplatYSIMD(weights);
+			splatweights[2] = SplatZSIMD(weights);
+			splatweights[3] = SplatWSIMD(weights);
+
+			pos1simd[0] = MaddSIMD(splatweights[0], pos2simd[0], pos1simd[0]);
+			posWriteMasks[0] = (fltx4)CmpGtSIMD(splatweights[0], Zero);
+			pos1simd[1] = MaddSIMD(splatweights[1], pos2simd[1], pos1simd[1]);
+			posWriteMasks[1] = (fltx4)CmpGtSIMD(splatweights[1], Zero);
+			pos1simd[2] = MaddSIMD(splatweights[2], pos2simd[2], pos1simd[2]);
+			posWriteMasks[2] = (fltx4)CmpGtSIMD(splatweights[2], Zero);
+			pos1simd[3] = MaddSIMD(splatweights[3], pos2simd[3], pos1simd[3]);
+			posWriteMasks[3] = (fltx4)CmpGtSIMD(splatweights[3], Zero);
+		}
+
+
+		FourQuaternions q1four, q2four, result;
+		q1four.LoadAndSwizzleAligned(q1 + aBonesSlerpAlign[i + 0],
+			q1 + aBonesSlerpAlign[i + 1],
+			q1 + aBonesSlerpAlign[i + 2],
+			q1 + aBonesSlerpAlign[i + 3]);
+
+#if 0
+		// FIXME: the SIMD slerp doesn't handle quaternions that have opposite signs
+		q2four.LoadAndSwizzleAligned(q2 + aBonesSlerpAlign[i + 0],
+			q2 + aBonesSlerpAlign[i + 1],
+			q2 + aBonesSlerpAlign[i + 2],
+			q2 + aBonesSlerpAlign[i + 3]);
+		result = q2four.Slerp(q1four, oneMinusWeight);
+#else
+		// force the quaternions to be the same sign (< 180 degree separation)
+		QuaternionAligned q20, q21, q22, q23;
+		QuaternionAlign(q1[aBonesSlerpAlign[i + 0]], q2[aBonesSlerpAlign[i + 0]], q20);
+		QuaternionAlign(q1[aBonesSlerpAlign[i + 1]], q2[aBonesSlerpAlign[i + 1]], q21);
+		QuaternionAlign(q1[aBonesSlerpAlign[i + 2]], q2[aBonesSlerpAlign[i + 2]], q22);
+		QuaternionAlign(q1[aBonesSlerpAlign[i + 3]], q2[aBonesSlerpAlign[i + 3]], q23);
+		q2four.LoadAndSwizzleAligned(&q20, &q21, &q22, &q23);
+		result = q2four.SlerpNoAlign(q1four, oneMinusWeight);
+#endif
+
+		result.SwizzleAndStoreAligned(q1 + aBonesSlerpAlign[i + 0],
+			q1 + aBonesSlerpAlign[i + 1],
+			q1 + aBonesSlerpAlign[i + 2],
+			q1 + aBonesSlerpAlign[i + 3]);
+
+		StoreUnaligned3SIMD(pos1[aBonesSlerpAlign[i + 0]].Base(), pos1simd[0]);
+		StoreUnaligned3SIMD(pos1[aBonesSlerpAlign[i + 1]].Base(), pos1simd[1]);
+		StoreUnaligned3SIMD(pos1[aBonesSlerpAlign[i + 2]].Base(), pos1simd[2]);
+		StoreUnaligned3SIMD(pos1[aBonesSlerpAlign[i + 3]].Base(), pos1simd[3]);
+	}
+
+	// handle stragglers
+	for (i; i < numBonesSlerpAlign; ++i)
+	{
+		QuaternionAligned q3;
+		weight = aBonesSlerpAlignWeights[i];
+		int k = aBonesSlerpAlign[i];
+
+		float s1 = 1.0 - weight;
+
+#if defined(_X360) || USE_DXMATH
+		fltx4  q1simd, q2simd, result;
+		q1simd = LoadAlignedSIMD(q1[k].Base());
+		q2simd = LoadAlignedSIMD(q2[k]);
+#endif
+
+#if !defined(_X360) && !USE_DXMATH
+		QuaternionSlerp(q2[k], q1[k], s1, q3);
+#else
+		result = QuaternionSlerpSIMD(q2simd, q1simd, s1);
+#endif
+
+#if !defined(_X360) && !USE_DXMATH
+		q1[k][0] = q3[0];
+		q1[k][1] = q3[1];
+		q1[k][2] = q3[2];
+		q1[k][3] = q3[3];
+#else
+		StoreAlignedSIMD(q1[k].Base(), result);
+#endif
+
+		pos1[k][0] = pos1[k][0] * s1 + pos2[k][0] * weight;
+		pos1[k][1] = pos1[k][1] * s1 + pos2[k][1] * weight;
+		pos1[k][2] = pos1[k][2] * s1 + pos2[k][2] * weight;
+	}
+	///////////////////
+	// // // Unaligned!
+	nBoneCountRoundedFour = (numBonesSlerpNoAlign) & ~3;
+	for (i = 0; i < nBoneCountRoundedFour; i += 4)
+	{
+		// drag the next cache line in
+		PREFETCH360(q1, i * 16 + 128);
+		PREFETCH360(pos1, i * sizeof(*pos1) + 128);
+		PREFETCH360(q2, i * 16 + 128);
+		PREFETCH360(pos2, i * sizeof(*pos2) + 128);
+
+		fltx4 weights = LoadAlignedSIMD(aBonesSlerpNoAlignWeights + i);
+		fltx4 oneMinusWeight = SubSIMD(One, weights);
+
+		// position component:
+		// pos1[i][0] = pos1[i][0] * s1 + pos2[i][0] * weight;
+		fltx4 pos1simd[4];
+		fltx4 pos2simd[4];
+		pos1simd[0] = LoadUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i + 0]].Base());
+		pos1simd[1] = LoadUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i + 1]].Base());
+		pos1simd[2] = LoadUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i + 2]].Base());
+		pos1simd[3] = LoadUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i + 3]].Base());
+		pos2simd[0] = LoadUnaligned3SIMD(pos2[aBonesSlerpNoAlign[i + 0]].Base());
+		pos2simd[1] = LoadUnaligned3SIMD(pos2[aBonesSlerpNoAlign[i + 1]].Base());
+		pos2simd[2] = LoadUnaligned3SIMD(pos2[aBonesSlerpNoAlign[i + 2]].Base());
+		pos2simd[3] = LoadUnaligned3SIMD(pos2[aBonesSlerpNoAlign[i + 3]].Base());
+
+		pos1simd[0] = MulSIMD(SplatXSIMD(oneMinusWeight), pos1simd[0]);
+		pos1simd[1] = MulSIMD(SplatYSIMD(oneMinusWeight), pos1simd[1]);
+		pos1simd[2] = MulSIMD(SplatZSIMD(oneMinusWeight), pos1simd[2]);
+		pos1simd[3] = MulSIMD(SplatWSIMD(oneMinusWeight), pos1simd[3]);
+
+		pos1simd[0] = MaddSIMD(SplatXSIMD(weights), pos2simd[0], pos1simd[0]);
+		pos1simd[1] = MaddSIMD(SplatYSIMD(weights), pos2simd[1], pos1simd[1]);
+		pos1simd[2] = MaddSIMD(SplatZSIMD(weights), pos2simd[2], pos1simd[2]);
+		pos1simd[3] = MaddSIMD(SplatWSIMD(weights), pos2simd[3], pos1simd[3]);
+
+		FourQuaternions q1four, q2four, result;
+		q1four.LoadAndSwizzleAligned(q1 + aBonesSlerpNoAlign[i + 0],
+			q1 + aBonesSlerpNoAlign[i + 1],
+			q1 + aBonesSlerpNoAlign[i + 2],
+			q1 + aBonesSlerpNoAlign[i + 3]);
+		q2four.LoadAndSwizzleAligned(q2 + aBonesSlerpNoAlign[i + 0],
+			q2 + aBonesSlerpNoAlign[i + 1],
+			q2 + aBonesSlerpNoAlign[i + 2],
+			q2 + aBonesSlerpNoAlign[i + 3]);
+
+		result = q2four.SlerpNoAlign(q1four, oneMinusWeight);
+
+		result.SwizzleAndStoreAligned(q1 + aBonesSlerpNoAlign[i + 0],
+			q1 + aBonesSlerpNoAlign[i + 1],
+			q1 + aBonesSlerpNoAlign[i + 2],
+			q1 + aBonesSlerpNoAlign[i + 3]);
+
+		StoreUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i + 0]].Base(), pos1simd[0]);
+		StoreUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i + 1]].Base(), pos1simd[1]);
+		StoreUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i + 2]].Base(), pos1simd[2]);
+		StoreUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i + 3]].Base(), pos1simd[3]);
+	}
+	// handle stragglers
+	for (i; i < numBonesSlerpNoAlign; ++i)
+	{
+		weight = aBonesSlerpNoAlignWeights[i];
+		int k = aBonesSlerpNoAlign[i];
+
+		float s1 = 1.0 - weight;
+
+#if defined(_X360) || USE_DXMATH
+		fltx4  q1simd, q2simd, result;
+		q1simd = LoadAlignedSIMD(q1[k].Base());
+		q2simd = LoadAlignedSIMD(q2[k]);
+#endif
+
+#if !defined(_X360) && !USE_DXMATH
+		QuaternionAligned q3;
+		QuaternionSlerpNoAlign(q2[k], q1[k], s1, q3);
+#else
+		result = QuaternionSlerpNoAlignSIMD(q2simd, q1simd, s1);
+#endif
+
+#if !defined(_X360) && !USE_DXMATH
+		q1[k][0] = q3[0];
+		q1[k][1] = q3[1];
+		q1[k][2] = q3[2];
+		q1[k][3] = q3[3];
+#else
+		StoreAlignedSIMD(q1[k].Base(), result);
+#endif
+
+		pos1[k][0] = pos1[k][0] * s1 + pos2[k][0] * weight;
+		pos1[k][1] = pos1[k][1] * s1 + pos2[k][1] * weight;
+		pos1[k][2] = pos1[k][2] * s1 + pos2[k][2] * weight;
+	}
+}
 
 //-----------------------------------------------------------------------------
 // Purpose: blend together q1,pos1 with q2,pos2.  Return result in q1,pos1.  
@@ -1387,6 +1856,22 @@ void SlerpBones(
 	float s,
 	int boneMask )
 {
+	// Test for 16-byte alignment, and if present, use the speedy SIMD version.
+	if ((reinterpret_cast<uintp>(q1) & 0x0F) == 0 &&
+		(reinterpret_cast<uintp>(q2) & 0x0F) == 0)
+	{
+		return SlerpBonesSpeedy(pStudioHdr,
+			reinterpret_cast<QuaternionAligned*>(q1),
+			pos1,
+			seqdesc,
+			sequence,
+			q2,
+			pos2,
+			s,
+			boneMask
+		);
+	}
+
 	if (s <= 0.0f) 
 		return;
 	if (s > 1.0f)
@@ -1448,7 +1933,7 @@ void SlerpBones(
 
 			if ( seqdesc.flags & STUDIO_POST )
 			{
-#ifndef _X360
+#if !defined(_X360) && !USE_DXMATH
 				QuaternionMA( q1[i], s2, q2[i], q1[i] );
 #else
 				fltx4 q1simd = LoadUnalignedSIMD( q1[i].Base() );
@@ -1456,14 +1941,10 @@ void SlerpBones(
 				fltx4 result = QuaternionMASIMD( q1simd, s2, q2simd );
 				StoreUnalignedSIMD( q1[i].Base(), result );
 #endif
-				// FIXME: are these correct?
-				pos1[i][0] = pos1[i][0] + pos2[i][0] * s2;
-				pos1[i][1] = pos1[i][1] + pos2[i][1] * s2;
-				pos1[i][2] = pos1[i][2] + pos2[i][2] * s2;
 			}
 			else
 			{
-#ifndef _X360
+#if !defined(_X360) && !USE_DXMATH
 				QuaternionSM( s2, q2[i], q1[i], q1[i] );
 #else
 				fltx4 q1simd = LoadUnalignedSIMD( q1[i].Base() );
@@ -1471,12 +1952,17 @@ void SlerpBones(
 				fltx4 result = QuaternionSMSIMD( s2, q2simd, q1simd );
 				StoreUnalignedSIMD( q1[i].Base(), result );
 #endif
-
-				// FIXME: are these correct?
-				pos1[i][0] = pos1[i][0] + pos2[i][0] * s2;
-				pos1[i][1] = pos1[i][1] + pos2[i][1] * s2;
-				pos1[i][2] = pos1[i][2] + pos2[i][2] * s2;
 			}
+			// do this explicitly to make the scheduling better
+			// (otherwise it might think pos1 and pos2 overlap,
+			// and thus save one before starting the next)
+			float x, y, z;
+			x = pos1[i][0] + pos2[i][0] * s2;
+			y = pos1[i][1] + pos2[i][1] * s2;
+			z = pos1[i][2] + pos2[i][2] * s2;
+			pos1[i][0] = x;
+			pos1[i][1] = y;
+			pos1[i][2] = z;
 		}
 		return;
 	}
@@ -1490,14 +1976,14 @@ void SlerpBones(
 
 		s1 = 1.0 - s2;
 
-#ifdef _X360
+#if defined(_X360) || USE_DXMATH
 		fltx4  q1simd, q2simd, result;
 		q1simd = LoadUnalignedSIMD( q1[i].Base() );
 		q2simd = LoadAlignedSIMD( q2[i] );
 #endif
 		if ( pStudioHdr->boneFlags(i) & BONE_FIXED_ALIGNMENT )
 		{
-#ifndef _X360
+#if !defined(_X360) && !USE_DXMATH
 			QuaternionSlerpNoAlign( q2[i], q1[i], s1, q3 );
 #else
 			result = QuaternionSlerpNoAlignSIMD( q2simd, q1simd, s1 );
@@ -1505,14 +1991,14 @@ void SlerpBones(
 		}
 		else
 		{
-#ifndef _X360
+#if !defined(_X360) && !USE_DXMATH
 			QuaternionSlerp( q2[i], q1[i], s1, q3 );
 #else
 			result = QuaternionSlerpSIMD( q2simd, q1simd, s1 );
 #endif
 		}
 
-#ifndef _X360
+#if !defined(_X360) && !USE_DXMATH
 		q1[i][0] = q3[0];
 		q1[i][1] = q3[1];
 		q1[i][2] = q3[2];
@@ -2632,14 +3118,14 @@ class CIKSolver
          X[i] = P[i];
       normalize(X);
 
-// Its y axis is perpendicular to P, so Y = unit( E - X(E�X) ).
+// Its y axis is perpendicular to P, so Y = unit( E - X(E�X) ).
 
       float dDOTx = dot(D,X);
       for (i = 0 ; i < 3 ; i++)
          Y[i] = D[i] - dDOTx * X[i];
       normalize(Y);
 
-// Its z axis is perpendicular to both X and Y, so Z = X�Y.
+// Its z axis is perpendicular to both X and Y, so Z = X�Y.
 
       cross(X,Y,Z);
 
diff --git a/src/public/mathlib/dxmath.h b/src/public/mathlib/dxmath.h
new file mode 100644
index 000000000..1af932614
--- /dev/null
+++ b/src/public/mathlib/dxmath.h
@@ -0,0 +1,20 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//===========================================================================//
+
+#pragma once
+
+#define USE_DXMATH 1
+
+#if USE_DXMATH
+#if defined(_WIN32)
+#include "../thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h"
+#elif defined(POSIX)
+#include "../thirdparty/dotnetrt/sal.h"
+#include "../thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h"
+#else
+#undef USE_DXMATH
+#endif
+#endif
diff --git a/src/public/mathlib/math_pfns.h b/src/public/mathlib/math_pfns.h
index d43411ce8..e50675395 100644
--- a/src/public/mathlib/math_pfns.h
+++ b/src/public/mathlib/math_pfns.h
@@ -9,32 +9,50 @@
 
 #if defined( _X360 )
 #include <xboxmath.h>
+#else
+#include "dxmath.h"
+#if !USE_DXMATH
+#include <xmmintrin.h>
+#endif
+#define USE_SSE2
+#include "../thirdparty/sse_mathfun/sse_mathfun.h"
 #endif
 
 #if !defined( _X360 )
 
-// These globals are initialized by mathlib and redirected based on available fpu features
-extern float (*pfSqrt)(float x);
-extern float (*pfRSqrt)(float x);
-extern float (*pfRSqrtFast)(float x);
-extern void  (*pfFastSinCos)(float x, float *s, float *c);
-extern float (*pfFastCos)(float x);
+FORCEINLINE float RSqrt(float x)
+{
+	// The compiler will generate ideal instructions for a Newton-Raphson
+	// Specifying it directly results in worse assembly.
+	return 1.0f / sqrtf(x);
+}
+
+FORCEINLINE float RSqrtFast(float x)
+{
+	// This results in the compiler simplifying down to a plain rsqrtss
+	const __m128 vec = _mm_set_ss( x );
+	const __m128 r = _mm_rsqrt_ps( vec );
+	float temp;
+	_mm_store_ss(&temp, r);
+	return temp;
+}
+
+FORCEINLINE float CosFast(float x)
+{
+	// Compiler doesn't optimize ::cosf call, use a vectorized cos. This is better than DirectX::XMScalarCos
+	const __m128 vec = _mm_set_ss( x );
+	const __m128 r = cos_ps(vec);
+	float temp;
+	_mm_store_ss(&temp, r);
+	return temp;
+}
 
 // The following are not declared as macros because they are often used in limiting situations,
 // and sometimes the compiler simply refuses to inline them for some reason
-#define FastSqrt(x)			(*pfSqrt)(x)
-#define	FastRSqrt(x)		(*pfRSqrt)(x)
-#define FastRSqrtFast(x)    (*pfRSqrtFast)(x)
-#define FastSinCos(x,s,c)   (*pfFastSinCos)(x,s,c)
-#define FastCos(x)			(*pfFastCos)(x)
-
-#if defined(__i386__) || defined(_M_IX86)
-// On x86, the inline FPU or SSE sqrt instruction is faster than
-// the overhead of setting up a function call and saving/restoring
-// the FPU or SSE register state and can be scheduled better, too.
-#undef FastSqrt
-#define FastSqrt(x)			::sqrtf(x)
-#endif
+#define FastSqrt(x)			::sqrtf(x) // sqrt is optimized to an efficient SSE call with modern compilers
+#define	FastRSqrt(x)		RSqrt(x)
+#define FastRSqrtFast(x)    RSqrtFast(x)
+#define FastCos(x)			CosFast(x)
 
 #endif // !_X360
 
diff --git a/src/public/mathlib/mathlib.h b/src/public/mathlib/mathlib.h
index 42317632b..fa7486dfa 100644
--- a/src/public/mathlib/mathlib.h
+++ b/src/public/mathlib/mathlib.h
@@ -7,6 +7,8 @@
 #ifndef MATH_LIB_H
 #define MATH_LIB_H
 
+#include "dxmath.h"
+
 #include <math.h>
 #include "minmax.h"
 #include "tier0/basetypes.h"
@@ -95,7 +97,7 @@ class FPExceptionEnabler
 
 
 
-#ifdef DEBUG  // stop crashing edit-and-continue
+#ifdef DEBUG // stop crashing edit-and-continue
 FORCEINLINE float clamp( float val, float minVal, float maxVal )
 {
 	if ( maxVal < minVal )
@@ -438,10 +440,12 @@ inline vec_t RoundInt (vec_t in)
 int Q_log2(int val);
 
 // Math routines done in optimized assembly math package routines
-void inline SinCos( float radians, float *sine, float *cosine )
+void FORCEINLINE SinCos( float radians, float *sine, float *cosine )
 {
 #if defined( _X360 )
 	XMScalarSinCos( sine, cosine, radians );
+#elif USE_DXMATH
+	DirectX::XMScalarSinCos( sine, cosine, radians );
 #elif defined( PLATFORM_WINDOWS_PC32 )
 	_asm
 	{
@@ -466,35 +470,7 @@ void inline SinCos( float radians, float *sine, float *cosine )
 #endif
 }
 
-#define SIN_TABLE_SIZE	256
-#define FTOIBIAS		12582912.f
-extern float SinCosTable[SIN_TABLE_SIZE];
-
-inline float TableCos( float theta )
-{
-	union
-	{
-		int i;
-		float f;
-	} ftmp;
-
-	// ideally, the following should compile down to: theta * constant + constant, changing any of these constants from defines sometimes fubars this.
-	ftmp.f = theta * ( float )( SIN_TABLE_SIZE / ( 2.0f * M_PI ) ) + ( FTOIBIAS + ( SIN_TABLE_SIZE / 4 ) );
-	return SinCosTable[ ftmp.i & ( SIN_TABLE_SIZE - 1 ) ];
-}
-
-inline float TableSin( float theta )
-{
-	union
-	{
-		int i;
-		float f;
-	} ftmp;
-
-	// ideally, the following should compile down to: theta * constant + constant
-	ftmp.f = theta * ( float )( SIN_TABLE_SIZE / ( 2.0f * M_PI ) ) + FTOIBIAS;
-	return SinCosTable[ ftmp.i & ( SIN_TABLE_SIZE - 1 ) ];
-}
+#define FastSinCos( angle, s, c) SinCos(angle, s, c)
 
 template<class T>
 FORCEINLINE T Square( T const &a )
@@ -1205,7 +1181,7 @@ inline float SimpleSplineRemapValClamped( float val, float A, float B, float C,
 FORCEINLINE int RoundFloatToInt(float f)
 {
 #if defined(__i386__) || defined(_M_IX86) || defined( PLATFORM_WINDOWS_PC64 ) || defined(__x86_64__)
-	return _mm_cvtss_si32(_mm_load_ss(&f));
+	return _mm_cvt_ss2si(_mm_set_ss(f + f + 0.5f)) >> 1;
 #elif defined( _X360 )
 #ifdef Assert
 	Assert( IsFPUControlWordSet() );
@@ -1310,17 +1286,11 @@ FORCEINLINE int Float2Int( float a )
 // Over 15x faster than: (int)floor(value)
 inline int Floor2Int( float a )
 {
-	int RetVal;
-#if defined( __i386__ )
-	// Convert to int and back, compare, subtract one if too big
-	__m128 a128 = _mm_set_ss(a);
-	RetVal = _mm_cvtss_si32(a128);
-    __m128 rounded128 = _mm_cvt_si2ss(_mm_setzero_ps(), RetVal);
-	RetVal -= _mm_comigt_ss( rounded128, a128 );
+#if defined( _X360 )
+	return static_cast<int>( floor(a) );
 #else
-	RetVal = static_cast<int>( floor(a) );
+	return _mm_cvt_ss2si(_mm_set_ss(a + a - 0.5f)) >> 1;
 #endif
-	return RetVal;
 }
 
 //-----------------------------------------------------------------------------
@@ -1366,18 +1336,12 @@ inline float ClampToMsec( float in )
 
 // Over 15x faster than: (int)ceil(value)
 inline int Ceil2Int( float a )
-{
-   int RetVal;
-#if defined( __i386__ )
-   // Convert to int and back, compare, add one if too small
-   __m128 a128 = _mm_load_ss(&a);
-   RetVal = _mm_cvtss_si32(a128);
-   __m128 rounded128 = _mm_cvt_si2ss(_mm_setzero_ps(), RetVal);
-   RetVal += _mm_comilt_ss( rounded128, a128 );
+{ 
+#if defined( _X360 )
+  return static_cast<int>( ceil(a) );
 #else
-   RetVal = static_cast<int>( ceil(a) );
+  return -(_mm_cvt_ss2si(_mm_set_ss(-0.5f - (a + a))) >> 1);
 #endif
-	return RetVal;
 }
 
 
@@ -2169,7 +2133,7 @@ inline bool CloseEnough( const Vector &a, const Vector &b, float epsilon = EQUAL
 // Fast compare
 // maxUlps is the maximum error in terms of Units in the Last Place. This 
 // specifies how big an error we are willing to accept in terms of the value
-// of the least significant digit of the floating point number�s 
+// of the least significant digit of the floating point number's
 // representation. maxUlps can also be interpreted in terms of how many 
 // representable floats we are willing to accept between A and B. 
 // This function will allow maxUlps-1 floats between A and B.
diff --git a/src/public/mathlib/ssemath.h b/src/public/mathlib/ssemath.h
index c2ff48d75..3bcda408a 100644
--- a/src/public/mathlib/ssemath.h
+++ b/src/public/mathlib/ssemath.h
@@ -149,6 +149,8 @@ extern const fltx4 Four_2ToThe23s;								// (1<<23)..
 extern const fltx4 Four_2ToThe24s;								// (1<<24)..
 extern const fltx4 Four_Origin;									// 0 0 0 1 (origin point, like vr0 on the PS2)
 extern const fltx4 Four_NegativeOnes;							// -1 -1 -1 -1 
+extern const fltx4 Four_DegToRad;								// (float)(M_PI_F / 180.f) times four
+extern const fltx4 Four_360;                                    // 360 360 360 360
 #else
 #define			   Four_Zeros XMVectorZero()					// 0 0 0 0
 #define			   Four_Ones XMVectorSplatOne()					// 1 1 1 1
@@ -164,6 +166,8 @@ extern const fltx4 Four_2ToThe23s;								// (1<<23)..
 extern const fltx4 Four_2ToThe24s;								// (1<<24)..
 extern const fltx4 Four_Origin;									// 0 0 0 1 (origin point, like vr0 on the PS2)
 extern const fltx4 Four_NegativeOnes;							// -1 -1 -1 -1 
+extern const fltx4 Four_DegToRad;								// (float)(M_PI_F / 180.f) times four
+extern const fltx4 Four_360;                                    // 360 360 360 360
 #endif
 extern const fltx4 Four_FLT_MAX;								// FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX
 extern const fltx4 Four_Negative_FLT_MAX;						// -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX
@@ -182,6 +186,8 @@ extern const ALIGN16 uint32 g_SIMD_Low16BitsMask[] ALIGN16_POST;			// 0xffff x 4
 // to mask out the tail, g_SIMD_SkipTailMask[N & 3] what you want to use for the last iteration.
 extern const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST;
 
+extern const int32 ALIGN16 g_SIMD_EveryOtherMask[];				// 0, ~0, 0, ~0
+
 // Define prefetch macros.
 // The characteristics of cache and prefetch are completely 
 // different between the different platforms, so you DO NOT
@@ -1951,6 +1957,34 @@ FORCEINLINE fltx4 RotateRight2( const fltx4 & a )
 	return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 1, 0, 3, 2 ) );
 }
 
+// a={ a.x, b.x, c.x, d.x }
+// combine 4 fltx4s by throwing away 3/4s of the fields
+FORCEINLINE fltx4 Compress4SIMD(fltx4 const a, fltx4 const& b, fltx4 const& c, fltx4 const& d)
+{
+	fltx4 aacc = _mm_shuffle_ps(a, c, MM_SHUFFLE_REV(0, 0, 0, 0));
+	fltx4 bbdd = _mm_shuffle_ps(b, d, MM_SHUFFLE_REV(0, 0, 0, 0));
+	return MaskedAssign(LoadAlignedSIMD(g_SIMD_EveryOtherMask), bbdd, aacc);
+}
+
+// outa={a.x, a.x, a.y, a.y}, outb = a.z, a.z, a.w, a.w }
+FORCEINLINE void ExpandSIMD(fltx4 const& a, fltx4& fl4OutA, fltx4& fl4OutB)
+{
+	fl4OutA = _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(0, 0, 1, 1));
+	fl4OutB = _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(2, 2, 3, 3));
+
+}
+
+// construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous
+FORCEINLINE fltx4 LoadGatherSIMD(const float& x, const float& y, const float& z, const float& w)
+{
+	// load the float into the low word of each vector register (this exploits the unaligned load op)
+	fltx4 vx = _mm_load_ss(&x);
+	fltx4 vy = _mm_load_ss(&y);
+	fltx4 vz = _mm_load_ss(&z);
+	fltx4 vw = _mm_load_ss(&w);
+	return Compress4SIMD(vx, vy, vz, vw);
+}
+
 
 FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )				// a+b
 {
@@ -1984,49 +2018,74 @@ FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )
 
 FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
 {
+#if USE_DXMATH
+	return DirectX::XMVector3Dot(a, b);
+#else
 	fltx4 m = MulSIMD( a, b );
 	float flDot = SubFloat( m, 0 ) + SubFloat( m, 1 ) + SubFloat( m, 2 );
 	return ReplicateX4( flDot );
+#endif
 }
 
 FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
 {
+#if USE_DXMATH
+	return DirectX::XMVector4Dot(a, b);
+#else
 	fltx4 m = MulSIMD( a, b );
 	float flDot = SubFloat( m, 0 ) + SubFloat( m, 1 ) + SubFloat( m, 2 ) + SubFloat( m, 3 );
 	return ReplicateX4( flDot );
+#endif
 }
 
-//TODO: implement as four-way Taylor series (see xbox implementation)
 FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
 {
+#if USE_DXMATH
+	return DirectX::XMVectorSin( radians );
+#else
+	//TODO: implement as four-way Taylor series (see xbox implementation)
+	// FIXME: Make a fast SSE version
 	fltx4 result;
 	SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) );
 	SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) );
 	SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) );
 	SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) );
 	return result;
+#endif
 }
 
 FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
 {
+#if USE_DXMATH
+	DirectX::XMVectorSinCos( &sine, &cosine, radians );
+#else
 	// FIXME: Make a fast SSE version
 	SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
 	SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
 	SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
+#endif
 }
 
 FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )				// a*b + c
 {
+#if USE_DXMATH
+	DirectX::XMVectorSinCos( &sine, &cosine, radians ); 	
+#else
 	// FIXME: Make a fast SSE version
 	SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
 	SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
 	SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
 	SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) );
+#endif
 }
 
-//TODO: implement as four-way Taylor series (see xbox implementation)
+
 FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
 {
+#if USE_DXMATH
+	return DirectX::XMVectorASin( sine );
+#else
+	//TODO: implement as four-way Taylor series (see xbox implementation)
 	// FIXME: Make a fast SSE version
 	fltx4 result;
 	SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) );
@@ -2034,27 +2093,36 @@ FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
 	SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) );
 	SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) );
 	return result;
+#endif
 }
 
 FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
 {
+#if USE_DXMATH
+	return DirectX::XMVectorACos( cs );
+#else
 	fltx4 result;
 	SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) );
 	SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) );
 	SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) );
 	SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) );
 	return result;
+#endif
 }
 
 // tan^1(a/b) .. ie, pass sin in as a and cos in as b
 FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
 {
+#if USE_DXMATH
+	return DirectX::XMVectorATan2( a, b );
+#else
 	fltx4 result;
 	SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) );
 	SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) );
 	SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) );
 	SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) );
 	return result;
+#endif
 }
 
 FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
@@ -2142,16 +2210,20 @@ FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b )				// max(a,b)
 // Round towards positive infinity
 FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
 {
+#if USE_DXMATH
+	return DirectX::XMVectorCeiling(a);
+#else
 	fltx4 retVal;
 	SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) );
 	SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) );
 	SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) );
 	SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) );
 	return retVal;
-
+#endif
 }
 
 fltx4 fabs( const fltx4 & x );
+
 // Round towards negative infinity
 // This is the implementation that was here before; it assumes
 // you are in round-to-floor mode, which I guess is usually the
@@ -2244,6 +2316,9 @@ FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
 // 2^x for all values (the antilog)
 FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower )
 {
+#if USE_DXMATH
+	return DirectX::XMVectorExp(toPower);
+#else
 	fltx4 retval;
 	SubFloat( retval, 0 ) = powf( 2, SubFloat(toPower, 0) );
 	SubFloat( retval, 1 ) = powf( 2, SubFloat(toPower, 1) );
@@ -2251,6 +2326,7 @@ FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower )
 	SubFloat( retval, 3 ) = powf( 2, SubFloat(toPower, 3) );
 
 	return retval;
+#endif
 }
 
 // Clamps the components of a vector to a specified minimum and maximum range.
@@ -2354,12 +2430,16 @@ FORCEINLINE void StoreUnalignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a
 // fixed point conversion is done.
 FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA )
 {
+#if USE_DXMATH
+		return DirectX::XMConvertVectorUIntToFloat(vSrcA, 0);
+#else
 	fltx4 retval;
 	SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) );
 	SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) );
 	SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) );
 	SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) );
 	return retval;
+#endif
 }
 
 
@@ -2368,12 +2448,16 @@ FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA )
 // fixed point conversion is done.
 FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
 {
+#if USE_DXMATH
+	return DirectX::XMConvertVectorIntToFloat(vSrcA, 0);
+#else
 	fltx4 retval;
 	SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[0]));
 	SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[1]));
 	SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[2]));
 	SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[3]));
 	return retval;
+#endif
 }
 
 /*
diff --git a/src/public/mathlib/ssequaternion.h b/src/public/mathlib/ssequaternion.h
index 825a9e45f..5548fa39a 100644
--- a/src/public/mathlib/ssequaternion.h
+++ b/src/public/mathlib/ssequaternion.h
@@ -38,9 +38,8 @@
 // the SSE2 registers, which lessens this problem a little.
 
 // permitted only on 360, as we've done careful tuning on its Altivec math:
-#ifdef _X360
-#define ALLOW_SIMD_QUATERNION_MATH 1  // not on PC!
-#endif
+// UNDONE: we've enabled SSE2
+#define ALLOW_SIMD_QUATERNION_MATH 1
 
 
 
@@ -48,7 +47,6 @@
 // Load/store quaternions
 //---------------------------------------------------------------------
 #ifndef _X360
-#if ALLOW_SIMD_QUATERNION_MATH
 // Using STDC or SSE
 FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD )
 {
@@ -58,7 +56,7 @@ FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD )
 
 FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD )
 {
-	fltx4 retval = LoadAlignedSIMD( pSIMD );
+	fltx4 retval = LoadAlignedSIMD( pSIMD->Base() );
 	return retval;
 }
 
@@ -66,7 +64,6 @@ FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const flt
 {
 	StoreAlignedSIMD( pSIMD->Base(), a );
 }
-#endif
 #else
 
 // for the transitional class -- load a QuaternionAligned
@@ -177,6 +174,10 @@ FORCEINLINE fltx4 QuaternionBlendSIMD( const fltx4 &p, const fltx4 &q, float t )
 // SSE and STDC
 FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q )
 {
+#if USE_DXMATH
+	fltx4 q2 = QuaternionAlignSIMD(p, q);
+	return DirectX::XMQuaternionMultiply(q2, p);
+#else
 	// decide if one of the quaternions is backwards
 	fltx4 q2, result;
 	q2 = QuaternionAlignSIMD( p, q );
@@ -185,6 +186,7 @@ FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q )
 	SubFloat( result, 2 ) =  SubFloat( p, 0 ) * SubFloat( q2, 1 ) - SubFloat( p, 1 ) * SubFloat( q2, 0 ) + SubFloat( p, 2 ) * SubFloat( q2, 3 ) + SubFloat( p, 3 ) * SubFloat( q2, 2 );
 	SubFloat( result, 3 ) = -SubFloat( p, 0 ) * SubFloat( q2, 0 ) - SubFloat( p, 1 ) * SubFloat( q2, 1 ) - SubFloat( p, 2 ) * SubFloat( q2, 2 ) + SubFloat( p, 3 ) * SubFloat( q2, 3 );
 	return result;
+#endif
 }
 
 #else 
@@ -224,6 +226,36 @@ FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q )
 //---------------------------------------------------------------------
 #ifndef _X360
 
+#if USE_DXMATH
+// DirectXMath
+FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4& p, float t)
+{
+	fltx4 sinom = Dot3SIMD(p, p);
+	sinom = SqrtSIMD(sinom);
+	sinom = MinSIMD(sinom, Four_Ones);
+	fltx4 sinsom = ArcSinSIMD(sinom);
+	fltx4 t4 = ReplicateX4(t);
+	sinsom = MulSIMD(sinsom, t4);
+	sinsom = SinSIMD(sinsom);
+	sinom = AddSIMD(sinom, Four_Epsilons);
+	sinom = ReciprocalSIMD(sinom);
+	t4 = MulSIMD(sinsom, sinom);
+	fltx4 result = MulSIMD(p, t4);
+
+	// rescale rotation
+	sinsom = MulSIMD(sinsom, sinsom);
+	fltx4 r = SubSIMD(Four_Ones, sinsom);
+	r = MaxSIMD(r, Four_Zeros);
+	r = SqrtSIMD(r);
+
+	// keep sign of rotation
+	fltx4 cmp = CmpGeSIMD(p, Four_Zeros);
+	r = MaskedAssign(cmp, r, NegSIMD(r));
+
+	result = SetWSIMD(result, r);
+	return result;
+}
+#else
 // SSE and STDC
 FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )
 {
@@ -254,6 +286,7 @@ FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )
 	SubFloat( q, 3 ) = fsel( SubFloat( p, 3 ), r, -r );
 	return q;
 }
+#endif
 
 #else
 
@@ -294,6 +327,13 @@ FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )
 //-----------------------------------------------------------------------------
 #ifndef _X360
 
+#if USE_DXMATH
+// DXMath
+FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD(const fltx4& p, const fltx4& q, float t)
+{
+	return DirectX::XMQuaternionSlerp(p, q, t);
+}
+#else
 // SSE and STDC
 FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )
 {
@@ -340,6 +380,7 @@ FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, fl
 
 	return result;
 }
+#endif
 
 #else
 
@@ -360,8 +401,795 @@ FORCEINLINE fltx4 QuaternionSlerpSIMD( const fltx4 &p, const fltx4 &q, float t )
 	return result;
 }
 
-
 #endif // ALLOW_SIMD_QUATERNION_MATH
 
+/// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are
+/// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated.
+class ALIGN16 FourQuaternions
+{
+public:
+	fltx4 x, y, z, w;
+
+	FourQuaternions(void)
+	{
+	}
+
+	FourQuaternions(const fltx4& _x,
+		const fltx4& _y,
+		const fltx4& _z,
+		const fltx4& _w)
+		: x(_x), y(_y), z(_z), w(_w)
+	{}
+
+#if !defined(__SPU__)
+	// four rotations around the same axis. angles should be in radians.
+	FourQuaternions(const fltx4& axis,
+		const float& angle0, const float& angle1, const float& angle2, const float& angle3)
+	{
+		FromAxisAndAngles(axis, angle0, angle1, angle2, angle3);
+	}
+#endif
+
+	FourQuaternions(FourQuaternions const& src)
+	{
+		x = src.x;
+		y = src.y;
+		z = src.z;
+		w = src.w;
+	}
+
+	FORCEINLINE void operator=(FourQuaternions const& src)
+	{
+		x = src.x;
+		y = src.y;
+		z = src.z;
+		w = src.w;
+	}
+
+	/// this = this * q;
+	FORCEINLINE FourQuaternions Mul(FourQuaternions const& q) const;
+
+	/// negate the vector part
+	FORCEINLINE FourQuaternions Conjugate() const;
+
+	/// for a quaternion representing a rotation of angle theta, return
+	/// one of angle s*theta
+	/// scale is four floats -- one for each quat
+	FORCEINLINE FourQuaternions ScaleAngle(const fltx4& scale) const;
+
+	/// ret = this * ( s * q ) 
+	/// In other words, for a quaternion representing a rotation of angle theta, return
+	/// one of angle s*theta
+	/// s is four floats in a fltx4 -- one for each quaternion
+	FORCEINLINE FourQuaternions MulAc(const fltx4& s, const FourQuaternions& q) const;
+
+	/// ret = ( s * this ) * q
+	FORCEINLINE FourQuaternions ScaleMul(const fltx4& s, const FourQuaternions& q) const;
+
+	/// Slerp four quaternions at once, FROM me TO the specified out.
+	FORCEINLINE FourQuaternions Slerp(const FourQuaternions& to, const fltx4& t);
+
+	FORCEINLINE FourQuaternions SlerpNoAlign(const FourQuaternions& originalto, const fltx4& t);
+
+#if !defined(__SPU__)
+	/// given an axis and four angles, populate this quaternion with the equivalent rotations
+	/// (ie, make these four quaternions represent four different rotations around the same axis)
+	/// angles should be in RADIANS
+	FORCEINLINE FourQuaternions& FromAxisAndAngles(const fltx4& axis,
+		const float& angle0, const float& angle1, const float& angle2, const float& angle3);
+	FORCEINLINE FourQuaternions& FromAxisAndAngles(const fltx4& axis, const fltx4& angles);
+	// one convenience imp if you're doing this in degrees
+	FORCEINLINE FourQuaternions& FromAxisAndAnglesInDegrees(const fltx4& axis, const fltx4& angles)
+	{
+		return FromAxisAndAngles(axis, MulSIMD(angles, Four_DegToRad));
+	}
+#endif
+
+	// rotate (in place) a FourVectors by this quaternion. there's a corresponding RotateBy in FourVectors.
+	FORCEINLINE void RotateFourVectors(FourVectors* RESTRICT vecs) const RESTRICT;
+
+
+	/// LoadAndSwizzleAligned - load 4 QuaternionAligneds into a FourQuaternions, performing transpose op.
+	/// all 4 vectors must be 128 bit boundary
+	FORCEINLINE void LoadAndSwizzleAligned(const float* RESTRICT a, const float* RESTRICT b, const float* RESTRICT c, const float* RESTRICT d)
+	{
+#if defined( _X360 )
+		fltx4 tx = LoadAlignedSIMD(a);
+		fltx4 ty = LoadAlignedSIMD(b);
+		fltx4 tz = LoadAlignedSIMD(c);
+		fltx4 tw = LoadAlignedSIMD(d);
+		fltx4 r0 = __vmrghw(tx, tz);
+		fltx4 r1 = __vmrghw(ty, tw);
+		fltx4 r2 = __vmrglw(tx, tz);
+		fltx4 r3 = __vmrglw(ty, tw);
+
+		x = __vmrghw(r0, r1);
+		y = __vmrglw(r0, r1);
+		z = __vmrghw(r2, r3);
+		w = __vmrglw(r2, r3);
+#else
+		x = LoadAlignedSIMD(a);
+		y = LoadAlignedSIMD(b);
+		z = LoadAlignedSIMD(c);
+		w = LoadAlignedSIMD(d);
+		// now, matrix is:
+		// x y z w
+		// x y z w
+		// x y z w
+		// x y z w
+		TransposeSIMD(x, y, z, w);
+#endif
+	}
+
+	FORCEINLINE void LoadAndSwizzleAligned(const QuaternionAligned* RESTRICT a,
+		const QuaternionAligned* RESTRICT b,
+		const QuaternionAligned* RESTRICT c,
+		const QuaternionAligned* RESTRICT d)
+	{
+		LoadAndSwizzleAligned(a->Base(), b->Base(), c->Base(), d->Base());
+	}
+
+
+	/// LoadAndSwizzleAligned - load 4 consecutive QuaternionAligneds into a FourQuaternions, 
+	/// performing transpose op.
+	/// all 4 vectors must be 128 bit boundary
+	FORCEINLINE void LoadAndSwizzleAligned(const QuaternionAligned* qs)
+	{
+#if defined( _X360 )
+		fltx4 tx = LoadAlignedSIMD(qs++);
+		fltx4 ty = LoadAlignedSIMD(qs++);
+		fltx4 tz = LoadAlignedSIMD(qs++);
+		fltx4 tw = LoadAlignedSIMD(qs);
+		fltx4 r0 = __vmrghw(tx, tz);
+		fltx4 r1 = __vmrghw(ty, tw);
+		fltx4 r2 = __vmrglw(tx, tz);
+		fltx4 r3 = __vmrglw(ty, tw);
+
+		x = __vmrghw(r0, r1);
+		y = __vmrglw(r0, r1);
+		z = __vmrghw(r2, r3);
+		w = __vmrglw(r2, r3);
+#else
+		x = LoadAlignedSIMD(qs++);
+		y = LoadAlignedSIMD(qs++);
+		z = LoadAlignedSIMD(qs++);
+		w = LoadAlignedSIMD(qs++);
+		// now, matrix is:
+		// x y z w
+		// x y z w
+		// x y z w
+		// x y z w
+		TransposeSIMD(x, y, z, w);
+#endif
+	}
+
+	// Store the FourQuaternions out to four nonconsecutive ordinary quaternions in memory.
+	FORCEINLINE void SwizzleAndStoreAligned(QuaternionAligned* a, QuaternionAligned* b, QuaternionAligned* c, QuaternionAligned* d)
+	{
+#if defined( _X360 )
+		fltx4 r0 = __vmrghw(x, z);
+		fltx4 r1 = __vmrghw(y, w);
+		fltx4 r2 = __vmrglw(x, z);
+		fltx4 r3 = __vmrglw(y, w);
+
+		fltx4 rx = __vmrghw(r0, r1);
+		fltx4 ry = __vmrglw(r0, r1);
+		fltx4 rz = __vmrghw(r2, r3);
+		fltx4 rw = __vmrglw(r2, r3);
+
+		StoreAlignedSIMD(a, rx);
+		StoreAlignedSIMD(b, ry);
+		StoreAlignedSIMD(c, rz);
+		StoreAlignedSIMD(d, rw);
+#else
+		fltx4 dupes[4] = { x, y, z, w };
+		TransposeSIMD(dupes[0], dupes[1], dupes[2], dupes[3]);
+		StoreAlignedSIMD(a, dupes[0]);
+		StoreAlignedSIMD(b, dupes[1]);
+		StoreAlignedSIMD(c, dupes[2]);
+		StoreAlignedSIMD(d, dupes[3]);
+#endif
+	}
+
+	// Store the FourQuaternions out to four consecutive ordinary quaternions in memory.
+	FORCEINLINE void SwizzleAndStoreAligned(QuaternionAligned* qs)
+	{
+#if defined( _X360 )
+		fltx4 r0 = __vmrghw(x, z);
+		fltx4 r1 = __vmrghw(y, w);
+		fltx4 r2 = __vmrglw(x, z);
+		fltx4 r3 = __vmrglw(y, w);
+
+		fltx4 rx = __vmrghw(r0, r1);
+		fltx4 ry = __vmrglw(r0, r1);
+		fltx4 rz = __vmrghw(r2, r3);
+		fltx4 rw = __vmrglw(r2, r3);
+
+		StoreAlignedSIMD(qs, rx);
+		StoreAlignedSIMD(++qs, ry);
+		StoreAlignedSIMD(++qs, rz);
+		StoreAlignedSIMD(++qs, rw);
+#else
+		SwizzleAndStoreAligned(qs, qs + 1, qs + 2, qs + 3);
+#endif
+	}
+
+	// Store the FourQuaternions out to four consecutive ordinary quaternions in memory.
+	// The mask specifies which of the quaternions are actually written out -- each	
+	// word in the fltx4 should be all binary ones or zeros. Ones means the corresponding
+	// quat will be written.
+	FORCEINLINE void SwizzleAndStoreAlignedMasked(QuaternionAligned* RESTRICT qs, const fltx4& controlMask)
+	{
+		fltx4 originals[4];
+		originals[0] = LoadAlignedSIMD(qs);
+		originals[1] = LoadAlignedSIMD(qs + 1);
+		originals[2] = LoadAlignedSIMD(qs + 2);
+		originals[3] = LoadAlignedSIMD(qs + 3);
+
+		fltx4 masks[4] = { SplatXSIMD(controlMask),
+			SplatYSIMD(controlMask),
+			SplatZSIMD(controlMask),
+			SplatWSIMD(controlMask) };
+
+#if defined( _X360 )
+		fltx4 r0 = __vmrghw(x, z);
+		fltx4 r1 = __vmrghw(y, w);
+		fltx4 r2 = __vmrglw(x, z);
+		fltx4 r3 = __vmrglw(y, w);
+
+		fltx4 rx = __vmrghw(r0, r1);
+		fltx4 ry = __vmrglw(r0, r1);
+		fltx4 rz = __vmrghw(r2, r3);
+		fltx4 rw = __vmrglw(r2, r3);
+#else
+		fltx4 rx = x;
+		fltx4 ry = y;
+		fltx4 rz = z;
+		fltx4 rw = w;
+		TransposeSIMD(rx, ry, rz, rw);
+#endif
+
+		StoreAlignedSIMD(qs + 0, MaskedAssign(masks[0], rx, originals[0]));
+		StoreAlignedSIMD(qs + 1, MaskedAssign(masks[1], ry, originals[1]));
+		StoreAlignedSIMD(qs + 2, MaskedAssign(masks[2], rz, originals[2]));
+		StoreAlignedSIMD(qs + 3, MaskedAssign(masks[3], rw, originals[3]));
+	}
+};
+
+
+
+FORCEINLINE FourQuaternions FourQuaternions::Conjugate() const
+{
+	return FourQuaternions(NegSIMD(x), NegSIMD(y), NegSIMD(z), w);
+}
+
+FORCEINLINE const fltx4 Dot(const FourQuaternions& a, const FourQuaternions& b)
+{
+	return
+		MaddSIMD(a.x, b.x,
+			MaddSIMD(a.y, b.y,
+				MaddSIMD(a.z, b.z, MulSIMD(a.w, b.w))
+			)
+		);
+}
+
+
+FORCEINLINE const FourQuaternions Madd(const FourQuaternions& a, const fltx4& scale, const FourQuaternions& c)
+{
+	FourQuaternions ret;
+	ret.x = MaddSIMD(a.x, scale, c.x);
+	ret.y = MaddSIMD(a.y, scale, c.y);
+	ret.z = MaddSIMD(a.z, scale, c.z);
+	ret.w = MaddSIMD(a.w, scale, c.w);
+	return ret;
+}
+
+FORCEINLINE const FourQuaternions Mul(const FourQuaternions& a, const fltx4& scale)
+{
+	FourQuaternions ret;
+	ret.x = MulSIMD(a.x, scale);
+	ret.y = MulSIMD(a.y, scale);
+	ret.z = MulSIMD(a.z, scale);
+	ret.w = MulSIMD(a.w, scale);
+	return ret;
+}
+
+FORCEINLINE const FourQuaternions Add(const FourQuaternions& a, const FourQuaternions& b)
+{
+	FourQuaternions ret;
+	ret.x = AddSIMD(a.x, b.x);
+	ret.y = AddSIMD(a.y, b.y);
+	ret.z = AddSIMD(a.z, b.z);
+	ret.w = AddSIMD(a.w, b.w);
+	return ret;
+}
+
+FORCEINLINE const FourQuaternions Sub(const FourQuaternions& a, const FourQuaternions& b)
+{
+	FourQuaternions ret;
+	ret.x = SubSIMD(a.x, b.x);
+	ret.y = SubSIMD(a.y, b.y);
+	ret.z = SubSIMD(a.z, b.z);
+	ret.w = SubSIMD(a.w, b.w);
+	return ret;
+}
+
+FORCEINLINE const FourQuaternions Neg(const FourQuaternions& q)
+{
+	FourQuaternions ret;
+	ret.x = NegSIMD(q.x);
+	ret.y = NegSIMD(q.y);
+	ret.z = NegSIMD(q.z);
+	ret.w = NegSIMD(q.w);
+	return ret;
+}
+
+FORCEINLINE const FourQuaternions MaskedAssign(const fltx4& mask, const FourQuaternions& a, const FourQuaternions& b)
+{
+	FourQuaternions ret;
+	ret.x = MaskedAssign(mask, a.x, b.x);
+	ret.y = MaskedAssign(mask, a.y, b.y);
+	ret.z = MaskedAssign(mask, a.z, b.z);
+	ret.w = MaskedAssign(mask, a.w, b.w);
+	return ret;
+}
+
+#ifdef DIFFERENT_NATIVE_VECTOR_TYPES
+FORCEINLINE const FourQuaternions MaskedAssign(const fltx4& mask, const FourQuaternions& a, const FourQuaternions& b)
+{
+	return MaskedAssign((bi32x4)mask, a, b);
+}
+#endif
+
+
+FORCEINLINE FourQuaternions QuaternionAlign(const FourQuaternions& p, const FourQuaternions& q)
+{
+	// decide if one of the quaternions is backwards
+	fltx4 cmp = CmpLtSIMD(Dot(p, q), Four_Zeros);
+	return MaskedAssign(cmp, Neg(q), q);
+}
+
+
+FORCEINLINE const FourQuaternions QuaternionNormalize(const FourQuaternions& q)
+{
+	fltx4 radius = Dot(q, q);
+	fltx4 mask = CmpEqSIMD(radius, Four_Zeros); // all ones iff radius = 0
+	fltx4 invRadius = ReciprocalSqrtSIMD(radius);
+
+	FourQuaternions ret = MaskedAssign(mask, q, Mul(q, invRadius));
+	return ret;
+}
+
+
+#if !defined(__SPU__)
+FORCEINLINE FourQuaternions& FourQuaternions::FromAxisAndAngles(const fltx4& axis,
+	const float& angle0, const float& angle1, const float& angle2, const float& angle3)
+{
+	return FromAxisAndAngles(axis, LoadGatherSIMD(angle0, angle1, angle2, angle3));
+}
+
+FORCEINLINE FourQuaternions& FourQuaternions::FromAxisAndAngles(const fltx4& axis,
+	const fltx4& angles)
+{
+	// compute the half theta 
+	fltx4 theta = MulSIMD(angles, Four_PointFives);
+	// compute the sine and cosine of each angle simultaneously
+	fltx4 vsines; fltx4 vcoses;
+	SinCosSIMD(vsines, vcoses, theta);
+	// now the sines and coses vectors contain the results for four angles.
+	// for each of the angles, splat them out and then swizzle together so
+	// as to get a < cos, sin, sin, sin > coefficient vector
+
+	x = MulSIMD(vsines, SplatXSIMD(axis)); // sin(t0) * x, sin(t1) * x, etc 
+	y = MulSIMD(vsines, SplatYSIMD(axis));
+	z = MulSIMD(vsines, SplatZSIMD(axis));
+	w = vcoses;
+
+
+	return *this;
+}
+#endif
+
+
+/// this = this * q;
+FORCEINLINE FourQuaternions FourQuaternions::Mul(FourQuaternions const& q) const
+{
+	// W = w1w2 - x1x2 - y1y2 - z1z2
+	FourQuaternions ret;
+	fltx4 signMask = LoadAlignedSIMD((float*)g_SIMD_signmask);
+	// as we do the multiplication, also do a dot product, so we know whether
+	// one of the quats is backwards and if we therefore have to negate at the end
+	fltx4 dotProduct = MulSIMD(w, q.w);
+
+	ret.w = MulSIMD(w, q.w); // W = w1w2
+	ret.x = MulSIMD(w, q.x); // X = w1x2
+	ret.y = MulSIMD(w, q.y); // Y = w1y2
+	ret.z = MulSIMD(w, q.z); // Z = w1z2
+
+	dotProduct = MaddSIMD(x, q.x, dotProduct);
+	ret.w = MsubSIMD(x, q.x, ret.w); // W = w1w2 - x1x2
+	ret.x = MaddSIMD(x, q.w, ret.x); // X = w1x2 + x1w2
+	ret.y = MsubSIMD(x, q.z, ret.y); // Y = w1y2 - x1z2
+	ret.z = MaddSIMD(x, q.y, ret.z); // Z = w1z2 + x1y2
+
+	dotProduct = MaddSIMD(y, q.y, dotProduct);
+	ret.w = MsubSIMD(y, q.y, ret.w); // W = w1w2 - x1x2 - y1y2
+	ret.x = MaddSIMD(y, q.z, ret.x); // X = w1x2 + x1w2 + y1z2
+	ret.y = MaddSIMD(y, q.w, ret.y); // Y = w1y2 - x1z2 + y1w2
+	ret.z = MsubSIMD(y, q.x, ret.z); // Z = w1z2 + x1y2 - y1x2
+
+	dotProduct = MaddSIMD(z, q.z, dotProduct);
+	ret.w = MsubSIMD(z, q.z, ret.w); // W = w1w2 - x1x2 - y1y2 - z1z2
+	ret.x = MsubSIMD(z, q.y, ret.x); // X = w1x2 + x1w2 + y1z2 - z1y2
+	ret.y = MaddSIMD(z, q.x, ret.y); // Y = w1y2 - x1z2 + y1w2 + z1x2
+	ret.z = MaddSIMD(z, q.w, ret.z); // Z = w1z2 + x1y2 - y1x2 + z1w2
+
+	fltx4 Zero = Four_Zeros;
+	fltx4 control = CmpLtSIMD(dotProduct, Four_Zeros);
+	signMask = MaskedAssign(control, signMask, Zero); // negate quats where q1.q2 < 0
+	ret.w = XorSIMD(signMask, ret.w);
+	ret.x = XorSIMD(signMask, ret.x);
+	ret.y = XorSIMD(signMask, ret.y);
+	ret.z = XorSIMD(signMask, ret.z);
+
+	return ret;
+}
+
+
+FORCEINLINE void FourQuaternions::RotateFourVectors(FourVectors* RESTRICT vecs) const RESTRICT
+{
+	fltx4 tmpX, tmpY, tmpZ, tmpW;
+	fltx4 outX, outY, outZ;
+
+	tmpX = SubSIMD(MaddSIMD(w, vecs->x, MulSIMD(y, vecs->z)),
+		MulSIMD(z, vecs->y));
+
+	tmpY = SubSIMD(MaddSIMD(w, vecs->y, MulSIMD(z, vecs->x)),
+		MulSIMD(x, vecs->z));
+
+	tmpZ = SubSIMD(MaddSIMD(w, vecs->z, MulSIMD(x, vecs->y)),
+		MulSIMD(y, vecs->x));
+
+	tmpW = AddSIMD(MaddSIMD(x, vecs->x, MulSIMD(y, vecs->y)),
+		MulSIMD(z, vecs->z));
+
+
+	outX = AddSIMD(SubSIMD(MaddSIMD(tmpW, x, MulSIMD(tmpX, w)),
+		MulSIMD(tmpY, z)),
+		MulSIMD(tmpZ, y));
+
+	outY = AddSIMD(SubSIMD(MaddSIMD(tmpW, y, MulSIMD(tmpY, w)),
+		MulSIMD(tmpZ, x)),
+		MulSIMD(tmpX, z));
+
+	outZ = AddSIMD(SubSIMD(MaddSIMD(tmpW, z, MulSIMD(tmpZ, w)),
+		MulSIMD(tmpX, y)),
+		MulSIMD(tmpY, x));
+
+	// although apparently redundant, assigning the results to intermediate local variables
+	// seems to improve code scheduling slightly in SN.
+	vecs->x = outX;
+	vecs->y = outY;
+	vecs->z = outZ;
+}
+
+
+/*
+void QuaternionScale( const Quaternion &p, float t, Quaternion &q )
+{
+	Assert( s_bMathlibInitialized );
+	float r;
+	// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to
+	// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.
+	float sinom = sqrt( DotProduct( &p.x, &p.x ) );
+	sinom = min( sinom, 1.f );
+	float sinsom = sin( asin( sinom ) * t );
+	t = sinsom / (sinom + FLT_EPSILON);
+	VectorScale( &p.x, t, &q.x );
+	// rescale rotation
+	r = 1.0f - sinsom * sinsom;
+	// Assert( r >= 0 );
+	if (r < 0.0f)
+	r = 0.0f;
+	r = sqrt( r );
+	// keep sign of rotation
+	if (p.w < 0)
+	q.w = -r;
+	else
+	q.w = r;
+	Assert( q.IsValid() );
+	return;
+}
+*/
+
+FORCEINLINE FourQuaternions FourQuaternions::ScaleAngle(const fltx4& scale) const
+{
+	FourQuaternions ret;
+	static const fltx4 OneMinusEpsilon = { 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f };
+	const fltx4 Zero = Four_Zeros;
+	fltx4 signMask = LoadAlignedSIMD((float*)g_SIMD_signmask);
+	// work out if there are any tiny scales or angles, which are unstable
+	fltx4 tinyAngles = CmpGtSIMD(w, OneMinusEpsilon);
+	fltx4 negativeRotations = CmpLtSIMD(w, Zero); // if any w's are <0, we will need to negate later down
+
+	// figure out the theta
+	fltx4 angles = ArcCosSIMD(w);
+
+	// test also if w > -1
+	fltx4 negativeWs = XorSIMD(signMask, w);
+	tinyAngles = OrSIMD(CmpGtSIMD(negativeWs, OneMinusEpsilon), tinyAngles);
+
+	// meanwhile start working on computing the dot product of the
+	// vector component, and trust in the scheduler to interleave them
+	fltx4 vLenSq = MulSIMD(x, x);
+	vLenSq = MaddSIMD(y, y, vLenSq);
+	vLenSq = MaddSIMD(z, z, vLenSq);
+
+	// scale the angles
+	angles = MulSIMD(angles, scale);
+
+	// clear out the sign mask where w>=0
+	signMask = MaskedAssign(negativeRotations, signMask, Zero);
+
+	// work out the new w component and vector length
+	fltx4 vLenRecip = ReciprocalSqrtSIMD(vLenSq); // interleave with Cos to hide latencies
+	fltx4 sine;
+	SinCosSIMD(sine, ret.w, angles);
+	ret.x = MulSIMD(x, vLenRecip); // renormalize so the vector length + w = 1
+	ret.y = MulSIMD(y, vLenRecip); // renormalize so the vector length + w = 1
+	ret.z = MulSIMD(z, vLenRecip); // renormalize so the vector length + w = 1
+	ret.x = MulSIMD(ret.x, sine);
+	ret.y = MulSIMD(ret.y, sine);
+	ret.z = MulSIMD(ret.z, sine);
+
+	// negate where necessary
+	ret.x = XorSIMD(ret.x, signMask);
+	ret.y = XorSIMD(ret.y, signMask);
+	ret.z = XorSIMD(ret.z, signMask);
+	ret.w = XorSIMD(ret.w, signMask);
+
+	// finally, toss results from where cos(theta) is close to 1 -- these are non rotations.
+	ret.x = MaskedAssign(tinyAngles, x, ret.x);
+	ret.y = MaskedAssign(tinyAngles, y, ret.y);
+	ret.z = MaskedAssign(tinyAngles, z, ret.z);
+	ret.w = MaskedAssign(tinyAngles, w, ret.w);
+
+	return ret;
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: return = this * ( s * q )
+// In other words, for a quaternion representing a rotation of angle theta, return
+// one of angle s*theta
+// s is four floats in a fltx4 -- one for each quaternion
+//-----------------------------------------------------------------------------
+
+FORCEINLINE FourQuaternions FourQuaternions::MulAc(const fltx4& s, const FourQuaternions& q) const
+{
+	/*
+	void QuaternionMA( const Quaternion &p, float s, const Quaternion &q, Quaternion &qt )
+	{
+		Quaternion p1, q1;
+		QuaternionScale( q, s, q1 );
+		QuaternionMult( p, q1, p1 );
+		QuaternionNormalize( p1 );
+		qt[0] = p1[0];
+		qt[1] = p1[1];
+		qt[2] = p1[2];
+		qt[3] = p1[3];
+	}
+	*/
+
+	return Mul(q.ScaleAngle(s));
+}
+
+
+FORCEINLINE FourQuaternions FourQuaternions::ScaleMul(const fltx4& s, const FourQuaternions& q) const
+{
+	return ScaleAngle(s).Mul(q);
+}
+
+
+FORCEINLINE FourQuaternions FourQuaternions::Slerp(const FourQuaternions& originalto, const fltx4& t)
+{
+	FourQuaternions ret;
+	static const fltx4 OneMinusEpsilon = { 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f };
+
+	// align if necessary.
+
+	// actually, before we even do that, start by computing the dot product of 
+	// the quaternions. it has lots of dependent ops and we can sneak it into
+	// the pipeline bubbles as we figure out alignment. Of course we don't know
+	// yet if we need to realign, so compute them both -- there's plenty of
+	// space in the bubbles. They're roomy, those bubbles.
+	fltx4 cosineOmega;
+#if 0 // Maybe I don't need to do alignment seperately, using the xb360 technique...
+	FourQuaternions to;
+	{
+		fltx4 diffs[4], sums[4], originalToNeg[4];
+		fltx4 dotIfAligned, dotIfNotAligned;
+
+		// compute negations of the TO quaternion.
+		originalToNeg[0] = NegSIMD(originalto.x);
+		originalToNeg[1] = NegSIMD(originalto.y);
+		originalToNeg[2] = NegSIMD(originalto.z);
+		originalToNeg[3] = NegSIMD(originalto.w);
+
+		dotIfAligned = MulSIMD(x, originalto.x);
+		dotIfNotAligned = MulSIMD(x, originalToNeg[0]);
+
+		diffs[0] = SubSIMD(x, originalto.x);
+		diffs[1] = SubSIMD(y, originalto.y);
+		diffs[2] = SubSIMD(z, originalto.z);
+		diffs[3] = SubSIMD(w, originalto.w);
+
+		sums[0] = AddSIMD(x, originalto.x);
+		sums[1] = AddSIMD(y, originalto.y);
+		sums[2] = AddSIMD(z, originalto.z);
+		sums[3] = AddSIMD(w, originalto.w);
+
+		dotIfAligned = MaddSIMD(y, originalto.y, dotIfAligned);
+		dotIfNotAligned = MaddSIMD(y, originalToNeg[1], dotIfNotAligned);
+
+		fltx4 diffsDot, sumsDot;
+
+		diffsDot = MulSIMD(diffs[0], diffs[0]); // x^2
+		sumsDot = MulSIMD(sums[0], sums[0]); // x^2
+			// do some work on the dot products while letting the multiplies cook
+		dotIfAligned = MaddSIMD(z, originalto.z, dotIfAligned);
+		dotIfNotAligned = MaddSIMD(z, originalToNeg[2], dotIfNotAligned);
+
+		diffsDot = MaddSIMD(diffs[1], diffs[1], diffsDot); // x^2 + y^2 
+		sumsDot = MaddSIMD(sums[1], sums[1], sumsDot);
+		diffsDot = MaddSIMD(diffs[2], diffs[2], diffsDot); // x^2 + y^2 + z^2
+		sumsDot = MaddSIMD(sums[2], sums[2], sumsDot);
+		diffsDot = MaddSIMD(diffs[3], diffs[3], diffsDot); // x^2 + y^2 + z^2 + w^2
+		sumsDot = MaddSIMD(sums[3], sums[3], sumsDot);
+		// do some work on the dot products while letting the multiplies cook
+		dotIfAligned = MaddSIMD(w, originalto.w, dotIfAligned);
+		dotIfNotAligned = MaddSIMD(w, originalToNeg[3], dotIfNotAligned);
+
+		// are the differences greater than the sums?
+		// if so, we need to negate that quaternion
+		fltx4 mask = CmpGtSIMD(diffsDot, sumsDot); // 1 for diffs>0 and 0 elsewhere
+		to.x = MaskedAssign(mask, originalToNeg[0], originalto.x);
+		to.y = MaskedAssign(mask, originalToNeg[1], originalto.y);
+		to.z = MaskedAssign(mask, originalToNeg[2], originalto.z);
+		to.w = MaskedAssign(mask, originalToNeg[3], originalto.w);
+
+		cosineOmega = MaskedAssign(mask, dotIfNotAligned, dotIfAligned);
+	}
+
+	// right, now to is aligned to be the short way round, and we computed
+	// the dot product while we were figuring all that out.
+#else
+	const FourQuaternions& to = originalto;
+	cosineOmega = MulSIMD(x, to.x);
+	cosineOmega = MaddSIMD(y, to.y, cosineOmega);
+	cosineOmega = MaddSIMD(z, to.z, cosineOmega);
+	cosineOmega = MaddSIMD(w, to.w, cosineOmega);
+#endif
+
+	fltx4 Zero = Four_Zeros;
+	fltx4 cosOmegaLessThanZero = CmpLtSIMD(cosineOmega, Zero);
+	// fltx4 shouldNegate = MaskedAssign(cosOmegaLessThanZero, Four_NegativeOnes , Four_Ones );
+	fltx4 signMask = LoadAlignedSIMD((float*)g_SIMD_signmask); // contains a one in the sign bit -- xor against a number to negate it
+	fltx4 sinOmega = Four_Ones;
+
+	// negate cosineOmega where necessary
+	cosineOmega = MaskedAssign(cosOmegaLessThanZero, XorSIMD(cosineOmega, signMask), cosineOmega);
+	fltx4 oneMinusT = SubSIMD(Four_Ones, t);
+	fltx4 bCosOmegaLessThanOne = CmpLtSIMD(cosineOmega, OneMinusEpsilon); // we'll use this to mask out null slerps
+
+	// figure out the sin component of the diff quaternion.
+	// since sin^2(t) + cos^2(t) = 1...
+	sinOmega = MsubSIMD(cosineOmega, cosineOmega, sinOmega); // = 1 - cos^2(t) = sin^2(t)
+	fltx4 invSinOmega = ReciprocalSqrtSIMD(sinOmega);  // 1/sin(t)
+	sinOmega = MulSIMD(sinOmega, invSinOmega); // = sin^2(t) / sin(t) = sin(t)
+
+	// use the arctangent technique to work out omega from  tan^-1(sin/cos)
+	fltx4 omega = ArcTan2SIMD(sinOmega, cosineOmega);
+
+	// alpha = sin(omega * (1-T))/sin(omega)
+	// beta  = sin(omega * T)/sin(omega)
+	fltx4 alpha = MulSIMD(omega, oneMinusT);  // w(1-T)
+	fltx4 beta = MulSIMD(omega, t);		  // w(T)
+	signMask = MaskedAssign(cosOmegaLessThanZero, signMask, Zero);
+
+	alpha = SinSIMD(alpha);  // sin(w(1-T))
+	beta = SinSIMD(beta);   // sin(wT)
+
+	alpha = MulSIMD(alpha, invSinOmega);
+	beta = MulSIMD(beta, invSinOmega);
+
+	// depending on whether the dot product was less than zero, negate beta, or not
+	beta = XorSIMD(beta, signMask);
+
+	// mask out singularities (where omega = 1)
+	alpha = MaskedAssign(bCosOmegaLessThanOne, alpha, oneMinusT);
+	beta = MaskedAssign(bCosOmegaLessThanOne, beta, t);
+
+	ret.x = MulSIMD(x, alpha);
+	ret.y = MulSIMD(y, alpha);
+	ret.z = MulSIMD(z, alpha);
+	ret.w = MulSIMD(w, alpha);
+
+	ret.x = MaddSIMD(to.x, beta, ret.x);
+	ret.y = MaddSIMD(to.y, beta, ret.y);
+	ret.z = MaddSIMD(to.z, beta, ret.z);
+	ret.w = MaddSIMD(to.w, beta, ret.w);
+
+	return ret;
+}
+
+
+
+FORCEINLINE FourQuaternions FourQuaternions::SlerpNoAlign(const FourQuaternions& originalto, const fltx4& t)
+{
+	FourQuaternions ret;
+	static const fltx4 OneMinusEpsilon = { 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f };
+
+	// align if necessary.
+
+	// actually, before we even do that, start by computing the dot product of 
+	// the quaternions. it has lots of dependent ops and we can sneak it into
+	// the pipeline bubbles as we figure out alignment. Of course we don't know
+	// yet if we need to realign, so compute them both -- there's plenty of
+	// space in the bubbles. They're roomy, those bubbles.
+	fltx4 cosineOmega;
+
+	const FourQuaternions& to = originalto;
+	cosineOmega = MulSIMD(x, to.x);
+	cosineOmega = MaddSIMD(y, to.y, cosineOmega);
+	cosineOmega = MaddSIMD(z, to.z, cosineOmega);
+	cosineOmega = MaddSIMD(w, to.w, cosineOmega);
+
+	fltx4 sinOmega = Four_Ones;
+
+	fltx4 oneMinusT = SubSIMD(Four_Ones, t);
+	fltx4 bCosOmegaLessThanOne = CmpLtSIMD(cosineOmega, OneMinusEpsilon); // we'll use this to mask out null slerps
+
+	// figure out the sin component of the diff quaternion.
+	// since sin^2(t) + cos^2(t) = 1...
+	sinOmega = MsubSIMD(cosineOmega, cosineOmega, sinOmega); // = 1 - cos^2(t) = sin^2(t)
+	fltx4 invSinOmega = ReciprocalSqrtSIMD(sinOmega);  // 1/sin(t)
+	sinOmega = MulSIMD(sinOmega, invSinOmega); // = sin^2(t) / sin(t) = sin(t)
+
+	// use the arctangent technique to work out omega from  tan^-1(sin/cos)
+	fltx4 omega = ArcTan2SIMD(sinOmega, cosineOmega);
+
+	// alpha = sin(omega * (1-T))/sin(omega)
+	// beta  = sin(omega * T)/sin(omega)
+	fltx4 alpha = MulSIMD(omega, oneMinusT);  // w(1-T)
+	fltx4 beta = MulSIMD(omega, t);		  // w(T)
+	alpha = SinSIMD(alpha);  // sin(w(1-T))
+	beta = SinSIMD(beta);   // sin(wT)
+	alpha = MulSIMD(alpha, invSinOmega);
+	beta = MulSIMD(beta, invSinOmega);
+
+	// mask out singularities (where omega = 1)
+	alpha = MaskedAssign(bCosOmegaLessThanOne, alpha, oneMinusT);
+	beta = MaskedAssign(bCosOmegaLessThanOne, beta, t);
+
+	ret.x = MulSIMD(x, alpha);
+	ret.y = MulSIMD(y, alpha);
+	ret.z = MulSIMD(z, alpha);
+	ret.w = MulSIMD(w, alpha);
+
+	ret.x = MaddSIMD(to.x, beta, ret.x);
+	ret.y = MaddSIMD(to.y, beta, ret.y);
+	ret.z = MaddSIMD(to.z, beta, ret.z);
+	ret.w = MaddSIMD(to.w, beta, ret.w);
+
+	return ret;
+}
+
+/***** removed because one of the SWIG permutations doesn't include ssequaternion.h, causing a missing symbol on this function:
+inline void FourVectors::RotateBy( const FourQuaternions &quats )
+{
+	quats.RotateFourVectors( this );
+}
+*/
+
 #endif // SSEQUATMATH_H
 
diff --git a/src/public/mathlib/vector.h b/src/public/mathlib/vector.h
index c7654ba83..7ec2a0469 100644
--- a/src/public/mathlib/vector.h
+++ b/src/public/mathlib/vector.h
@@ -2177,55 +2177,26 @@ inline void AngularImpulseToQAngle( const AngularImpulse &impulse, QAngle &angle
 
 FORCEINLINE vec_t InvRSquared( float const *v )
 {
-#if defined(__i386__) || defined(_M_IX86)
-	float sqrlen = v[0]*v[0]+v[1]*v[1]+v[2]*v[2] + 1.0e-10f, result;
-	_mm_store_ss(&result, _mm_rcp_ss( _mm_max_ss( _mm_set_ss(1.0f), _mm_load_ss(&sqrlen) ) ));
-	return result;
-#else
-	return 1.f/fpmax(1.f, v[0]*v[0]+v[1]*v[1]+v[2]*v[2]);
-#endif
+	// The compiler will make it good
+	return 1.f / (v[0]*v[0]+v[1]*v[1]+v[2]*v[2] + 1.0e-10f);
 }
 
 FORCEINLINE vec_t InvRSquared( const Vector &v )
 {
-	return InvRSquared(&v.x);
-}
-
-#if defined(__i386__) || defined(_M_IX86)
-inline void _SSE_RSqrtInline( float a, float* out )
-{
-	__m128  xx = _mm_load_ss( &a );
-	__m128  xr = _mm_rsqrt_ss( xx );
-	__m128  xt;
-	xt = _mm_mul_ss( xr, xr );
-	xt = _mm_mul_ss( xt, xx );
-	xt = _mm_sub_ss( _mm_set_ss(3.f), xt );
-	xt = _mm_mul_ss( xt, _mm_set_ss(0.5f) );
-	xr = _mm_mul_ss( xr, xt );
-	_mm_store_ss( out, xr );
+	// The compiler will make it good
+	return 1.0f / (v.x*v.x + v.y*v.y + v.z*v.z + 1.0e-10f);
 }
-#endif
 
 // FIXME: Change this back to a #define once we get rid of the vec_t version
 FORCEINLINE float VectorNormalize( Vector& vec )
 {
-#ifndef DEBUG // stop crashing my edit-and-continue!
-	#if defined(__i386__) || defined(_M_IX86)
-		#define DO_SSE_OPTIMIZATION
-	#endif
-#endif
-
-#if defined( DO_SSE_OPTIMIZATION )
-	float sqrlen = vec.LengthSqr() + 1.0e-10f, invlen;
-	_SSE_RSqrtInline(sqrlen, &invlen);
+	// The compiler will make it good
+	const float len = sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z + 1.0e-10f);
+	const float invlen = 1.0f / len;
 	vec.x *= invlen;
 	vec.y *= invlen;
 	vec.z *= invlen;
-	return sqrlen * invlen;
-#else
-	extern float (FASTCALL *pfVectorNormalize)(Vector& v);
-	return (*pfVectorNormalize)(vec);
-#endif
+	return len;
 }
 
 // FIXME: Obsolete version of VectorNormalize, once we remove all the friggin float*s
@@ -2236,7 +2207,11 @@ FORCEINLINE float VectorNormalize( float * v )
 
 FORCEINLINE void VectorNormalizeFast( Vector &vec )
 {
-	VectorNormalize(vec);
+	// The previous version just called VectorNormalize but it's significant to be able to do a rsqrtss here.
+	const float invlen = 1.0f / sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z + 1.0e-10f);
+	vec.x *= invlen;
+	vec.y *= invlen;
+	vec.z *= invlen;
 }
 
 #else
@@ -2308,4 +2283,3 @@ inline bool Vector::IsLengthLessThan( float val ) const
 }
 
 #endif
-
diff --git a/src/public/mathlib/vector4d.h b/src/public/mathlib/vector4d.h
index 2b20c8823..812348910 100644
--- a/src/public/mathlib/vector4d.h
+++ b/src/public/mathlib/vector4d.h
@@ -635,20 +635,17 @@ inline void Vector4DMultiplyAligned( Vector4DAligned const& a, Vector4DAligned c
 #endif
 }
 
-inline void Vector4DWeightMAD( vec_t w, Vector4DAligned const& vInA, Vector4DAligned& vOutA, Vector4DAligned const& vInB, Vector4DAligned& vOutB )
+inline void Vector4DWeightMADSSE( vec_t w, Vector4DAligned const& vInA, Vector4DAligned& vOutA, Vector4DAligned const& vInB, Vector4DAligned& vOutB )
 {
 	Assert( vInA.IsValid() && vInB.IsValid() && IsFinite(w) );
 
 #if !defined( _X360 )
-	vOutA.x += vInA.x * w;
-	vOutA.y += vInA.y * w;
-	vOutA.z += vInA.z * w;
-	vOutA.w += vInA.w * w;
+	// Replicate scalar float out to 4 components
+    __m128 packed = _mm_set_ps1( w );
 
-	vOutB.x += vInB.x * w;
-	vOutB.y += vInB.y * w;
-	vOutB.z += vInB.z * w;
-	vOutB.w += vInB.w * w;
+	// 4D SSE Vector MAD
+	vOutA.AsM128() = _mm_add_ps( vOutA.AsM128(), _mm_mul_ps( vInA.AsM128(), packed ) );
+	vOutB.AsM128() = _mm_add_ps( vOutB.AsM128(), _mm_mul_ps( vInB.AsM128(), packed ) );
 #else
     __vector4 temp;
 
@@ -660,17 +657,24 @@ inline void Vector4DWeightMAD( vec_t w, Vector4DAligned const& vInA, Vector4DAli
 #endif
 }
 
-inline void Vector4DWeightMADSSE( vec_t w, Vector4DAligned const& vInA, Vector4DAligned& vOutA, Vector4DAligned const& vInB, Vector4DAligned& vOutB )
+inline void Vector4DWeightMAD( vec_t w, Vector4DAligned const& vInA, Vector4DAligned& vOutA, Vector4DAligned const& vInB, Vector4DAligned& vOutB )
 {
 	Assert( vInA.IsValid() && vInB.IsValid() && IsFinite(w) );
 
 #if !defined( _X360 )
-	// Replicate scalar float out to 4 components
-    __m128 packed = _mm_set1_ps( w );
+#if 1 // Now using SSE2, so this is faster
+	Vector4DWeightMADSSE(w, vInA, vOutA, vInB, vOutB);
+#else
+	vOutA.x += vInA.x * w;
+	vOutA.y += vInA.y * w;
+	vOutA.z += vInA.z * w;
+	vOutA.w += vInA.w * w;
 
-	// 4D SSE Vector MAD
-	vOutA.AsM128() = _mm_add_ps( vOutA.AsM128(), _mm_mul_ps( vInA.AsM128(), packed ) );
-	vOutB.AsM128() = _mm_add_ps( vOutB.AsM128(), _mm_mul_ps( vInB.AsM128(), packed ) );
+	vOutB.x += vInB.x * w;
+	vOutB.y += vInB.y * w;
+	vOutB.z += vInB.z * w;
+	vOutB.w += vInB.w * w;
+#endif
 #else
     __vector4 temp;
 
diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h
index 593aead5b..fd542388f 100644
--- a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h
+++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h
@@ -147,7 +147,11 @@
 #endif
 #endif // !_XM_NO_INTRINSICS_
 
-#include "sal.h"
+#ifdef _WIN32
+#include <sal.h>
+#else
+#include "../../dotnetrt/sal.h"
+#endif
 #include <assert.h>
 
 #ifdef _MSC_VER
diff --git a/src/thirdparty/dotnetrt/sal.h b/src/thirdparty/dotnetrt/sal.h
index 2e0457140..a4a31fa44 100644
--- a/src/thirdparty/dotnetrt/sal.h
+++ b/src/thirdparty/dotnetrt/sal.h
@@ -1,2953 +1,327 @@
-// VALVE EDIT:
-// taken from https://github.com/dotnet/runtime/blob/main/src/coreclr/pal/inc/rt/sal.h
-// used for DirectXMath compatibly on POSIX
-
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-/***
-*sal.h - markers for documenting the semantics of APIs
-*
-
-*
-*Purpose:
-*       sal.h provides a set of annotations to describe how a function uses its
-*       parameters - the assumptions it makes about them, and the guarantees it makes
-*       upon finishing.
-****/
-#pragma once
-
-/*==========================================================================
-
-   The comments in this file are intended to give basic understanding of
-   the usage of SAL, the Microsoft Source Code Annotation Language.
-   For more details, please see http://go.microsoft.com/fwlink/?LinkID=242134
-
-   The macros are defined in 3 layers, plus the structural set:
-
-   _In_/_Out_/_Ret_ Layer:
-   ----------------------
-   This layer provides the highest abstraction and its macros should be used
-   in most cases. These macros typically start with:
-      _In_     : input parameter to a function, unmodified by called function
-      _Out_    : output parameter, written to by called function, pointed-to
-                 location not expected to be initialized prior to call
-      _Outptr_ : like _Out_ when returned variable is a pointer type
-                 (so param is pointer-to-pointer type). Called function
-                 provides/allocated space.
-      _Outref_ : like _Outptr_, except param is reference-to-pointer type.
-      _Inout_  : inout parameter, read from and potentially modified by
-                 called function.
-      _Ret_    : for return values
-      _Field_  : class/struct field invariants
-   For common usage, this class of SAL provides the most concise annotations.
-   Note that _In_/_Out_/_Inout_/_Outptr_ annotations are designed to be used
-   with a parameter target. Using them with _At_ to specify non-parameter
-   targets may yield unexpected results.
-
-   This layer also includes a number of other properties that can be specified
-   to extend the ability of code analysis, most notably:
-      -- Designating parameters as format strings for printf/scanf/scanf_s
-      -- Requesting stricter type checking for C enum parameters
-
-   _Pre_/_Post_ Layer:
-   ------------------
-   The macros of this layer only should be used when there is no suitable macro
-   in the _In_/_Out_ layer. Its macros start with _Pre_ or _Post_.
-   This layer provides the most flexibility for annotations.
-
-   Implementation Abstraction Layer:
-   --------------------------------
-   Macros from this layer should never be used directly. The layer only exists
-   to hide the implementation of the annotation macros.
-
-   Structural Layer:
-   ----------------
-   These annotations, like _At_ and _When_, are used with annotations from
-   any of the other layers as modifiers, indicating exactly when and where
-   the annotations apply.
-
-
-   Common syntactic conventions:
-   ----------------------------
-
-   Usage:
-   -----
-   _In_, _Out_, _Inout_, _Pre_, _Post_, are for formal parameters.
-   _Ret_, _Deref_ret_ must be used for return values.
-
-   Nullness:
-   --------
-   If the parameter can be NULL as a precondition to the function, the
-   annotation contains _opt. If the macro does not contain '_opt' the
-   parameter cannot be NULL.
-
-   If an out/inout parameter returns a null pointer as a postcondition, this is
-   indicated by _Ret_maybenull_ or _result_maybenull_. If the macro is not
-   of this form, then the result will not be NULL as a postcondition.
-     _Outptr_ - output value is not NULL
-     _Outptr_result_maybenull_ - output value might be NULL
-
-   String Type:
-   -----------
-   _z: NullTerminated string
-   for _In_ parameters the buffer must have the specified stringtype before the call
-   for _Out_ parameters the buffer must have the specified stringtype after the call
-   for _Inout_ parameters both conditions apply
-
-   Extent Syntax:
-   -------------
-   Buffer sizes are expressed as element counts, unless the macro explicitly
-   contains _byte_ or _bytes_. Some annotations specify two buffer sizes, in
-   which case the second is used to indicate how much of the buffer is valid
-   as a postcondition. This table outlines the precondition buffer allocation
-   size, precondition number of valid elements, postcondition allocation size,
-   and postcondition number of valid elements for representative buffer size
-   annotations:
-                                     Pre    |  Pre    |  Post   |  Post
-                                     alloc  |  valid  |  alloc  |  valid
-      Annotation                     elems  |  elems  |  elems  |  elems
-      ----------                     ------------------------------------
-      _In_reads_(s)                    s    |   s     |   s     |   s
-      _Inout_updates_(s)               s    |   s     |   s     |   s
-      _Inout_updates_to_(s,c)          s    |   s     |   s     |   c
-      _Out_writes_(s)                  s    |   0     |   s     |   s
-      _Out_writes_to_(s,c)             s    |   0     |   s     |   c
-      _Outptr_result_buffer_(s)        ?    |   ?     |   s     |   s
-      _Outptr_result_buffer_to_(s,c)   ?    |   ?     |   s     |   c
-
-   For the _Outptr_ annotations, the buffer in question is at one level of
-   dereference. The called function is responsible for supplying the buffer.
-
-   Success and failure:
-   -------------------
-   The SAL concept of success allows functions to define expressions that can
-   be tested by the caller, which if it evaluates to non-zero, indicates the
-   function succeeded, which means that its postconditions are guaranteed to
-   hold.  Otherwise, if the expression evaluates to zero, the function is
-   considered to have failed, and the postconditions are not guaranteed.
-
-   The success criteria can be specified with the _Success_(expr) annotation:
-     _Success_(return != FALSE) BOOL
-     PathCanonicalizeA(_Out_writes_(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) :
-        pszBuf is only guaranteed to be NULL-terminated when TRUE is returned,
-        and FALSE indicates failure. In common practice, callers check for zero
-        vs. non-zero returns, so it is preferable to express the success
-        criteria in terms of zero/non-zero, not checked for exactly TRUE.
-
-   Functions can specify that some postconditions will still hold, even when
-   the function fails, using _On_failure_(anno-list), or postconditions that
-   hold regardless of success or failure using _Always_(anno-list).
-
-   The annotation _Return_type_success_(expr) may be used with a typedef to
-   give a default _Success_ criteria to all functions returning that type.
-   This is the case for common Windows API status types, including
-   HRESULT and NTSTATUS.  This may be overridden on a per-function basis by
-   specifying a _Success_ annotation locally.
-
-============================================================================*/
-
-#define __ATTR_SAL
-
-#ifndef _SAL_VERSION /*IFSTRIP=IGN*/
-#define _SAL_VERSION 20
-#endif
-
-#ifdef _PREFAST_ // [
-
-// choose attribute or __declspec implementation
-#ifndef _USE_DECLSPECS_FOR_SAL // [
-#define _USE_DECLSPECS_FOR_SAL 1
-#endif // ]
-
-#if _USE_DECLSPECS_FOR_SAL // [
-#undef _USE_ATTRIBUTES_FOR_SAL
-#define _USE_ATTRIBUTES_FOR_SAL 0
-#elif !defined(_USE_ATTRIBUTES_FOR_SAL) // ][
-#if _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // [
-#define _USE_ATTRIBUTES_FOR_SAL 1
-#else // ][
-#define _USE_ATTRIBUTES_FOR_SAL 0
-#endif // ]
-#endif // ]
-
-
-#if !_USE_DECLSPECS_FOR_SAL // [
-#if !_USE_ATTRIBUTES_FOR_SAL // [
-#if _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // [
-#undef _USE_ATTRIBUTES_FOR_SAL
-#define _USE_ATTRIBUTES_FOR_SAL 1
-#else // ][
-#undef _USE_DECLSPECS_FOR_SAL
-#define _USE_DECLSPECS_FOR_SAL  1
-#endif // ]
-#endif // ]
-#endif // ]
-
-#else
-
-// Disable expansion of SAL macros in non-Prefast mode to
-// improve compiler throughput.
-#ifndef _USE_DECLSPECS_FOR_SAL // [
-#define _USE_DECLSPECS_FOR_SAL 0
-#endif // ]
-#ifndef _USE_ATTRIBUTES_FOR_SAL // [
-#define _USE_ATTRIBUTES_FOR_SAL 0
-#endif // ]
-
-#endif // ]
-
-// safeguard for MIDL and RC builds
-#if _USE_DECLSPECS_FOR_SAL && ( defined( MIDL_PASS ) || defined(__midl) || defined(RC_INVOKED) || !defined(_PREFAST_) ) /*IFSTRIP=IGN*/ // [
-#undef _USE_DECLSPECS_FOR_SAL
-#define _USE_DECLSPECS_FOR_SAL 0
-#endif // ]
-#if _USE_ATTRIBUTES_FOR_SAL && ( !defined(_MSC_EXTENSIONS) || defined( MIDL_PASS ) || defined(__midl) || defined(RC_INVOKED) ) /*IFSTRIP=IGN*/ // [
-#undef _USE_ATTRIBUTES_FOR_SAL
-#define _USE_ATTRIBUTES_FOR_SAL 0
-#endif // ]
-
-#if _USE_DECLSPECS_FOR_SAL || _USE_ATTRIBUTES_FOR_SAL
-
-// Special enum type for Y/N/M
-enum __SAL_YesNo {_SAL_notpresent, _SAL_no, _SAL_maybe, _SAL_yes, _SAL_default};
-
-#endif
-
-#if defined(BUILD_WINDOWS) && !_USE_ATTRIBUTES_FOR_SAL /*IFSTRIP=IGN*/
-#define _SAL1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1") _GrouP_(annotes _SAL_nop_impl_)
-#define _SAL1_1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.1") _GrouP_(annotes _SAL_nop_impl_)
-#define _SAL1_2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.2") _GrouP_(annotes _SAL_nop_impl_)
-#define _SAL2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "2") _GrouP_(annotes _SAL_nop_impl_)
-#else
-#define _SAL1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1") _Group_(annotes _SAL_nop_impl_)
-#define _SAL1_1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.1") _Group_(annotes _SAL_nop_impl_)
-#define _SAL1_2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.2") _Group_(annotes _SAL_nop_impl_)
-#define _SAL2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "2") _Group_(annotes _SAL_nop_impl_)
-#endif
-
-//============================================================================
-//   Structural SAL:
-//     These annotations modify the use of other annotations.  They may
-//     express the annotation target (i.e. what parameter/field the annotation
-//     applies to) or the condition under which the annotation is applicable.
-//============================================================================
-
-// _At_(target, annos) specifies that the annotations listed in 'annos' is to
-// be applied to 'target' rather than to the identifier which is the current
-// lexical target.
-#define _At_(target, annos)            _At_impl_(target, annos _SAL_nop_impl_)
-
-// _At_buffer_(target, iter, bound, annos) is similar to _At_, except that
-// target names a buffer, and each annotation in annos is applied to each
-// element of target up to bound, with the variable named in iter usable
-// by the annotations to refer to relevant offsets within target.
-#define _At_buffer_(target, iter, bound, annos)  _At_buffer_impl_(target, iter, bound, annos _SAL_nop_impl_)
-
-// _When_(expr, annos) specifies that the annotations listed in 'annos' only
-// apply when 'expr' evaluates to non-zero.
-#define _When_(expr, annos)            _When_impl_(expr, annos _SAL_nop_impl_)
-#define _Group_(annos)                 _Group_impl_(annos _SAL_nop_impl_)
-#define _GrouP_(annos)                 _GrouP_impl_(annos _SAL_nop_impl_)
-
-// <expr> indicates whether normal post conditions apply to a function
-#define _Success_(expr)                  _SAL2_Source_(_Success_, (expr), _Success_impl_(expr))
-
-// <expr> indicates whether post conditions apply to a function returning
-// the type that this annotation is applied to
-#define _Return_type_success_(expr)      _SAL2_Source_(_Return_type_success_, (expr), _Success_impl_(expr))
-
-// Establish postconditions that apply only if the function does not succeed
-#define _On_failure_(annos)              _On_failure_impl_(annos _SAL_nop_impl_)
-
-// Establish postconditions that apply in both success and failure cases.
-// Only applicable with functions that have  _Success_ or _Return_type_succss_.
-#define _Always_(annos)                  _Always_impl_(annos _SAL_nop_impl_)
-
-// Usable on a function definition. Asserts that a function declaration is
-// in scope, and its annotations are to be used. There are no other annotations
-// allowed on the function definition.
-#define _Use_decl_annotations_         _Use_decl_anno_impl_
-
-// _Notref_ may precede a _Deref_ or "real" annotation, and removes one
-// level of dereference if the parameter is a C++ reference (&).  If the
-// net deref on a "real" annotation is negative, it is simply discarded.
-#define _Notref_                       _Notref_impl_
-
-// Annotations for defensive programming styles.
-#define _Pre_defensive_             _SA_annotes0(SAL_pre_defensive)
-#define _Post_defensive_            _SA_annotes0(SAL_post_defensive)
-
-#define _In_defensive_(annotes)     _Pre_defensive_ _Group_(annotes)
-#define _Out_defensive_(annotes)    _Post_defensive_ _Group_(annotes)
-#define _Inout_defensive_(annotes)  _Pre_defensive_ _Post_defensive_ _Group_(annotes)
-
-//============================================================================
-//   _In_\_Out_ Layer:
-//============================================================================
-
-// Reserved pointer parameters, must always be NULL.
-#define _Reserved_                      _SAL2_Source_(_Reserved_, (), _Pre1_impl_(__null_impl))
-
-// _Const_ allows specification that any namable memory location is considered
-// readonly for a given call.
-#define _Const_                         _SAL2_Source_(_Const_, (), _Pre1_impl_(__readaccess_impl_notref))
-
-
-// Input parameters --------------------------
-
-//   _In_ - Annotations for parameters where data is passed into the function, but not modified.
-//          _In_ by itself can be used with non-pointer types (although it is redundant).
-
-// e.g. void SetPoint( _In_ const POINT* pPT );
-#define _In_                            _SAL2_Source_(_In_, (), _Pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_ _Deref_pre1_impl_(__readaccess_impl_notref))
-#define _In_opt_                        _SAL2_Source_(_In_opt_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_ _Deref_pre_readonly_)
-
-// nullterminated 'in' parameters.
-// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo );
-#define _In_z_                          _SAL2_Source_(_In_z_, (),     _In_     _Pre1_impl_(__zterm_impl))
-#define _In_opt_z_                      _SAL2_Source_(_In_opt_z_, (), _In_opt_ _Pre1_impl_(__zterm_impl))
-
-
-// 'input' buffers with given size
-
-#define _In_reads_(size)               _SAL2_Source_(_In_reads_, (size), _Pre_count_(size)          _Deref_pre_readonly_)
-#define _In_reads_opt_(size)           _SAL2_Source_(_In_reads_opt_, (size), _Pre_opt_count_(size)      _Deref_pre_readonly_)
-#define _In_reads_bytes_(size)         _SAL2_Source_(_In_reads_bytes_, (size), _Pre_bytecount_(size)      _Deref_pre_readonly_)
-#define _In_reads_bytes_opt_(size)     _SAL2_Source_(_In_reads_bytes_opt_, (size), _Pre_opt_bytecount_(size)  _Deref_pre_readonly_)
-#define _In_reads_z_(size)             _SAL2_Source_(_In_reads_z_, (size), _In_reads_(size)     _Pre_z_)
-#define _In_reads_opt_z_(size)         _SAL2_Source_(_In_reads_opt_z_, (size), _Pre_opt_count_(size)      _Deref_pre_readonly_     _Pre_opt_z_)
-#define _In_reads_or_z_(size)          _SAL2_Source_(_In_reads_or_z_, (size), _In_ _When_(_String_length_(_Curr_) < (size), _Pre_z_) _When_(_String_length_(_Curr_) >= (size), _Pre1_impl_(__count_impl(size))))
-#define _In_reads_or_z_opt_(size)      _SAL2_Source_(_In_reads_or_z_opt_, (size), _In_opt_ _When_(_String_length_(_Curr_) < (size), _Pre_z_) _When_(_String_length_(_Curr_) >= (size), _Pre1_impl_(__count_impl(size))))
-
-
-// 'input' buffers valid to the given end pointer
-
-#define _In_reads_to_ptr_(ptr)         _SAL2_Source_(_In_reads_to_ptr_, (ptr), _Pre_ptrdiff_count_(ptr)     _Deref_pre_readonly_)
-#define _In_reads_to_ptr_opt_(ptr)     _SAL2_Source_(_In_reads_to_ptr_opt_, (ptr), _Pre_opt_ptrdiff_count_(ptr) _Deref_pre_readonly_)
-#define _In_reads_to_ptr_z_(ptr)       _SAL2_Source_(_In_reads_to_ptr_z_, (ptr), _In_reads_to_ptr_(ptr) _Pre_z_)
-#define _In_reads_to_ptr_opt_z_(ptr)   _SAL2_Source_(_In_reads_to_ptr_opt_z_, (ptr), _Pre_opt_ptrdiff_count_(ptr) _Deref_pre_readonly_  _Pre_opt_z_)
-
-
-
-// Output parameters --------------------------
-
-//   _Out_ - Annotations for pointer or reference parameters where data passed back to the caller.
-//           These are mostly used where the pointer/reference is to a non-pointer type.
-//           _Outptr_/_Outref) (see below) are typically used to return pointers via parameters.
-
-// e.g. void GetPoint( _Out_ POINT* pPT );
-#define _Out_                                  _SAL2_Source_(_Out_, (),     _Out_impl_)
-#define _Out_opt_                              _SAL2_Source_(_Out_opt_, (), _Out_opt_impl_)
-
-#define _Out_writes_(size)                     _SAL2_Source_(_Out_writes_, (size), _Pre_cap_(size)            _Post_valid_impl_)
-#define _Out_writes_opt_(size)                 _SAL2_Source_(_Out_writes_opt_, (size), _Pre_opt_cap_(size)        _Post_valid_impl_)
-#define _Out_writes_bytes_(size)               _SAL2_Source_(_Out_writes_bytes_, (size), _Pre_bytecap_(size)        _Post_valid_impl_)
-#define _Out_writes_bytes_opt_(size)           _SAL2_Source_(_Out_writes_bytes_opt_, (size), _Pre_opt_bytecap_(size)    _Post_valid_impl_)
-#define _Out_writes_z_(size)                   _SAL2_Source_(_Out_writes_z_, (size), _Pre_cap_(size)            _Post_valid_impl_ _Post_z_)
-#define _Out_writes_opt_z_(size)               _SAL2_Source_(_Out_writes_opt_z_, (size), _Pre_opt_cap_(size)        _Post_valid_impl_ _Post_z_)
-
-#define _Out_writes_to_(size,count)            _SAL2_Source_(_Out_writes_to_, (size,count), _Pre_cap_(size)            _Post_valid_impl_ _Post_count_(count))
-#define _Out_writes_to_opt_(size,count)        _SAL2_Source_(_Out_writes_to_opt_, (size,count), _Pre_opt_cap_(size)        _Post_valid_impl_ _Post_count_(count))
-#define _Out_writes_all_(size)                 _SAL2_Source_(_Out_writes_all_, (size), _Out_writes_to_(_Old_(size), _Old_(size)))
-#define _Out_writes_all_opt_(size)             _SAL2_Source_(_Out_writes_all_opt_, (size), _Out_writes_to_opt_(_Old_(size), _Old_(size)))
-
-#define _Out_writes_bytes_to_(size,count)      _SAL2_Source_(_Out_writes_bytes_to_, (size,count), _Pre_bytecap_(size)        _Post_valid_impl_ _Post_bytecount_(count))
-#define _Out_writes_bytes_to_opt_(size,count)  _SAL2_Source_(_Out_writes_bytes_to_opt_, (size,count), _Pre_opt_bytecap_(size) _Post_valid_impl_ _Post_bytecount_(count))
-#define _Out_writes_bytes_all_(size)           _SAL2_Source_(_Out_writes_bytes_all_, (size), _Out_writes_bytes_to_(_Old_(size), _Old_(size)))
-#define _Out_writes_bytes_all_opt_(size)       _SAL2_Source_(_Out_writes_bytes_all_opt_, (size), _Out_writes_bytes_to_opt_(_Old_(size), _Old_(size)))
-
-#define _Out_writes_to_ptr_(ptr)               _SAL2_Source_(_Out_writes_to_ptr_, (ptr), _Pre_ptrdiff_cap_(ptr)     _Post_valid_impl_)
-#define _Out_writes_to_ptr_opt_(ptr)           _SAL2_Source_(_Out_writes_to_ptr_opt_, (ptr), _Pre_opt_ptrdiff_cap_(ptr) _Post_valid_impl_)
-#define _Out_writes_to_ptr_z_(ptr)             _SAL2_Source_(_Out_writes_to_ptr_z_, (ptr), _Pre_ptrdiff_cap_(ptr)     _Post_valid_impl_ Post_z_)
-#define _Out_writes_to_ptr_opt_z_(ptr)         _SAL2_Source_(_Out_writes_to_ptr_opt_z_, (ptr), _Pre_opt_ptrdiff_cap_(ptr) _Post_valid_impl_ Post_z_)
-
-
-// Inout parameters ----------------------------
-
-//   _Inout_ - Annotations for pointer or reference parameters where data is passed in and
-//        potentially modified.
-//          void ModifyPoint( _Inout_ POINT* pPT );
-//          void ModifyPointByRef( _Inout_ POINT& pPT );
-
-#define _Inout_                                _SAL2_Source_(_Inout_, (), _Prepost_valid_)
-#define _Inout_opt_                            _SAL2_Source_(_Inout_opt_, (), _Prepost_opt_valid_)
-
-// For modifying string buffers
-//   void toupper( _Inout_z_ char* sz );
-#define _Inout_z_                              _SAL2_Source_(_Inout_z_, (), _Prepost_z_)
-#define _Inout_opt_z_                          _SAL2_Source_(_Inout_opt_z_, (), _Prepost_opt_z_)
-
-// For modifying buffers with explicit element size
-#define _Inout_updates_(size)                  _SAL2_Source_(_Inout_updates_, (size), _Pre_cap_(size)         _Pre_valid_impl_ _Post_valid_impl_)
-#define _Inout_updates_opt_(size)              _SAL2_Source_(_Inout_updates_opt_, (size), _Pre_opt_cap_(size)     _Pre_valid_impl_ _Post_valid_impl_)
-#define _Inout_updates_z_(size)                _SAL2_Source_(_Inout_updates_z_, (size), _Pre_cap_(size)         _Pre_valid_impl_ _Post_valid_impl_ _Pre1_impl_(__zterm_impl) _Post1_impl_(__zterm_impl))
-#define _Inout_updates_opt_z_(size)            _SAL2_Source_(_Inout_updates_opt_z_, (size), _Pre_opt_cap_(size)     _Pre_valid_impl_ _Post_valid_impl_ _Pre1_impl_(__zterm_impl) _Post1_impl_(__zterm_impl))
-
-#define _Inout_updates_to_(size,count)         _SAL2_Source_(_Inout_updates_to_, (size,count), _Out_writes_to_(size,count) _Pre_valid_impl_ _Pre1_impl_(__count_impl(count)))
-#define _Inout_updates_to_opt_(size,count)     _SAL2_Source_(_Inout_updates_to_opt_, (size,count), _Out_writes_to_opt_(size,count) _Pre_valid_impl_ _Pre1_impl_(__count_impl(count)))
-
-#define _Inout_updates_all_(size)              _SAL2_Source_(_Inout_updates_all_, (size), _Inout_updates_to_(_Old_(size), _Old_(size)))
-#define _Inout_updates_all_opt_(size)          _SAL2_Source_(_Inout_updates_all_opt_, (size), _Inout_updates_to_opt_(_Old_(size), _Old_(size)))
-
-// For modifying buffers with explicit byte size
-#define _Inout_updates_bytes_(size)            _SAL2_Source_(_Inout_updates_bytes_, (size), _Pre_bytecap_(size)     _Pre_valid_impl_ _Post_valid_impl_)
-#define _Inout_updates_bytes_opt_(size)        _SAL2_Source_(_Inout_updates_bytes_opt_, (size), _Pre_opt_bytecap_(size) _Pre_valid_impl_ _Post_valid_impl_)
-
-#define _Inout_updates_bytes_to_(size,count)       _SAL2_Source_(_Inout_updates_bytes_to_, (size,count), _Out_writes_bytes_to_(size,count) _Pre_valid_impl_ _Pre1_impl_(__bytecount_impl(count)))
-#define _Inout_updates_bytes_to_opt_(size,count)   _SAL2_Source_(_Inout_updates_bytes_to_opt_, (size,count), _Out_writes_bytes_to_opt_(size,count) _Pre_valid_impl_ _Pre1_impl_(__bytecount_impl(count)))
-
-#define _Inout_updates_bytes_all_(size)        _SAL2_Source_(_Inout_updates_bytes_all_, (size), _Inout_updates_bytes_to_(_Old_(size), _Old_(size)))
-#define _Inout_updates_bytes_all_opt_(size)    _SAL2_Source_(_Inout_updates_bytes_all_opt_, (size), _Inout_updates_bytes_to_opt_(_Old_(size), _Old_(size)))
-
-
-// Pointer to pointer parameters -------------------------
-
-//   _Outptr_ - Annotations for output params returning pointers
-//      These describe parameters where the called function provides the buffer:
-//        HRESULT SHStrDupW(_In_ LPCWSTR psz, _Outptr_ LPWSTR *ppwsz);
-//      The caller passes the address of an LPWSTR variable as ppwsz, and SHStrDupW allocates
-//      and initializes memory and returns the pointer to the new LPWSTR in *ppwsz.
-//
-//    _Outptr_opt_ - describes parameters that are allowed to be NULL.
-//    _Outptr_*_result_maybenull_ - describes parameters where the called function might return NULL to the caller.
-//
-//    Example:
-//       void MyFunc(_Outptr_opt_ int **ppData1, _Outptr_result_maybenull_ int **ppData2);
-//    Callers:
-//       MyFunc(NULL, NULL);           // error: parameter 2, ppData2, should not be NULL
-//       MyFunc(&pData1, &pData2);     // ok: both non-NULL
-//       if (*pData1 == *pData2) ...   // error: pData2 might be NULL after call
-
-#define _Outptr_                         _SAL2_Source_(_Outptr_, (),                      _Out_impl_     _Deref_post2_impl_(__notnull_impl_notref,   __count_impl(1)))
-#define _Outptr_result_maybenull_        _SAL2_Source_(_Outptr_result_maybenull_, (),     _Out_impl_     _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(1)))
-#define _Outptr_opt_                     _SAL2_Source_(_Outptr_opt_, (),                  _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref,   __count_impl(1)))
-#define _Outptr_opt_result_maybenull_    _SAL2_Source_(_Outptr_opt_result_maybenull_, (), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(1)))
-
-// Annotations for _Outptr_ parameters returning pointers to null terminated strings.
-
-#define _Outptr_result_z_                _SAL2_Source_(_Outptr_result_z_, (),               _Out_impl_     _Deref_post_z_)
-#define _Outptr_opt_result_z_            _SAL2_Source_(_Outptr_opt_result_z_, (),           _Out_opt_impl_ _Deref_post_z_)
-#define _Outptr_result_maybenull_z_      _SAL2_Source_(_Outptr_result_maybenull_z_, (),     _Out_impl_     _Deref_post_opt_z_)
-#define _Outptr_opt_result_maybenull_z_  _SAL2_Source_(_Outptr_opt_result_maybenull_z_, (), _Out_opt_impl_ _Deref_post_opt_z_)
-
-// Annotations for _Outptr_ parameters where the output pointer is set to NULL if the function fails.
-
-#define _Outptr_result_nullonfailure_       _SAL2_Source_(_Outptr_result_nullonfailure_, (),     _Outptr_      _On_failure_(_Deref_post_null_))
-#define _Outptr_opt_result_nullonfailure_   _SAL2_Source_(_Outptr_opt_result_nullonfailure_, (), _Outptr_opt_  _On_failure_(_Deref_post_null_))
-
-// Annotations for _Outptr_ parameters which return a pointer to a ref-counted COM object,
-// following the COM convention of setting the output to NULL on failure.
-// The current implementation is identical to _Outptr_result_nullonfailure_.
-// For pointers to types that are not COM objects, _Outptr_result_nullonfailure_ is preferred.
-
-#define _COM_Outptr_                        _SAL2_Source_(_COM_Outptr_, (),                      _Outptr_                      _On_failure_(_Deref_post_null_))
-#define _COM_Outptr_result_maybenull_       _SAL2_Source_(_COM_Outptr_result_maybenull_, (),     _Outptr_result_maybenull_     _On_failure_(_Deref_post_null_))
-#define _COM_Outptr_opt_                    _SAL2_Source_(_COM_Outptr_opt_, (),                  _Outptr_opt_                  _On_failure_(_Deref_post_null_))
-#define _COM_Outptr_opt_result_maybenull_   _SAL2_Source_(_COM_Outptr_opt_result_maybenull_, (), _Outptr_opt_result_maybenull_ _On_failure_(_Deref_post_null_))
-
-// Annotations for _Outptr_ parameters returning a pointer to buffer with a specified number of elements/bytes
-
-#define _Outptr_result_buffer_(size)                      _SAL2_Source_(_Outptr_result_buffer_, (size),               _Out_impl_     _Deref_post2_impl_(__notnull_impl_notref, __cap_impl(size)))
-#define _Outptr_opt_result_buffer_(size)                  _SAL2_Source_(_Outptr_opt_result_buffer_, (size),           _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __cap_impl(size)))
-#define _Outptr_result_buffer_to_(size, count)            _SAL2_Source_(_Outptr_result_buffer_to_, (size, count),     _Out_impl_     _Deref_post3_impl_(__notnull_impl_notref, __cap_impl(size), __count_impl(count)))
-#define _Outptr_opt_result_buffer_to_(size, count)        _SAL2_Source_(_Outptr_opt_result_buffer_to_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__notnull_impl_notref, __cap_impl(size), __count_impl(count)))
-
-#define _Outptr_result_buffer_all_(size)                  _SAL2_Source_(_Outptr_result_buffer_all_, (size),           _Out_impl_     _Deref_post2_impl_(__notnull_impl_notref, __count_impl(size)))
-#define _Outptr_opt_result_buffer_all_(size)              _SAL2_Source_(_Outptr_opt_result_buffer_all_, (size),       _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(size)))
-
-#define _Outptr_result_buffer_maybenull_(size)               _SAL2_Source_(_Outptr_result_buffer_maybenull_, (size),               _Out_impl_     _Deref_post2_impl_(__maybenull_impl_notref, __cap_impl(size)))
-#define _Outptr_opt_result_buffer_maybenull_(size)           _SAL2_Source_(_Outptr_opt_result_buffer_maybenull_, (size),           _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __cap_impl(size)))
-#define _Outptr_result_buffer_to_maybenull_(size, count)     _SAL2_Source_(_Outptr_result_buffer_to_maybenull_, (size, count),     _Out_impl_     _Deref_post3_impl_(__maybenull_impl_notref, __cap_impl(size), __count_impl(count)))
-#define _Outptr_opt_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_opt_result_buffer_to_maybenull_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __cap_impl(size), __count_impl(count)))
-
-#define _Outptr_result_buffer_all_maybenull_(size)           _SAL2_Source_(_Outptr_result_buffer_all_maybenull_, (size),           _Out_impl_     _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(size)))
-#define _Outptr_opt_result_buffer_all_maybenull_(size)       _SAL2_Source_(_Outptr_opt_result_buffer_all_maybenull_, (size),       _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(size)))
-
-#define _Outptr_result_bytebuffer_(size)                     _SAL2_Source_(_Outptr_result_bytebuffer_, (size),                     _Out_impl_     _Deref_post2_impl_(__notnull_impl_notref, __bytecap_impl(size)))
-#define _Outptr_opt_result_bytebuffer_(size)                 _SAL2_Source_(_Outptr_opt_result_bytebuffer_, (size),                 _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecap_impl(size)))
-#define _Outptr_result_bytebuffer_to_(size, count)           _SAL2_Source_(_Outptr_result_bytebuffer_to_, (size, count),           _Out_impl_     _Deref_post3_impl_(__notnull_impl_notref, __bytecap_impl(size), __bytecount_impl(count)))
-#define _Outptr_opt_result_bytebuffer_to_(size, count)       _SAL2_Source_(_Outptr_opt_result_bytebuffer_to_, (size, count),       _Out_opt_impl_ _Deref_post3_impl_(__notnull_impl_notref, __bytecap_impl(size), __bytecount_impl(count)))
-
-#define _Outptr_result_bytebuffer_all_(size)                 _SAL2_Source_(_Outptr_result_bytebuffer_all_, (size),                 _Out_impl_     _Deref_post2_impl_(__notnull_impl_notref, __bytecount_impl(size)))
-#define _Outptr_opt_result_bytebuffer_all_(size)             _SAL2_Source_(_Outptr_opt_result_bytebuffer_all_, (size),             _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecount_impl(size)))
-
-#define _Outptr_result_bytebuffer_maybenull_(size)                 _SAL2_Source_(_Outptr_result_bytebuffer_maybenull_, (size),               _Out_impl_     _Deref_post2_impl_(__maybenull_impl_notref, __bytecap_impl(size)))
-#define _Outptr_opt_result_bytebuffer_maybenull_(size)             _SAL2_Source_(_Outptr_opt_result_bytebuffer_maybenull_, (size),           _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecap_impl(size)))
-#define _Outptr_result_bytebuffer_to_maybenull_(size, count)       _SAL2_Source_(_Outptr_result_bytebuffer_to_maybenull_, (size, count),     _Out_impl_     _Deref_post3_impl_(__maybenull_impl_notref, __bytecap_impl(size), __bytecount_impl(count)))
-#define _Outptr_opt_result_bytebuffer_to_maybenull_(size, count)   _SAL2_Source_(_Outptr_opt_result_bytebuffer_to_maybenull_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __bytecap_impl(size), __bytecount_impl(count)))
-
-#define _Outptr_result_bytebuffer_all_maybenull_(size)         _SAL2_Source_(_Outptr_result_bytebuffer_all_maybenull_, (size),               _Out_impl_     _Deref_post2_impl_(__maybenull_impl_notref, __bytecount_impl(size)))
-#define _Outptr_opt_result_bytebuffer_all_maybenull_(size)     _SAL2_Source_(_Outptr_opt_result_bytebuffer_all_maybenull_, (size),           _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecount_impl(size)))
-
-// Annotations for output reference to pointer parameters.
-
-#define _Outref_                                               _SAL2_Source_(_Outref_, (),                  _Out_impl_ _Post_notnull_)
-#define _Outref_result_maybenull_                              _SAL2_Source_(_Outref_result_maybenull_, (), _Pre2_impl_(__notnull_impl_notref, __cap_c_one_notref_impl) _Post_maybenull_ _Post_valid_impl_)
-
-#define _Outref_result_buffer_(size)                           _SAL2_Source_(_Outref_result_buffer_, (size),                         _Outref_ _Post1_impl_(__cap_impl(size)))
-#define _Outref_result_bytebuffer_(size)                       _SAL2_Source_(_Outref_result_bytebuffer_, (size),                     _Outref_ _Post1_impl_(__bytecap_impl(size)))
-#define _Outref_result_buffer_to_(size, count)                 _SAL2_Source_(_Outref_result_buffer_to_, (size, count),               _Outref_result_buffer_(size) _Post1_impl_(__count_impl(count)))
-#define _Outref_result_bytebuffer_to_(size, count)             _SAL2_Source_(_Outref_result_bytebuffer_to_, (size, count),           _Outref_result_bytebuffer_(size) _Post1_impl_(__bytecount_impl(count)))
-#define _Outref_result_buffer_all_(size)                       _SAL2_Source_(_Outref_result_buffer_all_, (size),                     _Outref_result_buffer_to_(size, _Old_(size)))
-#define _Outref_result_bytebuffer_all_(size)                   _SAL2_Source_(_Outref_result_bytebuffer_all_, (size),                 _Outref_result_bytebuffer_to_(size, _Old_(size)))
-
-#define _Outref_result_buffer_maybenull_(size)                 _SAL2_Source_(_Outref_result_buffer_maybenull_, (size),               _Outref_result_maybenull_ _Post1_impl_(__cap_impl(size)))
-#define _Outref_result_bytebuffer_maybenull_(size)             _SAL2_Source_(_Outref_result_bytebuffer_maybenull_, (size),           _Outref_result_maybenull_ _Post1_impl_(__bytecap_impl(size)))
-#define _Outref_result_buffer_to_maybenull_(size, count)       _SAL2_Source_(_Outref_result_buffer_to_maybenull_, (size, count),     _Outref_result_buffer_maybenull_(size) _Post1_impl_(__count_impl(count)))
-#define _Outref_result_bytebuffer_to_maybenull_(size, count)   _SAL2_Source_(_Outref_result_bytebuffer_to_maybenull_, (size, count), _Outref_result_bytebuffer_maybenull_(size) _Post1_impl_(__bytecount_impl(count)))
-#define _Outref_result_buffer_all_maybenull_(size)             _SAL2_Source_(_Outref_result_buffer_all_maybenull_, (size),           _Outref_result_buffer_to_maybenull_(size, _Old_(size)))
-#define _Outref_result_bytebuffer_all_maybenull_(size)         _SAL2_Source_(_Outref_result_bytebuffer_all_maybenull_, (size),       _Outref_result_bytebuffer_to_maybenull_(size, _Old_(size)))
-
-// Annotations for output reference to pointer parameters that guarantee
-// that the pointer is set to NULL on failure.
-#define _Outref_result_nullonfailure_                          _SAL2_Source_(_Outref_result_nullonfailure_, (), _Outref_    _On_failure_(_Post_null_))
-
-// Generic annotations to set output value of a by-pointer or by-reference parameter to null/zero on failure.
-#define _Result_nullonfailure_                                 _SAL2_Source_(_Result_nullonfailure_, (), _On_failure_(_Notref_impl_ _Deref_impl_ _Post_null_))
-#define _Result_zeroonfailure_                                 _SAL2_Source_(_Result_zeroonfailure_, (), _On_failure_(_Notref_impl_ _Deref_impl_ _Out_range_(==, 0)))
-
-
-// return values -------------------------------
-
-//
-// _Ret_ annotations
-//
-// describing conditions that hold for return values after the call
-
-// e.g. _Ret_z_ CString::operator const WCHAR*() const throw();
-#define _Ret_z_                             _SAL2_Source_(_Ret_z_, (), _Ret2_impl_(__notnull_impl,  __zterm_impl) _Ret_valid_impl_)
-#define _Ret_maybenull_z_                   _SAL2_Source_(_Ret_maybenull_z_, (), _Ret2_impl_(__maybenull_impl,__zterm_impl) _Ret_valid_impl_)
-
-// used with allocated but not yet initialized objects
-#define _Ret_notnull_                       _SAL2_Source_(_Ret_notnull_, (), _Ret1_impl_(__notnull_impl))
-#define _Ret_maybenull_                     _SAL2_Source_(_Ret_maybenull_, (), _Ret1_impl_(__maybenull_impl))
-#define _Ret_null_                          _SAL2_Source_(_Ret_null_, (), _Ret1_impl_(__null_impl))
-
-// used with allocated and initialized objects
-//    returns single valid object
-#define _Ret_valid_                         _SAL2_Source_(_Ret_valid_, (), _Ret1_impl_(__notnull_impl_notref)   _Ret_valid_impl_)
-
-//    returns pointer to initialized buffer of specified size
-#define _Ret_writes_(size)                  _SAL2_Source_(_Ret_writes_, (size), _Ret2_impl_(__notnull_impl,  __count_impl(size))          _Ret_valid_impl_)
-#define _Ret_writes_z_(size)                _SAL2_Source_(_Ret_writes_z_, (size), _Ret3_impl_(__notnull_impl,  __count_impl(size), __zterm_impl) _Ret_valid_impl_)
-#define _Ret_writes_bytes_(size)            _SAL2_Source_(_Ret_writes_bytes_, (size), _Ret2_impl_(__notnull_impl,  __bytecount_impl(size))      _Ret_valid_impl_)
-#define _Ret_writes_maybenull_(size)        _SAL2_Source_(_Ret_writes_maybenull_, (size), _Ret2_impl_(__maybenull_impl,__count_impl(size))          _Ret_valid_impl_)
-#define _Ret_writes_maybenull_z_(size)      _SAL2_Source_(_Ret_writes_maybenull_z_, (size), _Ret3_impl_(__maybenull_impl,__count_impl(size),__zterm_impl)  _Ret_valid_impl_)
-#define _Ret_writes_bytes_maybenull_(size)  _SAL2_Source_(_Ret_writes_bytes_maybenull_, (size), _Ret2_impl_(__maybenull_impl,__bytecount_impl(size))      _Ret_valid_impl_)
-
-//    returns pointer to partially initialized buffer, with total size 'size' and initialized size 'count'
-#define _Ret_writes_to_(size,count)                   _SAL2_Source_(_Ret_writes_to_, (size,count), _Ret3_impl_(__notnull_impl,  __cap_impl(size),     __count_impl(count))     _Ret_valid_impl_)
-#define _Ret_writes_bytes_to_(size,count)             _SAL2_Source_(_Ret_writes_bytes_to_, (size,count), _Ret3_impl_(__notnull_impl,  __bytecap_impl(size), __bytecount_impl(count)) _Ret_valid_impl_)
-#define _Ret_writes_to_maybenull_(size,count)         _SAL2_Source_(_Ret_writes_to_maybenull_, (size,count), _Ret3_impl_(__maybenull_impl,  __cap_impl(size),     __count_impl(count))     _Ret_valid_impl_)
-#define _Ret_writes_bytes_to_maybenull_(size,count)   _SAL2_Source_(_Ret_writes_bytes_to_maybenull_, (size,count), _Ret3_impl_(__maybenull_impl,  __bytecap_impl(size), __bytecount_impl(count)) _Ret_valid_impl_)
-
-
-// Annotations for strict type checking
-#define _Points_to_data_        _SAL2_Source_(_Points_to_data_, (), _Pre_ _Points_to_data_impl_)
-#define _Literal_               _SAL2_Source_(_Literal_, (), _Pre_ _Literal_impl_)
-#define _Notliteral_            _SAL2_Source_(_Notliteral_, (), _Pre_ _Notliteral_impl_)
-
-// Check the return value of a function e.g. _Check_return_ ErrorCode Foo();
-#define _Check_return_           _SAL2_Source_(_Check_return_, (), _Check_return_impl_)
-#define _Must_inspect_result_    _SAL2_Source_(_Must_inspect_result_, (), _Must_inspect_impl_ _Check_return_impl_)
-
-// e.g. MyPrintF( _Printf_format_string_ const WCHAR* wzFormat, ... );
-#define _Printf_format_string_  _SAL2_Source_(_Printf_format_string_, (), _Printf_format_string_impl_)
-#define _Scanf_format_string_   _SAL2_Source_(_Scanf_format_string_, (), _Scanf_format_string_impl_)
-#define _Scanf_s_format_string_  _SAL2_Source_(_Scanf_s_format_string_, (), _Scanf_s_format_string_impl_)
-
-#define _Format_string_impl_(kind,where)  _SA_annotes2(SAL_IsFormatString2, kind, where)
-#define _Printf_format_string_params_(x)  _SAL2_Source_(_Printf_format_string_params_, (x), _Format_string_impl_("printf", x))
-#define _Scanf_format_string_params_(x)   _SAL2_Source_(_Scanf_format_string_params_, (x), _Format_string_impl_("scanf", x))
-#define _Scanf_s_format_string_params_(x) _SAL2_Source_(_Scanf_s_format_string_params_, (x), _Format_string_impl_("scanf_s", x))
-
-// annotations to express value of integral or pointer parameter
-#define _In_range_(lb,ub)           _SAL2_Source_(_In_range_, (lb,ub), _In_range_impl_(lb,ub))
-#define _Out_range_(lb,ub)          _SAL2_Source_(_Out_range_, (lb,ub), _Out_range_impl_(lb,ub))
-#define _Ret_range_(lb,ub)          _SAL2_Source_(_Ret_range_, (lb,ub), _Ret_range_impl_(lb,ub))
-#define _Deref_in_range_(lb,ub)     _SAL2_Source_(_Deref_in_range_, (lb,ub), _Deref_in_range_impl_(lb,ub))
-#define _Deref_out_range_(lb,ub)    _SAL2_Source_(_Deref_out_range_, (lb,ub), _Deref_out_range_impl_(lb,ub))
-#define _Deref_ret_range_(lb,ub)    _SAL2_Source_(_Deref_ret_range_, (lb,ub), _Deref_ret_range_impl_(lb,ub))
-#define _Pre_equal_to_(expr)        _SAL2_Source_(_Pre_equal_to_, (expr), _In_range_(==, expr))
-#define _Post_equal_to_(expr)       _SAL2_Source_(_Post_equal_to_, (expr), _Out_range_(==, expr))
-
-// annotation to express that a value (usually a field of a mutable class)
-// is not changed by a function call
-#define _Unchanged_(e)              _SAL2_Source_(_Unchanged_, (e), _At_(e, _Post_equal_to_(_Old_(e)) _Const_))
-
-// Annotations to allow expressing generalized pre and post conditions.
-// 'cond' may be any valid SAL expression that is considered to be true as a precondition
-// or postcondition (respsectively).
-#define _Pre_satisfies_(cond)       _SAL2_Source_(_Pre_satisfies_, (cond), _Pre_satisfies_impl_(cond))
-#define _Post_satisfies_(cond)      _SAL2_Source_(_Post_satisfies_, (cond), _Post_satisfies_impl_(cond))
-
-// Annotations to express struct, class and field invariants
-#define _Struct_size_bytes_(size)                  _SAL2_Source_(_Struct_size_bytes_, (size), _Writable_bytes_(size))
-
-#define _Field_size_(size)                         _SAL2_Source_(_Field_size_, (size), _Notnull_   _Writable_elements_(size))
-#define _Field_size_opt_(size)                     _SAL2_Source_(_Field_size_opt_, (size), _Maybenull_ _Writable_elements_(size))
-#define _Field_size_part_(size, count)             _SAL2_Source_(_Field_size_part_, (size, count), _Notnull_   _Writable_elements_(size) _Readable_elements_(count))
-#define _Field_size_part_opt_(size, count)         _SAL2_Source_(_Field_size_part_opt_, (size, count), _Maybenull_ _Writable_elements_(size) _Readable_elements_(count))
-#define _Field_size_full_(size)                    _SAL2_Source_(_Field_size_full_, (size), _Field_size_part_(size, size))
-#define _Field_size_full_opt_(size)                _SAL2_Source_(_Field_size_full_opt_, (size), _Field_size_part_opt_(size, size))
-
-#define _Field_size_bytes_(size)                   _SAL2_Source_(_Field_size_bytes_, (size), _Notnull_   _Writable_bytes_(size))
-#define _Field_size_bytes_opt_(size)               _SAL2_Source_(_Field_size_bytes_opt_, (size), _Maybenull_ _Writable_bytes_(size))
-#define _Field_size_bytes_part_(size, count)       _SAL2_Source_(_Field_size_bytes_part_, (size, count), _Notnull_   _Writable_bytes_(size) _Readable_bytes_(count))
-#define _Field_size_bytes_part_opt_(size, count)   _SAL2_Source_(_Field_size_bytes_part_opt_, (size, count), _Maybenull_ _Writable_bytes_(size) _Readable_bytes_(count))
-#define _Field_size_bytes_full_(size)              _SAL2_Source_(_Field_size_bytes_full_, (size), _Field_size_bytes_part_(size, size))
-#define _Field_size_bytes_full_opt_(size)          _SAL2_Source_(_Field_size_bytes_full_opt_, (size), _Field_size_bytes_part_opt_(size, size))
-
-#define _Field_z_                                  _SAL2_Source_(_Field_z_, (), _Null_terminated_)
-
-#define _Field_range_(min,max)                     _SAL2_Source_(_Field_range_, (min,max), _Field_range_impl_(min,max))
-
-//============================================================================
-//   _Pre_\_Post_ Layer:
-//============================================================================
-
-//
-// Raw Pre/Post for declaring custom pre/post conditions
-//
-
-#define _Pre_                             _Pre_impl_
-#define _Post_                            _Post_impl_
-
-//
-// Validity property
-//
-
-#define _Valid_                           _Valid_impl_
-#define _Notvalid_                        _Notvalid_impl_
-#define _Maybevalid_                      _Maybevalid_impl_
-
-//
-// Buffer size properties
-//
-
-// Expressing buffer sizes without specifying pre or post condition
-#define _Readable_bytes_(size)            _SAL2_Source_(_Readable_bytes_, (size), _Readable_bytes_impl_(size))
-#define _Readable_elements_(size)         _SAL2_Source_(_Readable_elements_, (size), _Readable_elements_impl_(size))
-#define _Writable_bytes_(size)            _SAL2_Source_(_Writable_bytes_, (size), _Writable_bytes_impl_(size))
-#define _Writable_elements_(size)         _SAL2_Source_(_Writable_elements_, (size), _Writable_elements_impl_(size))
-
-#define _Null_terminated_                 _SAL2_Source_(_Null_terminated_, (), _Null_terminated_impl_)
-#define _NullNull_terminated_             _SAL2_Source_(_NullNull_terminated_, (), _NullNull_terminated_impl_)
-
-// Expressing buffer size as pre or post condition
-#define _Pre_readable_size_(size)         _SAL2_Source_(_Pre_readable_size_, (size), _Pre1_impl_(__count_impl(size))      _Pre_valid_impl_)
-#define _Pre_writable_size_(size)         _SAL2_Source_(_Pre_writable_size_, (size), _Pre1_impl_(__cap_impl(size)))
-#define _Pre_readable_byte_size_(size)    _SAL2_Source_(_Pre_readable_byte_size_, (size), _Pre1_impl_(__bytecount_impl(size))  _Pre_valid_impl_)
-#define _Pre_writable_byte_size_(size)    _SAL2_Source_(_Pre_writable_byte_size_, (size), _Pre1_impl_(__bytecap_impl(size)))
-
-#define _Post_readable_size_(size)        _SAL2_Source_(_Post_readable_size_, (size), _Post1_impl_(__count_impl(size))     _Post_valid_impl_)
-#define _Post_writable_size_(size)        _SAL2_Source_(_Post_writable_size_, (size), _Post1_impl_(__cap_impl(size)))
-#define _Post_readable_byte_size_(size)   _SAL2_Source_(_Post_readable_byte_size_, (size), _Post1_impl_(__bytecount_impl(size)) _Post_valid_impl_)
-#define _Post_writable_byte_size_(size)   _SAL2_Source_(_Post_writable_byte_size_, (size), _Post1_impl_(__bytecap_impl(size)))
-
-//
-// Pointer null-ness properties
-//
-#define _Null_                            _Null_impl_
-#define _Notnull_                         _Notnull_impl_
-#define _Maybenull_                       _Maybenull_impl_
-
-//
-// _Pre_ annotations ---
-//
-// describing conditions that must be met before the call of the function
-
-// e.g. int strlen( _Pre_z_ const char* sz );
-// buffer is a zero terminated string
-#define _Pre_z_                           _SAL2_Source_(_Pre_z_, (), _Pre1_impl_(__zterm_impl) _Pre_valid_impl_)
-
-// valid size unknown or indicated by type (e.g.:LPSTR)
-#define _Pre_valid_                       _SAL2_Source_(_Pre_valid_, (), _Pre1_impl_(__notnull_impl_notref)   _Pre_valid_impl_)
-#define _Pre_opt_valid_                   _SAL2_Source_(_Pre_opt_valid_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_)
-
-#define _Pre_invalid_                     _SAL2_Source_(_Pre_invalid_, (), _Deref_pre1_impl_(__notvalid_impl))
-
-// Overrides recursive valid when some field is not yet initialized when using _Inout_
-#define _Pre_unknown_                     _SAL2_Source_(_Pre_unknown_, (), _Pre1_impl_(__maybevalid_impl))
-
-// used with allocated but not yet initialized objects
-#define _Pre_notnull_                     _SAL2_Source_(_Pre_notnull_, (), _Pre1_impl_(__notnull_impl_notref))
-#define _Pre_maybenull_                   _SAL2_Source_(_Pre_maybenull_, (), _Pre1_impl_(__maybenull_impl_notref))
-#define _Pre_null_                        _SAL2_Source_(_Pre_null_, (), _Pre1_impl_(__null_impl_notref))
-
-//
-// _Post_ annotations ---
-//
-// describing conditions that hold after the function call
-
-// void CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_ char* szFrom, size_t cchFrom );
-// buffer will be a zero-terminated string after the call
-#define _Post_z_                         _SAL2_Source_(_Post_z_, (), _Post1_impl_(__zterm_impl) _Post_valid_impl_)
-
-// e.g. HRESULT InitStruct( _Post_valid_ Struct* pobj );
-#define _Post_valid_                     _SAL2_Source_(_Post_valid_, (), _Post_valid_impl_)
-#define _Post_invalid_                   _SAL2_Source_(_Post_invalid_, (), _Deref_post1_impl_(__notvalid_impl))
-
-// e.g. void free( _Post_ptr_invalid_ void* pv );
-#define _Post_ptr_invalid_               _SAL2_Source_(_Post_ptr_invalid_, (), _Post1_impl_(__notvalid_impl))
-
-// e.g. void ThrowExceptionIfNull( _Post_notnull_ const void* pv );
-#define _Post_notnull_                   _SAL2_Source_(_Post_notnull_, (), _Post1_impl_(__notnull_impl))
-
-// e.g. HRESULT GetObject(_Outptr_ _On_failure_(_At_(*p, _Post_null_)) T **p);
-#define _Post_null_                      _SAL2_Source_(_Post_null_, (), _Post1_impl_(__null_impl))
-
-#define _Post_maybenull_                 _SAL2_Source_(_Post_maybenull_, (), _Post1_impl_(__maybenull_impl))
-
-#define _Prepost_z_                      _SAL2_Source_(_Prepost_z_, (), _Pre_z_      _Post_z_)
-
-
-// #pragma region Input Buffer SAL 1 compatibility macros
-
-/*==========================================================================
-
-   This section contains definitions for macros defined for VS2010 and earlier.
-   Usage of these macros is still supported, but the SAL 2 macros defined above
-   are recommended instead.  This comment block is retained to assist in
-   understanding SAL that still uses the older syntax.
-
-   The macros are defined in 3 layers:
-
-   _In_\_Out_ Layer:
-   ----------------
-   This layer provides the highest abstraction and its macros should be used
-   in most cases. Its macros start with _In_, _Out_ or _Inout_. For the
-   typical case they provide the most concise annotations.
-
-   _Pre_\_Post_ Layer:
-   ------------------
-   The macros of this layer only should be used when there is no suitable macro
-   in the _In_\_Out_ layer. Its macros start with _Pre_, _Post_, _Ret_,
-   _Deref_pre_ _Deref_post_ and _Deref_ret_. This layer provides the most
-   flexibility for annotations.
-
-   Implementation Abstraction Layer:
-   --------------------------------
-   Macros from this layer should never be used directly. The layer only exists
-   to hide the implementation of the annotation macros.
-
-
-   Annotation Syntax:
-   |--------------|----------|----------------|-----------------------------|
-   |   Usage      | Nullness | ZeroTerminated |  Extent                     |
-   |--------------|----------|----------------|-----------------------------|
-   | _In_         | <>       | <>             | <>                          |
-   | _Out_        | opt_     | z_             | [byte]cap_[c_|x_]( size )   |
-   | _Inout_      |          |                | [byte]count_[c_|x_]( size ) |
-   | _Deref_out_  |          |                | ptrdiff_cap_( ptr )         |
-   |--------------|          |                | ptrdiff_count_( ptr )       |
-   | _Ret_        |          |                |                             |
-   | _Deref_ret_  |          |                |                             |
-   |--------------|          |                |                             |
-   | _Pre_        |          |                |                             |
-   | _Post_       |          |                |                             |
-   | _Deref_pre_  |          |                |                             |
-   | _Deref_post_ |          |                |                             |
-   |--------------|----------|----------------|-----------------------------|
-
-   Usage:
-   -----
-   _In_, _Out_, _Inout_, _Pre_, _Post_, _Deref_pre_, _Deref_post_ are for
-   formal parameters.
-   _Ret_, _Deref_ret_ must be used for return values.
-
-   Nullness:
-   --------
-   If the pointer can be NULL the annotation contains _opt. If the macro
-   does not contain '_opt' the pointer may not be NULL.
-
-   String Type:
-   -----------
-   _z: NullTerminated string
-   for _In_ parameters the buffer must have the specified stringtype before the call
-   for _Out_ parameters the buffer must have the specified stringtype after the call
-   for _Inout_ parameters both conditions apply
-
-   Extent Syntax:
-   |------|---------------|---------------|
-   | Unit | Writ\Readable | Argument Type |
-   |------|---------------|---------------|
-   |  <>  | cap_          | <>            |
-   | byte | count_        | c_            |
-   |      |               | x_            |
-   |------|---------------|---------------|
-
-   'cap' (capacity) describes the writable size of the buffer and is typically used
-   with _Out_. The default unit is elements. Use 'bytecap' if the size is given in bytes
-   'count' describes the readable size of the buffer and is typically used with _In_.
-   The default unit is elements. Use 'bytecount' if the size is given in bytes.
-
-   Argument syntax for cap_, bytecap_, count_, bytecount_:
-   (<parameter>|return)[+n]  e.g. cch, return, cb+2
-
-   If the buffer size is a constant expression use the c_ postfix.
-   E.g. cap_c_(20), count_c_(MAX_PATH), bytecount_c_(16)
-
-   If the buffer size is given by a limiting pointer use the ptrdiff_ versions
-   of the macros.
-
-   If the buffer size is neither a parameter nor a constant expression use the x_
-   postfix. e.g. bytecount_x_(num*size) x_ annotations accept any arbitrary string.
-   No analysis can be done for x_ annotations but they at least tell the tool that
-   the buffer has some sort of extent description. x_ annotations might be supported
-   by future compiler versions.
-
-============================================================================*/
-
-// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch )
-// valid buffer extent described by another parameter
-#define _In_count_(size)               _SAL1_1_Source_(_In_count_, (size), _Pre_count_(size)         _Deref_pre_readonly_)
-#define _In_opt_count_(size)           _SAL1_1_Source_(_In_opt_count_, (size), _Pre_opt_count_(size)     _Deref_pre_readonly_)
-#define _In_bytecount_(size)           _SAL1_1_Source_(_In_bytecount_, (size), _Pre_bytecount_(size)     _Deref_pre_readonly_)
-#define _In_opt_bytecount_(size)       _SAL1_1_Source_(_In_opt_bytecount_, (size), _Pre_opt_bytecount_(size) _Deref_pre_readonly_)
-
-// valid buffer extent described by a constant extression
-#define _In_count_c_(size)             _SAL1_1_Source_(_In_count_c_, (size), _Pre_count_c_(size)         _Deref_pre_readonly_)
-#define _In_opt_count_c_(size)         _SAL1_1_Source_(_In_opt_count_c_, (size), _Pre_opt_count_c_(size)     _Deref_pre_readonly_)
-#define _In_bytecount_c_(size)         _SAL1_1_Source_(_In_bytecount_c_, (size), _Pre_bytecount_c_(size)     _Deref_pre_readonly_)
-#define _In_opt_bytecount_c_(size)     _SAL1_1_Source_(_In_opt_bytecount_c_, (size), _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_)
-
-// nullterminated  'input' buffers with given size
-
-// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch )
-// nullterminated valid buffer extent described by another parameter
-#define _In_z_count_(size)               _SAL1_1_Source_(_In_z_count_, (size), _Pre_z_ _Pre_count_(size)         _Deref_pre_readonly_)
-#define _In_opt_z_count_(size)           _SAL1_1_Source_(_In_opt_z_count_, (size), _Pre_opt_z_ _Pre_opt_count_(size)     _Deref_pre_readonly_)
-#define _In_z_bytecount_(size)           _SAL1_1_Source_(_In_z_bytecount_, (size), _Pre_z_ _Pre_bytecount_(size)     _Deref_pre_readonly_)
-#define _In_opt_z_bytecount_(size)       _SAL1_1_Source_(_In_opt_z_bytecount_, (size), _Pre_opt_z_ _Pre_opt_bytecount_(size) _Deref_pre_readonly_)
-
-// nullterminated valid buffer extent described by a constant extression
-#define _In_z_count_c_(size)             _SAL1_1_Source_(_In_z_count_c_, (size), _Pre_z_ _Pre_count_c_(size)         _Deref_pre_readonly_)
-#define _In_opt_z_count_c_(size)         _SAL1_1_Source_(_In_opt_z_count_c_, (size), _Pre_opt_z_ _Pre_opt_count_c_(size)     _Deref_pre_readonly_)
-#define _In_z_bytecount_c_(size)         _SAL1_1_Source_(_In_z_bytecount_c_, (size), _Pre_z_ _Pre_bytecount_c_(size)     _Deref_pre_readonly_)
-#define _In_opt_z_bytecount_c_(size)     _SAL1_1_Source_(_In_opt_z_bytecount_c_, (size), _Pre_opt_z_ _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_)
-
-// buffer capacity is described by another pointer
-// e.g. void Foo( _In_ptrdiff_count_(pchMax) const char* pch, const char* pchMax ) { while pch < pchMax ) pch++; }
-#define _In_ptrdiff_count_(size)       _SAL1_1_Source_(_In_ptrdiff_count_, (size), _Pre_ptrdiff_count_(size)     _Deref_pre_readonly_)
-#define _In_opt_ptrdiff_count_(size)   _SAL1_1_Source_(_In_opt_ptrdiff_count_, (size), _Pre_opt_ptrdiff_count_(size) _Deref_pre_readonly_)
-
-// 'x' version for complex expressions that are not supported by the current compiler version
-// e.g. void Set3ColMatrix( _In_count_x_(3*cRows) const Elem* matrix, int cRows );
-#define _In_count_x_(size)             _SAL1_1_Source_(_In_count_x_, (size), _Pre_count_x_(size)         _Deref_pre_readonly_)
-#define _In_opt_count_x_(size)         _SAL1_1_Source_(_In_opt_count_x_, (size), _Pre_opt_count_x_(size)     _Deref_pre_readonly_)
-#define _In_bytecount_x_(size)         _SAL1_1_Source_(_In_bytecount_x_, (size), _Pre_bytecount_x_(size)     _Deref_pre_readonly_)
-#define _In_opt_bytecount_x_(size)     _SAL1_1_Source_(_In_opt_bytecount_x_, (size), _Pre_opt_bytecount_x_(size) _Deref_pre_readonly_)
-
-
-// 'out' with buffer size
-// e.g. void GetIndices( _Out_cap_(cIndices) int* rgIndices, size_t cIndices );
-// buffer capacity is described by another parameter
-#define _Out_cap_(size)                   _SAL1_1_Source_(_Out_cap_, (size), _Pre_cap_(size)           _Post_valid_impl_)
-#define _Out_opt_cap_(size)               _SAL1_1_Source_(_Out_opt_cap_, (size), _Pre_opt_cap_(size)       _Post_valid_impl_)
-#define _Out_bytecap_(size)               _SAL1_1_Source_(_Out_bytecap_, (size), _Pre_bytecap_(size)       _Post_valid_impl_)
-#define _Out_opt_bytecap_(size)           _SAL1_1_Source_(_Out_opt_bytecap_, (size), _Pre_opt_bytecap_(size)   _Post_valid_impl_)
-
-// buffer capacity is described by a constant expression
-#define _Out_cap_c_(size)                 _SAL1_1_Source_(_Out_cap_c_, (size), _Pre_cap_c_(size)         _Post_valid_impl_)
-#define _Out_opt_cap_c_(size)             _SAL1_1_Source_(_Out_opt_cap_c_, (size), _Pre_opt_cap_c_(size)     _Post_valid_impl_)
-#define _Out_bytecap_c_(size)             _SAL1_1_Source_(_Out_bytecap_c_, (size), _Pre_bytecap_c_(size)     _Post_valid_impl_)
-#define _Out_opt_bytecap_c_(size)         _SAL1_1_Source_(_Out_opt_bytecap_c_, (size), _Pre_opt_bytecap_c_(size) _Post_valid_impl_)
-
-// buffer capacity is described by another parameter multiplied by a constant expression
-#define _Out_cap_m_(mult,size)            _SAL1_1_Source_(_Out_cap_m_, (mult,size), _Pre_cap_m_(mult,size)     _Post_valid_impl_)
-#define _Out_opt_cap_m_(mult,size)        _SAL1_1_Source_(_Out_opt_cap_m_, (mult,size), _Pre_opt_cap_m_(mult,size) _Post_valid_impl_)
-#define _Out_z_cap_m_(mult,size)          _SAL1_1_Source_(_Out_z_cap_m_, (mult,size), _Pre_cap_m_(mult,size)     _Post_valid_impl_ _Post_z_)
-#define _Out_opt_z_cap_m_(mult,size)      _SAL1_1_Source_(_Out_opt_z_cap_m_, (mult,size), _Pre_opt_cap_m_(mult,size) _Post_valid_impl_ _Post_z_)
-
-// buffer capacity is described by another pointer
-// e.g. void Foo( _Out_ptrdiff_cap_(pchMax) char* pch, const char* pchMax ) { while pch < pchMax ) pch++; }
-#define _Out_ptrdiff_cap_(size)           _SAL1_1_Source_(_Out_ptrdiff_cap_, (size), _Pre_ptrdiff_cap_(size)     _Post_valid_impl_)
-#define _Out_opt_ptrdiff_cap_(size)       _SAL1_1_Source_(_Out_opt_ptrdiff_cap_, (size), _Pre_opt_ptrdiff_cap_(size) _Post_valid_impl_)
-
-// buffer capacity is described by a complex expression
-#define _Out_cap_x_(size)                 _SAL1_1_Source_(_Out_cap_x_, (size), _Pre_cap_x_(size)         _Post_valid_impl_)
-#define _Out_opt_cap_x_(size)             _SAL1_1_Source_(_Out_opt_cap_x_, (size), _Pre_opt_cap_x_(size)     _Post_valid_impl_)
-#define _Out_bytecap_x_(size)             _SAL1_1_Source_(_Out_bytecap_x_, (size), _Pre_bytecap_x_(size)     _Post_valid_impl_)
-#define _Out_opt_bytecap_x_(size)         _SAL1_1_Source_(_Out_opt_bytecap_x_, (size), _Pre_opt_bytecap_x_(size) _Post_valid_impl_)
-
-// a zero terminated string is filled into a buffer of given capacity
-// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo );
-// buffer capacity is described by another parameter
-#define _Out_z_cap_(size)                 _SAL1_1_Source_(_Out_z_cap_, (size), _Pre_cap_(size)           _Post_valid_impl_ _Post_z_)
-#define _Out_opt_z_cap_(size)             _SAL1_1_Source_(_Out_opt_z_cap_, (size), _Pre_opt_cap_(size)       _Post_valid_impl_ _Post_z_)
-#define _Out_z_bytecap_(size)             _SAL1_1_Source_(_Out_z_bytecap_, (size), _Pre_bytecap_(size)       _Post_valid_impl_ _Post_z_)
-#define _Out_opt_z_bytecap_(size)         _SAL1_1_Source_(_Out_opt_z_bytecap_, (size), _Pre_opt_bytecap_(size)   _Post_valid_impl_ _Post_z_)
-
-// buffer capacity is described by a constant expression
-#define _Out_z_cap_c_(size)               _SAL1_1_Source_(_Out_z_cap_c_, (size), _Pre_cap_c_(size)         _Post_valid_impl_ _Post_z_)
-#define _Out_opt_z_cap_c_(size)           _SAL1_1_Source_(_Out_opt_z_cap_c_, (size), _Pre_opt_cap_c_(size)     _Post_valid_impl_ _Post_z_)
-#define _Out_z_bytecap_c_(size)           _SAL1_1_Source_(_Out_z_bytecap_c_, (size), _Pre_bytecap_c_(size)     _Post_valid_impl_ _Post_z_)
-#define _Out_opt_z_bytecap_c_(size)       _SAL1_1_Source_(_Out_opt_z_bytecap_c_, (size), _Pre_opt_bytecap_c_(size) _Post_valid_impl_ _Post_z_)
-
-// buffer capacity is described by a complex expression
-#define _Out_z_cap_x_(size)               _SAL1_1_Source_(_Out_z_cap_x_, (size), _Pre_cap_x_(size)         _Post_valid_impl_ _Post_z_)
-#define _Out_opt_z_cap_x_(size)           _SAL1_1_Source_(_Out_opt_z_cap_x_, (size), _Pre_opt_cap_x_(size)     _Post_valid_impl_ _Post_z_)
-#define _Out_z_bytecap_x_(size)           _SAL1_1_Source_(_Out_z_bytecap_x_, (size), _Pre_bytecap_x_(size)     _Post_valid_impl_ _Post_z_)
-#define _Out_opt_z_bytecap_x_(size)       _SAL1_1_Source_(_Out_opt_z_bytecap_x_, (size), _Pre_opt_bytecap_x_(size) _Post_valid_impl_ _Post_z_)
-
-// a zero terminated string is filled into a buffer of given capacity
-// e.g. size_t CopyCharRange( _In_count_(cchFrom) const char* rgchFrom, size_t cchFrom, _Out_cap_post_count_(cchTo,return)) char* rgchTo, size_t cchTo );
-#define _Out_cap_post_count_(cap,count)                _SAL1_1_Source_(_Out_cap_post_count_, (cap,count), _Pre_cap_(cap)         _Post_valid_impl_ _Post_count_(count))
-#define _Out_opt_cap_post_count_(cap,count)            _SAL1_1_Source_(_Out_opt_cap_post_count_, (cap,count), _Pre_opt_cap_(cap)     _Post_valid_impl_ _Post_count_(count))
-#define _Out_bytecap_post_bytecount_(cap,count)        _SAL1_1_Source_(_Out_bytecap_post_bytecount_, (cap,count), _Pre_bytecap_(cap)     _Post_valid_impl_ _Post_bytecount_(count))
-#define _Out_opt_bytecap_post_bytecount_(cap,count)    _SAL1_1_Source_(_Out_opt_bytecap_post_bytecount_, (cap,count), _Pre_opt_bytecap_(cap) _Post_valid_impl_ _Post_bytecount_(count))
-
-// a zero terminated string is filled into a buffer of given capacity
-// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Out_z_cap_post_count_(cchTo,return+1) char* szTo, size_t cchTo );
-#define _Out_z_cap_post_count_(cap,count)               _SAL1_1_Source_(_Out_z_cap_post_count_, (cap,count), _Pre_cap_(cap)         _Post_valid_impl_ _Post_z_count_(count))
-#define _Out_opt_z_cap_post_count_(cap,count)           _SAL1_1_Source_(_Out_opt_z_cap_post_count_, (cap,count), _Pre_opt_cap_(cap)     _Post_valid_impl_ _Post_z_count_(count))
-#define _Out_z_bytecap_post_bytecount_(cap,count)       _SAL1_1_Source_(_Out_z_bytecap_post_bytecount_, (cap,count), _Pre_bytecap_(cap)     _Post_valid_impl_ _Post_z_bytecount_(count))
-#define _Out_opt_z_bytecap_post_bytecount_(cap,count)   _SAL1_1_Source_(_Out_opt_z_bytecap_post_bytecount_, (cap,count), _Pre_opt_bytecap_(cap) _Post_valid_impl_ _Post_z_bytecount_(count))
-
-// only use with dereferenced arguments e.g. '*pcch'
-#define _Out_capcount_(capcount)             _SAL1_1_Source_(_Out_capcount_, (capcount), _Pre_cap_(capcount)         _Post_valid_impl_ _Post_count_(capcount))
-#define _Out_opt_capcount_(capcount)         _SAL1_1_Source_(_Out_opt_capcount_, (capcount), _Pre_opt_cap_(capcount)     _Post_valid_impl_ _Post_count_(capcount))
-#define _Out_bytecapcount_(capcount)         _SAL1_1_Source_(_Out_bytecapcount_, (capcount), _Pre_bytecap_(capcount)     _Post_valid_impl_ _Post_bytecount_(capcount))
-#define _Out_opt_bytecapcount_(capcount)     _SAL1_1_Source_(_Out_opt_bytecapcount_, (capcount), _Pre_opt_bytecap_(capcount) _Post_valid_impl_ _Post_bytecount_(capcount))
-
-#define _Out_capcount_x_(capcount)           _SAL1_1_Source_(_Out_capcount_x_, (capcount), _Pre_cap_x_(capcount)         _Post_valid_impl_ _Post_count_x_(capcount))
-#define _Out_opt_capcount_x_(capcount)       _SAL1_1_Source_(_Out_opt_capcount_x_, (capcount), _Pre_opt_cap_x_(capcount)     _Post_valid_impl_ _Post_count_x_(capcount))
-#define _Out_bytecapcount_x_(capcount)       _SAL1_1_Source_(_Out_bytecapcount_x_, (capcount), _Pre_bytecap_x_(capcount)     _Post_valid_impl_ _Post_bytecount_x_(capcount))
-#define _Out_opt_bytecapcount_x_(capcount)   _SAL1_1_Source_(_Out_opt_bytecapcount_x_, (capcount), _Pre_opt_bytecap_x_(capcount) _Post_valid_impl_ _Post_bytecount_x_(capcount))
-
-// e.g. GetString( _Out_z_capcount_(*pLen+1) char* sz, size_t* pLen );
-#define _Out_z_capcount_(capcount)           _SAL1_1_Source_(_Out_z_capcount_, (capcount), _Pre_cap_(capcount)         _Post_valid_impl_ _Post_z_count_(capcount))
-#define _Out_opt_z_capcount_(capcount)       _SAL1_1_Source_(_Out_opt_z_capcount_, (capcount), _Pre_opt_cap_(capcount)     _Post_valid_impl_ _Post_z_count_(capcount))
-#define _Out_z_bytecapcount_(capcount)       _SAL1_1_Source_(_Out_z_bytecapcount_, (capcount), _Pre_bytecap_(capcount)     _Post_valid_impl_ _Post_z_bytecount_(capcount))
-#define _Out_opt_z_bytecapcount_(capcount)   _SAL1_1_Source_(_Out_opt_z_bytecapcount_, (capcount), _Pre_opt_bytecap_(capcount) _Post_valid_impl_ _Post_z_bytecount_(capcount))
-
-
-// 'inout' buffers with initialized elements before and after the call
-// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndices, size_t cIndices );
-#define _Inout_count_(size)               _SAL1_1_Source_(_Inout_count_, (size), _Prepost_count_(size))
-#define _Inout_opt_count_(size)           _SAL1_1_Source_(_Inout_opt_count_, (size), _Prepost_opt_count_(size))
-#define _Inout_bytecount_(size)           _SAL1_1_Source_(_Inout_bytecount_, (size), _Prepost_bytecount_(size))
-#define _Inout_opt_bytecount_(size)       _SAL1_1_Source_(_Inout_opt_bytecount_, (size), _Prepost_opt_bytecount_(size))
-
-#define _Inout_count_c_(size)             _SAL1_1_Source_(_Inout_count_c_, (size), _Prepost_count_c_(size))
-#define _Inout_opt_count_c_(size)         _SAL1_1_Source_(_Inout_opt_count_c_, (size), _Prepost_opt_count_c_(size))
-#define _Inout_bytecount_c_(size)         _SAL1_1_Source_(_Inout_bytecount_c_, (size), _Prepost_bytecount_c_(size))
-#define _Inout_opt_bytecount_c_(size)     _SAL1_1_Source_(_Inout_opt_bytecount_c_, (size), _Prepost_opt_bytecount_c_(size))
-
-// nullterminated 'inout' buffers with initialized elements before and after the call
-// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndices, size_t cIndices );
-#define _Inout_z_count_(size)               _SAL1_1_Source_(_Inout_z_count_, (size), _Prepost_z_ _Prepost_count_(size))
-#define _Inout_opt_z_count_(size)           _SAL1_1_Source_(_Inout_opt_z_count_, (size), _Prepost_z_ _Prepost_opt_count_(size))
-#define _Inout_z_bytecount_(size)           _SAL1_1_Source_(_Inout_z_bytecount_, (size), _Prepost_z_ _Prepost_bytecount_(size))
-#define _Inout_opt_z_bytecount_(size)       _SAL1_1_Source_(_Inout_opt_z_bytecount_, (size), _Prepost_z_ _Prepost_opt_bytecount_(size))
-
-#define _Inout_z_count_c_(size)             _SAL1_1_Source_(_Inout_z_count_c_, (size), _Prepost_z_ _Prepost_count_c_(size))
-#define _Inout_opt_z_count_c_(size)         _SAL1_1_Source_(_Inout_opt_z_count_c_, (size), _Prepost_z_ _Prepost_opt_count_c_(size))
-#define _Inout_z_bytecount_c_(size)         _SAL1_1_Source_(_Inout_z_bytecount_c_, (size), _Prepost_z_ _Prepost_bytecount_c_(size))
-#define _Inout_opt_z_bytecount_c_(size)     _SAL1_1_Source_(_Inout_opt_z_bytecount_c_, (size), _Prepost_z_ _Prepost_opt_bytecount_c_(size))
-
-#define _Inout_ptrdiff_count_(size)       _SAL1_1_Source_(_Inout_ptrdiff_count_, (size), _Pre_ptrdiff_count_(size))
-#define _Inout_opt_ptrdiff_count_(size)   _SAL1_1_Source_(_Inout_opt_ptrdiff_count_, (size), _Pre_opt_ptrdiff_count_(size))
-
-#define _Inout_count_x_(size)             _SAL1_1_Source_(_Inout_count_x_, (size), _Prepost_count_x_(size))
-#define _Inout_opt_count_x_(size)         _SAL1_1_Source_(_Inout_opt_count_x_, (size), _Prepost_opt_count_x_(size))
-#define _Inout_bytecount_x_(size)         _SAL1_1_Source_(_Inout_bytecount_x_, (size), _Prepost_bytecount_x_(size))
-#define _Inout_opt_bytecount_x_(size)     _SAL1_1_Source_(_Inout_opt_bytecount_x_, (size), _Prepost_opt_bytecount_x_(size))
-
-// e.g. void AppendToLPSTR( _In_ LPCSTR szFrom, _Inout_cap_(cchTo) LPSTR* szTo, size_t cchTo );
-#define _Inout_cap_(size)                 _SAL1_1_Source_(_Inout_cap_, (size), _Pre_valid_cap_(size)           _Post_valid_)
-#define _Inout_opt_cap_(size)             _SAL1_1_Source_(_Inout_opt_cap_, (size), _Pre_opt_valid_cap_(size)       _Post_valid_)
-#define _Inout_bytecap_(size)             _SAL1_1_Source_(_Inout_bytecap_, (size), _Pre_valid_bytecap_(size)       _Post_valid_)
-#define _Inout_opt_bytecap_(size)         _SAL1_1_Source_(_Inout_opt_bytecap_, (size), _Pre_opt_valid_bytecap_(size)   _Post_valid_)
-
-#define _Inout_cap_c_(size)               _SAL1_1_Source_(_Inout_cap_c_, (size), _Pre_valid_cap_c_(size)         _Post_valid_)
-#define _Inout_opt_cap_c_(size)           _SAL1_1_Source_(_Inout_opt_cap_c_, (size), _Pre_opt_valid_cap_c_(size)     _Post_valid_)
-#define _Inout_bytecap_c_(size)           _SAL1_1_Source_(_Inout_bytecap_c_, (size), _Pre_valid_bytecap_c_(size)     _Post_valid_)
-#define _Inout_opt_bytecap_c_(size)       _SAL1_1_Source_(_Inout_opt_bytecap_c_, (size), _Pre_opt_valid_bytecap_c_(size) _Post_valid_)
-
-#define _Inout_cap_x_(size)               _SAL1_1_Source_(_Inout_cap_x_, (size), _Pre_valid_cap_x_(size)         _Post_valid_)
-#define _Inout_opt_cap_x_(size)           _SAL1_1_Source_(_Inout_opt_cap_x_, (size), _Pre_opt_valid_cap_x_(size)     _Post_valid_)
-#define _Inout_bytecap_x_(size)           _SAL1_1_Source_(_Inout_bytecap_x_, (size), _Pre_valid_bytecap_x_(size)     _Post_valid_)
-#define _Inout_opt_bytecap_x_(size)       _SAL1_1_Source_(_Inout_opt_bytecap_x_, (size), _Pre_opt_valid_bytecap_x_(size) _Post_valid_)
-
-// inout string buffers with writable size
-// e.g. void AppendStr( _In_z_ const char* szFrom, _Inout_z_cap_(cchTo) char* szTo, size_t cchTo );
-#define _Inout_z_cap_(size)                  _SAL1_1_Source_(_Inout_z_cap_, (size), _Pre_z_cap_(size)            _Post_z_)
-#define _Inout_opt_z_cap_(size)              _SAL1_1_Source_(_Inout_opt_z_cap_, (size), _Pre_opt_z_cap_(size)        _Post_z_)
-#define _Inout_z_bytecap_(size)              _SAL1_1_Source_(_Inout_z_bytecap_, (size), _Pre_z_bytecap_(size)        _Post_z_)
-#define _Inout_opt_z_bytecap_(size)          _SAL1_1_Source_(_Inout_opt_z_bytecap_, (size), _Pre_opt_z_bytecap_(size)    _Post_z_)
-
-#define _Inout_z_cap_c_(size)                _SAL1_1_Source_(_Inout_z_cap_c_, (size), _Pre_z_cap_c_(size)          _Post_z_)
-#define _Inout_opt_z_cap_c_(size)            _SAL1_1_Source_(_Inout_opt_z_cap_c_, (size), _Pre_opt_z_cap_c_(size)      _Post_z_)
-#define _Inout_z_bytecap_c_(size)            _SAL1_1_Source_(_Inout_z_bytecap_c_, (size), _Pre_z_bytecap_c_(size)      _Post_z_)
-#define _Inout_opt_z_bytecap_c_(size)        _SAL1_1_Source_(_Inout_opt_z_bytecap_c_, (size), _Pre_opt_z_bytecap_c_(size)  _Post_z_)
-
-#define _Inout_z_cap_x_(size)                _SAL1_1_Source_(_Inout_z_cap_x_, (size), _Pre_z_cap_x_(size)          _Post_z_)
-#define _Inout_opt_z_cap_x_(size)            _SAL1_1_Source_(_Inout_opt_z_cap_x_, (size), _Pre_opt_z_cap_x_(size)      _Post_z_)
-#define _Inout_z_bytecap_x_(size)            _SAL1_1_Source_(_Inout_z_bytecap_x_, (size), _Pre_z_bytecap_x_(size)      _Post_z_)
-#define _Inout_opt_z_bytecap_x_(size)        _SAL1_1_Source_(_Inout_opt_z_bytecap_x_, (size), _Pre_opt_z_bytecap_x_(size)  _Post_z_)
-
-
-// returning pointers to valid objects
-#define _Ret_                   _SAL1_1_Source_(_Ret_, (), _Ret_valid_)
-#define _Ret_opt_               _SAL1_1_Source_(_Ret_opt_, (), _Ret_opt_valid_)
-
-// annotations to express 'boundedness' of integral value parameter
-#define _In_bound_           _SAL1_1_Source_(_In_bound_, (), _In_bound_impl_)
-#define _Out_bound_          _SAL1_1_Source_(_Out_bound_, (), _Out_bound_impl_)
-#define _Ret_bound_          _SAL1_1_Source_(_Ret_bound_, (), _Ret_bound_impl_)
-#define _Deref_in_bound_     _SAL1_1_Source_(_Deref_in_bound_, (), _Deref_in_bound_impl_)
-#define _Deref_out_bound_    _SAL1_1_Source_(_Deref_out_bound_, (), _Deref_out_bound_impl_)
-#define _Deref_inout_bound_  _SAL1_1_Source_(_Deref_inout_bound_, (), _Deref_in_bound_ _Deref_out_bound_)
-#define _Deref_ret_bound_    _SAL1_1_Source_(_Deref_ret_bound_, (), _Deref_ret_bound_impl_)
-
-// e.g.  HRESULT HrCreatePoint( _Deref_out_opt_ POINT** ppPT );
-#define _Deref_out_             _SAL1_1_Source_(_Deref_out_, (), _Out_ _Deref_post_valid_)
-#define _Deref_out_opt_         _SAL1_1_Source_(_Deref_out_opt_, (), _Out_ _Deref_post_opt_valid_)
-#define _Deref_opt_out_         _SAL1_1_Source_(_Deref_opt_out_, (), _Out_opt_ _Deref_post_valid_)
-#define _Deref_opt_out_opt_     _SAL1_1_Source_(_Deref_opt_out_opt_, (), _Out_opt_ _Deref_post_opt_valid_)
-
-// e.g.  void CloneString( _In_z_ const WCHAR* wzFrom, _Deref_out_z_ WCHAR** pWzTo );
-#define _Deref_out_z_           _SAL1_1_Source_(_Deref_out_z_, (), _Out_ _Deref_post_z_)
-#define _Deref_out_opt_z_       _SAL1_1_Source_(_Deref_out_opt_z_, (), _Out_ _Deref_post_opt_z_)
-#define _Deref_opt_out_z_       _SAL1_1_Source_(_Deref_opt_out_z_, (), _Out_opt_ _Deref_post_z_)
-#define _Deref_opt_out_opt_z_   _SAL1_1_Source_(_Deref_opt_out_opt_z_, (), _Out_opt_ _Deref_post_opt_z_)
-
-//
-// _Deref_pre_ ---
-//
-// describing conditions for array elements of dereferenced pointer parameters that must be met before the call
-
-// e.g. void SaveStringArray( _In_count_(cStrings) _Deref_pre_z_ const WCHAR* const rgpwch[] );
-#define _Deref_pre_z_                           _SAL1_1_Source_(_Deref_pre_z_, (), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__zterm_impl) _Pre_valid_impl_)
-#define _Deref_pre_opt_z_                       _SAL1_1_Source_(_Deref_pre_opt_z_, (), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__zterm_impl) _Pre_valid_impl_)
-
-// e.g. void FillInArrayOfStr32( _In_count_(cStrings) _Deref_pre_cap_c_(32) _Deref_post_z_ WCHAR* const rgpwch[] );
-// buffer capacity is described by another parameter
-#define _Deref_pre_cap_(size)                   _SAL1_1_Source_(_Deref_pre_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__cap_impl(size)))
-#define _Deref_pre_opt_cap_(size)               _SAL1_1_Source_(_Deref_pre_opt_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_impl(size)))
-#define _Deref_pre_bytecap_(size)               _SAL1_1_Source_(_Deref_pre_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecap_impl(size)))
-#define _Deref_pre_opt_bytecap_(size)           _SAL1_1_Source_(_Deref_pre_opt_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size)))
-
-// buffer capacity is described by a constant expression
-#define _Deref_pre_cap_c_(size)                 _SAL1_1_Source_(_Deref_pre_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__cap_c_impl(size)))
-#define _Deref_pre_opt_cap_c_(size)             _SAL1_1_Source_(_Deref_pre_opt_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size)))
-#define _Deref_pre_bytecap_c_(size)             _SAL1_1_Source_(_Deref_pre_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecap_c_impl(size)))
-#define _Deref_pre_opt_bytecap_c_(size)         _SAL1_1_Source_(_Deref_pre_opt_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size)))
-
-// buffer capacity is described by a complex condition
-#define _Deref_pre_cap_x_(size)                 _SAL1_1_Source_(_Deref_pre_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__cap_x_impl(size)))
-#define _Deref_pre_opt_cap_x_(size)             _SAL1_1_Source_(_Deref_pre_opt_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size)))
-#define _Deref_pre_bytecap_x_(size)             _SAL1_1_Source_(_Deref_pre_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecap_x_impl(size)))
-#define _Deref_pre_opt_bytecap_x_(size)         _SAL1_1_Source_(_Deref_pre_opt_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size)))
-
-// convenience macros for nullterminated buffers with given capacity
-#define _Deref_pre_z_cap_(size)                 _SAL1_1_Source_(_Deref_pre_z_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre2_impl_(__zterm_impl,__cap_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_opt_z_cap_(size)             _SAL1_1_Source_(_Deref_pre_opt_z_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_z_bytecap_(size)             _SAL1_1_Source_(_Deref_pre_z_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_z_bytecap_(size)         _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_)
-
-#define _Deref_pre_z_cap_c_(size)               _SAL1_1_Source_(_Deref_pre_z_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre2_impl_(__zterm_impl,__cap_c_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_opt_z_cap_c_(size)           _SAL1_1_Source_(_Deref_pre_opt_z_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_c_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_z_bytecap_c_(size)           _SAL1_1_Source_(_Deref_pre_z_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_z_bytecap_c_(size)       _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_)
-
-#define _Deref_pre_z_cap_x_(size)               _SAL1_1_Source_(_Deref_pre_z_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre2_impl_(__zterm_impl,__cap_x_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_opt_z_cap_x_(size)           _SAL1_1_Source_(_Deref_pre_opt_z_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_x_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_z_bytecap_x_(size)           _SAL1_1_Source_(_Deref_pre_z_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_z_bytecap_x_(size)       _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_)
-
-// known capacity and valid but unknown readable extent
-#define _Deref_pre_valid_cap_(size)             _SAL1_1_Source_(_Deref_pre_valid_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__cap_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_opt_valid_cap_(size)         _SAL1_1_Source_(_Deref_pre_opt_valid_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_valid_bytecap_(size)         _SAL1_1_Source_(_Deref_pre_valid_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_valid_bytecap_(size)     _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_)
-
-#define _Deref_pre_valid_cap_c_(size)           _SAL1_1_Source_(_Deref_pre_valid_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__cap_c_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_opt_valid_cap_c_(size)       _SAL1_1_Source_(_Deref_pre_opt_valid_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_valid_bytecap_c_(size)       _SAL1_1_Source_(_Deref_pre_valid_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_valid_bytecap_c_(size)   _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_)
-
-#define _Deref_pre_valid_cap_x_(size)           _SAL1_1_Source_(_Deref_pre_valid_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__cap_x_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_opt_valid_cap_x_(size)       _SAL1_1_Source_(_Deref_pre_opt_valid_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_valid_bytecap_x_(size)       _SAL1_1_Source_(_Deref_pre_valid_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_valid_bytecap_x_(size)   _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_)
-
-// e.g. void SaveMatrix( _In_count_(n) _Deref_pre_count_(n) const Elem** matrix, size_t n );
-// valid buffer extent is described by another parameter
-#define _Deref_pre_count_(size)                 _SAL1_1_Source_(_Deref_pre_count_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__count_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_opt_count_(size)             _SAL1_1_Source_(_Deref_pre_opt_count_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_bytecount_(size)             _SAL1_1_Source_(_Deref_pre_bytecount_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_bytecount_(size)         _SAL1_1_Source_(_Deref_pre_opt_bytecount_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_)
-
-// valid buffer extent is described by a constant expression
-#define _Deref_pre_count_c_(size)               _SAL1_1_Source_(_Deref_pre_count_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__count_c_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_opt_count_c_(size)           _SAL1_1_Source_(_Deref_pre_opt_count_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_c_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_bytecount_c_(size)           _SAL1_1_Source_(_Deref_pre_bytecount_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_bytecount_c_(size)       _SAL1_1_Source_(_Deref_pre_opt_bytecount_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_)
-
-// valid buffer extent is described by a complex expression
-#define _Deref_pre_count_x_(size)               _SAL1_1_Source_(_Deref_pre_count_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__count_x_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_opt_count_x_(size)           _SAL1_1_Source_(_Deref_pre_opt_count_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_x_impl(size))     _Pre_valid_impl_)
-#define _Deref_pre_bytecount_x_(size)           _SAL1_1_Source_(_Deref_pre_bytecount_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref)   _Deref_pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_bytecount_x_(size)       _SAL1_1_Source_(_Deref_pre_opt_bytecount_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_)
-
-// e.g. void PrintStringArray( _In_count_(cElems) _Deref_pre_valid_ LPCSTR rgStr[], size_t cElems );
-#define _Deref_pre_valid_                       _SAL1_1_Source_(_Deref_pre_valid_, (), _Deref_pre1_impl_(__notnull_impl_notref)   _Pre_valid_impl_)
-#define _Deref_pre_opt_valid_                   _SAL1_1_Source_(_Deref_pre_opt_valid_, (), _Deref_pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_)
-#define _Deref_pre_invalid_                     _SAL1_1_Source_(_Deref_pre_invalid_, (), _Deref_pre1_impl_(__notvalid_impl))
-
-#define _Deref_pre_notnull_                     _SAL1_1_Source_(_Deref_pre_notnull_, (), _Deref_pre1_impl_(__notnull_impl_notref))
-#define _Deref_pre_maybenull_                   _SAL1_1_Source_(_Deref_pre_maybenull_, (), _Deref_pre1_impl_(__maybenull_impl_notref))
-#define _Deref_pre_null_                        _SAL1_1_Source_(_Deref_pre_null_, (), _Deref_pre1_impl_(__null_impl_notref))
-
-// restrict access rights
-#define _Deref_pre_readonly_                    _SAL1_1_Source_(_Deref_pre_readonly_, (), _Deref_pre1_impl_(__readaccess_impl_notref))
-#define _Deref_pre_writeonly_                   _SAL1_1_Source_(_Deref_pre_writeonly_, (), _Deref_pre1_impl_(__writeaccess_impl_notref))
-
-//
-// _Deref_post_ ---
-//
-// describing conditions for array elements or dereferenced pointer parameters that hold after the call
-
-// e.g. void CloneString( _In_z_ const Wchar_t* wzIn _Out_ _Deref_post_z_ WCHAR** pWzOut );
-#define _Deref_post_z_                           _SAL1_1_Source_(_Deref_post_z_, (), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__zterm_impl) _Post_valid_impl_)
-#define _Deref_post_opt_z_                       _SAL1_1_Source_(_Deref_post_opt_z_, (), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__zterm_impl) _Post_valid_impl_)
-
-// e.g. HRESULT HrAllocateMemory( size_t cb, _Out_ _Deref_post_bytecap_(cb) void** ppv );
-// buffer capacity is described by another parameter
-#define _Deref_post_cap_(size)                   _SAL1_1_Source_(_Deref_post_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_impl(size)))
-#define _Deref_post_opt_cap_(size)               _SAL1_1_Source_(_Deref_post_opt_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_impl(size)))
-#define _Deref_post_bytecap_(size)               _SAL1_1_Source_(_Deref_post_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size)))
-#define _Deref_post_opt_bytecap_(size)           _SAL1_1_Source_(_Deref_post_opt_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size)))
-
-// buffer capacity is described by a constant expression
-#define _Deref_post_cap_c_(size)                 _SAL1_1_Source_(_Deref_post_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size)))
-#define _Deref_post_opt_cap_c_(size)             _SAL1_1_Source_(_Deref_post_opt_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size)))
-#define _Deref_post_bytecap_c_(size)             _SAL1_1_Source_(_Deref_post_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)))
-#define _Deref_post_opt_bytecap_c_(size)         _SAL1_1_Source_(_Deref_post_opt_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)))
-
-// buffer capacity is described by a complex expression
-#define _Deref_post_cap_x_(size)                 _SAL1_1_Source_(_Deref_post_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size)))
-#define _Deref_post_opt_cap_x_(size)             _SAL1_1_Source_(_Deref_post_opt_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size)))
-#define _Deref_post_bytecap_x_(size)             _SAL1_1_Source_(_Deref_post_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)))
-#define _Deref_post_opt_bytecap_x_(size)         _SAL1_1_Source_(_Deref_post_opt_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)))
-
-// convenience macros for nullterminated buffers with given capacity
-#define _Deref_post_z_cap_(size)                 _SAL1_1_Source_(_Deref_post_z_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_impl(size))       _Post_valid_impl_)
-#define _Deref_post_opt_z_cap_(size)             _SAL1_1_Source_(_Deref_post_opt_z_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_impl(size))       _Post_valid_impl_)
-#define _Deref_post_z_bytecap_(size)             _SAL1_1_Source_(_Deref_post_z_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_impl(size))   _Post_valid_impl_)
-#define _Deref_post_opt_z_bytecap_(size)         _SAL1_1_Source_(_Deref_post_opt_z_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_impl(size))   _Post_valid_impl_)
-
-#define _Deref_post_z_cap_c_(size)               _SAL1_1_Source_(_Deref_post_z_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_c_impl(size))     _Post_valid_impl_)
-#define _Deref_post_opt_z_cap_c_(size)           _SAL1_1_Source_(_Deref_post_opt_z_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_c_impl(size))     _Post_valid_impl_)
-#define _Deref_post_z_bytecap_c_(size)           _SAL1_1_Source_(_Deref_post_z_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_z_bytecap_c_(size)       _SAL1_1_Source_(_Deref_post_opt_z_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Post_valid_impl_)
-
-#define _Deref_post_z_cap_x_(size)               _SAL1_1_Source_(_Deref_post_z_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_x_impl(size))     _Post_valid_impl_)
-#define _Deref_post_opt_z_cap_x_(size)           _SAL1_1_Source_(_Deref_post_opt_z_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_x_impl(size))     _Post_valid_impl_)
-#define _Deref_post_z_bytecap_x_(size)           _SAL1_1_Source_(_Deref_post_z_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_z_bytecap_x_(size)       _SAL1_1_Source_(_Deref_post_opt_z_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Post_valid_impl_)
-
-// known capacity and valid but unknown readable extent
-#define _Deref_post_valid_cap_(size)             _SAL1_1_Source_(_Deref_post_valid_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_impl(size))       _Post_valid_impl_)
-#define _Deref_post_opt_valid_cap_(size)         _SAL1_1_Source_(_Deref_post_opt_valid_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_impl(size))       _Post_valid_impl_)
-#define _Deref_post_valid_bytecap_(size)         _SAL1_1_Source_(_Deref_post_valid_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size))   _Post_valid_impl_)
-#define _Deref_post_opt_valid_bytecap_(size)     _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size))   _Post_valid_impl_)
-
-#define _Deref_post_valid_cap_c_(size)           _SAL1_1_Source_(_Deref_post_valid_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size))     _Post_valid_impl_)
-#define _Deref_post_opt_valid_cap_c_(size)       _SAL1_1_Source_(_Deref_post_opt_valid_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size))     _Post_valid_impl_)
-#define _Deref_post_valid_bytecap_c_(size)       _SAL1_1_Source_(_Deref_post_valid_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_valid_bytecap_c_(size)   _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)) _Post_valid_impl_)
-
-#define _Deref_post_valid_cap_x_(size)           _SAL1_1_Source_(_Deref_post_valid_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size))     _Post_valid_impl_)
-#define _Deref_post_opt_valid_cap_x_(size)       _SAL1_1_Source_(_Deref_post_opt_valid_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size))     _Post_valid_impl_)
-#define _Deref_post_valid_bytecap_x_(size)       _SAL1_1_Source_(_Deref_post_valid_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_valid_bytecap_x_(size)   _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)) _Post_valid_impl_)
-
-// e.g. HRESULT HrAllocateZeroInitializedMemory( size_t cb, _Out_ _Deref_post_bytecount_(cb) void** ppv );
-// valid buffer extent is described by another parameter
-#define _Deref_post_count_(size)                 _SAL1_1_Source_(_Deref_post_count_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_impl(size))       _Post_valid_impl_)
-#define _Deref_post_opt_count_(size)             _SAL1_1_Source_(_Deref_post_opt_count_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_impl(size))       _Post_valid_impl_)
-#define _Deref_post_bytecount_(size)             _SAL1_1_Source_(_Deref_post_bytecount_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_impl(size))   _Post_valid_impl_)
-#define _Deref_post_opt_bytecount_(size)         _SAL1_1_Source_(_Deref_post_opt_bytecount_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_impl(size))   _Post_valid_impl_)
-
-// buffer capacity is described by a constant expression
-#define _Deref_post_count_c_(size)               _SAL1_1_Source_(_Deref_post_count_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_c_impl(size))     _Post_valid_impl_)
-#define _Deref_post_opt_count_c_(size)           _SAL1_1_Source_(_Deref_post_opt_count_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_c_impl(size))     _Post_valid_impl_)
-#define _Deref_post_bytecount_c_(size)           _SAL1_1_Source_(_Deref_post_bytecount_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_bytecount_c_(size)       _SAL1_1_Source_(_Deref_post_opt_bytecount_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_)
-
-// buffer capacity is described by a complex expression
-#define _Deref_post_count_x_(size)               _SAL1_1_Source_(_Deref_post_count_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_x_impl(size))     _Post_valid_impl_)
-#define _Deref_post_opt_count_x_(size)           _SAL1_1_Source_(_Deref_post_opt_count_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_x_impl(size))     _Post_valid_impl_)
-#define _Deref_post_bytecount_x_(size)           _SAL1_1_Source_(_Deref_post_bytecount_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_bytecount_x_(size)       _SAL1_1_Source_(_Deref_post_opt_bytecount_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_)
-
-// e.g. void GetStrings( _Out_count_(cElems) _Deref_post_valid_ LPSTR const rgStr[], size_t cElems );
-#define _Deref_post_valid_                       _SAL1_1_Source_(_Deref_post_valid_, (), _Deref_post1_impl_(__notnull_impl_notref)   _Post_valid_impl_)
-#define _Deref_post_opt_valid_                   _SAL1_1_Source_(_Deref_post_opt_valid_, (), _Deref_post1_impl_(__maybenull_impl_notref) _Post_valid_impl_)
-
-#define _Deref_post_notnull_                     _SAL1_1_Source_(_Deref_post_notnull_, (), _Deref_post1_impl_(__notnull_impl_notref))
-#define _Deref_post_maybenull_                   _SAL1_1_Source_(_Deref_post_maybenull_, (), _Deref_post1_impl_(__maybenull_impl_notref))
-#define _Deref_post_null_                        _SAL1_1_Source_(_Deref_post_null_, (), _Deref_post1_impl_(__null_impl_notref))
-
-//
-// _Deref_ret_ ---
-//
-
-#define _Deref_ret_z_                            _SAL1_1_Source_(_Deref_ret_z_, (), _Deref_ret1_impl_(__notnull_impl_notref) _Deref_ret1_impl_(__zterm_impl))
-#define _Deref_ret_opt_z_                        _SAL1_1_Source_(_Deref_ret_opt_z_, (), _Deref_ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__zterm_impl))
-
-//
-// special _Deref_ ---
-//
-#define _Deref2_pre_readonly_                    _SAL1_1_Source_(_Deref2_pre_readonly_, (), _Deref2_pre1_impl_(__readaccess_impl_notref))
-
-//
-// _Ret_ ---
-//
-
-// e.g. _Ret_opt_valid_ LPSTR void* CloneSTR( _Pre_valid_ LPSTR src );
-#define _Ret_opt_valid_                   _SAL1_1_Source_(_Ret_opt_valid_, (), _Ret1_impl_(__maybenull_impl_notref) _Ret_valid_impl_)
-#define _Ret_opt_z_                       _SAL1_1_Source_(_Ret_opt_z_, (), _Ret2_impl_(__maybenull_impl,__zterm_impl) _Ret_valid_impl_)
-
-// e.g. _Ret_opt_bytecap_(cb) void* AllocateMemory( size_t cb );
-// Buffer capacity is described by another parameter
-#define _Ret_cap_(size)                   _SAL1_1_Source_(_Ret_cap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_impl(size)))
-#define _Ret_opt_cap_(size)               _SAL1_1_Source_(_Ret_opt_cap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_impl(size)))
-#define _Ret_bytecap_(size)               _SAL1_1_Source_(_Ret_bytecap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_impl(size)))
-#define _Ret_opt_bytecap_(size)           _SAL1_1_Source_(_Ret_opt_bytecap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_impl(size)))
-
-// Buffer capacity is described by a constant expression
-#define _Ret_cap_c_(size)                 _SAL1_1_Source_(_Ret_cap_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_c_impl(size)))
-#define _Ret_opt_cap_c_(size)             _SAL1_1_Source_(_Ret_opt_cap_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_c_impl(size)))
-#define _Ret_bytecap_c_(size)             _SAL1_1_Source_(_Ret_bytecap_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_c_impl(size)))
-#define _Ret_opt_bytecap_c_(size)         _SAL1_1_Source_(_Ret_opt_bytecap_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_c_impl(size)))
-
-// Buffer capacity is described by a complex condition
-#define _Ret_cap_x_(size)                 _SAL1_1_Source_(_Ret_cap_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_x_impl(size)))
-#define _Ret_opt_cap_x_(size)             _SAL1_1_Source_(_Ret_opt_cap_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_x_impl(size)))
-#define _Ret_bytecap_x_(size)             _SAL1_1_Source_(_Ret_bytecap_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_x_impl(size)))
-#define _Ret_opt_bytecap_x_(size)         _SAL1_1_Source_(_Ret_opt_bytecap_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_x_impl(size)))
-
-// return value is nullterminated and capacity is given by another parameter
-#define _Ret_z_cap_(size)                 _SAL1_1_Source_(_Ret_z_cap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__cap_impl(size))     _Ret_valid_impl_)
-#define _Ret_opt_z_cap_(size)             _SAL1_1_Source_(_Ret_opt_z_cap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__cap_impl(size))     _Ret_valid_impl_)
-#define _Ret_z_bytecap_(size)             _SAL1_1_Source_(_Ret_z_bytecap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecap_impl(size)) _Ret_valid_impl_)
-#define _Ret_opt_z_bytecap_(size)         _SAL1_1_Source_(_Ret_opt_z_bytecap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecap_impl(size)) _Ret_valid_impl_)
-
-// e.g. _Ret_opt_bytecount_(cb) void* AllocateZeroInitializedMemory( size_t cb );
-// Valid Buffer extent is described by another parameter
-#define _Ret_count_(size)                 _SAL1_1_Source_(_Ret_count_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_impl(size))     _Ret_valid_impl_)
-#define _Ret_opt_count_(size)             _SAL1_1_Source_(_Ret_opt_count_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_impl(size))     _Ret_valid_impl_)
-#define _Ret_bytecount_(size)             _SAL1_1_Source_(_Ret_bytecount_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_impl(size)) _Ret_valid_impl_)
-#define _Ret_opt_bytecount_(size)         _SAL1_1_Source_(_Ret_opt_bytecount_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_impl(size)) _Ret_valid_impl_)
-
-// Valid Buffer extent is described by a constant expression
-#define _Ret_count_c_(size)               _SAL1_1_Source_(_Ret_count_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_c_impl(size))     _Ret_valid_impl_)
-#define _Ret_opt_count_c_(size)           _SAL1_1_Source_(_Ret_opt_count_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_c_impl(size))     _Ret_valid_impl_)
-#define _Ret_bytecount_c_(size)           _SAL1_1_Source_(_Ret_bytecount_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_c_impl(size)) _Ret_valid_impl_)
-#define _Ret_opt_bytecount_c_(size)       _SAL1_1_Source_(_Ret_opt_bytecount_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_c_impl(size)) _Ret_valid_impl_)
-
-// Valid Buffer extent is described by a complex expression
-#define _Ret_count_x_(size)               _SAL1_1_Source_(_Ret_count_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_x_impl(size))     _Ret_valid_impl_)
-#define _Ret_opt_count_x_(size)           _SAL1_1_Source_(_Ret_opt_count_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_x_impl(size))     _Ret_valid_impl_)
-#define _Ret_bytecount_x_(size)           _SAL1_1_Source_(_Ret_bytecount_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_x_impl(size)) _Ret_valid_impl_)
-#define _Ret_opt_bytecount_x_(size)       _SAL1_1_Source_(_Ret_opt_bytecount_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_x_impl(size)) _Ret_valid_impl_)
-
-// return value is nullterminated and length is given by another parameter
-#define _Ret_z_count_(size)               _SAL1_1_Source_(_Ret_z_count_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__count_impl(size))     _Ret_valid_impl_)
-#define _Ret_opt_z_count_(size)           _SAL1_1_Source_(_Ret_opt_z_count_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__count_impl(size))     _Ret_valid_impl_)
-#define _Ret_z_bytecount_(size)           _SAL1_1_Source_(_Ret_z_bytecount_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecount_impl(size)) _Ret_valid_impl_)
-#define _Ret_opt_z_bytecount_(size)       _SAL1_1_Source_(_Ret_opt_z_bytecount_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecount_impl(size)) _Ret_valid_impl_)
-
-
-// _Pre_ annotations ---
-#define _Pre_opt_z_                       _SAL1_1_Source_(_Pre_opt_z_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__zterm_impl) _Pre_valid_impl_)
-
-// restrict access rights
-#define _Pre_readonly_                    _SAL1_1_Source_(_Pre_readonly_, (), _Pre1_impl_(__readaccess_impl_notref))
-#define _Pre_writeonly_                   _SAL1_1_Source_(_Pre_writeonly_, (), _Pre1_impl_(__writeaccess_impl_notref))
-
-// e.g. void FreeMemory( _Pre_bytecap_(cb) _Post_ptr_invalid_ void* pv, size_t cb );
-// buffer capacity described by another parameter
-#define _Pre_cap_(size)                   _SAL1_1_Source_(_Pre_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_impl(size)))
-#define _Pre_opt_cap_(size)               _SAL1_1_Source_(_Pre_opt_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_impl(size)))
-#define _Pre_bytecap_(size)               _SAL1_1_Source_(_Pre_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_impl(size)))
-#define _Pre_opt_bytecap_(size)           _SAL1_1_Source_(_Pre_opt_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_impl(size)))
-
-// buffer capacity described by a constant expression
-#define _Pre_cap_c_(size)                 _SAL1_1_Source_(_Pre_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_impl(size)))
-#define _Pre_opt_cap_c_(size)             _SAL1_1_Source_(_Pre_opt_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_impl(size)))
-#define _Pre_bytecap_c_(size)             _SAL1_1_Source_(_Pre_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)))
-#define _Pre_opt_bytecap_c_(size)         _SAL1_1_Source_(_Pre_opt_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)))
-#define _Pre_cap_c_one_                   _SAL1_1_Source_(_Pre_cap_c_one_, (), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl))
-#define _Pre_opt_cap_c_one_               _SAL1_1_Source_(_Pre_opt_cap_c_one_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl))
-
-// buffer capacity is described by another parameter multiplied by a constant expression
-#define _Pre_cap_m_(mult,size)            _SAL1_1_Source_(_Pre_cap_m_, (mult,size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__mult_impl(mult,size)))
-#define _Pre_opt_cap_m_(mult,size)        _SAL1_1_Source_(_Pre_opt_cap_m_, (mult,size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__mult_impl(mult,size)))
-
-// buffer capacity described by size of other buffer, only used by dangerous legacy APIs
-// e.g. int strcpy(_Pre_cap_for_(src) char* dst, const char* src);
-#define _Pre_cap_for_(param)              _SAL1_1_Source_(_Pre_cap_for_, (param), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_for_impl(param)))
-#define _Pre_opt_cap_for_(param)          _SAL1_1_Source_(_Pre_opt_cap_for_, (param), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_for_impl(param)))
-
-// buffer capacity described by a complex condition
-#define _Pre_cap_x_(size)                 _SAL1_1_Source_(_Pre_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(size)))
-#define _Pre_opt_cap_x_(size)             _SAL1_1_Source_(_Pre_opt_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(size)))
-#define _Pre_bytecap_x_(size)             _SAL1_1_Source_(_Pre_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)))
-#define _Pre_opt_bytecap_x_(size)         _SAL1_1_Source_(_Pre_opt_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)))
-
-// buffer capacity described by the difference to another pointer parameter
-#define _Pre_ptrdiff_cap_(ptr)            _SAL1_1_Source_(_Pre_ptrdiff_cap_, (ptr), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(__ptrdiff(ptr))))
-#define _Pre_opt_ptrdiff_cap_(ptr)        _SAL1_1_Source_(_Pre_opt_ptrdiff_cap_, (ptr), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(__ptrdiff(ptr))))
-
-// e.g. void AppendStr( _Pre_z_ const char* szFrom, _Pre_z_cap_(cchTo) _Post_z_ char* szTo, size_t cchTo );
-#define _Pre_z_cap_(size)                 _SAL1_1_Source_(_Pre_z_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_impl(size))       _Pre_valid_impl_)
-#define _Pre_opt_z_cap_(size)             _SAL1_1_Source_(_Pre_opt_z_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_impl(size))       _Pre_valid_impl_)
-#define _Pre_z_bytecap_(size)             _SAL1_1_Source_(_Pre_z_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_impl(size))   _Pre_valid_impl_)
-#define _Pre_opt_z_bytecap_(size)         _SAL1_1_Source_(_Pre_opt_z_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_impl(size))   _Pre_valid_impl_)
-
-#define _Pre_z_cap_c_(size)               _SAL1_1_Source_(_Pre_z_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_c_impl(size))     _Pre_valid_impl_)
-#define _Pre_opt_z_cap_c_(size)           _SAL1_1_Source_(_Pre_opt_z_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_c_impl(size))     _Pre_valid_impl_)
-#define _Pre_z_bytecap_c_(size)           _SAL1_1_Source_(_Pre_z_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_)
-#define _Pre_opt_z_bytecap_c_(size)       _SAL1_1_Source_(_Pre_opt_z_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_)
-
-#define _Pre_z_cap_x_(size)               _SAL1_1_Source_(_Pre_z_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_x_impl(size))     _Pre_valid_impl_)
-#define _Pre_opt_z_cap_x_(size)           _SAL1_1_Source_(_Pre_opt_z_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_x_impl(size))     _Pre_valid_impl_)
-#define _Pre_z_bytecap_x_(size)           _SAL1_1_Source_(_Pre_z_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_)
-#define _Pre_opt_z_bytecap_x_(size)       _SAL1_1_Source_(_Pre_opt_z_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_)
-
-// known capacity and valid but unknown readable extent
-#define _Pre_valid_cap_(size)             _SAL1_1_Source_(_Pre_valid_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_impl(size))       _Pre_valid_impl_)
-#define _Pre_opt_valid_cap_(size)         _SAL1_1_Source_(_Pre_opt_valid_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_impl(size))       _Pre_valid_impl_)
-#define _Pre_valid_bytecap_(size)         _SAL1_1_Source_(_Pre_valid_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_impl(size))   _Pre_valid_impl_)
-#define _Pre_opt_valid_bytecap_(size)     _SAL1_1_Source_(_Pre_opt_valid_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_impl(size))   _Pre_valid_impl_)
-
-#define _Pre_valid_cap_c_(size)           _SAL1_1_Source_(_Pre_valid_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_impl(size))     _Pre_valid_impl_)
-#define _Pre_opt_valid_cap_c_(size)       _SAL1_1_Source_(_Pre_opt_valid_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_impl(size))     _Pre_valid_impl_)
-#define _Pre_valid_bytecap_c_(size)       _SAL1_1_Source_(_Pre_valid_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_)
-#define _Pre_opt_valid_bytecap_c_(size)   _SAL1_1_Source_(_Pre_opt_valid_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_)
-
-#define _Pre_valid_cap_x_(size)           _SAL1_1_Source_(_Pre_valid_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(size))     _Pre_valid_impl_)
-#define _Pre_opt_valid_cap_x_(size)       _SAL1_1_Source_(_Pre_opt_valid_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(size))     _Pre_valid_impl_)
-#define _Pre_valid_bytecap_x_(size)       _SAL1_1_Source_(_Pre_valid_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_)
-#define _Pre_opt_valid_bytecap_x_(size)   _SAL1_1_Source_(_Pre_opt_valid_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_)
-
-// e.g. void AppendCharRange( _Pre_count_(cchFrom) const char* rgFrom, size_t cchFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo );
-// Valid buffer extent described by another parameter
-#define _Pre_count_(size)                 _SAL1_1_Source_(_Pre_count_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_impl(size))       _Pre_valid_impl_)
-#define _Pre_opt_count_(size)             _SAL1_1_Source_(_Pre_opt_count_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_impl(size))       _Pre_valid_impl_)
-#define _Pre_bytecount_(size)             _SAL1_1_Source_(_Pre_bytecount_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_impl(size))   _Pre_valid_impl_)
-#define _Pre_opt_bytecount_(size)         _SAL1_1_Source_(_Pre_opt_bytecount_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_impl(size))   _Pre_valid_impl_)
-
-// Valid buffer extent described by a constant expression
-#define _Pre_count_c_(size)               _SAL1_1_Source_(_Pre_count_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_c_impl(size))     _Pre_valid_impl_)
-#define _Pre_opt_count_c_(size)           _SAL1_1_Source_(_Pre_opt_count_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_c_impl(size))     _Pre_valid_impl_)
-#define _Pre_bytecount_c_(size)           _SAL1_1_Source_(_Pre_bytecount_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_)
-#define _Pre_opt_bytecount_c_(size)       _SAL1_1_Source_(_Pre_opt_bytecount_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_)
-
-// Valid buffer extent described by a complex expression
-#define _Pre_count_x_(size)               _SAL1_1_Source_(_Pre_count_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_x_impl(size))     _Pre_valid_impl_)
-#define _Pre_opt_count_x_(size)           _SAL1_1_Source_(_Pre_opt_count_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_x_impl(size))     _Pre_valid_impl_)
-#define _Pre_bytecount_x_(size)           _SAL1_1_Source_(_Pre_bytecount_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_)
-#define _Pre_opt_bytecount_x_(size)       _SAL1_1_Source_(_Pre_opt_bytecount_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_)
-
-// Valid buffer extent described by the difference to another pointer parameter
-#define _Pre_ptrdiff_count_(ptr)          _SAL1_1_Source_(_Pre_ptrdiff_count_, (ptr), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_x_impl(__ptrdiff(ptr))) _Pre_valid_impl_)
-#define _Pre_opt_ptrdiff_count_(ptr)      _SAL1_1_Source_(_Pre_opt_ptrdiff_count_, (ptr), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_x_impl(__ptrdiff(ptr))) _Pre_valid_impl_)
-
-
-// char * strncpy(_Out_cap_(_Count) _Post_maybez_ char * _Dest, _In_z_ const char * _Source, _In_ size_t _Count)
-// buffer maybe zero-terminated after the call
-#define _Post_maybez_                    _SAL1_1_Source_(_Post_maybez_, (), _Post1_impl_(__maybezterm_impl))
-
-// e.g. SIZE_T HeapSize( _In_ HANDLE hHeap, DWORD dwFlags, _Pre_notnull_ _Post_bytecap_(return) LPCVOID lpMem );
-#define _Post_cap_(size)                 _SAL1_1_Source_(_Post_cap_, (size), _Post1_impl_(__cap_impl(size)))
-#define _Post_bytecap_(size)             _SAL1_1_Source_(_Post_bytecap_, (size), _Post1_impl_(__bytecap_impl(size)))
-
-// e.g. int strlen( _In_z_ _Post_count_(return+1) const char* sz );
-#define _Post_count_(size)               _SAL1_1_Source_(_Post_count_, (size), _Post1_impl_(__count_impl(size))       _Post_valid_impl_)
-#define _Post_bytecount_(size)           _SAL1_1_Source_(_Post_bytecount_, (size), _Post1_impl_(__bytecount_impl(size))   _Post_valid_impl_)
-#define _Post_count_c_(size)             _SAL1_1_Source_(_Post_count_c_, (size), _Post1_impl_(__count_c_impl(size))     _Post_valid_impl_)
-#define _Post_bytecount_c_(size)         _SAL1_1_Source_(_Post_bytecount_c_, (size), _Post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_)
-#define _Post_count_x_(size)             _SAL1_1_Source_(_Post_count_x_, (size), _Post1_impl_(__count_x_impl(size))     _Post_valid_impl_)
-#define _Post_bytecount_x_(size)         _SAL1_1_Source_(_Post_bytecount_x_, (size), _Post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_)
-
-// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_count_(return+1) char* szFrom, size_t cchFrom );
-#define _Post_z_count_(size)             _SAL1_1_Source_(_Post_z_count_, (size), _Post2_impl_(__zterm_impl,__count_impl(size))       _Post_valid_impl_)
-#define _Post_z_bytecount_(size)         _SAL1_1_Source_(_Post_z_bytecount_, (size), _Post2_impl_(__zterm_impl,__bytecount_impl(size))   _Post_valid_impl_)
-#define _Post_z_count_c_(size)           _SAL1_1_Source_(_Post_z_count_c_, (size), _Post2_impl_(__zterm_impl,__count_c_impl(size))     _Post_valid_impl_)
-#define _Post_z_bytecount_c_(size)       _SAL1_1_Source_(_Post_z_bytecount_c_, (size), _Post2_impl_(__zterm_impl,__bytecount_c_impl(size)) _Post_valid_impl_)
-#define _Post_z_count_x_(size)           _SAL1_1_Source_(_Post_z_count_x_, (size), _Post2_impl_(__zterm_impl,__count_x_impl(size))     _Post_valid_impl_)
-#define _Post_z_bytecount_x_(size)       _SAL1_1_Source_(_Post_z_bytecount_x_, (size), _Post2_impl_(__zterm_impl,__bytecount_x_impl(size)) _Post_valid_impl_)
-
-//
-// _Prepost_ ---
-//
-// describing conditions that hold before and after the function call
-
-#define _Prepost_opt_z_                  _SAL1_1_Source_(_Prepost_opt_z_, (), _Pre_opt_z_  _Post_z_)
-
-#define _Prepost_count_(size)            _SAL1_1_Source_(_Prepost_count_, (size), _Pre_count_(size)           _Post_count_(size))
-#define _Prepost_opt_count_(size)        _SAL1_1_Source_(_Prepost_opt_count_, (size), _Pre_opt_count_(size)       _Post_count_(size))
-#define _Prepost_bytecount_(size)        _SAL1_1_Source_(_Prepost_bytecount_, (size), _Pre_bytecount_(size)       _Post_bytecount_(size))
-#define _Prepost_opt_bytecount_(size)    _SAL1_1_Source_(_Prepost_opt_bytecount_, (size), _Pre_opt_bytecount_(size)   _Post_bytecount_(size))
-#define _Prepost_count_c_(size)          _SAL1_1_Source_(_Prepost_count_c_, (size), _Pre_count_c_(size)         _Post_count_c_(size))
-#define _Prepost_opt_count_c_(size)      _SAL1_1_Source_(_Prepost_opt_count_c_, (size), _Pre_opt_count_c_(size)     _Post_count_c_(size))
-#define _Prepost_bytecount_c_(size)      _SAL1_1_Source_(_Prepost_bytecount_c_, (size), _Pre_bytecount_c_(size)     _Post_bytecount_c_(size))
-#define _Prepost_opt_bytecount_c_(size)  _SAL1_1_Source_(_Prepost_opt_bytecount_c_, (size), _Pre_opt_bytecount_c_(size) _Post_bytecount_c_(size))
-#define _Prepost_count_x_(size)          _SAL1_1_Source_(_Prepost_count_x_, (size), _Pre_count_x_(size)         _Post_count_x_(size))
-#define _Prepost_opt_count_x_(size)      _SAL1_1_Source_(_Prepost_opt_count_x_, (size), _Pre_opt_count_x_(size)     _Post_count_x_(size))
-#define _Prepost_bytecount_x_(size)      _SAL1_1_Source_(_Prepost_bytecount_x_, (size), _Pre_bytecount_x_(size)     _Post_bytecount_x_(size))
-#define _Prepost_opt_bytecount_x_(size)  _SAL1_1_Source_(_Prepost_opt_bytecount_x_, (size), _Pre_opt_bytecount_x_(size) _Post_bytecount_x_(size))
-
-#define _Prepost_valid_                   _SAL1_1_Source_(_Prepost_valid_, (), _Pre_valid_     _Post_valid_)
-#define _Prepost_opt_valid_               _SAL1_1_Source_(_Prepost_opt_valid_, (), _Pre_opt_valid_ _Post_valid_)
-
-//
-// _Deref_<both> ---
-//
-// short version for _Deref_pre_<ann> _Deref_post_<ann>
-// describing conditions for array elements or dereferenced pointer parameters that hold before and after the call
-
-#define _Deref_prepost_z_                         _SAL1_1_Source_(_Deref_prepost_z_, (), _Deref_pre_z_      _Deref_post_z_)
-#define _Deref_prepost_opt_z_                     _SAL1_1_Source_(_Deref_prepost_opt_z_, (), _Deref_pre_opt_z_  _Deref_post_opt_z_)
-
-#define _Deref_prepost_cap_(size)                 _SAL1_1_Source_(_Deref_prepost_cap_, (size), _Deref_pre_cap_(size)                _Deref_post_cap_(size))
-#define _Deref_prepost_opt_cap_(size)             _SAL1_1_Source_(_Deref_prepost_opt_cap_, (size), _Deref_pre_opt_cap_(size)            _Deref_post_opt_cap_(size))
-#define _Deref_prepost_bytecap_(size)             _SAL1_1_Source_(_Deref_prepost_bytecap_, (size), _Deref_pre_bytecap_(size)            _Deref_post_bytecap_(size))
-#define _Deref_prepost_opt_bytecap_(size)         _SAL1_1_Source_(_Deref_prepost_opt_bytecap_, (size), _Deref_pre_opt_bytecap_(size)        _Deref_post_opt_bytecap_(size))
-
-#define _Deref_prepost_cap_x_(size)               _SAL1_1_Source_(_Deref_prepost_cap_x_, (size), _Deref_pre_cap_x_(size)              _Deref_post_cap_x_(size))
-#define _Deref_prepost_opt_cap_x_(size)           _SAL1_1_Source_(_Deref_prepost_opt_cap_x_, (size), _Deref_pre_opt_cap_x_(size)          _Deref_post_opt_cap_x_(size))
-#define _Deref_prepost_bytecap_x_(size)           _SAL1_1_Source_(_Deref_prepost_bytecap_x_, (size), _Deref_pre_bytecap_x_(size)          _Deref_post_bytecap_x_(size))
-#define _Deref_prepost_opt_bytecap_x_(size)       _SAL1_1_Source_(_Deref_prepost_opt_bytecap_x_, (size), _Deref_pre_opt_bytecap_x_(size)      _Deref_post_opt_bytecap_x_(size))
-
-#define _Deref_prepost_z_cap_(size)               _SAL1_1_Source_(_Deref_prepost_z_cap_, (size), _Deref_pre_z_cap_(size)              _Deref_post_z_cap_(size))
-#define _Deref_prepost_opt_z_cap_(size)           _SAL1_1_Source_(_Deref_prepost_opt_z_cap_, (size), _Deref_pre_opt_z_cap_(size)          _Deref_post_opt_z_cap_(size))
-#define _Deref_prepost_z_bytecap_(size)           _SAL1_1_Source_(_Deref_prepost_z_bytecap_, (size), _Deref_pre_z_bytecap_(size)          _Deref_post_z_bytecap_(size))
-#define _Deref_prepost_opt_z_bytecap_(size)       _SAL1_1_Source_(_Deref_prepost_opt_z_bytecap_, (size), _Deref_pre_opt_z_bytecap_(size)      _Deref_post_opt_z_bytecap_(size))
-
-#define _Deref_prepost_valid_cap_(size)           _SAL1_1_Source_(_Deref_prepost_valid_cap_, (size), _Deref_pre_valid_cap_(size)          _Deref_post_valid_cap_(size))
-#define _Deref_prepost_opt_valid_cap_(size)       _SAL1_1_Source_(_Deref_prepost_opt_valid_cap_, (size), _Deref_pre_opt_valid_cap_(size)      _Deref_post_opt_valid_cap_(size))
-#define _Deref_prepost_valid_bytecap_(size)       _SAL1_1_Source_(_Deref_prepost_valid_bytecap_, (size), _Deref_pre_valid_bytecap_(size)      _Deref_post_valid_bytecap_(size))
-#define _Deref_prepost_opt_valid_bytecap_(size)   _SAL1_1_Source_(_Deref_prepost_opt_valid_bytecap_, (size), _Deref_pre_opt_valid_bytecap_(size)  _Deref_post_opt_valid_bytecap_(size))
-
-#define _Deref_prepost_valid_cap_x_(size)           _SAL1_1_Source_(_Deref_prepost_valid_cap_x_, (size), _Deref_pre_valid_cap_x_(size)          _Deref_post_valid_cap_x_(size))
-#define _Deref_prepost_opt_valid_cap_x_(size)       _SAL1_1_Source_(_Deref_prepost_opt_valid_cap_x_, (size), _Deref_pre_opt_valid_cap_x_(size)      _Deref_post_opt_valid_cap_x_(size))
-#define _Deref_prepost_valid_bytecap_x_(size)       _SAL1_1_Source_(_Deref_prepost_valid_bytecap_x_, (size), _Deref_pre_valid_bytecap_x_(size)      _Deref_post_valid_bytecap_x_(size))
-#define _Deref_prepost_opt_valid_bytecap_x_(size)   _SAL1_1_Source_(_Deref_prepost_opt_valid_bytecap_x_, (size), _Deref_pre_opt_valid_bytecap_x_(size)  _Deref_post_opt_valid_bytecap_x_(size))
-
-#define _Deref_prepost_count_(size)             _SAL1_1_Source_(_Deref_prepost_count_, (size), _Deref_pre_count_(size)            _Deref_post_count_(size))
-#define _Deref_prepost_opt_count_(size)         _SAL1_1_Source_(_Deref_prepost_opt_count_, (size), _Deref_pre_opt_count_(size)        _Deref_post_opt_count_(size))
-#define _Deref_prepost_bytecount_(size)         _SAL1_1_Source_(_Deref_prepost_bytecount_, (size), _Deref_pre_bytecount_(size)        _Deref_post_bytecount_(size))
-#define _Deref_prepost_opt_bytecount_(size)     _SAL1_1_Source_(_Deref_prepost_opt_bytecount_, (size), _Deref_pre_opt_bytecount_(size)    _Deref_post_opt_bytecount_(size))
-
-#define _Deref_prepost_count_x_(size)           _SAL1_1_Source_(_Deref_prepost_count_x_, (size), _Deref_pre_count_x_(size)          _Deref_post_count_x_(size))
-#define _Deref_prepost_opt_count_x_(size)       _SAL1_1_Source_(_Deref_prepost_opt_count_x_, (size), _Deref_pre_opt_count_x_(size)      _Deref_post_opt_count_x_(size))
-#define _Deref_prepost_bytecount_x_(size)       _SAL1_1_Source_(_Deref_prepost_bytecount_x_, (size), _Deref_pre_bytecount_x_(size)      _Deref_post_bytecount_x_(size))
-#define _Deref_prepost_opt_bytecount_x_(size)   _SAL1_1_Source_(_Deref_prepost_opt_bytecount_x_, (size), _Deref_pre_opt_bytecount_x_(size)  _Deref_post_opt_bytecount_x_(size))
-
-#define _Deref_prepost_valid_                    _SAL1_1_Source_(_Deref_prepost_valid_, (), _Deref_pre_valid_     _Deref_post_valid_)
-#define _Deref_prepost_opt_valid_                _SAL1_1_Source_(_Deref_prepost_opt_valid_, (), _Deref_pre_opt_valid_ _Deref_post_opt_valid_)
-
-//
-// _Deref_<miscellaneous>
-//
-// used with references to arrays
-
-#define _Deref_out_z_cap_c_(size)  _SAL1_1_Source_(_Deref_out_z_cap_c_, (size), _Deref_pre_cap_c_(size) _Deref_post_z_)
-#define _Deref_inout_z_cap_c_(size)  _SAL1_1_Source_(_Deref_inout_z_cap_c_, (size), _Deref_pre_z_cap_c_(size) _Deref_post_z_)
-#define _Deref_out_z_bytecap_c_(size)  _SAL1_1_Source_(_Deref_out_z_bytecap_c_, (size), _Deref_pre_bytecap_c_(size) _Deref_post_z_)
-#define _Deref_inout_z_bytecap_c_(size)  _SAL1_1_Source_(_Deref_inout_z_bytecap_c_, (size), _Deref_pre_z_bytecap_c_(size) _Deref_post_z_)
-#define _Deref_inout_z_  _SAL1_1_Source_(_Deref_inout_z_, (), _Deref_prepost_z_)
-
-// #pragma endregion Input Buffer SAL 1 compatibility macros
-
-
-//============================================================================
-//   Implementation Layer:
-//============================================================================
-
-
-// Naming conventions:
-// A symbol the begins with _SA_ is for the machinery of creating any
-// annotations; many of those come from sourceannotations.h in the case
-// of attributes.
-
-// A symbol that ends with _impl is the very lowest level macro.  It is
-// not required to be a legal standalone annotation, and in the case
-// of attribute annotations, usually is not.  (In the case of some declspec
-// annotations, it might be, but it should not be assumed so.)  Those
-// symols will be used in the _PreN..., _PostN... and _RetN... annotations
-// to build up more complete annotations.
-
-// A symbol ending in _impl_ is reserved to the implementation as well,
-// but it does form a complete annotation; usually they are used to build
-// up even higher level annotations.
-
-
-#if _USE_ATTRIBUTES_FOR_SAL || _USE_DECLSPECS_FOR_SAL // [
-// Sharable "_impl" macros: these can be shared between the various annotation
-// forms but are part of the implementation of the macros.  These are collected
-// here to assure that only necessary differences in the annotations
-// exist.
-
-#define _Always_impl_(annos)            _Group_(annos _SAL_nop_impl_) _On_failure_impl_(annos _SAL_nop_impl_)
-#define _Bound_impl_                    _SA_annotes0(SAL_bound)
-#define _Field_range_impl_(min,max)     _Range_impl_(min,max)
-#define _Literal_impl_                  _SA_annotes1(SAL_constant, __yes)
-#define _Maybenull_impl_                _SA_annotes1(SAL_null, __maybe)
-#define _Maybevalid_impl_               _SA_annotes1(SAL_valid, __maybe)
-#define _Must_inspect_impl_ _Post_impl_ _SA_annotes0(SAL_mustInspect)
-#define _Notliteral_impl_               _SA_annotes1(SAL_constant, __no)
-#define _Notnull_impl_                  _SA_annotes1(SAL_null, __no)
-#define _Notvalid_impl_                 _SA_annotes1(SAL_valid, __no)
-#define _NullNull_terminated_impl_      _Group_(_SA_annotes1(SAL_nullTerminated, __yes) _SA_annotes1(SAL_readableTo,inexpressibleCount("NullNull terminated string")))
-#define _Null_impl_                     _SA_annotes1(SAL_null, __yes)
-#define _Null_terminated_impl_          _SA_annotes1(SAL_nullTerminated, __yes)
-#define _Out_impl_                      _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl) _Post_valid_impl_
-#define _Out_opt_impl_                  _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl) _Post_valid_impl_
-#define _Points_to_data_impl_           _At_(*_Curr_, _SA_annotes1(SAL_mayBePointer, __no))
-#define _Post_satisfies_impl_(cond)     _Post_impl_ _Satisfies_impl_(cond)
-#define _Post_valid_impl_               _Post1_impl_(__valid_impl)
-#define _Pre_satisfies_impl_(cond)      _Pre_impl_ _Satisfies_impl_(cond)
-#define _Pre_valid_impl_                _Pre1_impl_(__valid_impl)
-#define _Range_impl_(min,max)           _SA_annotes2(SAL_range, min, max)
-#define _Readable_bytes_impl_(size)     _SA_annotes1(SAL_readableTo, byteCount(size))
-#define _Readable_elements_impl_(size)  _SA_annotes1(SAL_readableTo, elementCount(size))
-#define _Ret_valid_impl_                _Ret1_impl_(__valid_impl)
-#define _Satisfies_impl_(cond)          _SA_annotes1(SAL_satisfies, cond)
-#define _Valid_impl_                    _SA_annotes1(SAL_valid, __yes)
-#define _Writable_bytes_impl_(size)     _SA_annotes1(SAL_writableTo, byteCount(size))
-#define _Writable_elements_impl_(size)  _SA_annotes1(SAL_writableTo, elementCount(size))
-
-#define _In_range_impl_(min,max)        _Pre_impl_ _Range_impl_(min,max)
-#define _Out_range_impl_(min,max)       _Post_impl_ _Range_impl_(min,max)
-#define _Ret_range_impl_(min,max)       _Post_impl_ _Range_impl_(min,max)
-#define _Deref_in_range_impl_(min,max)  _Deref_pre_impl_ _Range_impl_(min,max)
-#define _Deref_out_range_impl_(min,max) _Deref_post_impl_ _Range_impl_(min,max)
-#define _Deref_ret_range_impl_(min,max) _Deref_post_impl_ _Range_impl_(min,max)
-
-#define _Deref_pre_impl_                _Pre_impl_  _Notref_impl_ _Deref_impl_
-#define _Deref_post_impl_               _Post_impl_ _Notref_impl_ _Deref_impl_
-
-// The following are for the implementation machinery, and are not
-// suitable for annotating general code.
-// We're tying to phase this out, someday.  The parser quotes the param.
-#define __AuToQuOtE                     _SA_annotes0(SAL_AuToQuOtE)
-
-// Normally the parser does some simple type checking of annotation params,
-// defer that check to the plugin.
-#define __deferTypecheck                _SA_annotes0(SAL_deferTypecheck)
-
-#define _SA_SPECSTRIZE( x ) #x
-#define _SAL_nop_impl_       /* nothing */
-#define __nop_impl(x)            x
-#endif
-
-
-#if _USE_ATTRIBUTES_FOR_SAL // [
-
-// Using attributes for sal
-
-#include "codeanalysis\sourceannotations.h"
-
-
-#define _SA_annotes0(n)                [SAL_annotes(Name=#n)]
-#define _SA_annotes1(n,pp1)            [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1))]
-#define _SA_annotes2(n,pp1,pp2)        [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1), p2=_SA_SPECSTRIZE(pp2))]
-#define _SA_annotes3(n,pp1,pp2,pp3)    [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1), p2=_SA_SPECSTRIZE(pp2), p3=_SA_SPECSTRIZE(pp3))]
-
-#define _Pre_impl_                     [SAL_pre]
-#define _Post_impl_                    [SAL_post]
-#define _Deref_impl_                   [SAL_deref]
-#define _Notref_impl_                  [SAL_notref]
-
-
-// Declare a function to be an annotation or primop (respectively).
-// Done this way so that they don't appear in the regular compiler's
-// namespace.
-#define __ANNOTATION(fun)              _SA_annotes0(SAL_annotation)  void __SA_##fun;
-#define __PRIMOP(type, fun)            _SA_annotes0(SAL_primop)  type __SA_##fun;
-#define __QUALIFIER(fun)               _SA_annotes0(SAL_qualifier)  void __SA_##fun;
-
-// Benign declspec needed here for WindowsPREfast
-#define __In_impl_ [SA_Pre(Valid=SA_Yes)] [SA_Pre(Deref=1, Notref=1, Access=SA_Read)] __declspec("SAL_pre SAL_valid")
-
-#elif _USE_DECLSPECS_FOR_SAL // ][
-
-// Using declspecs for sal
-
-#define _SA_annotes0(n)                __declspec(#n)
-#define _SA_annotes1(n,pp1)            __declspec(#n "(" _SA_SPECSTRIZE(pp1) ")" )
-#define _SA_annotes2(n,pp1,pp2)        __declspec(#n "(" _SA_SPECSTRIZE(pp1) "," _SA_SPECSTRIZE(pp2) ")")
-#define _SA_annotes3(n,pp1,pp2,pp3)    __declspec(#n "(" _SA_SPECSTRIZE(pp1) "," _SA_SPECSTRIZE(pp2) "," _SA_SPECSTRIZE(pp3) ")")
-
-#define _Pre_impl_                     _SA_annotes0(SAL_pre)
-#define _Post_impl_                    _SA_annotes0(SAL_post)
-#define _Deref_impl_                   _SA_annotes0(SAL_deref)
-#define _Notref_impl_                  _SA_annotes0(SAL_notref)
-
-// Declare a function to be an annotation or primop (respectively).
-// Done this way so that they don't appear in the regular compiler's
-// namespace.
-#define __ANNOTATION(fun)              _SA_annotes0(SAL_annotation) void __SA_##fun
-
-#define __PRIMOP(type, fun)            _SA_annotes0(SAL_primop) type __SA_##fun
-
-#define __QUALIFIER(fun)               _SA_annotes0(SAL_qualifier)  void __SA_##fun;
-
-#define __In_impl_ _Pre_impl_ _SA_annotes0(SAL_valid) _Pre_impl_ _Deref_impl_ _Notref_impl_ _SA_annotes0(SAL_readonly)
-
-#else // ][
-
-// Using "nothing" for sal
-
-#define _SA_annotes0(n)
-#define _SA_annotes1(n,pp1)
-#define _SA_annotes2(n,pp1,pp2)
-#define _SA_annotes3(n,pp1,pp2,pp3)
-
-#define __ANNOTATION(fun)
-#define __PRIMOP(type, fun)
-#define __QUALIFIER(type, fun)
-
-#endif // ]
-
-#if _USE_ATTRIBUTES_FOR_SAL || _USE_DECLSPECS_FOR_SAL // [
-
-// Declare annotations that need to be declared.
-__ANNOTATION(SAL_useHeader(void));
-__ANNOTATION(SAL_bound(void));
-__ANNOTATION(SAL_allocator(void));   //??? resolve with PFD
-__ANNOTATION(SAL_file_parser(__AuToQuOtE __In_impl_ char *, __In_impl_ char *));
-__ANNOTATION(SAL_source_code_content(__In_impl_ char *));
-__ANNOTATION(SAL_analysisHint(__AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_untrusted_data_source(__AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_untrusted_data_source_this(__AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_validated(__AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_validated_this(__AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_encoded(void));
-__ANNOTATION(SAL_adt(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_add_adt_property(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_remove_adt_property(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_transfer_adt_property_from(__AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_post_type(__AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_volatile(void));
-__ANNOTATION(SAL_nonvolatile(void));
-__ANNOTATION(SAL_entrypoint(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_blocksOn(__In_impl_ void*));
-__ANNOTATION(SAL_mustInspect(void));
-
-// Only appears in model files, but needs to be declared.
-__ANNOTATION(SAL_TypeName(__AuToQuOtE __In_impl_ char *));
-
-// To be declared well-known soon.
-__ANNOTATION(SAL_interlocked(void);)
-
-#pragma warning (suppress: 28227 28241)
-__ANNOTATION(SAL_name(__In_impl_ char *, __In_impl_ char *, __In_impl_ char *);)
-
-__PRIMOP(char *, _Macro_value_(__In_impl_ char *));
-__PRIMOP(int, _Macro_defined_(__In_impl_ char *));
-__PRIMOP(char *, _Strstr_(__In_impl_ char *, __In_impl_ char *));
-
-#endif // ]
-
-#if _USE_ATTRIBUTES_FOR_SAL // [
-
-#define _Check_return_impl_           [SA_Post(MustCheck=SA_Yes)]
-
-#define _Success_impl_(expr)          [SA_Success(Condition=#expr)]
-#define _On_failure_impl_(annos)      [SAL_context(p1="SAL_failed")] _Group_(_Post_impl_ _Group_(annos _SAL_nop_impl_))
-
-#define _Printf_format_string_impl_   [SA_FormatString(Style="printf")]
-#define _Scanf_format_string_impl_    [SA_FormatString(Style="scanf")]
-#define _Scanf_s_format_string_impl_  [SA_FormatString(Style="scanf_s")]
-
-#define _In_bound_impl_               [SA_PreBound(Deref=0)]
-#define _Out_bound_impl_              [SA_PostBound(Deref=0)]
-#define _Ret_bound_impl_              [SA_PostBound(Deref=0)]
-#define _Deref_in_bound_impl_         [SA_PreBound(Deref=1)]
-#define _Deref_out_bound_impl_        [SA_PostBound(Deref=1)]
-#define _Deref_ret_bound_impl_        [SA_PostBound(Deref=1)]
-
-#define __valid_impl                  Valid=SA_Yes
-#define __maybevalid_impl             Valid=SA_Maybe
-#define __notvalid_impl               Valid=SA_No
-
-#define __null_impl                   Null=SA_Yes
-#define __maybenull_impl              Null=SA_Maybe
-#define __notnull_impl                Null=SA_No
-
-#define __null_impl_notref        Null=SA_Yes,Notref=1
-#define __maybenull_impl_notref   Null=SA_Maybe,Notref=1
-#define __notnull_impl_notref     Null=SA_No,Notref=1
-
-#define __zterm_impl              NullTerminated=SA_Yes
-#define __maybezterm_impl         NullTerminated=SA_Maybe
-#define __maybzterm_impl          NullTerminated=SA_Maybe
-#define __notzterm_impl           NullTerminated=SA_No
-
-#define __readaccess_impl         Access=SA_Read
-#define __writeaccess_impl        Access=SA_Write
-#define __allaccess_impl          Access=SA_ReadWrite
-
-#define __readaccess_impl_notref  Access=SA_Read,Notref=1
-#define __writeaccess_impl_notref Access=SA_Write,Notref=1
-#define __allaccess_impl_notref   Access=SA_ReadWrite,Notref=1
-
-#if _MSC_VER >= 1610 /*IFSTRIP=IGN*/ // [
-
-// For SAL2, we need to expect general expressions.
-
-#define __cap_impl(size)          WritableElements="\n"#size
-#define __bytecap_impl(size)      WritableBytes="\n"#size
-#define __bytecount_impl(size)    ValidBytes="\n"#size
-#define __count_impl(size)        ValidElements="\n"#size
-
-#else // ][
-
-#define __cap_impl(size)          WritableElements=#size
-#define __bytecap_impl(size)      WritableBytes=#size
-#define __bytecount_impl(size)    ValidBytes=#size
-#define __count_impl(size)        ValidElements=#size
-
-#endif // ]
-
-#define __cap_c_impl(size)        WritableElementsConst=size
-#define __cap_c_one_notref_impl   WritableElementsConst=1,Notref=1
-#define __cap_for_impl(param)     WritableElementsLength=#param
-#define __cap_x_impl(size)        WritableElements="\n@"#size
-
-#define __bytecap_c_impl(size)    WritableBytesConst=size
-#define __bytecap_x_impl(size)    WritableBytes="\n@"#size
-
-#define __mult_impl(mult,size)    __cap_impl((mult)*(size))
-
-#define __count_c_impl(size)      ValidElementsConst=size
-#define __count_x_impl(size)      ValidElements="\n@"#size
-
-#define __bytecount_c_impl(size)  ValidBytesConst=size
-#define __bytecount_x_impl(size)  ValidBytes="\n@"#size
-
-
-#define _At_impl_(target, annos)       [SAL_at(p1=#target)] _Group_(annos)
-#define _At_buffer_impl_(target, iter, bound, annos)  [SAL_at_buffer(p1=#target, p2=#iter, p3=#bound)] _Group_(annos)
-#define _When_impl_(expr, annos)       [SAL_when(p1=#expr)] _Group_(annos)
-
-#define _Group_impl_(annos)            [SAL_begin] annos [SAL_end]
-#define _GrouP_impl_(annos)            [SAL_BEGIN] annos [SAL_END]
-
-#define _Use_decl_anno_impl_               _SA_annotes0(SAL_useHeader) // this is a special case!
-
-#define _Pre1_impl_(p1)                    [SA_Pre(p1)]
-#define _Pre2_impl_(p1,p2)                 [SA_Pre(p1,p2)]
-#define _Pre3_impl_(p1,p2,p3)              [SA_Pre(p1,p2,p3)]
-
-#define _Post1_impl_(p1)                   [SA_Post(p1)]
-#define _Post2_impl_(p1,p2)                [SA_Post(p1,p2)]
-#define _Post3_impl_(p1,p2,p3)             [SA_Post(p1,p2,p3)]
-
-#define _Ret1_impl_(p1)                    [SA_Post(p1)]
-#define _Ret2_impl_(p1,p2)                 [SA_Post(p1,p2)]
-#define _Ret3_impl_(p1,p2,p3)              [SA_Post(p1,p2,p3)]
-
-#define _Deref_pre1_impl_(p1)              [SA_Pre(Deref=1,p1)]
-#define _Deref_pre2_impl_(p1,p2)           [SA_Pre(Deref=1,p1,p2)]
-#define _Deref_pre3_impl_(p1,p2,p3)        [SA_Pre(Deref=1,p1,p2,p3)]
-
-
-#define _Deref_post1_impl_(p1)             [SA_Post(Deref=1,p1)]
-#define _Deref_post2_impl_(p1,p2)          [SA_Post(Deref=1,p1,p2)]
-#define _Deref_post3_impl_(p1,p2,p3)       [SA_Post(Deref=1,p1,p2,p3)]
-
-#define _Deref_ret1_impl_(p1)              [SA_Post(Deref=1,p1)]
-#define _Deref_ret2_impl_(p1,p2)           [SA_Post(Deref=1,p1,p2)]
-#define _Deref_ret3_impl_(p1,p2,p3)        [SA_Post(Deref=1,p1,p2,p3)]
-
-#define _Deref2_pre1_impl_(p1)             [SA_Pre(Deref=2,Notref=1,p1)]
-#define _Deref2_post1_impl_(p1)            [SA_Post(Deref=2,Notref=1,p1)]
-#define _Deref2_ret1_impl_(p1)             [SA_Post(Deref=2,Notref=1,p1)]
-
-// Obsolete -- may be needed for transition to attributes.
-#define __inner_typefix(ctype)             [SAL_typefix(p1=_SA_SPECSTRIZE(ctype))]
-#define __inner_exceptthat                 [SAL_except]
-
-
-#elif _USE_DECLSPECS_FOR_SAL // ][
-
-#define _Check_return_impl_ __post      _SA_annotes0(SAL_checkReturn)
-
-#define _Success_impl_(expr)            _SA_annotes1(SAL_success, expr)
-#define _On_failure_impl_(annos)        _SA_annotes1(SAL_context, SAL_failed) _Group_(_Post_impl_ _Group_(_SAL_nop_impl_ annos))
-
-#define _Printf_format_string_impl_     _SA_annotes1(SAL_IsFormatString, "printf")
-#define _Scanf_format_string_impl_      _SA_annotes1(SAL_IsFormatString, "scanf")
-#define _Scanf_s_format_string_impl_    _SA_annotes1(SAL_IsFormatString, "scanf_s")
-
-#define _In_bound_impl_                 _Pre_impl_ _Bound_impl_
-#define _Out_bound_impl_                _Post_impl_ _Bound_impl_
-#define _Ret_bound_impl_                _Post_impl_ _Bound_impl_
-#define _Deref_in_bound_impl_           _Deref_pre_impl_ _Bound_impl_
-#define _Deref_out_bound_impl_          _Deref_post_impl_ _Bound_impl_
-#define _Deref_ret_bound_impl_          _Deref_post_impl_ _Bound_impl_
-
-
-#define __null_impl              _SA_annotes0(SAL_null) // _SA_annotes1(SAL_null, __yes)
-#define __notnull_impl           _SA_annotes0(SAL_notnull) // _SA_annotes1(SAL_null, __no)
-#define __maybenull_impl         _SA_annotes0(SAL_maybenull) // _SA_annotes1(SAL_null, __maybe)
-
-#define __valid_impl             _SA_annotes0(SAL_valid) // _SA_annotes1(SAL_valid, __yes)
-#define __notvalid_impl          _SA_annotes0(SAL_notvalid) // _SA_annotes1(SAL_valid, __no)
-#define __maybevalid_impl        _SA_annotes0(SAL_maybevalid) // _SA_annotes1(SAL_valid, __maybe)
-
-#define __null_impl_notref       _Notref_ _Null_impl_
-#define __maybenull_impl_notref  _Notref_ _Maybenull_impl_
-#define __notnull_impl_notref    _Notref_ _Notnull_impl_
-
-#define __zterm_impl             _SA_annotes1(SAL_nullTerminated, __yes)
-#define __maybezterm_impl        _SA_annotes1(SAL_nullTerminated, __maybe)
-#define __maybzterm_impl         _SA_annotes1(SAL_nullTerminated, __maybe)
-#define __notzterm_impl          _SA_annotes1(SAL_nullTerminated, __no)
-
-#define __readaccess_impl        _SA_annotes1(SAL_access, 0x1)
-#define __writeaccess_impl       _SA_annotes1(SAL_access, 0x2)
-#define __allaccess_impl         _SA_annotes1(SAL_access, 0x3)
-
-#define __readaccess_impl_notref  _Notref_ _SA_annotes1(SAL_access, 0x1)
-#define __writeaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x2)
-#define __allaccess_impl_notref   _Notref_ _SA_annotes1(SAL_access, 0x3)
-
-#define __cap_impl(size)         _SA_annotes1(SAL_writableTo,elementCount(size))
-#define __cap_c_impl(size)       _SA_annotes1(SAL_writableTo,elementCount(size))
-#define __cap_c_one_notref_impl  _Notref_ _SA_annotes1(SAL_writableTo,elementCount(1))
-#define __cap_for_impl(param)    _SA_annotes1(SAL_writableTo,inexpressibleCount(sizeof(param)))
-#define __cap_x_impl(size)       _SA_annotes1(SAL_writableTo,inexpressibleCount(#size))
-
-#define __bytecap_impl(size)     _SA_annotes1(SAL_writableTo,byteCount(size))
-#define __bytecap_c_impl(size)   _SA_annotes1(SAL_writableTo,byteCount(size))
-#define __bytecap_x_impl(size)   _SA_annotes1(SAL_writableTo,inexpressibleCount(#size))
-
-#define __mult_impl(mult,size)   _SA_annotes1(SAL_writableTo,(mult)*(size))
-
-#define __count_impl(size)       _SA_annotes1(SAL_readableTo,elementCount(size))
-#define __count_c_impl(size)     _SA_annotes1(SAL_readableTo,elementCount(size))
-#define __count_x_impl(size)     _SA_annotes1(SAL_readableTo,inexpressibleCount(#size))
-
-#define __bytecount_impl(size)   _SA_annotes1(SAL_readableTo,byteCount(size))
-#define __bytecount_c_impl(size) _SA_annotes1(SAL_readableTo,byteCount(size))
-#define __bytecount_x_impl(size) _SA_annotes1(SAL_readableTo,inexpressibleCount(#size))
-
-#define _At_impl_(target, annos)     _SA_annotes0(SAL_at(target)) _Group_(annos)
-#define _At_buffer_impl_(target, iter, bound, annos)  _SA_annotes3(SAL_at_buffer, target, iter, bound) _Group_(annos)
-#define _Group_impl_(annos)          _SA_annotes0(SAL_begin) annos _SA_annotes0(SAL_end)
-#define _GrouP_impl_(annos)          _SA_annotes0(SAL_BEGIN) annos _SA_annotes0(SAL_END)
-#define _When_impl_(expr, annos)     _SA_annotes0(SAL_when(expr)) _Group_(annos)
-
-#define _Use_decl_anno_impl_         __declspec("SAL_useHeader()") // this is a special case!
-
-#define _Pre1_impl_(p1)              _Pre_impl_ p1
-#define _Pre2_impl_(p1,p2)           _Pre_impl_ p1 _Pre_impl_ p2
-#define _Pre3_impl_(p1,p2,p3)        _Pre_impl_ p1 _Pre_impl_ p2 _Pre_impl_ p3
-
-#define _Post1_impl_(p1)             _Post_impl_ p1
-#define _Post2_impl_(p1,p2)          _Post_impl_ p1 _Post_impl_ p2
-#define _Post3_impl_(p1,p2,p3)       _Post_impl_ p1 _Post_impl_ p2 _Post_impl_ p3
-
-#define _Ret1_impl_(p1)              _Post_impl_ p1
-#define _Ret2_impl_(p1,p2)           _Post_impl_ p1 _Post_impl_ p2
-#define _Ret3_impl_(p1,p2,p3)        _Post_impl_ p1 _Post_impl_ p2 _Post_impl_ p3
-
-#define _Deref_pre1_impl_(p1)        _Deref_pre_impl_ p1
-#define _Deref_pre2_impl_(p1,p2)     _Deref_pre_impl_ p1 _Deref_pre_impl_ p2
-#define _Deref_pre3_impl_(p1,p2,p3)  _Deref_pre_impl_ p1 _Deref_pre_impl_ p2 _Deref_pre_impl_ p3
-
-#define _Deref_post1_impl_(p1)       _Deref_post_impl_ p1
-#define _Deref_post2_impl_(p1,p2)    _Deref_post_impl_ p1 _Deref_post_impl_ p2
-#define _Deref_post3_impl_(p1,p2,p3) _Deref_post_impl_ p1 _Deref_post_impl_ p2 _Deref_post_impl_ p3
-
-#define _Deref_ret1_impl_(p1)        _Deref_post_impl_ p1
-#define _Deref_ret2_impl_(p1,p2)     _Deref_post_impl_ p1 _Deref_post_impl_ p2
-#define _Deref_ret3_impl_(p1,p2,p3)  _Deref_post_impl_ p1 _Deref_post_impl_ p2 _Deref_post_impl_ p3
-
-#define _Deref2_pre1_impl_(p1)       _Deref_pre_impl_ _Notref_impl_ _Deref_impl_ p1
-#define _Deref2_post1_impl_(p1)      _Deref_post_impl_ _Notref_impl_ _Deref_impl_ p1
-#define _Deref2_ret1_impl_(p1)       _Deref_post_impl_ _Notref_impl_ _Deref_impl_ p1
-
-#define __inner_typefix(ctype)             _SA_annotes1(SAL_typefix, ctype)
-#define __inner_exceptthat                 _SA_annotes0(SAL_except)
-
-#elif defined(_MSC_EXTENSIONS) && !defined( MIDL_PASS ) && !defined(__midl) && !defined(RC_INVOKED) && defined(_PFT_VER) && _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // ][
-
-// minimum attribute expansion for foreground build
-
-#pragma push_macro( "SA" )
-#pragma push_macro( "REPEATABLE" )
-
-#ifdef __cplusplus // [
-#define SA( id ) id
-#define REPEATABLE [repeatable]
-#else  // !__cplusplus // ][
-#define SA( id ) SA_##id
-#define REPEATABLE
-#endif  // !__cplusplus // ]
-
-REPEATABLE
-[source_annotation_attribute( SA( Parameter ) )]
-struct __P_impl
-{
-#ifdef __cplusplus // [
-    __P_impl();
-#endif // ]
-   int __d_;
-};
-typedef struct __P_impl __P_impl;
-
-REPEATABLE
-[source_annotation_attribute( SA( ReturnValue ) )]
-struct __R_impl
-{
-#ifdef __cplusplus // [
-    __R_impl();
-#endif // ]
-   int __d_;
-};
-typedef struct __R_impl __R_impl;
-
-[source_annotation_attribute( SA( Method ) )]
-struct __M_
-{
-#ifdef __cplusplus // [
-    __M_();
-#endif // ]
-   int __d_;
-};
-typedef struct __M_ __M_;
-
-[source_annotation_attribute( SA( All ) )]
-struct __A_
-{
-#ifdef __cplusplus // [
-    __A_();
-#endif // ]
-   int __d_;
-};
-typedef struct __A_ __A_;
-
-[source_annotation_attribute( SA( Field ) )]
-struct __F_
-{
-#ifdef __cplusplus // [
-    __F_();
-#endif // ]
-   int __d_;
-};
-typedef struct __F_ __F_;
-
-#pragma pop_macro( "REPEATABLE" )
-#pragma pop_macro( "SA" )
-
-
-#define _SAL_nop_impl_
-
-#define _At_impl_(target, annos)        [__A_(__d_=0)]
-#define _At_buffer_impl_(target, iter, bound, annos)  [__A_(__d_=0)]
-#define _When_impl_(expr, annos)        annos
-#define _Group_impl_(annos)             annos
-#define _GrouP_impl_(annos)             annos
-#define _Use_decl_anno_impl_            [__M_(__d_=0)]
-
-#define _Points_to_data_impl_           [__P_impl(__d_=0)]
-#define _Literal_impl_                  [__P_impl(__d_=0)]
-#define _Notliteral_impl_               [__P_impl(__d_=0)]
-
-#define _Pre_valid_impl_                [__P_impl(__d_=0)]
-#define _Post_valid_impl_               [__P_impl(__d_=0)]
-#define _Ret_valid_impl_                [__R_impl(__d_=0)]
-
-#define _Check_return_impl_             [__R_impl(__d_=0)]
-#define _Must_inspect_impl_             [__R_impl(__d_=0)]
-
-#define _Success_impl_(expr)            [__M_(__d_=0)]
-#define _On_failure_impl_(expr)         [__M_(__d_=0)]
-#define _Always_impl_(expr)             [__M_(__d_=0)]
-
-#define _Printf_format_string_impl_     [__P_impl(__d_=0)]
-#define _Scanf_format_string_impl_      [__P_impl(__d_=0)]
-#define _Scanf_s_format_string_impl_    [__P_impl(__d_=0)]
-
-#define _Raises_SEH_exception_impl_         [__M_(__d_=0)]
-#define _Maybe_raises_SEH_exception_impl_   [__M_(__d_=0)]
-
-#define _In_bound_impl_                 [__P_impl(__d_=0)]
-#define _Out_bound_impl_                [__P_impl(__d_=0)]
-#define _Ret_bound_impl_                [__R_impl(__d_=0)]
-#define _Deref_in_bound_impl_           [__P_impl(__d_=0)]
-#define _Deref_out_bound_impl_          [__P_impl(__d_=0)]
-#define _Deref_ret_bound_impl_          [__R_impl(__d_=0)]
-
-#define _Range_impl_(min,max)           [__P_impl(__d_=0)]
-#define _In_range_impl_(min,max)        [__P_impl(__d_=0)]
-#define _Out_range_impl_(min,max)       [__P_impl(__d_=0)]
-#define _Ret_range_impl_(min,max)       [__R_impl(__d_=0)]
-#define _Deref_in_range_impl_(min,max)  [__P_impl(__d_=0)]
-#define _Deref_out_range_impl_(min,max) [__P_impl(__d_=0)]
-#define _Deref_ret_range_impl_(min,max) [__R_impl(__d_=0)]
-
-#define _Field_range_impl_(min,max)     [__F_(__d_=0)]
-
-#define _Pre_satisfies_impl_(cond)      [__A_(__d_=0)]
-#define _Post_satisfies_impl_(cond)     [__A_(__d_=0)]
-#define _Satisfies_impl_(cond)          [__A_(__d_=0)]
-
-#define _Null_impl_                     [__A_(__d_=0)]
-#define _Notnull_impl_                  [__A_(__d_=0)]
-#define _Maybenull_impl_                [__A_(__d_=0)]
-
-#define _Valid_impl_                    [__A_(__d_=0)]
-#define _Notvalid_impl_                 [__A_(__d_=0)]
-#define _Maybevalid_impl_               [__A_(__d_=0)]
-
-#define _Readable_bytes_impl_(size)     [__A_(__d_=0)]
-#define _Readable_elements_impl_(size)  [__A_(__d_=0)]
-#define _Writable_bytes_impl_(size)     [__A_(__d_=0)]
-#define _Writable_elements_impl_(size)  [__A_(__d_=0)]
-
-#define _Null_terminated_impl_          [__A_(__d_=0)]
-#define _NullNull_terminated_impl_      [__A_(__d_=0)]
-
-#define _Pre_impl_                      [__P_impl(__d_=0)]
-#define _Pre1_impl_(p1)                 [__P_impl(__d_=0)]
-#define _Pre2_impl_(p1,p2)              [__P_impl(__d_=0)]
-#define _Pre3_impl_(p1,p2,p3)           [__P_impl(__d_=0)]
-
-#define _Post_impl_                     [__P_impl(__d_=0)]
-#define _Post1_impl_(p1)                [__P_impl(__d_=0)]
-#define _Post2_impl_(p1,p2)             [__P_impl(__d_=0)]
-#define _Post3_impl_(p1,p2,p3)          [__P_impl(__d_=0)]
-
-#define _Ret1_impl_(p1)                 [__R_impl(__d_=0)]
-#define _Ret2_impl_(p1,p2)              [__R_impl(__d_=0)]
-#define _Ret3_impl_(p1,p2,p3)           [__R_impl(__d_=0)]
-
-#define _Deref_pre1_impl_(p1)           [__P_impl(__d_=0)]
-#define _Deref_pre2_impl_(p1,p2)        [__P_impl(__d_=0)]
-#define _Deref_pre3_impl_(p1,p2,p3)     [__P_impl(__d_=0)]
-
-#define _Deref_post1_impl_(p1)          [__P_impl(__d_=0)]
-#define _Deref_post2_impl_(p1,p2)       [__P_impl(__d_=0)]
-#define _Deref_post3_impl_(p1,p2,p3)    [__P_impl(__d_=0)]
-
-#define _Deref_ret1_impl_(p1)           [__R_impl(__d_=0)]
-#define _Deref_ret2_impl_(p1,p2)        [__R_impl(__d_=0)]
-#define _Deref_ret3_impl_(p1,p2,p3)     [__R_impl(__d_=0)]
-
-#define _Deref2_pre1_impl_(p1)          //[__P_impl(__d_=0)]
-#define _Deref2_post1_impl_(p1)         //[__P_impl(__d_=0)]
-#define _Deref2_ret1_impl_(p1)          //[__P_impl(__d_=0)]
-
-#else // ][
-
-
-#define _SAL_nop_impl_ X
-
-#define _At_impl_(target, annos)
-#define _When_impl_(expr, annos)
-#define _Group_impl_(annos)
-#define _GrouP_impl_(annos)
-#define _At_buffer_impl_(target, iter, bound, annos)
-#define _Use_decl_anno_impl_
-#define _Points_to_data_impl_
-#define _Literal_impl_
-#define _Notliteral_impl_
-#define _Notref_impl_
-
-#define _Pre_valid_impl_
-#define _Post_valid_impl_
-#define _Ret_valid_impl_
-
-#define _Check_return_impl_
-#define _Must_inspect_impl_
-
-#define _Success_impl_(expr)
-#define _On_failure_impl_(annos)
-#define _Always_impl_(annos)
-
-#define _Printf_format_string_impl_
-#define _Scanf_format_string_impl_
-#define _Scanf_s_format_string_impl_
-
-#define _In_bound_impl_
-#define _Out_bound_impl_
-#define _Ret_bound_impl_
-#define _Deref_in_bound_impl_
-#define _Deref_out_bound_impl_
-#define _Deref_ret_bound_impl_
-
-#define _Range_impl_(min,max)
-#define _In_range_impl_(min,max)
-#define _Out_range_impl_(min,max)
-#define _Ret_range_impl_(min,max)
-#define _Deref_in_range_impl_(min,max)
-#define _Deref_out_range_impl_(min,max)
-#define _Deref_ret_range_impl_(min,max)
-
-#define _Satisfies_impl_(expr)
-#define _Pre_satisfies_impl_(expr)
-#define _Post_satisfies_impl_(expr)
-
-#define _Null_impl_
-#define _Notnull_impl_
-#define _Maybenull_impl_
-
-#define _Valid_impl_
-#define _Notvalid_impl_
-#define _Maybevalid_impl_
-
-#define _Field_range_impl_(min,max)
-
-#define _Pre_impl_
-#define _Pre1_impl_(p1)
-#define _Pre2_impl_(p1,p2)
-#define _Pre3_impl_(p1,p2,p3)
-
-#define _Post_impl_
-#define _Post1_impl_(p1)
-#define _Post2_impl_(p1,p2)
-#define _Post3_impl_(p1,p2,p3)
-
-#define _Ret1_impl_(p1)
-#define _Ret2_impl_(p1,p2)
-#define _Ret3_impl_(p1,p2,p3)
-
-#define _Deref_pre1_impl_(p1)
-#define _Deref_pre2_impl_(p1,p2)
-#define _Deref_pre3_impl_(p1,p2,p3)
-
-#define _Deref_post1_impl_(p1)
-#define _Deref_post2_impl_(p1,p2)
-#define _Deref_post3_impl_(p1,p2,p3)
-
-#define _Deref_ret1_impl_(p1)
-#define _Deref_ret2_impl_(p1,p2)
-#define _Deref_ret3_impl_(p1,p2,p3)
-
-#define _Deref2_pre1_impl_(p1)
-#define _Deref2_post1_impl_(p1)
-#define _Deref2_ret1_impl_(p1)
-
-#define _Readable_bytes_impl_(size)
-#define _Readable_elements_impl_(size)
-#define _Writable_bytes_impl_(size)
-#define _Writable_elements_impl_(size)
+/*
+**==============================================================================
+**
+** Open Management Infrastructure (OMI) v.1.1.0
+** 
+** Copyright (c) Microsoft Corporation
+** 
+** All rights reserved. 
+** 
+** MIT License
+** 
+** Permission is hereby granted, free of charge, to any person obtaining
+** a copy of this software and associated documentation files (the
+** ""Software""), to deal in the Software without restriction, including
+** without limitation the rights to use, copy, modify, merge, publish,
+** distribute, sublicense, and/or sell copies of the Software, and to
+** permit persons to whom the Software is furnished to do so, subject to
+** the following conditions:
+** 
+** The above copyright notice and this permission notice shall be
+** included in all copies or substantial portions of the Software.
+** 
+** THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND,
+** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+** NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+** LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+** OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+** WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+**
+**==============================================================================
+*/
 
-#define _Null_terminated_impl_
-#define _NullNull_terminated_impl_
+#ifndef _SAL_h
+#define _SAL_h
 
-// Obsolete -- may be needed for transition to attributes.
-#define __inner_typefix(ctype)
-#define __inner_exceptthat
+#if !defined(_In_)
+# define _In_
+#endif
 
-#endif // ]
-
-// This section contains the deprecated annotations
+#if !defined(_In_count_)
+# define _In_count_(count)
+#endif
 
-/*
- -------------------------------------------------------------------------------
- Introduction
-
- sal.h provides a set of annotations to describe how a function uses its
- parameters - the assumptions it makes about them, and the guarantees it makes
- upon finishing.
-
- Annotations may be placed before either a function parameter's type or its return
- type, and describe the function's behavior regarding the parameter or return value.
- There are two classes of annotations: buffer annotations and advanced annotations.
- Buffer annotations describe how functions use their pointer parameters, and
- advanced annotations either describe complex/unusual buffer behavior, or provide
- additional information about a parameter that is not otherwise expressible.
-
- -------------------------------------------------------------------------------
- Buffer Annotations
-
- The most important annotations in sal.h provide a consistent way to annotate
- buffer parameters or return values for a function. Each of these annotations describes
- a single buffer (which could be a string, a fixed-length or variable-length array,
- or just a pointer) that the function interacts with: where it is, how large it is,
- how much is initialized, and what the function does with it.
-
- The appropriate macro for a given buffer can be constructed using the table below.
- Just pick the appropriate values from each category, and combine them together
- with a leading underscore. Some combinations of values do not make sense as buffer
- annotations. Only meaningful annotations can be added to your code; for a list of
- these, see the buffer annotation definitions section.
-
- Only a single buffer annotation should be used for each parameter.
-
- |------------|------------|---------|--------|----------|----------|---------------|
- |   Level    |   Usage    |  Size   | Output | NullTerm | Optional |  Parameters   |
- |------------|------------|---------|--------|----------|----------|---------------|
- | <>         | <>         | <>      | <>     | _z       | <>       | <>            |
- | _deref     | _in        | _ecount | _full  | _nz      | _opt     | (size)        |
- | _deref_opt | _out       | _bcount | _part  |          |          | (size,length) |
- |            | _inout     |         |        |          |          |               |
- |            |            |         |        |          |          |               |
- |------------|------------|---------|--------|----------|----------|---------------|
-
- Level: Describes the buffer pointer's level of indirection from the parameter or
-          return value 'p'.
-
- <>         : p is the buffer pointer.
- _deref     : *p is the buffer pointer. p must not be NULL.
- _deref_opt : *p may be the buffer pointer. p may be NULL, in which case the rest of
-                the annotation is ignored.
-
- Usage: Describes how the function uses the buffer.
-
- <>     : The buffer is not accessed. If used on the return value or with _deref, the
-            function will provide the buffer, and it will be uninitialized at exit.
-            Otherwise, the caller must provide the buffer. This should only be used
-            for alloc and free functions.
- _in    : The function will only read from the buffer. The caller must provide the
-            buffer and initialize it. Cannot be used with _deref.
- _out   : The function will only write to the buffer. If used on the return value or
-            with _deref, the function will provide the buffer and initialize it.
-            Otherwise, the caller must provide the buffer, and the function will
-            initialize it.
- _inout : The function may freely read from and write to the buffer. The caller must
-            provide the buffer and initialize it. If used with _deref, the buffer may
-            be reallocated by the function.
-
- Size: Describes the total size of the buffer. This may be less than the space actually
-         allocated for the buffer, in which case it describes the accessible amount.
-
- <>      : No buffer size is given. If the type specifies the buffer size (such as
-             with LPSTR and LPWSTR), that amount is used. Otherwise, the buffer is one
-             element long. Must be used with _in, _out, or _inout.
- _ecount : The buffer size is an explicit element count.
- _bcount : The buffer size is an explicit byte count.
-
- Output: Describes how much of the buffer will be initialized by the function. For
-           _inout buffers, this also describes how much is initialized at entry. Omit this
-           category for _in buffers; they must be fully initialized by the caller.
-
- <>    : The type specifies how much is initialized. For instance, a function initializing
-           an LPWSTR must NULL-terminate the string.
- _full : The function initializes the entire buffer.
- _part : The function initializes part of the buffer, and explicitly indicates how much.
-
- NullTerm: States if the present of a '\0' marks the end of valid elements in the buffer.
- _z    : A '\0' indicated the end of the buffer
- _nz     : The buffer may not be null terminated and a '\0' does not indicate the end of the
-          buffer.
- Optional: Describes if the buffer itself is optional.
-
- <>   : The pointer to the buffer must not be NULL.
- _opt : The pointer to the buffer might be NULL. It will be checked before being dereferenced.
-
- Parameters: Gives explicit counts for the size and length of the buffer.
-
- <>            : There is no explicit count. Use when neither _ecount nor _bcount is used.
- (size)        : Only the buffer's total size is given. Use with _ecount or _bcount but not _part.
- (size,length) : The buffer's total size and initialized length are given. Use with _ecount_part
-                   and _bcount_part.
-
- -------------------------------------------------------------------------------
- Buffer Annotation Examples
-
- LWSTDAPI_(BOOL) StrToIntExA(
-     __in LPCSTR pszString,
-     DWORD dwFlags,
-     __out int *piRet                     -- A pointer whose dereference will be filled in.
- );
-
- void MyPaintingFunction(
-     __in HWND hwndControl,               -- An initialized read-only parameter.
-     __in_opt HDC hdcOptional,            -- An initialized read-only parameter that might be NULL.
-     __inout IPropertyStore *ppsStore     -- An initialized parameter that may be freely used
-                                          --   and modified.
- );
-
- LWSTDAPI_(BOOL) PathCompactPathExA(
-     __out_ecount(cchMax) LPSTR pszOut,   -- A string buffer with cch elements that will
-                                          --   be NULL terminated on exit.
-     __in LPCSTR pszSrc,
-     UINT cchMax,
-     DWORD dwFlags
- );
-
- HRESULT SHLocalAllocBytes(
-     size_t cb,
-     __deref_bcount(cb) T **ppv           -- A pointer whose dereference will be set to an
-                                          --   uninitialized buffer with cb bytes.
- );
-
- __inout_bcount_full(cb) : A buffer with cb elements that is fully initialized at
-     entry and exit, and may be written to by this function.
-
- __out_ecount_part(count, *countOut) : A buffer with count elements that will be
-     partially initialized by this function. The function indicates how much it
-     initialized by setting *countOut.
-
- -------------------------------------------------------------------------------
- Advanced Annotations
-
- Advanced annotations describe behavior that is not expressible with the regular
- buffer macros. These may be used either to annotate buffer parameters that involve
- complex or conditional behavior, or to enrich existing annotations with additional
- information.
-
- __success(expr) f :
-     <expr> indicates whether function f succeeded or not. If <expr> is true at exit,
-     all the function's guarantees (as given by other annotations) must hold. If <expr>
-     is false at exit, the caller should not expect any of the function's guarantees
-     to hold. If not used, the function must always satisfy its guarantees. Added
-     automatically to functions that indicate success in standard ways, such as by
-     returning an HRESULT.
-
- __nullterminated p :
-     Pointer p is a buffer that may be read or written up to and including the first
-     NULL character or pointer. May be used on typedefs, which marks valid (properly
-     initialized) instances of that type as being NULL-terminated.
-
- __nullnullterminated p :
-     Pointer p is a buffer that may be read or written up to and including the first
-     sequence of two NULL characters or pointers. May be used on typedefs, which marks
-     valid instances of that type as being double-NULL terminated.
-
- __reserved v :
-     Value v must be 0/NULL, reserved for future use.
-
- __checkReturn v :
-     Return value v must not be ignored by callers of this function.
-
- __typefix(ctype) v :
-     Value v should be treated as an instance of ctype, rather than its declared type.
-
- __override f :
-     Specify C#-style 'override' behaviour for overriding virtual methods.
+#if !defined(_In_opt_)
+# define _In_opt_
+#endif
 
- __callback f :
-     Function f can be used as a function pointer.
-
- __format_string p :
-     Pointer p is a string that contains % markers in the style of printf.
-
- __blocksOn(resource) f :
-     Function f blocks on the resource 'resource'.
+#if !defined(_In_z_)
+# define _In_z_
+#endif
 
- FALLTHROUGH :
-     Annotates switch statement labels where fall-through is desired, to distinguish
-     from forgotten break statements.
+#if !defined(_In_opt_z_)
+# define _In_opt_z_
+#endif
 
- -------------------------------------------------------------------------------
- Advanced Annotation Examples
+#if !defined(_Must_inspect_result_)
+# define _Must_inspect_result_
+#endif
 
- __success(return != FALSE) LWSTDAPI_(BOOL)
- PathCanonicalizeA(__out_ecount(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) :
-    pszBuf is only guaranteed to be NULL-terminated when TRUE is returned.
+#if !defined(_Out_)
+# define _Out_
+#endif
 
- typedef __nullterminated WCHAR* LPWSTR : Initialized LPWSTRs are NULL-terminated strings.
-
- __out_ecount(cch) __typefix(LPWSTR) void *psz : psz is a buffer parameter which will be
-     a NULL-terminated WCHAR string at exit, and which initially contains cch WCHARs.
+#if !defined(_Outptr_)
+# define _Outptr_
+#endif
 
- -------------------------------------------------------------------------------
-*/
+#if !defined(_Outptr_opt_)
+# define _Outptr_opt_
+#endif
 
-#define __specstrings
+#if !defined(_Out_writes_z_)
+# define _Out_writes_z_(count)
+#endif
 
-#ifdef  __cplusplus // [
-#ifndef __nothrow // [
-# define __nothrow NOTHROW_DECL
-#endif // ]
-extern "C" {
-#else // ][
-#ifndef __nothrow // [
-# define __nothrow
-#endif // ]
-#endif  /* #ifdef __cplusplus */ // ]
+#if !defined(_Outptr_result_z_)
+# define _Outptr_result_z_
+#endif
 
+#if !defined(_Outptr_result_bytebuffer_)
+# define _Outptr_result_bytebuffer_(count)
+#endif
 
-/*
- -------------------------------------------------------------------------------
- Helper Macro Definitions
+#if !defined(_Outptr_result_maybenull_)
+# define _Outptr_result_maybenull_
+#endif
 
- These express behavior common to many of the high-level annotations.
- DO NOT USE THESE IN YOUR CODE.
- -------------------------------------------------------------------------------
-*/
+#if !defined(_Outptr_opt_result_maybenull_)
+# define _Outptr_opt_result_maybenull_
+#endif
 
-/*
-    The helper annotations are only understood by the compiler version used by
-    various defect detection tools. When the regular compiler is running, they
-    are defined into nothing, and do not affect the compiled code.
-*/
+#if !defined(_Outptr_result_maybenull_z_)
+# define _Outptr_result_maybenull_z_
+#endif
 
-#if !defined(__midl) && defined(_PREFAST_) // [
+#if !defined(_Outptr_opt_result_z_)
+# define _Outptr_opt_result_z_
+#endif
 
-    /*
-     In the primitive "SAL_*" annotations "SAL" stands for Standard
-     Annotation Language.  These "SAL_*" annotations are the
-     primitives the compiler understands and high-level MACROs
-     will decompose into these primivates.
-    */
+#if !defined(_Outptr_opt_result_maybenull_z_)
+# define _Outptr_opt_result_maybenull_z_
+#endif
 
-    #define _SA_SPECSTRIZE( x ) #x
+#if !defined(_Return_type_success_)
+# define _Return_type_success_(expr)
+#endif
 
-    /*
-     __null p
-     __notnull p
-     __maybenull p
+#if !defined(_In_reads_bytes_)
+# define _In_reads_bytes_(count)
+#endif
 
-     Annotates a pointer p. States that pointer p is null. Commonly used
-     in the negated form __notnull or the possibly null form __maybenull.
-    */
+#if !defined(_In_reads_opt_)
+# define _In_reads_opt_(expr)
+#endif
 
-#ifndef PAL_STDCPP_COMPAT
-    #define __null                  _Null_impl_
-    #define __notnull               _Notnull_impl_
-    #define __maybenull             _Maybenull_impl_
-#endif // !PAL_STDCPP_COMPAT
+#if !defined(_Out_writes_to_opt_)
+# define _Out_writes_to_opt_(length, lengthwritten)
+#endif
 
-    /*
-     __readonly l
-     __notreadonly l
-     __maybereadonly l
+#if !defined(_Acquires_lock_)
+# define _Acquires_lock_(lock)
+#endif
 
-     Annotates a location l. States that location l is not modified after
-     this point.  If the annotation is placed on the precondition state of
-     a function, the restriction only applies until the postcondition state
-     of the function.  __maybereadonly states that the annotated location
-     may be modified, whereas __notreadonly states that a location must be
-     modified.
-    */
+#if !defined(_Releases_lock_)
+# define _Releases_lock_(lock)
+#endif
 
-    #define __readonly              _Pre1_impl_(__readaccess_impl)
-    #define __notreadonly           _Pre1_impl_(__allaccess_impl)
-    #define __maybereadonly         _Pre1_impl_(__readaccess_impl)
+#if !defined(_Inout_)
+# define _Inout_
+#endif
 
-    /*
-     __valid v
-     __notvalid v
-     __maybevalid v
+#if !defined(_Inout_opt_)
+# define _Inout_opt_
+#endif
 
-     Annotates any value v. States that the value satisfies all properties of
-     valid values of its type. For example, for a string buffer, valid means
-     that the buffer pointer is either NULL or points to a NULL-terminated string.
-    */
+#if !defined(_Inout_z_)
+# define _Inout_z_
+#endif
 
-    #define __valid                 _Valid_impl_
-    #define __notvalid              _Notvalid_impl_
-    #define __maybevalid            _Maybevalid_impl_
+#if !defined(_Out_opt_)
+# define _Out_opt_
+#endif
 
-    /*
-     __readableTo(extent) p
+#if !defined(_Out_writes_bytes_)
+# define _Out_writes_bytes_(count)
+#endif
 
-     Annotates a buffer pointer p.  If the buffer can be read, extent describes
-     how much of the buffer is readable. For a reader of the buffer, this is
-     an explicit permission to read up to that amount, rather than a restriction to
-     read only up to it.
-    */
+#if !defined(_In_reads_)
+# define _In_reads_(count)
+#endif
 
-    #define __readableTo(extent)    _SA_annotes1(SAL_readableTo, extent)
+#if !defined(_In_reads_z_)
+# define _In_reads_z_(count)
+#endif
 
-    /*
+#if !defined(_Out_writes_opt_)
+# define _Out_writes_opt_(count)
+#endif
 
-     __elem_readableTo(size)
+#if !defined(_Null_terminated_)
+#define _Null_terminated_
+#endif
 
-     Annotates a buffer pointer p as being readable to size elements.
-    */
+#if !defined(_Requires_lock_not_held_)
+#define _Requires_lock_not_held_(lock)
+#endif
 
-    #define __elem_readableTo(size)   _SA_annotes1(SAL_readableTo, elementCount( size ))
+#if !defined(_Requires_lock_held_)
+#define _Requires_lock_held_(lock)
+#endif
 
-    /*
-     __byte_readableTo(size)
+#if !defined(__field_ecount)
+#define __field_ecount(count)
+#endif
 
-     Annotates a buffer pointer p as being readable to size bytes.
-    */
-    #define __byte_readableTo(size)   _SA_annotes1(SAL_readableTo, byteCount(size))
+#if !defined(_Check_return_)
+#define _Check_return_
+#endif
 
-    /*
-     __writableTo(extent) p
+#if !defined(_Deref_post_z_)
+#define _Deref_post_z_
+#endif
 
-     Annotates a buffer pointer p. If the buffer can be modified, extent
-     describes how much of the buffer is writable (usually the allocation
-     size). For a writer of the buffer, this is an explicit permission to
-     write up to that amount, rather than a restriction to write only up to it.
-    */
-    #define __writableTo(size)   _SA_annotes1(SAL_writableTo, size)
+#if !defined(_Deref_prepost_opt_z_)
+#define _Deref_prepost_opt_z_
+#endif
 
-    /*
-     __elem_writableTo(size)
+#if !defined(_Deref_out_range_)
+#define _Deref_out_range_(min, max)
+#endif
 
-     Annotates a buffer pointer p as being writable to size elements.
-    */
-    #define __elem_writableTo(size)   _SA_annotes1(SAL_writableTo, elementCount( size ))
+#if !defined(_Inout_opt_z_)
+#define _Inout_opt_z_
+#endif
 
-    /*
-     __byte_writableTo(size)
+#if !defined(_Inout_updates_z_)
+#define _Inout_updates_z_(count)
+#endif
 
-     Annotates a buffer pointer p as being writable to size bytes.
-    */
-    #define __byte_writableTo(size)   _SA_annotes1(SAL_writableTo, byteCount( size))
+#if !defined(_Out_writes_)
+#define _Out_writes_(count)
+#endif
 
-    /*
-     __deref p
+#if !defined(_Post_readable_size_)
+#define _Post_readable_size_(count)
+#endif
 
-     Annotates a pointer p. The next annotation applies one dereference down
-     in the type. If readableTo(p, size) then the next annotation applies to
-     all elements *(p+i) for which i satisfies the size. If p is a pointer
-     to a struct, the next annotation applies to all fields of the struct.
-    */
-    #define __deref                 _Deref_impl_
+#if !defined(_Post_ptr_invalid_)
+#define _Post_ptr_invalid_
+#endif
 
-    /*
-     __pre __next_annotation
+#if !defined(_Pre_valid_)
+#define _Pre_valid_
+#endif
 
-     The next annotation applies in the precondition state
-    */
-    #define __pre                   _Pre_impl_
+#if !defined(_Pre_writable_size_)
+#define _Pre_writable_size_(count)
+#endif
 
-    /*
-     __post __next_annotation
+#if !defined(_Success_)
+#define _Success_(count)
+#endif
 
-     The next annotation applies in the postcondition state
-    */
-    #define __post                  _Post_impl_
-
-    /*
-     __precond(<expr>)
-
-     When <expr> is true, the next annotation applies in the precondition state
-     (currently not enabled)
-    */
-    #define __precond(expr)         __pre
-
-    /*
-     __postcond(<expr>)
-
-     When <expr> is true, the next annotation applies in the postcondition state
-     (currently not enabled)
-    */
-    #define __postcond(expr)        __post
-
-    /*
-     __exceptthat
+#if !defined(_Ret_notnull_)
+#define _Ret_notnull_
+#endif
 
-     Given a set of annotations Q containing __exceptthat maybeP, the effect of
-     the except clause is to erase any P or notP annotations (explicit or
-     implied) within Q at the same level of dereferencing that the except
-     clause appears, and to replace it with maybeP.
+#if !defined(_Ret_z_)
+#define _Ret_z_
+#endif
 
-      Example 1: __valid __pre_except_maybenull on a pointer p means that the
-                 pointer may be null, and is otherwise valid, thus overriding
-                 the implicit notnull annotation implied by __valid on
-                 pointers.
+#if !defined(_Use_decl_annotations_)
+#define _Use_decl_annotations_
+#endif
 
-      Example 2: __valid __deref __pre_except_maybenull on an int **p means
-                 that p is not null (implied by valid), but the elements
-                 pointed to by p could be null, and are otherwise valid.
-    */
-    #define __exceptthat                __inner_exceptthat
+#if !defined(_Ret_maybenull_)
+#define _Ret_maybenull_
+#endif
 
-    /*
-     _refparam
+#if !defined(_Pre_writable_byte_size_)
+#define _Pre_writable_byte_size_(count)
+#endif
 
-     Added to all out parameter macros to indicate that they are all reference
-     parameters.
-    */
-    #define __refparam                  _Notref_ __deref __notreadonly
-
-    /*
-     __inner_*
-
-     Helper macros that directly correspond to certain high-level annotations.
-
-    */
-
-    /*
-     Macros to classify the entrypoints and indicate their category.
-
-     Pre-defined control point categories include: RPC, LPC, DeviceDriver, UserToKernel, ISAPI, COM.
-
-    */
-    #define __inner_control_entrypoint(category) _SA_annotes2(SAL_entrypoint, controlEntry, category)
-
-
-    /*
-     Pre-defined data entry point categories include: Registry, File, Network.
-    */
-    #define __inner_data_entrypoint(category)    _SA_annotes2(SAL_entrypoint, dataEntry, category)
-
-    #define __inner_override                    _SA_annotes0(__override)
-    #define __inner_callback                    _SA_annotes0(__callback)
-    #define __inner_blocksOn(resource)          _SA_annotes1(SAL_blocksOn, resource)
-
-    #define __post_except_maybenull     __post __inner_exceptthat _Maybenull_impl_
-    #define __pre_except_maybenull      __pre  __inner_exceptthat _Maybenull_impl_
-
-    #define __post_deref_except_maybenull       __post __deref __inner_exceptthat _Maybenull_impl_
-    #define __pre_deref_except_maybenull    __pre  __deref __inner_exceptthat _Maybenull_impl_
-
-    #define __inexpressible_readableTo(size)  _Readable_elements_impl_(_Inexpressible_(size))
-    #define __inexpressible_writableTo(size)  _Writable_elements_impl_(_Inexpressible_(size))
-
-
-#else // ][
-#ifndef PAL_STDCPP_COMPAT
-    #define __null
-    #define __notnull
-    #define __deref
-#endif // !PAL_STDCPP_COMPAT
-    #define __maybenull
-    #define __readonly
-    #define __notreadonly
-    #define __maybereadonly
-    #define __valid
-    #define __notvalid
-    #define __maybevalid
-    #define __readableTo(extent)
-    #define __elem_readableTo(size)
-    #define __byte_readableTo(size)
-    #define __writableTo(size)
-    #define __elem_writableTo(size)
-    #define __byte_writableTo(size)
-    #define __pre
-    #define __post
-    #define __precond(expr)
-    #define __postcond(expr)
-    #define __exceptthat
-    #define __inner_override
-    #define __inner_callback
-    #define __inner_blocksOn(resource)
-    #define __refparam
-    #define __inner_control_entrypoint(category)
-    #define __inner_data_entrypoint(category)
-
-    #define __post_except_maybenull
-    #define __pre_except_maybenull
-    #define __post_deref_except_maybenull
-    #define __pre_deref_except_maybenull
-
-    #define __inexpressible_readableTo(size)
-    #define __inexpressible_writableTo(size)
-
-#endif /* #if !defined(__midl) && defined(_PREFAST_) */ // ]
+#if !defined(_Post_writable_byte_size_)
+#define _Post_writable_byte_size_(count)
+#endif
 
-/*
--------------------------------------------------------------------------------
-Buffer Annotation Definitions
+#if !defined(_Analysis_assume_)
+#define _Analysis_assume_(expr)
+#endif
 
-Any of these may be used to directly annotate functions, but only one should
-be used for each parameter. To determine which annotation to use for a given
-buffer, use the table in the buffer annotations section.
--------------------------------------------------------------------------------
-*/
+#if !defined(_Post_satisfies_)
+#define _Post_satisfies_(expr)
+#endif
 
-#define __ecount(size)                                           _SAL1_Source_(__ecount, (size), __notnull __elem_writableTo(size))
-#define __bcount(size)                                           _SAL1_Source_(__bcount, (size), __notnull __byte_writableTo(size))
-#define __in_ecount(size)                                        _SAL1_Source_(__in_ecount, (size), _In_reads_(size))
-#define __in_bcount(size)                                        _SAL1_Source_(__in_bcount, (size), _In_reads_bytes_(size))
-#define __in_z                                                   _SAL1_Source_(__in_z, (), _In_z_)
-#define __in_ecount_z(size)                                      _SAL1_Source_(__in_ecount_z, (size), _In_reads_z_(size))
-#define __in_bcount_z(size)                                      _SAL1_Source_(__in_bcount_z, (size), __in_bcount(size) __pre __nullterminated)
-#define __in_nz                                                  _SAL1_Source_(__in_nz, (), __in)
-#define __in_ecount_nz(size)                                     _SAL1_Source_(__in_ecount_nz, (size), __in_ecount(size))
-#define __in_bcount_nz(size)                                     _SAL1_Source_(__in_bcount_nz, (size), __in_bcount(size))
-#define __out_ecount(size)                                       _SAL1_Source_(__out_ecount, (size), _Out_writes_(size))
-#define __out_bcount(size)                                       _SAL1_Source_(__out_bcount, (size), _Out_writes_bytes_(size))
-#define __out_ecount_part(size,length)                           _SAL1_Source_(__out_ecount_part, (size,length), _Out_writes_to_(size,length))
-#define __out_bcount_part(size,length)                           _SAL1_Source_(__out_bcount_part, (size,length), _Out_writes_bytes_to_(size,length))
-#define __out_ecount_full(size)                                  _SAL1_Source_(__out_ecount_full, (size), _Out_writes_all_(size))
-#define __out_bcount_full(size)                                  _SAL1_Source_(__out_bcount_full, (size), _Out_writes_bytes_all_(size))
-#define __out_z                                                  _SAL1_Source_(__out_z, (), __post __valid __refparam __post __nullterminated)
-#define __out_z_opt                                              _SAL1_Source_(__out_z_opt, (), __post __valid __refparam __post __nullterminated __pre_except_maybenull)
-#define __out_ecount_z(size)                                     _SAL1_Source_(__out_ecount_z, (size), __ecount(size) __post __valid __refparam __post __nullterminated)
-#define __out_bcount_z(size)                                     _SAL1_Source_(__out_bcount_z, (size), __bcount(size) __post __valid __refparam __post __nullterminated)
-#define __out_ecount_part_z(size,length)                         _SAL1_Source_(__out_ecount_part_z, (size,length), __out_ecount_part(size,length) __post __nullterminated)
-#define __out_bcount_part_z(size,length)                         _SAL1_Source_(__out_bcount_part_z, (size,length), __out_bcount_part(size,length) __post __nullterminated)
-#define __out_ecount_full_z(size)                                _SAL1_Source_(__out_ecount_full_z, (size), __out_ecount_full(size) __post __nullterminated)
-#define __out_bcount_full_z(size)                                _SAL1_Source_(__out_bcount_full_z, (size), __out_bcount_full(size) __post __nullterminated)
-#define __out_nz                                                 _SAL1_Source_(__out_nz, (), __post __valid __refparam)
-#define __out_nz_opt                                             _SAL1_Source_(__out_nz_opt, (), __post __valid __refparam __post_except_maybenull_)
-#define __out_ecount_nz(size)                                    _SAL1_Source_(__out_ecount_nz, (size), __ecount(size) __post __valid __refparam)
-#define __out_bcount_nz(size)                                    _SAL1_Source_(__out_bcount_nz, (size), __bcount(size) __post __valid __refparam)
-#define __inout                                                  _SAL1_Source_(__inout, (), _Inout_)
-#define __inout_ecount(size)                                     _SAL1_Source_(__inout_ecount, (size), _Inout_updates_(size))
-#define __inout_bcount(size)                                     _SAL1_Source_(__inout_bcount, (size), _Inout_updates_bytes_(size))
-#define __inout_ecount_part(size,length)                         _SAL1_Source_(__inout_ecount_part, (size,length), _Inout_updates_to_(size,length))
-#define __inout_bcount_part(size,length)                         _SAL1_Source_(__inout_bcount_part, (size,length), _Inout_updates_bytes_to_(size,length))
-#define __inout_ecount_full(size)                                _SAL1_Source_(__inout_ecount_full, (size), _Inout_updates_all_(size))
-#define __inout_bcount_full(size)                                _SAL1_Source_(__inout_bcount_full, (size), _Inout_updates_bytes_all_(size))
-#define __inout_z                                                _SAL1_Source_(__inout_z, (), _Inout_z_)
-#define __inout_ecount_z(size)                                   _SAL1_Source_(__inout_ecount_z, (size), _Inout_updates_z_(size))
-#define __inout_bcount_z(size)                                   _SAL1_Source_(__inout_bcount_z, (size), __inout_bcount(size) __pre __nullterminated __post __nullterminated)
-#define __inout_nz                                               _SAL1_Source_(__inout_nz, (), __inout)
-#define __inout_ecount_nz(size)                                  _SAL1_Source_(__inout_ecount_nz, (size), __inout_ecount(size))
-#define __inout_bcount_nz(size)                                  _SAL1_Source_(__inout_bcount_nz, (size), __inout_bcount(size))
-#define __ecount_opt(size)                                       _SAL1_Source_(__ecount_opt, (size), __ecount(size)                              __pre_except_maybenull)
-#define __bcount_opt(size)                                       _SAL1_Source_(__bcount_opt, (size), __bcount(size)                              __pre_except_maybenull)
-#define __in_opt                                                 _SAL1_Source_(__in_opt, (), _In_opt_)
-#define __in_ecount_opt(size)                                    _SAL1_Source_(__in_ecount_opt, (size), _In_reads_opt_(size))
-#define __in_bcount_opt(size)                                    _SAL1_Source_(__in_bcount_opt, (size), _In_reads_bytes_opt_(size))
-#define __in_z_opt                                               _SAL1_Source_(__in_z_opt, (), _In_opt_z_)
-#define __in_ecount_z_opt(size)                                  _SAL1_Source_(__in_ecount_z_opt, (size), __in_ecount_opt(size) __pre __nullterminated)
-#define __in_bcount_z_opt(size)                                  _SAL1_Source_(__in_bcount_z_opt, (size), __in_bcount_opt(size) __pre __nullterminated)
-#define __in_nz_opt                                              _SAL1_Source_(__in_nz_opt, (), __in_opt)
-#define __in_ecount_nz_opt(size)                                 _SAL1_Source_(__in_ecount_nz_opt, (size), __in_ecount_opt(size))
-#define __in_bcount_nz_opt(size)                                 _SAL1_Source_(__in_bcount_nz_opt, (size), __in_bcount_opt(size))
-#define __out_opt                                                _SAL1_Source_(__out_opt, (), _Out_opt_)
-#define __out_ecount_opt(size)                                   _SAL1_Source_(__out_ecount_opt, (size), _Out_writes_opt_(size))
-#define __out_bcount_opt(size)                                   _SAL1_Source_(__out_bcount_opt, (size), _Out_writes_bytes_opt_(size))
-#define __out_ecount_part_opt(size,length)                       _SAL1_Source_(__out_ecount_part_opt, (size,length), __out_ecount_part(size,length)              __pre_except_maybenull)
-#define __out_bcount_part_opt(size,length)                       _SAL1_Source_(__out_bcount_part_opt, (size,length), __out_bcount_part(size,length)              __pre_except_maybenull)
-#define __out_ecount_full_opt(size)                              _SAL1_Source_(__out_ecount_full_opt, (size), __out_ecount_full(size)                     __pre_except_maybenull)
-#define __out_bcount_full_opt(size)                              _SAL1_Source_(__out_bcount_full_opt, (size), __out_bcount_full(size)                     __pre_except_maybenull)
-#define __out_ecount_z_opt(size)                                 _SAL1_Source_(__out_ecount_z_opt, (size), __out_ecount_opt(size) __post __nullterminated)
-#define __out_bcount_z_opt(size)                                 _SAL1_Source_(__out_bcount_z_opt, (size), __out_bcount_opt(size) __post __nullterminated)
-#define __out_ecount_part_z_opt(size,length)                     _SAL1_Source_(__out_ecount_part_z_opt, (size,length), __out_ecount_part_opt(size,length) __post __nullterminated)
-#define __out_bcount_part_z_opt(size,length)                     _SAL1_Source_(__out_bcount_part_z_opt, (size,length), __out_bcount_part_opt(size,length) __post __nullterminated)
-#define __out_ecount_full_z_opt(size)                            _SAL1_Source_(__out_ecount_full_z_opt, (size), __out_ecount_full_opt(size) __post __nullterminated)
-#define __out_bcount_full_z_opt(size)                            _SAL1_Source_(__out_bcount_full_z_opt, (size), __out_bcount_full_opt(size) __post __nullterminated)
-#define __out_ecount_nz_opt(size)                                _SAL1_Source_(__out_ecount_nz_opt, (size), __out_ecount_opt(size) __post __nullterminated)
-#define __out_bcount_nz_opt(size)                                _SAL1_Source_(__out_bcount_nz_opt, (size), __out_bcount_opt(size) __post __nullterminated)
-#define __inout_opt                                              _SAL1_Source_(__inout_opt, (), _Inout_opt_)
-#define __inout_ecount_opt(size)                                 _SAL1_Source_(__inout_ecount_opt, (size), __inout_ecount(size)                        __pre_except_maybenull)
-#define __inout_bcount_opt(size)                                 _SAL1_Source_(__inout_bcount_opt, (size), __inout_bcount(size)                        __pre_except_maybenull)
-#define __inout_ecount_part_opt(size,length)                     _SAL1_Source_(__inout_ecount_part_opt, (size,length), __inout_ecount_part(size,length)            __pre_except_maybenull)
-#define __inout_bcount_part_opt(size,length)                     _SAL1_Source_(__inout_bcount_part_opt, (size,length), __inout_bcount_part(size,length)            __pre_except_maybenull)
-#define __inout_ecount_full_opt(size)                            _SAL1_Source_(__inout_ecount_full_opt, (size), __inout_ecount_full(size)                   __pre_except_maybenull)
-#define __inout_bcount_full_opt(size)                            _SAL1_Source_(__inout_bcount_full_opt, (size), __inout_bcount_full(size)                   __pre_except_maybenull)
-#define __inout_z_opt                                            _SAL1_Source_(__inout_z_opt, (), __inout_opt __pre __nullterminated __post __nullterminated)
-#define __inout_ecount_z_opt(size)                               _SAL1_Source_(__inout_ecount_z_opt, (size), __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated)
-#define __inout_ecount_z_opt(size)                               _SAL1_Source_(__inout_ecount_z_opt, (size), __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated)
-#define __inout_bcount_z_opt(size)                               _SAL1_Source_(__inout_bcount_z_opt, (size), __inout_bcount_opt(size))
-#define __inout_nz_opt                                           _SAL1_Source_(__inout_nz_opt, (), __inout_opt)
-#define __inout_ecount_nz_opt(size)                              _SAL1_Source_(__inout_ecount_nz_opt, (size), __inout_ecount_opt(size))
-#define __inout_bcount_nz_opt(size)                              _SAL1_Source_(__inout_bcount_nz_opt, (size), __inout_bcount_opt(size))
-#define __deref_ecount(size)                                     _SAL1_Source_(__deref_ecount, (size), _Notref_ __ecount(1) __post _Notref_ __elem_readableTo(1) __post _Notref_ __deref _Notref_ __notnull __post __deref __elem_writableTo(size))
-#define __deref_bcount(size)                                     _SAL1_Source_(__deref_bcount, (size), _Notref_ __ecount(1) __post _Notref_ __elem_readableTo(1) __post _Notref_ __deref _Notref_ __notnull __post __deref __byte_writableTo(size))
-#define __deref_out                                              _SAL1_Source_(__deref_out, (), _Outptr_)
-#define __deref_out_ecount(size)                                 _SAL1_Source_(__deref_out_ecount, (size), _Outptr_result_buffer_(size))
-#define __deref_out_bcount(size)                                 _SAL1_Source_(__deref_out_bcount, (size), _Outptr_result_bytebuffer_(size))
-#define __deref_out_ecount_part(size,length)                     _SAL1_Source_(__deref_out_ecount_part, (size,length), _Outptr_result_buffer_to_(size,length))
-#define __deref_out_bcount_part(size,length)                     _SAL1_Source_(__deref_out_bcount_part, (size,length), _Outptr_result_bytebuffer_to_(size,length))
-#define __deref_out_ecount_full(size)                            _SAL1_Source_(__deref_out_ecount_full, (size), __deref_out_ecount_part(size,size))
-#define __deref_out_bcount_full(size)                            _SAL1_Source_(__deref_out_bcount_full, (size), __deref_out_bcount_part(size,size))
-#define __deref_out_z                                            _SAL1_Source_(__deref_out_z, (), _Outptr_result_z_)
-#define __deref_out_ecount_z(size)                               _SAL1_Source_(__deref_out_ecount_z, (size), __deref_out_ecount(size) __post __deref __nullterminated)
-#define __deref_out_bcount_z(size)                               _SAL1_Source_(__deref_out_bcount_z, (size), __deref_out_bcount(size) __post __deref __nullterminated)
-#define __deref_out_nz                                           _SAL1_Source_(__deref_out_nz, (), __deref_out)
-#define __deref_out_ecount_nz(size)                              _SAL1_Source_(__deref_out_ecount_nz, (size), __deref_out_ecount(size))
-#define __deref_out_bcount_nz(size)                              _SAL1_Source_(__deref_out_bcount_nz, (size), __deref_out_ecount(size))
-#define __deref_inout                                            _SAL1_Source_(__deref_inout, (), _Notref_ __notnull _Notref_ __elem_readableTo(1) __pre __deref __valid __post _Notref_ __deref __valid __refparam)
-#define __deref_inout_z                                          _SAL1_Source_(__deref_inout_z, (), __deref_inout __pre __deref __nullterminated __post _Notref_ __deref __nullterminated)
-#define __deref_inout_ecount(size)                               _SAL1_Source_(__deref_inout_ecount, (size), __deref_inout __pre __deref __elem_writableTo(size) __post _Notref_ __deref __elem_writableTo(size))
-#define __deref_inout_bcount(size)                               _SAL1_Source_(__deref_inout_bcount, (size), __deref_inout __pre __deref __byte_writableTo(size) __post _Notref_ __deref __byte_writableTo(size))
-#define __deref_inout_ecount_part(size,length)                   _SAL1_Source_(__deref_inout_ecount_part, (size,length), __deref_inout_ecount(size) __pre __deref __elem_readableTo(length) __post __deref __elem_readableTo(length))
-#define __deref_inout_bcount_part(size,length)                   _SAL1_Source_(__deref_inout_bcount_part, (size,length), __deref_inout_bcount(size) __pre __deref __byte_readableTo(length) __post __deref __byte_readableTo(length))
-#define __deref_inout_ecount_full(size)                          _SAL1_Source_(__deref_inout_ecount_full, (size), __deref_inout_ecount_part(size,size))
-#define __deref_inout_bcount_full(size)                          _SAL1_Source_(__deref_inout_bcount_full, (size), __deref_inout_bcount_part(size,size))
-#define __deref_inout_ecount_z(size)                             _SAL1_Source_(__deref_inout_ecount_z, (size), __deref_inout_ecount(size) __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_inout_bcount_z(size)                             _SAL1_Source_(__deref_inout_bcount_z, (size), __deref_inout_bcount(size) __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_inout_nz                                         _SAL1_Source_(__deref_inout_nz, (), __deref_inout)
-#define __deref_inout_ecount_nz(size)                            _SAL1_Source_(__deref_inout_ecount_nz, (size), __deref_inout_ecount(size))
-#define __deref_inout_bcount_nz(size)                            _SAL1_Source_(__deref_inout_bcount_nz, (size), __deref_inout_ecount(size))
-#define __deref_ecount_opt(size)                                 _SAL1_Source_(__deref_ecount_opt, (size), __deref_ecount(size)                        __post_deref_except_maybenull)
-#define __deref_bcount_opt(size)                                 _SAL1_Source_(__deref_bcount_opt, (size), __deref_bcount(size)                        __post_deref_except_maybenull)
-#define __deref_out_opt                                          _SAL1_Source_(__deref_out_opt, (), __deref_out                                 __post_deref_except_maybenull)
-#define __deref_out_ecount_opt(size)                             _SAL1_Source_(__deref_out_ecount_opt, (size), __deref_out_ecount(size)                    __post_deref_except_maybenull)
-#define __deref_out_bcount_opt(size)                             _SAL1_Source_(__deref_out_bcount_opt, (size), __deref_out_bcount(size)                    __post_deref_except_maybenull)
-#define __deref_out_ecount_part_opt(size,length)                 _SAL1_Source_(__deref_out_ecount_part_opt, (size,length), __deref_out_ecount_part(size,length)        __post_deref_except_maybenull)
-#define __deref_out_bcount_part_opt(size,length)                 _SAL1_Source_(__deref_out_bcount_part_opt, (size,length), __deref_out_bcount_part(size,length)        __post_deref_except_maybenull)
-#define __deref_out_ecount_full_opt(size)                        _SAL1_Source_(__deref_out_ecount_full_opt, (size), __deref_out_ecount_full(size)               __post_deref_except_maybenull)
-#define __deref_out_bcount_full_opt(size)                        _SAL1_Source_(__deref_out_bcount_full_opt, (size), __deref_out_bcount_full(size)               __post_deref_except_maybenull)
-#define __deref_out_z_opt                                        _SAL1_Source_(__deref_out_z_opt, (), _Outptr_result_maybenull_z_)
-#define __deref_out_ecount_z_opt(size)                           _SAL1_Source_(__deref_out_ecount_z_opt, (size), __deref_out_ecount_opt(size) __post __deref __nullterminated)
-#define __deref_out_bcount_z_opt(size)                           _SAL1_Source_(__deref_out_bcount_z_opt, (size), __deref_out_bcount_opt(size) __post __deref __nullterminated)
-#define __deref_out_nz_opt                                       _SAL1_Source_(__deref_out_nz_opt, (), __deref_out_opt)
-#define __deref_out_ecount_nz_opt(size)                          _SAL1_Source_(__deref_out_ecount_nz_opt, (size), __deref_out_ecount_opt(size))
-#define __deref_out_bcount_nz_opt(size)                          _SAL1_Source_(__deref_out_bcount_nz_opt, (size), __deref_out_bcount_opt(size))
-#define __deref_inout_opt                                        _SAL1_Source_(__deref_inout_opt, (), __deref_inout                               __pre_deref_except_maybenull __post_deref_except_maybenull)
-#define __deref_inout_ecount_opt(size)                           _SAL1_Source_(__deref_inout_ecount_opt, (size), __deref_inout_ecount(size)                  __pre_deref_except_maybenull __post_deref_except_maybenull)
-#define __deref_inout_bcount_opt(size)                           _SAL1_Source_(__deref_inout_bcount_opt, (size), __deref_inout_bcount(size)                  __pre_deref_except_maybenull __post_deref_except_maybenull)
-#define __deref_inout_ecount_part_opt(size,length)               _SAL1_Source_(__deref_inout_ecount_part_opt, (size,length), __deref_inout_ecount_part(size,length)      __pre_deref_except_maybenull __post_deref_except_maybenull)
-#define __deref_inout_bcount_part_opt(size,length)               _SAL1_Source_(__deref_inout_bcount_part_opt, (size,length), __deref_inout_bcount_part(size,length)      __pre_deref_except_maybenull __post_deref_except_maybenull)
-#define __deref_inout_ecount_full_opt(size)                      _SAL1_Source_(__deref_inout_ecount_full_opt, (size), __deref_inout_ecount_full(size)             __pre_deref_except_maybenull __post_deref_except_maybenull)
-#define __deref_inout_bcount_full_opt(size)                      _SAL1_Source_(__deref_inout_bcount_full_opt, (size), __deref_inout_bcount_full(size)             __pre_deref_except_maybenull __post_deref_except_maybenull)
-#define __deref_inout_z_opt                                      _SAL1_Source_(__deref_inout_z_opt, (), __deref_inout_opt __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_inout_ecount_z_opt(size)                         _SAL1_Source_(__deref_inout_ecount_z_opt, (size), __deref_inout_ecount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_inout_bcount_z_opt(size)                         _SAL1_Source_(__deref_inout_bcount_z_opt, (size), __deref_inout_bcount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_inout_nz_opt                                     _SAL1_Source_(__deref_inout_nz_opt, (), __deref_inout_opt)
-#define __deref_inout_ecount_nz_opt(size)                        _SAL1_Source_(__deref_inout_ecount_nz_opt, (size), __deref_inout_ecount_opt(size))
-#define __deref_inout_bcount_nz_opt(size)                        _SAL1_Source_(__deref_inout_bcount_nz_opt, (size), __deref_inout_bcount_opt(size))
-#define __deref_opt_ecount(size)                                 _SAL1_Source_(__deref_opt_ecount, (size), __deref_ecount(size)                        __pre_except_maybenull)
-#define __deref_opt_bcount(size)                                 _SAL1_Source_(__deref_opt_bcount, (size), __deref_bcount(size)                        __pre_except_maybenull)
-#define __deref_opt_out                                          _SAL1_Source_(__deref_opt_out, (), _Outptr_opt_)
-#define __deref_opt_out_z                                        _SAL1_Source_(__deref_opt_out_z, (), _Outptr_opt_result_z_)
-#define __deref_opt_out_ecount(size)                             _SAL1_Source_(__deref_opt_out_ecount, (size), __deref_out_ecount(size)                    __pre_except_maybenull)
-#define __deref_opt_out_bcount(size)                             _SAL1_Source_(__deref_opt_out_bcount, (size), __deref_out_bcount(size)                    __pre_except_maybenull)
-#define __deref_opt_out_ecount_part(size,length)                 _SAL1_Source_(__deref_opt_out_ecount_part, (size,length), __deref_out_ecount_part(size,length)        __pre_except_maybenull)
-#define __deref_opt_out_bcount_part(size,length)                 _SAL1_Source_(__deref_opt_out_bcount_part, (size,length), __deref_out_bcount_part(size,length)        __pre_except_maybenull)
-#define __deref_opt_out_ecount_full(size)                        _SAL1_Source_(__deref_opt_out_ecount_full, (size), __deref_out_ecount_full(size)               __pre_except_maybenull)
-#define __deref_opt_out_bcount_full(size)                        _SAL1_Source_(__deref_opt_out_bcount_full, (size), __deref_out_bcount_full(size)               __pre_except_maybenull)
-#define __deref_opt_inout                                        _SAL1_Source_(__deref_opt_inout, (), _Inout_opt_)
-#define __deref_opt_inout_ecount(size)                           _SAL1_Source_(__deref_opt_inout_ecount, (size), __deref_inout_ecount(size)                  __pre_except_maybenull)
-#define __deref_opt_inout_bcount(size)                           _SAL1_Source_(__deref_opt_inout_bcount, (size), __deref_inout_bcount(size)                  __pre_except_maybenull)
-#define __deref_opt_inout_ecount_part(size,length)               _SAL1_Source_(__deref_opt_inout_ecount_part, (size,length), __deref_inout_ecount_part(size,length)      __pre_except_maybenull)
-#define __deref_opt_inout_bcount_part(size,length)               _SAL1_Source_(__deref_opt_inout_bcount_part, (size,length), __deref_inout_bcount_part(size,length)      __pre_except_maybenull)
-#define __deref_opt_inout_ecount_full(size)                      _SAL1_Source_(__deref_opt_inout_ecount_full, (size), __deref_inout_ecount_full(size)             __pre_except_maybenull)
-#define __deref_opt_inout_bcount_full(size)                      _SAL1_Source_(__deref_opt_inout_bcount_full, (size), __deref_inout_bcount_full(size)             __pre_except_maybenull)
-#define __deref_opt_inout_z                                      _SAL1_Source_(__deref_opt_inout_z, (), __deref_opt_inout __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_opt_inout_ecount_z(size)                         _SAL1_Source_(__deref_opt_inout_ecount_z, (size), __deref_opt_inout_ecount(size) __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_opt_inout_bcount_z(size)                         _SAL1_Source_(__deref_opt_inout_bcount_z, (size), __deref_opt_inout_bcount(size) __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_opt_inout_nz                                     _SAL1_Source_(__deref_opt_inout_nz, (), __deref_opt_inout)
-#define __deref_opt_inout_ecount_nz(size)                        _SAL1_Source_(__deref_opt_inout_ecount_nz, (size), __deref_opt_inout_ecount(size))
-#define __deref_opt_inout_bcount_nz(size)                        _SAL1_Source_(__deref_opt_inout_bcount_nz, (size), __deref_opt_inout_bcount(size))
-#define __deref_opt_ecount_opt(size)                             _SAL1_Source_(__deref_opt_ecount_opt, (size), __deref_ecount_opt(size)                    __pre_except_maybenull)
-#define __deref_opt_bcount_opt(size)                             _SAL1_Source_(__deref_opt_bcount_opt, (size), __deref_bcount_opt(size)                    __pre_except_maybenull)
-#define __deref_opt_out_opt                                      _SAL1_Source_(__deref_opt_out_opt, (), _Outptr_opt_result_maybenull_)
-#define __deref_opt_out_ecount_opt(size)                         _SAL1_Source_(__deref_opt_out_ecount_opt, (size), __deref_out_ecount_opt(size)                __pre_except_maybenull)
-#define __deref_opt_out_bcount_opt(size)                         _SAL1_Source_(__deref_opt_out_bcount_opt, (size), __deref_out_bcount_opt(size)                __pre_except_maybenull)
-#define __deref_opt_out_ecount_part_opt(size,length)             _SAL1_Source_(__deref_opt_out_ecount_part_opt, (size,length), __deref_out_ecount_part_opt(size,length)    __pre_except_maybenull)
-#define __deref_opt_out_bcount_part_opt(size,length)             _SAL1_Source_(__deref_opt_out_bcount_part_opt, (size,length), __deref_out_bcount_part_opt(size,length)    __pre_except_maybenull)
-#define __deref_opt_out_ecount_full_opt(size)                    _SAL1_Source_(__deref_opt_out_ecount_full_opt, (size), __deref_out_ecount_full_opt(size)           __pre_except_maybenull)
-#define __deref_opt_out_bcount_full_opt(size)                    _SAL1_Source_(__deref_opt_out_bcount_full_opt, (size), __deref_out_bcount_full_opt(size)           __pre_except_maybenull)
-#define __deref_opt_out_z_opt                                    _SAL1_Source_(__deref_opt_out_z_opt, (), __post __deref __valid __refparam __pre_except_maybenull __pre_deref_except_maybenull __post_deref_except_maybenull __post __deref __nullterminated)
-#define __deref_opt_out_ecount_z_opt(size)                       _SAL1_Source_(__deref_opt_out_ecount_z_opt, (size), __deref_opt_out_ecount_opt(size) __post __deref __nullterminated)
-#define __deref_opt_out_bcount_z_opt(size)                       _SAL1_Source_(__deref_opt_out_bcount_z_opt, (size), __deref_opt_out_bcount_opt(size) __post __deref __nullterminated)
-#define __deref_opt_out_nz_opt                                   _SAL1_Source_(__deref_opt_out_nz_opt, (), __deref_opt_out_opt)
-#define __deref_opt_out_ecount_nz_opt(size)                      _SAL1_Source_(__deref_opt_out_ecount_nz_opt, (size), __deref_opt_out_ecount_opt(size))
-#define __deref_opt_out_bcount_nz_opt(size)                      _SAL1_Source_(__deref_opt_out_bcount_nz_opt, (size), __deref_opt_out_bcount_opt(size))
-#define __deref_opt_inout_opt                                    _SAL1_Source_(__deref_opt_inout_opt, (), __deref_inout_opt                           __pre_except_maybenull)
-#define __deref_opt_inout_ecount_opt(size)                       _SAL1_Source_(__deref_opt_inout_ecount_opt, (size), __deref_inout_ecount_opt(size)              __pre_except_maybenull)
-#define __deref_opt_inout_bcount_opt(size)                       _SAL1_Source_(__deref_opt_inout_bcount_opt, (size), __deref_inout_bcount_opt(size)              __pre_except_maybenull)
-#define __deref_opt_inout_ecount_part_opt(size,length)           _SAL1_Source_(__deref_opt_inout_ecount_part_opt, (size,length), __deref_inout_ecount_part_opt(size,length)  __pre_except_maybenull)
-#define __deref_opt_inout_bcount_part_opt(size,length)           _SAL1_Source_(__deref_opt_inout_bcount_part_opt, (size,length), __deref_inout_bcount_part_opt(size,length)  __pre_except_maybenull)
-#define __deref_opt_inout_ecount_full_opt(size)                  _SAL1_Source_(__deref_opt_inout_ecount_full_opt, (size), __deref_inout_ecount_full_opt(size)         __pre_except_maybenull)
-#define __deref_opt_inout_bcount_full_opt(size)                  _SAL1_Source_(__deref_opt_inout_bcount_full_opt, (size), __deref_inout_bcount_full_opt(size)         __pre_except_maybenull)
-#define __deref_opt_inout_z_opt                                  _SAL1_Source_(__deref_opt_inout_z_opt, (), __deref_opt_inout_opt  __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_opt_inout_ecount_z_opt(size)                     _SAL1_Source_(__deref_opt_inout_ecount_z_opt, (size), __deref_opt_inout_ecount_opt(size)  __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_opt_inout_bcount_z_opt(size)                     _SAL1_Source_(__deref_opt_inout_bcount_z_opt, (size), __deref_opt_inout_bcount_opt(size)  __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_opt_inout_nz_opt                                 _SAL1_Source_(__deref_opt_inout_nz_opt, (), __deref_opt_inout_opt)
-#define __deref_opt_inout_ecount_nz_opt(size)                    _SAL1_Source_(__deref_opt_inout_ecount_nz_opt, (size), __deref_opt_inout_ecount_opt(size))
-#define __deref_opt_inout_bcount_nz_opt(size)                    _SAL1_Source_(__deref_opt_inout_bcount_nz_opt, (size), __deref_opt_inout_bcount_opt(size))
+#if !defined(_Post_invalid_)
+#define _Post_invalid_
+#endif
 
-/*
--------------------------------------------------------------------------------
-Advanced Annotation Definitions
+#if !defined(_Post_valid_)
+#define _Post_valid_
+#endif
 
-Any of these may be used to directly annotate functions, and may be used in
-combination with each other or with regular buffer macros. For an explanation
-of each annotation, see the advanced annotations section.
--------------------------------------------------------------------------------
-*/
+#if !defined(_Pre_notnull_)
+#define _Pre_notnull_
+#endif
 
-#define __success(expr)                      _Success_(expr)
-#define __nullterminated                     _Null_terminated_
-#define __nullnullterminated
-#define __clr_reserved                       _SAL1_Source_(__reserved, (), _Reserved_)
-#define __checkReturn                        _SAL1_Source_(__checkReturn, (), _Check_return_)
-#define __typefix(ctype)                     _SAL1_Source_(__typefix, (ctype), __inner_typefix(ctype))
-#define __override                           __inner_override
-#define __callback                           __inner_callback
-#define __format_string                      _Printf_format_string_
-#define __blocksOn(resource)                 __inner_blocksOn(resource)
-#define __control_entrypoint(category)       __inner_control_entrypoint(category)
-#define __data_entrypoint(category)          __inner_data_entrypoint(category)
-#define __useHeader                          _Use_decl_anno_impl_
-#define __on_failure(annotes)                _On_failure_impl_(annotes _SAL_nop_impl_)
-
-#ifndef __has_cpp_attribute
-#define __has_cpp_attribute(x) (0)
-#endif
-
-#ifndef __fallthrough // [
-#if __has_cpp_attribute(fallthrough)
-#define __fallthrough [[fallthrough]]
-#else
-#define __fallthrough
-#endif
-#endif // ]
-
-#ifndef __analysis_assume // [
-#ifdef _PREFAST_ // [
-#define __analysis_assume(expr) __assume(expr)
-#else // ][
-#define __analysis_assume(expr)
-#endif // ]
-#endif // ]
-
-#ifndef _Analysis_assume_ // [
-#ifdef _PREFAST_ // [
-#define _Analysis_assume_(expr) __assume(expr)
-#else // ][
-#define _Analysis_assume_(expr)
-#endif // ]
-#endif // ]
+#if !defined(_When_)
+#define _When_(expr1, expr2)
+#endif
 
-#define _Analysis_noreturn_    _SAL2_Source_(_Analysis_noreturn_, (), _SA_annotes0(SAL_terminates))
+#define _Deref_pre_z_
 
-#ifdef _PREFAST_ // [
-__inline __nothrow
-void __AnalysisAssumeNullterminated(_Post_ __nullterminated void *p);
+#if !defined(_Inout_updates_)
+#define _Inout_updates_(count)
+#endif
 
-#define _Analysis_assume_nullterminated_(x) __AnalysisAssumeNullterminated(x)
-#else // ][
-#define _Analysis_assume_nullterminated_(x)
-#endif // ]
+#if !defined(_Out_writes_opt_z_)
+#define _Out_writes_opt_z_(count)
+#endif
 
-//
-// Set the analysis mode (global flags to analysis).
-// They take effect at the point of declaration; use at global scope
-// as a declaration.
-//
+#if !defined(_Out_cap_post_count_)
+#define _Out_cap_post_count_(maxLen,used)
+#endif
 
-// Synthesize a unique symbol.
-#define ___MKID(x, y) x ## y
-#define __MKID(x, y) ___MKID(x, y)
-#define __GENSYM(x) __MKID(x, __COUNTER__)
+#if !defined(_Inout_count_)
+#define _Inout_count_(size)
+#endif
 
-__ANNOTATION(SAL_analysisMode(__AuToQuOtE __In_impl_ char *mode);)
+#if !defined(_Post_equal_to_)
+#define _Post_equal_to_(expr)
+#endif
 
-#define _Analysis_mode_impl_(mode) _SA_annotes1(SAL_analysisMode, #mode)
+#if !defined(_Field_size_bytes_)
+# define _Field_size_bytes_(count)
+#endif
 
-#define _Analysis_mode_(mode)                                                 \
-    typedef _Analysis_mode_impl_(mode) int                                    \
-        __GENSYM(__prefast_analysis_mode_flag);
+#if !defined(_Field_range_)
+# define _Field_range_(count, capacity)
+#endif
 
-// The following are predefined:
-//  _Analysis_operator_new_throw_   (operator new throws)
-//  _Analysis_operator_new_null_        (operator new returns null)
-//  _Analysis_operator_new_never_fails_ (operator new never fails)
-//
+#if !defined(_Post_z_)
+# define _Post_z_
+#endif
 
-// Function class annotations.
-__ANNOTATION(SAL_functionClassNew(__In_impl_ char*);)
-__PRIMOP(int, _In_function_class_(__In_impl_ char*);)
-#define _In_function_class_(x)  _In_function_class_(#x)
+#if !defined(_Outptr_result_buffer_)
+# define _Outptr_result_buffer_(count)
+#endif
 
-#define _Function_class_(x)  _SA_annotes1(SAL_functionClassNew, #x)
+#if !defined(_Field_size_)
+# define _Field_size_(count)
+#endif
 
-/*
- * interlocked operand used in interlocked instructions
- */
-//#define _Interlocked_operand_ _Pre_ _SA_annotes0(SAL_interlocked)
+#if !defined(_Always_)
+# define _Always_(expr)
+#endif
 
-#define _Enum_is_bitflag_    _SA_annotes0(SAL_enumIsBitflag)
-#define _Strict_type_match_  _SA_annotes0(SAL_strictType2)
+#if !defined(_Readable_bytes_)
+# define _Readable_bytes_(count)
+#endif
 
-#define _Maybe_raises_SEH_exception_   _Pre_ _SA_annotes1(SAL_inTry,__yes)
-#define _Raises_SEH_exception_         _Group_(_Maybe_raises_SEH_exception_ _Analysis_noreturn_)
+#if !defined(_Outptr_result_buffer_maybenull_)
+# define _Outptr_result_buffer_maybenull_(count)
+#endif
 
-#ifdef  __cplusplus // [
-}
-#endif // ]
+#endif /* _SAL_h */
diff --git a/src/thirdparty/sse_mathfun/sse_mathfun.h b/src/thirdparty/sse_mathfun/sse_mathfun.h
new file mode 100644
index 000000000..24c3394a9
--- /dev/null
+++ b/src/thirdparty/sse_mathfun/sse_mathfun.h
@@ -0,0 +1,710 @@
+/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+
+   The default is to use the SSE1 version. If you define USE_SSE2 the
+   the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
+   not expect any significant performance improvement with SSE2.
+*/
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include <xmmintrin.h>
+
+/* yes I know, the top of this file is quite ugly */
+
+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN16_BEG __declspec(align(16))
+# define ALIGN16_END 
+#else /* gcc or icc */
+# define ALIGN16_BEG
+# define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+/* __m128 is ugly to write */
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+
+#ifdef USE_SSE2
+# include <emmintrin.h>
+typedef __m128i v4si; // vector of 4 int (sse2)
+#else
+typedef __m64 v2si;   // vector of 2 int (mmx)
+#endif
+
+/* declare some SSE constants -- why can't I figure a better way to do that? */
+#define _PS_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { (float)(Val), (float)(Val), (float)(Val), (float)(Val) }
+#define _PI32_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { (Val), (Val), (Val), (Val) }
+#define _PS_CONST_TYPE(Name, Type, Val)                                 \
+  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { (Type)(Val), (Type)(Val), (Type)(Val), (Type)(Val) }
+
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+
+_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS_CONST(cephes_log_p3, -1.2420140846E-1);
+_PS_CONST(cephes_log_p4, +1.4249322787E-1);
+_PS_CONST(cephes_log_p5, -1.6668057665E-1);
+_PS_CONST(cephes_log_p6, +2.0000714765E-1);
+_PS_CONST(cephes_log_p7, -2.4999993993E-1);
+_PS_CONST(cephes_log_p8, +3.3333331174E-1);
+_PS_CONST(cephes_log_q1, -2.12194440e-4);
+_PS_CONST(cephes_log_q2, 0.693359375);
+
+#ifndef USE_SSE2
+typedef union xmm_mm_union {
+  __m128 xmm;
+  __m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
+    xmm_mm_union u; u.xmm = xmm_;                   \
+    mm0_ = u.mm[0];                                 \
+    mm1_ = u.mm[1];                                 \
+}
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
+    xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
+  }
+
+#endif // USE_SSE2
+
+/* natural logarithm computed for 4 simultaneous float 
+   return NaN for x <= 0
+*/
+inline v4sf log_ps(v4sf x) {
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+
+  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
+
+#ifndef USE_SSE2
+  /* part 1: x = frexpf(x, &e); */
+  COPY_XMM_TO_MM(x, mm0, mm1);
+  mm0 = _mm_srli_pi32(mm0, 23);
+  mm1 = _mm_srli_pi32(mm1, 23);
+#else
+  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+#endif
+  /* keep only the fractional part */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
+  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
+
+#ifndef USE_SSE2
+  /* now e=mm0:mm1 contain the really base-2 exponent */
+  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
+  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
+  _mm_empty(); /* bye bye mmx */
+#else
+  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
+  v4sf e = _mm_cvtepi32_ps(emm0);
+#endif
+
+  e = _mm_add_ps(e, one);
+
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
+  v4sf tmp = _mm_and_ps(x, mask);
+  x = _mm_sub_ps(x, one);
+  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
+  x = _mm_add_ps(x, tmp);
+
+
+  v4sf z = _mm_mul_ps(x,x);
+
+  v4sf y = *(v4sf*)_ps_cephes_log_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
+  y = _mm_mul_ps(y, x);
+
+  y = _mm_mul_ps(y, z);
+  
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
+  y = _mm_add_ps(y, tmp);
+
+
+  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
+  x = _mm_add_ps(x, y);
+  x = _mm_add_ps(x, tmp);
+  x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
+  return x;
+}
+
+_PS_CONST(exp_hi,	88.3762626647949f);
+_PS_CONST(exp_lo,	-88.3762626647949f);
+
+_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS_CONST(cephes_exp_C1, 0.693359375);
+_PS_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+inline v4sf exp_ps(v4sf x) {
+  v4sf tmp = _mm_setzero_ps(), fx;
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
+  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
+  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
+
+  /* how to perform a floorf with SSE: just below */
+#ifndef USE_SSE2
+  /* step 1 : cast to int */
+  tmp = _mm_movehl_ps(tmp, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(tmp);
+  /* step 2 : cast back to float */
+  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  tmp  = _mm_cvtepi32_ps(emm0);
+#endif
+  /* if greater, substract 1 */
+  v4sf mask = _mm_cmpgt_ps(tmp, fx);    
+  mask = _mm_and_ps(mask, one);
+  fx = _mm_sub_ps(tmp, mask);
+
+  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
+  v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
+  x = _mm_sub_ps(x, tmp);
+  x = _mm_sub_ps(x, z);
+
+  z = _mm_mul_ps(x,x);
+  
+  v4sf y = *(v4sf*)_ps_cephes_exp_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, x);
+  y = _mm_add_ps(y, one);
+
+  /* build 2^n */
+#ifndef USE_SSE2
+  z = _mm_movehl_ps(z, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(z);
+  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
+  mm0 = _mm_slli_pi32(mm0, 23); 
+  mm1 = _mm_slli_pi32(mm1, 23);
+  
+  v4sf pow2n; 
+  COPY_MM_TO_XMM(mm0, mm1, pow2n);
+  _mm_empty();
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
+  emm0 = _mm_slli_epi32(emm0, 23);
+  v4sf pow2n = _mm_castsi128_ps(emm0);
+#endif
+  y = _mm_mul_ps(y, pow2n);
+  return y;
+}
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+
+/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
+   it runs also on old athlons XPs and the pentium III of your grand
+   mother.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+   Performance is also surprisingly good, 1.33 times faster than the
+   macos vsinf SSE2 function, and 1.5 times faster than the
+   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
+   too bad for an SSE1 function (with no special tuning) !
+   However the latter libraries probably have a much better handling of NaN,
+   Inf, denormalized and other special arguments..
+
+   On my core 1 duo, the execution of this function takes approximately 95 cycles.
+
+   From what I have observed on the experiments with Intel AMath lib, switching to an
+   SSE2 version would improve the perf by only 10%.
+
+   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
+   deliver full speed.
+*/
+inline v4sf sin_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  /* get the swap sign flag */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+  
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+  /* get the swap sign flag */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  /* get the polynom selection mask */
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf swap_sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+  return y;
+}
+
+/* almost the same as sin_ps */
+inline v4sf cos_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+  
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
+  
+  /* get the swap sign flag */
+  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  v4sf sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+
+  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+
+  /* get the swap sign flag in mm0:mm1 and the 
+     polynom selection mask in mm2:mm3 */
+
+  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+  v4sf sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+inline void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
+  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2, emm4;
+#else
+  v2si mm0, mm1, mm2, mm3, mm4, mm5;
+#endif
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+    
+#ifdef USE_SSE2
+  /* store the integer part of y in emm2 */
+  emm2 = _mm_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm4 = emm2;
+
+  /* get the swap sign flag for the sine */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+  /* get the polynom selection mask for the sine*/
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm2:mm3 */
+  xmm3 = _mm_movehl_ps(xmm3, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm3);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+  mm4 = mm2;
+  mm5 = mm3;
+
+  /* get the swap sign flag for the sine */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  v4sf swap_sign_bit_sin;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+
+  /* get the polynom selection mask for the sine */
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf poly_mask;
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+#endif
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+#ifdef USE_SSE2
+  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
+  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
+  emm4 = _mm_slli_epi32(emm4, 29);
+  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
+#else
+  /* get the sign flag for the cosine */
+  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+  mm4 = _mm_slli_pi32(mm4, 29);
+  mm5 = _mm_slli_pi32(mm5, 29);
+  v4sf sign_bit_cos;
+  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+  _mm_empty(); /* good-bye mmx */
+#endif
+
+  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v4sf z = _mm_mul_ps(x,x);
+  y = *(v4sf*)_ps_coscof_p0;
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  v4sf ysin2 = _mm_and_ps(xmm3, y2);
+  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+  y2 = _mm_sub_ps(y2,ysin2);
+  y = _mm_sub_ps(y, ysin1);
+
+  xmm1 = _mm_add_ps(ysin1,ysin2);
+  xmm2 = _mm_add_ps(y,y2);
+ 
+  /* update the sign */
+  *s = _mm_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm_xor_ps(xmm2, sign_bit_cos);
+}
diff --git a/src/tier2/util_init.cpp b/src/tier2/util_init.cpp
index 1fb1ddd21..e4e2620c4 100644
--- a/src/tier2/util_init.cpp
+++ b/src/tier2/util_init.cpp
@@ -38,7 +38,7 @@ static void PrintFReportHandler(char const *job_name, int total_units_to_do, int
 
 void InitCommandLineProgram( int argc, char **argv )
 {
-	MathLib_Init( 1,1,1,0,false,true,true,true);
+	MathLib_Init( 1,1,1,0);
 	CommandLine()->CreateCmdLine( argc, argv );
 	InitDefaultFileSystem();
 	InstallProgressReportHandler( PrintFReportHandler );
diff --git a/src/tier3/mdlutils.cpp b/src/tier3/mdlutils.cpp
index afe872941..553a297f3 100644
--- a/src/tier3/mdlutils.cpp
+++ b/src/tier3/mdlutils.cpp
@@ -292,7 +292,7 @@ void CMDL::SetUpBones( const matrix3x4_t& rootToWorld, int nMaxBoneCount, matrix
 	flCycle -= (int)(flCycle);
 
 	Vector		pos[MAXSTUDIOBONES];
-	Quaternion	q[MAXSTUDIOBONES];
+	QuaternionAligned	q[MAXSTUDIOBONES];
 
 	IBoneSetup boneSetup( &studioHdr, BONE_USED_BY_ANYTHING_AT_LOD( m_nLOD ), pPoseParameter, NULL );
 	boneSetup.InitPose( pos, q );
@@ -420,7 +420,7 @@ void CMDL::SetupBonesWithBoneMerge( const CStudioHdr *pMergeHdr, matrix3x4_t *pM
 	flCycle -= (int)(flCycle);
 
 	Vector pos[MAXSTUDIOBONES];
-	Quaternion q[MAXSTUDIOBONES];
+	QuaternionAligned q[MAXSTUDIOBONES];
 
 	IBoneSetup boneSetup( pMergeHdr,  BONE_USED_BY_ANYTHING_AT_LOD( m_nLOD ), pPoseParameter );
 	boneSetup.InitPose( pos, q );

From b5a943fe822ee0a19a1e5d94887fbd281aa4a0fb Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Sun, 5 Mar 2023 00:20:04 -0500
Subject: [PATCH 06/42] perf: backport CalcBones optimizations from CS:GO

* lower LODs are now marked as set up
* align parent transform matrix
---
 src/game/client/c_baseanimating.cpp | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/game/client/c_baseanimating.cpp b/src/game/client/c_baseanimating.cpp
index c78f8582f..a6dda9b98 100644
--- a/src/game/client/c_baseanimating.cpp
+++ b/src/game/client/c_baseanimating.cpp
@@ -2837,6 +2837,20 @@ bool C_BaseAnimating::SetupBones( matrix3x4_t *pBoneToWorldOut, int nMaxBones, i
 		}
 	}
 
+	// If we're setting up LOD N, we have set up all lower LODs also
+	// because lower LODs always use subsets of the bones of higher LODs.
+	int nLOD = 0;
+	int nMask = BONE_USED_BY_VERTEX_LOD0;
+	for( ; nLOD < MAX_NUM_LODS; ++nLOD, nMask <<= 1 )
+	{
+		if ( boneMask & nMask )
+			break;
+	}
+	for( ; nLOD < MAX_NUM_LODS; ++nLOD, nMask <<= 1 )
+	{
+		boneMask |= nMask;
+	}
+
 #ifdef DEBUG_BONE_SETUP_THREADING
 	if ( cl_warn_thread_contested_bone_setup.GetBool() )
 	{
@@ -2904,7 +2918,7 @@ bool C_BaseAnimating::SetupBones( matrix3x4_t *pBoneToWorldOut, int nMaxBones, i
 			return false;
 
 		// Setup our transform based on render angles and origin.
-		matrix3x4_t parentTransform;
+		ALIGN16 matrix3x4_t parentTransform ALIGN16_POST;
 		AngleMatrix( GetRenderAngles(), GetRenderOrigin(), parentTransform );
 
 		// Load the boneMask with the total of what was asked for last frame.
@@ -2974,6 +2988,7 @@ bool C_BaseAnimating::SetupBones( matrix3x4_t *pBoneToWorldOut, int nMaxBones, i
 			StandardBlendingRules( hdr, pos, q, currentTime, bonesMaskNeedRecalc );
 
 			CBoneBitList boneComputed;
+
 			// don't calculate IK on ragdolls
 			if ( m_pIk && !IsRagdoll() )
 			{

From 712ff518b37adb4b42550f36a0a6386b37e10a96 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Sun, 5 Mar 2023 00:22:13 -0500
Subject: [PATCH 07/42] perf: add animation attachment deferral

this skips a pre-mature/extra bone setup during particle simulation
and flexing by allowing for the last frame's attachment position to be used

technically this is a bit of a hack but it works well from my analysis.

it gets rid of almost all of the particle cost in team fights besides sprite rendering
---
 src/game/client/c_baseanimating.cpp      | 32 +++++++++++++++++++++---
 src/game/client/c_baseanimating.h        |  1 +
 src/game/client/c_baseflex.cpp           |  2 +-
 src/game/client/tf/c_tf_player.cpp       | 15 +++++++++++
 src/game/shared/baseviewmodel_shared.cpp |  9 +++++++
 src/game/shared/baseviewmodel_shared.h   |  1 +
 src/game/shared/econ/econ_entity.cpp     |  8 ++++++
 src/game/shared/econ/econ_entity.h       |  1 +
 src/game/shared/particle_property.cpp    |  4 +--
 9 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/src/game/client/c_baseanimating.cpp b/src/game/client/c_baseanimating.cpp
index a6dda9b98..3fdc65dcf 100644
--- a/src/game/client/c_baseanimating.cpp
+++ b/src/game/client/c_baseanimating.cpp
@@ -1096,6 +1096,7 @@ CStudioHdr *C_BaseAnimating::OnNewModel()
 		}
 	}
 	m_BoneAccessor.Init( this, m_CachedBoneData.Base() ); // Always call this in case the studiohdr_t has changed.
+	m_iAccumulatedBoneMask = 0; // Reset the accumulated bone mask.
 
 	// Free any IK data
 	if (m_pIk)
@@ -2033,18 +2034,24 @@ bool C_BaseAnimating::PutAttachment( int number, const matrix3x4_t &attachmentTo
 		return false;
 
 	CAttachmentData *pAtt = &m_Attachments[number-1];
-	if ( gpGlobals->frametime > 0 && pAtt->m_nLastFramecount > 0 && pAtt->m_nLastFramecount == gpGlobals->framecount - 1 )
+	if ( gpGlobals->frametime > 0 && pAtt->m_nLastFramecount > 0 && pAtt->m_nLastFramecount < gpGlobals->framecount )
 	{
 		Vector vecPreviousOrigin, vecOrigin;
 		MatrixPosition( pAtt->m_AttachmentToWorld, vecPreviousOrigin );
 		MatrixPosition( attachmentToWorld, vecOrigin );
-		pAtt->m_vOriginVelocity = (vecOrigin - vecPreviousOrigin) / gpGlobals->frametime;
+		// compensate for the fact that the previous origin could have been multiple frames behind
+		pAtt->m_vOriginVelocity = (vecOrigin - vecPreviousOrigin) / (gpGlobals->frametime * (gpGlobals->framecount - pAtt->m_nLastFramecount));
+		// only update the frame count if the position changed, so we don't have to recompute attachments
+		if ( !pAtt->m_vOriginVelocity.IsZero(0.00001f) )
+		{
+			pAtt->m_nLastFramecount = gpGlobals->framecount;
+		}
 	}
 	else
 	{
 		pAtt->m_vOriginVelocity.Init();
+		pAtt->m_nLastFramecount = gpGlobals->framecount;
 	}
-	pAtt->m_nLastFramecount = gpGlobals->framecount;
 	pAtt->m_bAnglesComputed = false;
 	pAtt->m_AttachmentToWorld = attachmentToWorld;
 
@@ -2108,6 +2115,21 @@ bool C_BaseAnimating::GetAttachment( const char *szName, Vector &absOrigin, QAng
 	return GetAttachment( LookupAttachment( szName ), absOrigin, absAngles );
 }
 
+bool C_BaseAnimating::GetAttachmentDeferred( int number, matrix3x4_t& matrix )
+{
+	if (number < 1 || number > m_Attachments.Count())
+		return false;
+
+	// allow visual effects (eg. particles) to be a frame behind bone setup so that there are not messy dependencies.
+	CAttachmentData* pAtt = &m_Attachments[number - 1];
+	const bool bShouldUpdate = pAtt->m_nLastFramecount < gpGlobals->framecount - 1;
+	if ( bShouldUpdate && !CalcAttachments() )
+		return false;
+
+	matrix = pAtt->m_AttachmentToWorld;
+	return true;
+}
+
 //-----------------------------------------------------------------------------
 // Purpose: Get attachment point by index
 // Input  : number - which point
@@ -2883,7 +2905,9 @@ bool C_BaseAnimating::SetupBones( matrix3x4_t *pBoneToWorldOut, int nMaxBones, i
 			m_flLastBoneSetupTime = currentTime;
 		}
 		m_iPrevBoneMask = m_iAccumulatedBoneMask;
-		m_iAccumulatedBoneMask = 0;
+		// Keep record of the fact that we've used attachments. Because of deferred attachments, we can't keep track from the previous frame.
+		//m_iAccumulatedBoneMask = 0;
+		m_iAccumulatedBoneMask = m_iAccumulatedBoneMask & BONE_USED_BY_ATTACHMENT;
 
 #ifdef STUDIO_ENABLE_PERF_COUNTERS
 		CStudioHdr *hdr = GetModelPtr();
diff --git a/src/game/client/c_baseanimating.h b/src/game/client/c_baseanimating.h
index 1c8a74b30..6aa79794b 100644
--- a/src/game/client/c_baseanimating.h
+++ b/src/game/client/c_baseanimating.h
@@ -265,6 +265,7 @@ class C_BaseAnimating : public C_BaseEntity, private IModelLoadCallback
 	// Attachments.
 	bool							GetAttachment( const char *szName, Vector &absOrigin );
 	bool							GetAttachment( const char *szName, Vector &absOrigin, QAngle &absAngles );
+	virtual bool			GetAttachmentDeferred( int number, matrix3x4_t &matrix );
 
 	// Inherited from C_BaseEntity
 	virtual bool					GetAttachment( int number, Vector &origin );
diff --git a/src/game/client/c_baseflex.cpp b/src/game/client/c_baseflex.cpp
index a9bebdec1..8a52fb488 100644
--- a/src/game/client/c_baseflex.cpp
+++ b/src/game/client/c_baseflex.cpp
@@ -574,7 +574,7 @@ Vector C_BaseFlex::SetViewTarget( CStudioHdr *pStudioHdr )
 	if (m_iEyeAttachment > 0)
 	{
 		matrix3x4_t attToWorld;
-		if (!GetAttachment( m_iEyeAttachment, attToWorld ))
+		if (!GetAttachmentDeferred( m_iEyeAttachment, attToWorld ))
 		{
 			return Vector( 0, 0, 0);
 		}
diff --git a/src/game/client/tf/c_tf_player.cpp b/src/game/client/tf/c_tf_player.cpp
index 61ec13426..d43b0c3da 100644
--- a/src/game/client/tf/c_tf_player.cpp
+++ b/src/game/client/tf/c_tf_player.cpp
@@ -667,6 +667,7 @@ class C_TFRagdoll : public C_BaseFlex
 	int GetDamageCustom() { return m_iDamageCustom; }
 
 	virtual bool GetAttachment( int iAttachment, matrix3x4_t &attachmentToWorld );
+	virtual bool GetAttachmentDeferred( int iAttachment, matrix3x4_t &attachmentToWorld );
 
 	int GetClass() { return m_iClass; }
 
@@ -1579,6 +1580,20 @@ bool C_TFRagdoll::GetAttachment( int iAttachment, matrix3x4_t &attachmentToWorld
 	}
 }
 
+bool C_TFRagdoll::GetAttachmentDeferred( int iAttachment, matrix3x4_t &attachmentToWorld )
+{
+	int iHeadAttachment = LookupAttachment( "head" );
+	if ( IsDecapitation() && (iAttachment == iHeadAttachment) )
+	{
+		MatrixCopy( m_mHeadAttachment, attachmentToWorld );
+		return true;
+	}
+	else
+	{
+		return BaseClass::GetAttachmentDeferred( iAttachment, attachmentToWorld );
+	}
+}
+
 //-----------------------------------------------------------------------------
 // Purpose: 
 // Input  :  - 
diff --git a/src/game/shared/baseviewmodel_shared.cpp b/src/game/shared/baseviewmodel_shared.cpp
index 20538e8ca..f42445da3 100644
--- a/src/game/shared/baseviewmodel_shared.cpp
+++ b/src/game/shared/baseviewmodel_shared.cpp
@@ -655,6 +655,15 @@ bool CBaseViewModel::GetAttachment( int number, matrix3x4_t &matrix )
 	return BaseClass::GetAttachment( number, matrix );
 }
 
+bool C_BaseViewModel::GetAttachmentDeferred( int number, matrix3x4_t &matrix )
+{
+	// Update priority for your own viewmodel (no deferral)
+	if ( m_hWeapon.Get() && m_hWeapon.Get()->WantsToOverrideViewmodelAttachments() )
+		return m_hWeapon.Get()->GetAttachment(number, matrix);
+
+	return BaseClass::GetAttachment( number, matrix );
+}
+
 //-----------------------------------------------------------------------------
 // Purpose: 
 //-----------------------------------------------------------------------------
diff --git a/src/game/shared/baseviewmodel_shared.h b/src/game/shared/baseviewmodel_shared.h
index 15d3be53f..434bcce05 100644
--- a/src/game/shared/baseviewmodel_shared.h
+++ b/src/game/shared/baseviewmodel_shared.h
@@ -168,6 +168,7 @@ class CBaseViewModel : public CBaseAnimating, public IHasOwner
 	// Attachments
 	virtual int				LookupAttachment( const char *pAttachmentName );
 	virtual bool			GetAttachment( int number, matrix3x4_t &matrix );
+	virtual bool			GetAttachmentDeferred( int number, matrix3x4_t &matrix );
 	virtual bool			GetAttachment( int number, Vector &origin );
 	virtual	bool			GetAttachment( int number, Vector &origin, QAngle &angles );
 	virtual bool			GetAttachmentVelocity( int number, Vector &originVel, Quaternion &angleVel );
diff --git a/src/game/shared/econ/econ_entity.cpp b/src/game/shared/econ/econ_entity.cpp
index 041ad9940..21d68cf14 100644
--- a/src/game/shared/econ/econ_entity.cpp
+++ b/src/game/shared/econ/econ_entity.cpp
@@ -1947,6 +1947,14 @@ bool CEconEntity::GetAttachment( int number, matrix3x4_t &matrix )
 	return BaseClass::GetAttachment( number, matrix );
 }
 
+bool C_EconEntity::GetAttachmentDeferred( int number, matrix3x4_t &matrix )
+{
+	if ( m_hViewmodelAttachment )
+		return m_hViewmodelAttachment->GetAttachmentDeferred( number, matrix );
+
+	return BaseClass::GetAttachmentDeferred( number, matrix );
+}
+
 //-----------------------------------------------------------------------------
 // Purpose: 
 //-----------------------------------------------------------------------------
diff --git a/src/game/shared/econ/econ_entity.h b/src/game/shared/econ/econ_entity.h
index e0c2453de..e09d59f67 100644
--- a/src/game/shared/econ/econ_entity.h
+++ b/src/game/shared/econ/econ_entity.h
@@ -114,6 +114,7 @@ class CEconEntity : public CBaseAnimating, public IHasAttributes
 	virtual bool			GetAttachment( const char *szName, Vector &absOrigin ) { return BaseClass::GetAttachment(szName,absOrigin); }
 	virtual bool			GetAttachment( const char *szName, Vector &absOrigin, QAngle &absAngles ) { return BaseClass::GetAttachment(szName,absOrigin,absAngles); }
 	virtual bool			GetAttachment( int number, matrix3x4_t &matrix );
+	virtual bool			GetAttachmentDeferred( int number, matrix3x4_t &matrix );
 	virtual bool			GetAttachment( int number, Vector &origin );
 	virtual	bool			GetAttachment( int number, Vector &origin, QAngle &angles );
 	virtual bool			GetAttachmentVelocity( int number, Vector &originVel, Quaternion &angleVel );
diff --git a/src/game/shared/particle_property.cpp b/src/game/shared/particle_property.cpp
index fe1087214..747211d31 100644
--- a/src/game/shared/particle_property.cpp
+++ b/src/game/shared/particle_property.cpp
@@ -612,10 +612,10 @@ void CParticleProperty::UpdateControlPoint( ParticleEffectList_t *pEffect, int i
 				{
 					matrix3x4_t attachmentToWorld;
 
-					if ( !pAnimating->GetAttachment( pPoint->iAttachmentPoint, attachmentToWorld ) )
+					if ( !pAnimating->GetAttachmentDeferred( pPoint->iAttachmentPoint, attachmentToWorld ) )
 					{
 						// try C_BaseAnimating if attach point is not on the weapon
-						if ( !pAnimating->C_BaseAnimating::GetAttachment( pPoint->iAttachmentPoint, attachmentToWorld ) )
+						if ( !pAnimating->C_BaseAnimating::GetAttachmentDeferred( pPoint->iAttachmentPoint, attachmentToWorld ) )
 						{
 							Warning( "Cannot update control point %d for effect '%s'.\n", pPoint->iAttachmentPoint, pEffect->pParticleEffect->GetEffectName() );
 							// Remove the effect cause this warning means something is orphaned

From db8ac86c8e4da71704327949ca18b84114f80b20 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Sun, 5 Mar 2023 00:20:32 -0500
Subject: [PATCH 08/42] perf: fix duplicate GetSOCData for
 GetQualityParticleType

* I don't think the compiler optimizes this out, because it
  isn't sure about side effects, and it's making this function
  spike up randomly in profiles, especially for the player model panel.
---
 src/game/shared/econ/econ_item_view.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/game/shared/econ/econ_item_view.cpp b/src/game/shared/econ/econ_item_view.cpp
index e25390bd3..caf16d067 100644
--- a/src/game/shared/econ/econ_item_view.cpp
+++ b/src/game/shared/econ/econ_item_view.cpp
@@ -1084,7 +1084,7 @@ int CEconItemView::GetQualityParticleType() const
 	if ( !pItem )
 		return 0;
 
-	if( GetSOCData()->GetQuality() == AE_SELFMADE || GetSOCData()->GetQuality() == AE_COMMUNITY )
+	if( pItem->GetQuality() == AE_SELFMADE || pItem->GetQuality() == AE_COMMUNITY )
 		return pSparkleSystem ? pSparkleSystem->nSystemID : 0;
 	else
 		return 0;

From 474947e53e35ec727a7d73344601c48ae6e2eb1f Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 6 Mar 2023 12:04:41 -0500
Subject: [PATCH 09/42] pending: sound updates

* increase max channels to 256 to prevent us from hitting the cap in team fights
* make more room for static channels over dynamic channels

* dsound updates:
  * use DSBCAPS_TRUEPLAYPOSITION for accurate GetCurrentPosition
  * Pass NULL to GetCurrentPosition for write cursor, to skip querying it
  * limit dsound max distance, we only use it for local speaker spatialization

* implement threaded sound
  a dedicated thread to doing all sound updates
  it gets notified when the main thread wants a sound update, and wakes up from sleep

* backport 64 bit sound changes from CSGO

* enable spatialization effects by default
---
 src/engine/audio/private/snd_channels.h     |   4 +-
 src/engine/audio/private/snd_dev_direct.cpp |  27 +-
 src/engine/audio/private/snd_dma.cpp        | 369 +++++++++++++++++---
 src/engine/audio/private/snd_dsp.cpp        |   6 +-
 src/engine/audio/private/snd_mix.cpp        |  10 +-
 src/engine/audio/private/sound_private.h    |   2 +-
 src/engine/audio/public/sound.h             |   4 +
 src/engine/baseclientstate.cpp              |   3 +
 src/engine/cl_main.cpp                      |   8 +
 src/engine/servermsghandler.cpp             |   2 -
 src/game/server/sceneentity.cpp             |   2 +
 src/game/server/soundscape_system.cpp       |   5 +
 src/game/server/soundscape_system.h         |   1 +
 src/game/shared/collisionproperty.cpp       |  15 +-
 14 files changed, 376 insertions(+), 82 deletions(-)

diff --git a/src/engine/audio/private/snd_channels.h b/src/engine/audio/private/snd_channels.h
index 06fd06506..ebc950d02 100644
--- a/src/engine/audio/private/snd_channels.h
+++ b/src/engine/audio/private/snd_channels.h
@@ -126,8 +126,8 @@ struct channel_t
 //-----------------------------------------------------------------------------
 //-----------------------------------------------------------------------------
 
-#define	MAX_CHANNELS			128
-#define	MAX_DYNAMIC_CHANNELS	64
+#define	MAX_CHANNELS			256
+#define	MAX_DYNAMIC_CHANNELS	32
 
 //-----------------------------------------------------------------------------
 //-----------------------------------------------------------------------------
diff --git a/src/engine/audio/private/snd_dev_direct.cpp b/src/engine/audio/private/snd_dev_direct.cpp
index a3c882265..3ae78c463 100644
--- a/src/engine/audio/private/snd_dev_direct.cpp
+++ b/src/engine/audio/private/snd_dev_direct.cpp
@@ -370,7 +370,7 @@ int CAudioDirectSound::PaintBegin( float mixAheadTime, int soundtime, int lpaint
 	DWORD	dwStatus;
 
 	// If using surround, there are 4 or 5 different buffers being used and the pDSBuf is NULL.
-	if ( IsUsingBufferPerSpeaker() ) 
+	if ( IsUsingBufferPerSpeaker() )
 	{
 		if (pDSBufFL->GetStatus(&dwStatus) != DS_OK)
 			Msg ("Couldn't get SURROUND FL sound buffer status\n");
@@ -674,6 +674,7 @@ bool CAudioDirectSound::SNDDMA_InitInterleaved( LPDIRECTSOUND lpDS, WAVEFORMATEX
 			dsbdesc.dwFlags = 0;
 			break;
 		}
+		dsbdesc.dwFlags |= DSBCAPS_TRUEPLAYPOSITION;
 		if ( !snd_mute_losefocus.GetBool() )
 		{
 			dsbdesc.dwFlags |= DSBCAPS_GLOBALFOCUS;
@@ -688,7 +689,7 @@ bool CAudioDirectSound::SNDDMA_InitInterleaved( LPDIRECTSOUND lpDS, WAVEFORMATEX
 	if ( !bSuccess )
 		return false;
 
-	DWORD dwSize = 0, dwWrite;
+	DWORD dwSize = 0;
 	DWORD *pBuffer = 0;
 	if ( !LockDSBuffer( pDSBuf, &pBuffer, &dwSize, "DS_INTERLEAVED", DSBLOCK_ENTIREBUFFER ) )
 		return false;
@@ -707,7 +708,7 @@ bool CAudioDirectSound::SNDDMA_InitInterleaved( LPDIRECTSOUND lpDS, WAVEFORMATEX
 	pDSBuf->Play(0, 0, DSBPLAY_LOOPING);
 
 	pDSBuf->Stop();
-	pDSBuf->GetCurrentPosition(&m_outputBufferStartOffset, &dwWrite);
+	pDSBuf->GetCurrentPosition(&m_outputBufferStartOffset, NULL);
 
 	pDSBuf->Play(0, 0, DSBPLAY_LOOPING);
 
@@ -725,7 +726,7 @@ sndinitstat CAudioDirectSound::SNDDMA_InitDirect( void )
 {
 	DSBUFFERDESC	dsbuf;
 	DSBCAPS			dsbcaps;
-	DWORD			dwSize, dwWrite;
+	DWORD			dwSize;
 	WAVEFORMATEX	format;
 	WAVEFORMATEX	pformat; 
 	HRESULT			hresult;
@@ -830,7 +831,7 @@ sndinitstat CAudioDirectSound::SNDDMA_InitDirect( void )
 	// sound hardware format
 	Q_memset( &dsbuf, 0, sizeof(dsbuf) );
 	dsbuf.dwSize = sizeof(DSBUFFERDESC);
-	dsbuf.dwFlags = DSBCAPS_PRIMARYBUFFER;
+	dsbuf.dwFlags = DSBCAPS_PRIMARYBUFFER | DSBCAPS_TRUEPLAYPOSITION;
 	if ( snd_legacy_surround.GetBool() || m_bSurround )
 	{
 		dsbuf.dwFlags |= DSBCAPS_CTRL3D;
@@ -900,7 +901,8 @@ sndinitstat CAudioDirectSound::SNDDMA_InitDirect( void )
 			// create the secondary buffer we'll actually work with
 			Q_memset( &dsbuf, 0, sizeof(dsbuf) );
 			dsbuf.dwSize = sizeof(DSBUFFERDESC);
-			dsbuf.dwFlags = DSBCAPS_LOCSOFTWARE;		// NOTE: don't use CTRLFREQUENCY (slow)
+			// NOTE: don't use CTRLFREQUENCY (slow)
+			dsbuf.dwFlags = DSBCAPS_LOCSOFTWARE | DSBCAPS_TRUEPLAYPOSITION;
 			dsbuf.dwBufferBytes = SECONDARY_BUFFER_SIZE;
 			dsbuf.lpwfxFormat = &format;
 			if ( !snd_mute_losefocus.GetBool() )
@@ -992,7 +994,7 @@ sndinitstat CAudioDirectSound::SNDDMA_InitDirect( void )
 
 		pDSBuf->Stop();
 
-		pDSBuf->GetCurrentPosition(&m_outputBufferStartOffset, &dwWrite);
+		pDSBuf->GetCurrentPosition(&m_outputBufferStartOffset, NULL);
 
 		pDSBuf->Play(0, 0, DSBPLAY_LOOPING);
 	}
@@ -1288,7 +1290,7 @@ void DS3D_SetBufferParams( LPDIRECTSOUND3DBUFFER pDSBuf3D, D3DVECTOR *pbpos, D3D
 	bparm.vConeOrientation = bdir;
 	bparm.lConeOutsideVolume = DSBVOLUME_MIN;
 	bparm.flMinDistance = 100.0;		// no rolloff (until > 2.0 meter distance)
-	bparm.flMaxDistance = DS3D_DEFAULTMAXDISTANCE;
+	bparm.flMaxDistance = 1000.0;
 	bparm.dwMode = DS3DMODE_NORMAL;
 
 	hr = pDSBuf3D->SetAllParameters( &bparm, DS3D_DEFERRED );
@@ -1300,7 +1302,7 @@ bool CAudioDirectSound::SNDDMA_InitSurround(LPDIRECTSOUND lpDS, WAVEFORMATEX* lp
 {
 	DSBUFFERDESC	dsbuf;
 	WAVEFORMATEX wvex;
-	DWORD dwSize, dwWrite;
+	DWORD dwSize;
 	int reps;
 	HRESULT hresult;
 	void			*lpData = NULL;
@@ -1316,8 +1318,9 @@ bool CAudioDirectSound::SNDDMA_InitSurround(LPDIRECTSOUND lpDS, WAVEFORMATEX* lp
 
 	memset (&dsbuf, 0, sizeof(dsbuf));
 	dsbuf.dwSize = sizeof(DSBUFFERDESC);
-														 // NOTE: LOCHARDWARE causes SB AWE64 to crash in it's DSOUND driver
-	dsbuf.dwFlags = DSBCAPS_CTRL3D;						 // don't use CTRLFREQUENCY (slow)
+	// NOTE: LOCHARDWARE causes SB AWE64 to crash in it's DSOUND driver
+	// don't use CTRLFREQUENCY (slow)
+	dsbuf.dwFlags = DSBCAPS_CTRL3D | DSBCAPS_TRUEPLAYPOSITION;
 	if ( !snd_mute_losefocus.GetBool() )
 	{
 		dsbuf.dwFlags |= DSBCAPS_GLOBALFOCUS;
@@ -1623,7 +1626,7 @@ bool CAudioDirectSound::SNDDMA_InitSurround(LPDIRECTSOUND lpDS, WAVEFORMATEX* lp
 
 	// get hardware playback position, store it, syncronize all buffers to FL
 
-	pDSBufFL->GetCurrentPosition(&m_outputBufferStartOffset, &dwWrite);
+	pDSBufFL->GetCurrentPosition(&m_outputBufferStartOffset, NULL);
 	pDSBufFR->SetCurrentPosition(m_outputBufferStartOffset);
 	pDSBufRL->SetCurrentPosition(m_outputBufferStartOffset);
 	pDSBufRR->SetCurrentPosition(m_outputBufferStartOffset);
diff --git a/src/engine/audio/private/snd_dma.cpp b/src/engine/audio/private/snd_dma.cpp
index 137bce7dd..6969f5c67 100644
--- a/src/engine/audio/private/snd_dma.cpp
+++ b/src/engine/audio/private/snd_dma.cpp
@@ -69,6 +69,12 @@ extern IVideoServices *g_pVideo;
 #define SNDLVL_TO_DIST_MULT( sndlvl ) ( sndlvl ? ((pow( 10.0f, snd_refdb.GetFloat() / 20 ) / pow( 10.0f, (float)sndlvl / 20 )) / snd_refdist.GetFloat()) : 0 )
 #define DIST_MULT_TO_SNDLVL( dist_mult ) (soundlevel_t)(int)( dist_mult ? ( 20 * log10( pow( 10.0f, snd_refdb.GetFloat() / 20 ) / (dist_mult * snd_refdist.GetFloat()) ) ) : 0 )
 
+#if !defined( _X360 )
+#define THREADED_MIX_TIME 0.005
+#else
+#define THREADED_MIX_TIME XMA_POLL_RATE * 0.001
+#endif
+
 extern ConVar dsp_spatial;
 extern IPhysicsSurfaceProps	*physprop;
 
@@ -162,6 +168,8 @@ bool IsSoundSourceLocalPlayer( int soundsource )
 
 CThreadMutex g_SndMutex;
 
+CThreadEvent g_SndUpdateEvent;
+
 #define THREAD_LOCK_SOUND() AUTO_LOCK( g_SndMutex )
 
 const int MASK_BLOCK_AUDIO = CONTENTS_SOLID|CONTENTS_MOVEABLE|CONTENTS_WINDOW;
@@ -236,13 +244,14 @@ vec_t S_GetNominalClipDist()
 	return sound_nominal_clip_dist;
 }
 
-int				g_soundtime = 0;		// sample PAIRS output since start
-int   			g_paintedtime = 0; 		// sample PAIRS mixed since start
+int64				g_soundtime = 0;		// sample PAIRS output since start
+double			g_soundtimeerror = 0.0;  // Error in sound time (used for synchronizing movie output sound to host_time)
+int64   			g_paintedtime = 0; 		// sample PAIRS mixed since start
 
 float			g_ReplaySoundTimeFracAccumulator = 0.0f;	// Used by replay
 
 float			g_ClockSyncArray[NUM_CLOCK_SYNCS] = {0};
-int				g_SoundClockPaintTime[NUM_CLOCK_SYNCS] = {0};
+int64				g_SoundClockPaintTime[NUM_CLOCK_SYNCS] = {0};
 
 // default 10ms
 ConVar snd_delay_sound_shift("snd_delay_sound_shift","0.01");
@@ -277,7 +286,7 @@ float S_ComputeDelayForSoundtime( float soundtime, clocksync_index_t syncIndex )
 	int delaySamples = gameSamples - paintedSamples;
 	float delay = delaySamples / float(dmaSpeed);
 
-	if ( gameDeltaTime < 0 || fabs(delay) > 0.500f )
+	if ( gameDeltaTime < 0 || abs(delay) > 0.200f )
 	{
 		// Note that the equations assume a correlation between game time and real time
 		// some kind of clock error.  This can happen with large host_timescale or when the 
@@ -451,7 +460,7 @@ static soundfade_t soundfade;  // Client sound fading singleton object
 // autodetected from windows settings
 ConVar snd_surround( "snd_surround_speakers", "-1", FCVAR_INTERNAL_USE );
 ConVar snd_legacy_surround( "snd_legacy_surround", "0", FCVAR_ARCHIVE );
-ConVar snd_noextraupdate( "snd_noextraupdate", "0" );
+ConVar snd_noextraupdate( "snd_noextraupdate", "1" );
 ConVar snd_show( "snd_show", "0", FCVAR_CHEAT, "Show sounds info" );
 ConVar snd_visualize ("snd_visualize", "0", FCVAR_CHEAT, "Show sounds location in world" );
 ConVar snd_pitchquality( "snd_pitchquality", "1", FCVAR_ARCHIVE );		// 1) use high quality pitch shifters
@@ -461,8 +470,17 @@ static ConVar volume( "volume", "1.0", FCVAR_ARCHIVE | FCVAR_ARCHIVE_XBOX, "Soun
 // user configurable music volume
 ConVar snd_musicvolume( "snd_musicvolume", "1.0", FCVAR_ARCHIVE | FCVAR_ARCHIVE_XBOX, "Music volume", true, 0.0f, true, 1.0f );	
 
-ConVar snd_mixahead( "snd_mixahead", "0.1", FCVAR_ARCHIVE );
-ConVar snd_mix_async( "snd_mix_async", "0" );
+#ifdef THREADED_SOUND_UPDATE
+ConVar snd_mixahead( "snd_threaded_mixahead", "0.1", 0 );
+#else
+ConVar snd_mixahead( "snd_mixahead", "0.1", 0 );
+#endif
+#ifdef THREADED_SOUND_UPDATE
+ConVar snd_mix_async( "snd_mix_async", "1" );
+#else
+ConVar snd_mix_async("snd_mix_async", "0");
+#endif
+
 #ifdef _DEBUG
 static ConCommand snd_mixvol("snd_mixvol", MXR_DebugSetMixGroupVolume, "Set named Mixgroup to mix volume.");
 #endif
@@ -4688,7 +4706,6 @@ void SND_SpatializeFirstFrameNoTrace( channel_t *pChannel)
 
 int S_AlterChannel( int soundsource, int entchannel, CSfxTable *sfx, int vol, int pitch, int flags )
 {
-	THREAD_LOCK_SOUND();
 	int ch_idx;
 
 	const char *name = sfx->getname();
@@ -5083,7 +5100,7 @@ int S_StartDynamicSound( StartSoundParams_t& params )
 		vol = 255;
 	}
 
-	THREAD_LOCK_SOUND();
+	THREAD_LOCK_SOUND()
 
 	if ( params.flags & (SND_STOP|SND_CHANGE_VOL|SND_CHANGE_PITCH) )
 	{
@@ -5376,6 +5393,8 @@ int S_StartStaticSound( StartSoundParams_t& params )
 	if ((params.flags & SND_STOP) && nSndShowStart > 0)
 		DevMsg("S_StartStaticSound: %s Stopped.\n", sndname);
 
+	THREAD_LOCK_SOUND()
+
 	if ((params.flags & SND_STOP) || (params.flags & SND_CHANGE_VOL) || (params.flags & SND_CHANGE_PITCH))
 	{
 		if (S_AlterChannel(params.soundsource, params.entchannel, params.pSfx, vol, params.pitch, params.flags) || (params.flags & SND_STOP))
@@ -5424,8 +5443,6 @@ int S_StartStaticSound( StartSoundParams_t& params )
 	g_pSoundServices->GetSoundSpatialization( params.soundsource, si );
 
 	// pick a channel to play on from the static area
-	THREAD_LOCK_SOUND();
-
 	ch = SND_PickStaticChannel(params.soundsource, params.pSfx); // Autolooping sounds are always fixed origin(?)
 	if ( !ch )
 		return 0;
@@ -5736,7 +5753,7 @@ int S_GetCurrentStaticSounds( SoundInfo_t *pResult, int nSizeResult, int entchan
 // Stop all sounds for entity on a channel.
 void S_StopSound(int soundsource, int entchannel)
 {
-	THREAD_LOCK_SOUND();
+	THREAD_LOCK_SOUND()
 	CChannelList list;
 	g_ActiveChannels.GetActiveChannels( list );
 	for ( int i = 0; i < list.Count(); i++ )
@@ -5771,7 +5788,7 @@ channel_t *S_FindChannelByGuid( int guid )
 //-----------------------------------------------------------------------------
 void S_StopSoundByGuid( int guid )
 {
-	THREAD_LOCK_SOUND();
+	THREAD_LOCK_SOUND()
 	channel_t *pChannel = S_FindChannelByGuid( guid );
 	if ( pChannel )
 	{
@@ -5910,7 +5927,7 @@ void S_GetActiveSounds( CUtlVector< SndInfo_t >& sndlist )
 
 void S_StopAllSounds( bool bClear )
 {
-	THREAD_LOCK_SOUND();
+	THREAD_LOCK_SOUND()
 	int		i;
 
 	if ( !g_AudioDevice )
@@ -6113,12 +6130,221 @@ S_Update
 Called once each time through the main loop
 ============
 */
-void S_Update( const AudioState_t *pAudioState )
+#ifdef THREADED_SOUND_UPDATE
+void S_UpdateThreaded_Main()
+{
+	channel_t* ch;
+	static unsigned int s_roundrobin = 0; ///< number of times this function is called.
+									  ///< used instead of host_frame because that number
+									  ///< isn't necessarily available here (sez Yahn).
+
+	g_AudioDevice->UpdateListener(listener_origin, listener_forward, listener_right, listener_up);
+
+	int voiceChannelCount = 0;
+	int voiceChannelMaxVolume = 0;
+
+	// reset traceline counter for this frame
+	g_snd_trace_count = 0;
+
+	// calculate distance to nearest walls, update dsp_spatial
+	// updates one wall only per frame (one trace per frame)
+	SND_SetSpatialDelays();
+
+	// updates dsp_room if automatic room detection enabled
+	DAS_CheckNewRoomDSP();
+
+	// update spatialization for static and dynamic sounds
+	CChannelList list;
+	g_ActiveChannels.GetActiveChannels(list);
+
+	g_SndMutex.Lock();
+
+	if (snd_spatialize_roundrobin.GetInt() == 0)
+	{
+		// spatialize each channel each time
+		for (int i = 0; i < list.Count(); i++)
+		{
+			ch = list.GetChannel(i);
+			if (!ch->sfx || !ch->activeIndex)
+			{
+				continue;
+			}
+
+			SND_Spatialize(ch);       // respatialize channel
+
+			if (ch->sfx->pSource && ch->sfx->pSource->IsVoiceSource())
+			{
+				voiceChannelCount++;
+				voiceChannelMaxVolume = max(voiceChannelMaxVolume, ChannelGetMaxVol(ch));
+			}
+		}
+	}
+	else	// lowend performance improvement: spatialize only some  channels each frame.
+	{
+		unsigned int robinmask = (1 << snd_spatialize_roundrobin.GetInt()) - 1;
+
+		// now do static channels
+		for (int i = 0; i < list.Count(); ++i)
+		{
+			ch = list.GetChannel(i);
+			if (!ch->sfx || !ch->activeIndex)
+			{
+				continue;
+			}
+
+			// need to check bfirstpass because sound tracing may have been deferred
+			if (ch->flags.bfirstpass || (robinmask & s_roundrobin) == (i & robinmask))
+			{
+				SND_Spatialize(ch);         // respatialize channel
+			}
+
+			if (ch->sfx->pSource && ch->sfx->pSource->IsVoiceSource())
+			{
+				voiceChannelCount++;
+				voiceChannelMaxVolume = max(voiceChannelMaxVolume, ChannelGetMaxVol(ch));
+			}
+		}
+
+		++s_roundrobin;
+	}
+
+	SND_ChannelTraceReset();
+
+	g_SndMutex.Unlock();
+
+	// set new target for voice ducking
+	float frametime = g_pSoundServices->GetHostFrametime();
+	S_UpdateVoiceDuck(voiceChannelCount, voiceChannelMaxVolume, frametime);
+
+#ifdef _X360
+	// update x360 music volume
+	g_DashboardMusicMixValue = Approach(g_DashboardMusicMixTarget, g_DashboardMusicMixValue, g_DashboardMusicFadeRate * frametime);
+#endif
+}
+
+void S_UpdateThreaded_Base(const AudioState_t* pAudioState)
+{
+	VPROF("S_Update");
+	if (!g_AudioDevice->IsActive())
+		return;
+
+	g_SndMutex.Lock();
+
+	if (pAudioState)
+	{
+		VectorCopy(pAudioState->m_Origin, listener_origin);
+		AngleVectors(pAudioState->m_Angles, &listener_forward, &listener_right, &listener_up);
+		s_bIsListenerUnderwater = pAudioState->m_bIsUnderwater;
+	}
+	else
+	{
+		VectorCopy(vec3_origin, listener_origin);
+		VectorCopy(vec3_origin, listener_forward);
+		VectorCopy(vec3_origin, listener_right);
+		VectorCopy(vec3_origin, listener_up);
+		s_bIsListenerUnderwater = false;
+	}
+
+	//
+	// debugging output
+	//
+	if (snd_show.GetInt())
+	{
+		con_nprint_t np;
+		np.time_to_live = 2.0f;
+		np.fixed_width_font = true;
+
+		int total = 0;
+
+		CChannelList activeChannels;
+		g_ActiveChannels.GetActiveChannels(activeChannels);
+		for (int i = 0; i < activeChannels.Count(); i++)
+		{
+			channel_t* channel = activeChannels.GetChannel(i);
+			if (!channel->sfx)
+				continue;
+
+			np.index = total + 2;
+			if (channel->flags.fromserver)
+			{
+				np.color[0] = 1.0;
+				np.color[1] = 0.8;
+				np.color[2] = 0.1;
+			}
+			else
+			{
+				np.color[0] = 0.1;
+				np.color[1] = 0.9;
+				np.color[2] = 1.0;
+			}
+
+			unsigned int sampleCount = RemainingSamples(channel);
+			float timeleft = (float)sampleCount / (float)channel->sfx->pSource->SampleRate();
+			bool bLooping = channel->sfx->pSource->IsLooped();
+
+			if (snd_surround.GetInt() < 4)
+			{
+				Con_NXPrintf(&np, "%02i l(%03d) r(%03d) vol(%03d) ent(%03d) pos(%6d %6d %6d) timeleft(%f) looped(%d) %50s",
+					total + 1,
+					(int)channel->fvolume[IFRONT_LEFT],
+					(int)channel->fvolume[IFRONT_RIGHT],
+					channel->master_vol,
+					channel->soundsource,
+					(int)channel->origin[0],
+					(int)channel->origin[1],
+					(int)channel->origin[2],
+					timeleft,
+					bLooping,
+					channel->sfx->getname());
+			}
+			else
+			{
+				Con_NXPrintf(&np, "%02i l(%03d) c(%03d) r(%03d) rl(%03d) rr(%03d) vol(%03d) ent(%03d) pos(%6d %6d %6d) timeleft(%f) looped(%d) %50s",
+					total + 1,
+					(int)channel->fvolume[IFRONT_LEFT],
+					(int)channel->fvolume[IFRONT_CENTER],
+					(int)channel->fvolume[IFRONT_RIGHT],
+					(int)channel->fvolume[IREAR_LEFT],
+					(int)channel->fvolume[IREAR_RIGHT],
+					channel->master_vol,
+					channel->soundsource,
+					(int)channel->origin[0],
+					(int)channel->origin[1],
+					(int)channel->origin[2],
+					timeleft,
+					bLooping,
+					channel->sfx->getname());
+			}
+
+			if (snd_visualize.GetInt())
+			{
+				CDebugOverlay::AddTextOverlay(channel->origin, 0.05f, channel->sfx->getname());
+			}
+
+			total++;
+		}
+
+		while (total <= 128)
+		{
+			Con_NPrintf(total + 2, "");
+			total++;
+		}
+	}
+
+	g_SndMutex.Unlock();
+
+	if (s_bOnLoadScreen)
+		return;
+
+	S_Update_( snd_mixahead.GetFloat() );
+}
+#endif
+
+void S_Update_Main( const AudioState_t *pAudioState )
 {
 	VPROF("S_Update");
 	channel_t	*ch;
-	channel_t	*combine;
-	static unsigned int s_roundrobin = 0 ; ///< number of times this function is called.
+	static unsigned int s_roundrobin = 0; ///< number of times this function is called.
 									  ///< used instead of host_frame because that number
 									  ///< isn't necessarily available here (sez Yahn).
 
@@ -6146,8 +6372,6 @@ void S_Update( const AudioState_t *pAudioState )
 	}
 
 	g_AudioDevice->UpdateListener( listener_origin, listener_forward, listener_right, listener_up );
- 
-	combine = NULL;
 
 	int voiceChannelCount = 0;
 	int voiceChannelMaxVolume = 0;
@@ -6175,6 +6399,11 @@ void S_Update( const AudioState_t *pAudioState )
 			Assert(ch->sfx);
 			Assert(ch->activeIndex > 0);
 
+			if (!ch->sfx || ch->activeIndex < 1)
+			{
+				continue;
+			}
+
 			SND_Spatialize(ch);         // respatialize channel
 
 			if ( ch->sfx->pSource && ch->sfx->pSource->IsVoiceSource() )
@@ -6195,6 +6424,11 @@ void S_Update( const AudioState_t *pAudioState )
 			Assert(ch->sfx);
 			Assert(ch->activeIndex > 0);
 
+			if (!ch->sfx || ch->activeIndex < 1)
+			{
+				continue;
+			}
+
 			// need to check bfirstpass because sound tracing may have been deferred
 			if ( ch->flags.bfirstpass || (robinmask & s_roundrobin) == ( i & robinmask ) )
 			{
@@ -6211,16 +6445,16 @@ void S_Update( const AudioState_t *pAudioState )
 		++s_roundrobin;
 	}
 
-
-
 	SND_ChannelTraceReset();
 
 	// set new target for voice ducking
 	float frametime = g_pSoundServices->GetHostFrametime();
 	S_UpdateVoiceDuck( voiceChannelCount, voiceChannelMaxVolume, frametime );
 
+#ifdef _X360
 	// update x360 music volume
 	g_DashboardMusicMixValue = Approach( g_DashboardMusicMixTarget, g_DashboardMusicMixValue, g_DashboardMusicFadeRate * frametime );
+#endif
 
 	//
 	// debugging output
@@ -6325,6 +6559,20 @@ void S_Update( const AudioState_t *pAudioState )
 	S_Update_( g_EstFrameTime + snd_mixahead.GetFloat() );
 }
 
+void S_Update(const AudioState_t* pAudioState)
+{
+#ifdef THREADED_SOUND_UPDATE
+	if ( snd_mix_async.GetBool() )
+	{
+		S_UpdateThreaded_Base(pAudioState);
+	}
+	else
+#endif
+	{
+		S_Update_Main(pAudioState);
+	}
+}
+
 CON_COMMAND( snd_dumpclientsounds, "Dump sounds to VXConsole" )
 {
 	con_nprint_t np;
@@ -6375,8 +6623,9 @@ CON_COMMAND( snd_dumpclientsounds, "Dump sounds to VXConsole" )
 //-----------------------------------------------------------------------------
 void GetSoundTime(void)
 {
-	int		fullsamples;
-	int		sampleOutCount;
+	// Make them 64 bits so calculation is done in 64 bits.
+	int64		fullsamples;
+	int64		sampleOutCount;
 
 	// size of output buffer in *full* 16 bit samples
 	// A 2 channel device has a *full* sample consisting of a 16 bit LR pair.
@@ -6394,13 +6643,6 @@ void GetSoundTime(void)
 	{
 		// buffer wrapped
 		s_buffers++;
-		if ( g_paintedtime > 0x70000000 )
-		{	
-			// time to chop things off to avoid 32 bit limits
-			s_buffers = 0;
-			g_paintedtime = fullsamples;
-			S_StopAllSounds( true );
-		}
 	}
 	
 	s_oldsampleOutCount = sampleOutCount;
@@ -6440,8 +6682,18 @@ void GetSoundTime(void)
 			float t = g_pSoundServices->GetHostTime();
 			if ( s_lastsoundtime != t )
 			{
-				g_soundtime += g_pSoundServices->GetHostFrametime() * g_AudioDevice->DeviceDmaSpeed();
-				
+				double flSamples = (double) g_pSoundServices->GetHostFrametime() * (double) g_AudioDevice->DeviceDmaSpeed();
+				int nSamples = (int)flSamples;
+				double flSampleError = flSamples - (double)nSamples;
+				g_soundtimeerror += flSampleError;
+				if (fabs(g_soundtimeerror) > 1.0)
+				{
+					int nErrorSamples = (int)g_soundtimeerror;
+					g_soundtimeerror -= (double)nErrorSamples;
+					nSamples += nErrorSamples;
+				}
+
+				g_soundtime += nSamples;
 				s_lastsoundtime = t;
 			}
 		}
@@ -6468,6 +6720,11 @@ void S_ExtraUpdate( void )
 	if ( snd_noextraupdate.GetInt() || cl_movieinfo.IsRecording() || IsReplayRendering() )
 		return;		// don't pollute timings
 
+#ifdef THREADED_SOUND_UPDATE
+	if (snd_mix_async.GetBool())
+		return;
+#endif
+
 	// If listener position and orientation has not yet been updated (ie: no call to S_Update since level load)
 	// then don't mix.  Important - mixing with listener at 'false' origin causes
 	// some sounds to incorrectly spatialize to 0 volume, killing them before they can play.
@@ -6546,45 +6803,41 @@ void S_Update_Guts( float mixAheadTime )
 	DEBUG_StopSoundMeasure( 4, samples );
 }
 
-#if !defined( _X360 )
-#define THREADED_MIX_TIME 33
-#else
-#define THREADED_MIX_TIME XMA_POLL_RATE
-#endif
-
 ConVar snd_ShowThreadFrameTime( "snd_ShowThreadFrameTime", "0" );
 
 bool g_bMixThreadExit;
 ThreadHandle_t g_hMixThread;
 void S_Update_Thread()
 {
-	float frameTime = THREADED_MIX_TIME * 0.001f;
-	double lastFrameTime = Plat_FloatTime();
+	double frameTime = THREADED_MIX_TIME;
 
 	while ( !g_bMixThreadExit )
 	{
-		// mixing (for 360) needs to be updated at a steady rate
-		// large update times causes the mixer to demand more audio data
-		// the 360 decoder has finite latency and cannot fulfill spike requests
-		float t0 = Plat_FloatTime();
-		S_Update_Guts( frameTime + snd_mixahead.GetFloat() );
-		int updateTime = ( Plat_FloatTime() - t0 ) * 1000.0f;
+		const double t0 = Plat_FloatTime();
+#ifdef THREADED_SOUND_UPDATE
+		S_UpdateThreaded_Main();
+#endif
+		S_Update_Guts(frameTime + snd_mixahead.GetFloat() );
+		const double tf = Plat_FloatTime();
+
+		const double dt = tf - t0;
 
-		// try to maintain a steadier rate by compensating for fluctuating mix times
-		int sleepTime = THREADED_MIX_TIME - updateTime;
-		if ( sleepTime > 0 )
+		// we have two goals: reduce latency and improve consistency
+		// this means we have regular update times that keep track of variance of frame times.
+		// however, we also want to update as soon as the game thread makes new stuff available to us
+		const int nSleepMS = (int) ((THREADED_MIX_TIME - dt) * 1000);
+		if (nSleepMS > 0)
 		{
-			ThreadSleep( sleepTime );
+			g_SndUpdateEvent.Wait(nSleepMS);
 		}
 
 		// mimic a frametime needed for sound update
-		double t1 = Plat_FloatTime();
-		frameTime = t1 - lastFrameTime;
-		lastFrameTime = t1;
+		const double t1 = Plat_FloatTime();
+		frameTime = t1 - t0;
 
 		if ( snd_ShowThreadFrameTime.GetBool() )
 		{
-			Msg( "S_Update_Thread: frameTime: %d ms\n", (int)( frameTime * 1000.0f ) );
+			Msg( "S_Update_Thread: frameTime: %f s\n", frameTime );
 		}
 	}
 }
@@ -6593,6 +6846,7 @@ void S_ShutdownMixThread()
 {
 	if ( g_hMixThread )
 	{
+		g_SndUpdateEvent.Set();
 		g_bMixThreadExit = true;
 		ThreadJoin( g_hMixThread );
 		ReleaseThreadHandle( g_hMixThread );
@@ -6602,7 +6856,7 @@ void S_ShutdownMixThread()
 
 void S_Update_( float mixAheadTime )
 {
-	if ( !IsConsole() || !snd_mix_async.GetBool() )
+	if ( !snd_mix_async.GetBool() )
 	{
 		S_ShutdownMixThread();
 		S_Update_Guts( mixAheadTime );
@@ -6613,11 +6867,16 @@ void S_Update_( float mixAheadTime )
 		{
 			g_bMixThreadExit = false;
 			g_hMixThread = ThreadExecuteSolo( "SndMix", S_Update_Thread );
+			ThreadSetPriority(g_hMixThread, TP_PRIORITY_HIGHEST);
 			if ( IsX360() )
 			{
 				ThreadSetAffinity( g_hMixThread, XBOX_PROCESSOR_5 );
 			}
 		}
+		else
+		{
+			g_SndUpdateEvent.Set();
+		}
 	}
 }
 
diff --git a/src/engine/audio/private/snd_dsp.cpp b/src/engine/audio/private/snd_dsp.cpp
index 4e73fdb40..92a5c2c01 100644
--- a/src/engine/audio/private/snd_dsp.cpp
+++ b/src/engine/audio/private/snd_dsp.cpp
@@ -5890,10 +5890,10 @@ inline int PSET_GetNext ( pset_t *ppset, int x )
 // Dsp presets
 
 
-ConVar dsp_room			("dsp_room", "0", FCVAR_DEMO );				// room dsp preset - sounds more distant from player (1ch)
+ConVar dsp_room			("dsp_room", "1", FCVAR_DEMO );				// room dsp preset - sounds more distant from player (1ch)
 ConVar dsp_water		("dsp_water", "14", FCVAR_DEMO );			// "14" underwater dsp preset - sound when underwater (1-2ch)
 ConVar dsp_player		("dsp_player", "0", FCVAR_DEMO | FCVAR_SERVER_CAN_EXECUTE );			// dsp on player - sound when player hit by special device (1-2ch)
-ConVar dsp_facingaway	("dsp_facingaway", "0", FCVAR_DEMO );		// "30" sounds that face away from player (weapons, voice) (1-4ch)
+ConVar dsp_facingaway	("dsp_facingaway", "30", FCVAR_DEMO );		// "30" sounds that face away from player (weapons, voice) (1-4ch)
 ConVar dsp_speaker		("dsp_speaker", "50", FCVAR_DEMO );			// "50" small distorted speaker sound (1ch)
 ConVar dsp_spatial		("dsp_spatial", "40", FCVAR_DEMO );			// spatial delays for l/r front/rear ears
 ConVar dsp_automatic	("dsp_automatic", "0", FCVAR_DEMO );			// automatic room type detection. if non zero, replaces dsp_room
@@ -5930,7 +5930,7 @@ ConVar dsp_vol_5ch	("dsp_vol_5ch", "0.5", FCVAR_DEMO );					// 0.0 - 1.0; attenu
 ConVar dsp_vol_4ch	("dsp_vol_4ch", "0.5", FCVAR_DEMO );					// 0.0 - 1.0; attenuate master dsp volume for 4ch surround
 ConVar dsp_vol_2ch	("dsp_vol_2ch", "1.0", FCVAR_DEMO );					// 0.0 - 1.0; attenuate master dsp volume for 2ch surround
 
-ConVar dsp_enhance_stereo("dsp_enhance_stereo", "0", FCVAR_ARCHIVE );	// 1) use dsp_spatial delays on all reverb channels
+ConVar dsp_enhance_stereo("dsp_enhance_stereo", "1", FCVAR_CHEAT );	// 1) use dsp_spatial delays on all reverb channels
 
 // DSP preset executor
 
diff --git a/src/engine/audio/private/snd_mix.cpp b/src/engine/audio/private/snd_mix.cpp
index ca44cbf6e..877b633cd 100644
--- a/src/engine/audio/private/snd_mix.cpp
+++ b/src/engine/audio/private/snd_mix.cpp
@@ -53,7 +53,8 @@ bool BChannelLowVolume( channel_t *pch, int vol_min );
 void ChannelCopyVolumes( channel_t *pch, int *pvolume_dest, int ivol_start, int cvol );
 float ChannelLoudestCurVolume( const channel_t * RESTRICT pch );
 
-extern int g_soundtime;
+extern int64 g_soundtime;
+extern double g_soundtimeerror;
 extern float host_frametime;
 extern float host_frametime_unbounded;
 
@@ -434,7 +435,7 @@ void S_FreeChannel(channel_t *ch)
 
 	ch->flags.isSentence = false;
 //	Msg("End sound %s\n", ch->sfx->getname() );
-	
+
 	delete ch->pMixer;
 	ch->pMixer = NULL;
 	ch->sfx = NULL;
@@ -2267,9 +2268,9 @@ void MIX_PaintChannels( int endtime, bool bIsUnderwater )
 	VPROF("MIX_PaintChannels");
 	tmZone( TELEMETRY_LEVEL0, TMZF_NONE, "%s", __FUNCTION__ );
 
-	int 	end;
+	int64 	end;
 	int		count;
-	bool	b_spatial_delays = dsp_enhance_stereo.GetInt() != 0 ? true : false;
+	bool	b_spatial_delays = dsp_enhance_stereo.GetBool();
 	bool room_fsurround_sav;
 	bool room_fsurround_center_sav;
 	paintbuffer_t	*proom = MIX_GetPPaintFromIPaint(SOUND_BUFFER_ROOM);
@@ -4196,6 +4197,7 @@ void SND_RecordInit()
 {
 	g_paintedtime = 0;
 	g_soundtime = 0;
+	g_soundtimeerror = 0.0;
 
 	// TMP Wave file supports stereo only, so force stereo
 	if ( snd_surround.GetInt() != 2 )
diff --git a/src/engine/audio/private/sound_private.h b/src/engine/audio/private/sound_private.h
index f054c7c89..9a96308b1 100644
--- a/src/engine/audio/private/sound_private.h
+++ b/src/engine/audio/private/sound_private.h
@@ -44,7 +44,7 @@ void SNDDMA_Shutdown(void);
 // User-setable variables
 // ====================================================================
 
-extern int g_paintedtime;
+extern int64 g_paintedtime;
 
 extern bool	snd_initialized;
 
diff --git a/src/engine/audio/public/sound.h b/src/engine/audio/public/sound.h
index 64b4f9d9a..19bd742aa 100644
--- a/src/engine/audio/public/sound.h
+++ b/src/engine/audio/public/sound.h
@@ -22,6 +22,10 @@
 
 #define AUDIOSOURCE_CACHE_ROOTDIR	"maps/soundcache"
 
+#if !defined(_X360)
+#define THREADED_SOUND_UPDATE
+#endif
+
 class CSfxTable;
 enum soundlevel_t;
 struct SoundInfo_t;
diff --git a/src/engine/baseclientstate.cpp b/src/engine/baseclientstate.cpp
index 591f367a9..b1aa1622a 100644
--- a/src/engine/baseclientstate.cpp
+++ b/src/engine/baseclientstate.cpp
@@ -41,6 +41,7 @@
 #include "replay_internal.h"
 #include "replayserver.h"
 #endif
+#include "sound.h"
 
 // memdbgon must be the last include file in a .cpp file!!!
 #include "tier0/memdbgon.h"
@@ -738,6 +739,8 @@ void CBaseClientState::Disconnect( const char *pszReason, bool bShowMainMenu )
 	CL_NotifyRPTOfDisconnect( );
 #endif
 
+	S_StopAllSounds( true );
+
 	m_nSignonState = SIGNONSTATE_NONE;
 
 	netadr_t adr;
diff --git a/src/engine/cl_main.cpp b/src/engine/cl_main.cpp
index 4573a8a93..a808b09cd 100644
--- a/src/engine/cl_main.cpp
+++ b/src/engine/cl_main.cpp
@@ -145,10 +145,18 @@ struct ResourceLocker
 		// Need to temporarily disable queued material system, then lock it
 		m_QMS = Host_AllowQueuedMaterialSystem( false );
 		m_MatLock = g_pMaterialSystem->Lock();
+		// Disable threaded sound updates while loading
+#ifdef THREADED_SOUND_UPDATE
+		S_EnableThreadedMixing(false);
+#endif
 	}
 
 	~ResourceLocker()
 	{
+		// Restore threaded sound update
+#ifdef THREADED_SOUND_UPDATE
+		S_EnableThreadedMixing(true);
+#endif
 		// Restore QMS
 		materials->Unlock( m_MatLock );
 		Host_AllowQueuedMaterialSystem( m_QMS );
diff --git a/src/engine/servermsghandler.cpp b/src/engine/servermsghandler.cpp
index 96788090a..7024990cb 100644
--- a/src/engine/servermsghandler.cpp
+++ b/src/engine/servermsghandler.cpp
@@ -236,8 +236,6 @@ void CClientState::Disconnect( const char *pszReason, bool bShowMainMenu )
 	demoplayer->StopPlayback();
 	demorecorder->StopRecording();
 #endif
-
-	S_StopAllSounds( true );
 	
 	R_DecalTermAll();
 
diff --git a/src/game/server/sceneentity.cpp b/src/game/server/sceneentity.cpp
index f092624f6..9c7be7ec5 100644
--- a/src/game/server/sceneentity.cpp
+++ b/src/game/server/sceneentity.cpp
@@ -768,6 +768,8 @@ CSceneEntity::CSceneEntity( void )
 
 	m_bCompletedEarly	= false;
 
+	if ( !m_pcvSndMixahead )
+		m_pcvSndMixahead	= cvar->FindVar( "snd_threaded_mixahead" );
 	if ( !m_pcvSndMixahead )
 		m_pcvSndMixahead	= cvar->FindVar( "snd_mixahead" );
 
diff --git a/src/game/server/soundscape_system.cpp b/src/game/server/soundscape_system.cpp
index 29b29402c..45b6c8dd7 100644
--- a/src/game/server/soundscape_system.cpp
+++ b/src/game/server/soundscape_system.cpp
@@ -263,6 +263,11 @@ void CSoundscapeSystem::LevelInitPostEntity()
 	}
 }
 
+void CSoundscapeSystem::LevelShutdownPreEntity()
+{
+	g_SoundscapeSystem.Shutdown();
+}
+
 int	CSoundscapeSystem::GetSoundscapeIndex( const char *pName )
 {
 	return m_soundscapes.GetStringID( pName );
diff --git a/src/game/server/soundscape_system.h b/src/game/server/soundscape_system.h
index 8cdc356d9..f7164ad98 100644
--- a/src/game/server/soundscape_system.h
+++ b/src/game/server/soundscape_system.h
@@ -36,6 +36,7 @@ class CSoundscapeSystem : public CAutoGameSystemPerFrame
 	virtual void FrameUpdatePostEntityThink( void );
 	virtual void LevelInitPreEntity( void );
 	virtual void LevelInitPostEntity();
+	virtual void LevelShutdownPreEntity();
 
 	virtual void AddSoundscapeFile( const char *filename );
 	int	GetSoundscapeIndex( const char *pName );
diff --git a/src/game/shared/collisionproperty.cpp b/src/game/shared/collisionproperty.cpp
index 0ff49a4a6..bccba0496 100644
--- a/src/game/shared/collisionproperty.cpp
+++ b/src/game/shared/collisionproperty.cpp
@@ -167,6 +167,10 @@ void CDirtySpatialPartitionEntityList::OnPreQuery( SpatialPartitionListMask_t li
 	if ( m_partitionWriteId != 0 && m_partitionWriteId == ThreadGetCurrentId() )
 		return;
 
+	// Don't break the cache by running in a separate thread! Not thread-safe!
+	if ( !ThreadInMainThread() )
+		return;
+
 #ifdef CLIENT_DLL
 	// FIXME: This should really be an assertion... feh!
 	if ( !C_BaseEntity::IsAbsRecomputationsEnabled() )
@@ -1107,7 +1111,8 @@ void CCollisionProperty::ComputeSurroundingBox( Vector *pVecWorldMins, Vector *p
 		{
 			Assert( GetSolid() != SOLID_CUSTOM );
 			bool bUseVPhysics = false;
-			if ( ( GetSolid() == SOLID_VPHYSICS ) && ( GetOuter()->GetMoveType() == MOVETYPE_VPHYSICS ) )
+			// VPhysics is not thread-safe!
+			if ( ThreadInMainThread() && ( GetSolid() == SOLID_VPHYSICS ) && ( GetOuter()->GetMoveType() == MOVETYPE_VPHYSICS ) )
 			{
 				// UNDONE: This may not be necessary any more.
 				IPhysicsObject *pPhysics = GetOuter()->VPhysicsGetObject();
@@ -1128,7 +1133,9 @@ void CCollisionProperty::ComputeSurroundingBox( Vector *pVecWorldMins, Vector *p
 		break;
 
 	case USE_HITBOXES:
-		ComputeHitboxSurroundingBox( pVecWorldMins, pVecWorldMaxs );
+		// Client code is not thread-safe!
+		if (ThreadInMainThread())
+			ComputeHitboxSurroundingBox( pVecWorldMins, pVecWorldMaxs );
 		break;
 
 	case USE_ROTATION_EXPANDED_BOUNDS:
@@ -1141,7 +1148,9 @@ void CCollisionProperty::ComputeSurroundingBox( Vector *pVecWorldMins, Vector *p
 		break;
 
 	case USE_GAME_CODE:
-		GetOuter()->ComputeWorldSpaceSurroundingBox( pVecWorldMins, pVecWorldMaxs );
+		// Client code is not thread-safe!
+		if (ThreadInMainThread())
+			GetOuter()->ComputeWorldSpaceSurroundingBox( pVecWorldMins, pVecWorldMaxs );
 		Assert( pVecWorldMins->x <= pVecWorldMaxs->x );
 		Assert( pVecWorldMins->y <= pVecWorldMaxs->y );
 		Assert( pVecWorldMins->z <= pVecWorldMaxs->z );

From 5bd9fcfdb118c4ff8f9e015007d7259ef8bd64f4 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 6 Mar 2023 12:06:11 -0500
Subject: [PATCH 10/42] perf: backport animation ActivityList optimization

this saves a lot of comparison times syncing client and server
back and forth in a listen server (about 2 to 4%)
---
 src/game/shared/animation.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/game/shared/animation.cpp b/src/game/shared/animation.cpp
index 975615813..da70c01ce 100644
--- a/src/game/shared/animation.cpp
+++ b/src/game/shared/animation.cpp
@@ -213,7 +213,7 @@ void VerifySequenceIndex( CStudioHdr *pstudiohdr )
 		return;
 	}
 
-	if( pstudiohdr->GetActivityListVersion( ) != g_nActivityListVersion )
+	if( pstudiohdr->GetActivityListVersion( ) < g_nActivityListVersion ) // sometimes the server's numbers can get ahead of the client's if we're sharing memory between the two, so it's only necessary to reindex if a model is lagging the version number.
 	{
 		// this model's sequences have not yet been indexed by activity
 		IndexModelSequences( pstudiohdr );

From ae4f85871e7e59d6d391756b380f60e739b4f3ab Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 6 Mar 2023 12:13:32 -0500
Subject: [PATCH 11/42] perf: RenderSpriteCard backport from CSGO

* skips render if alpha is 0
* combines nSequence calculation
* uses FastQuad
---
 src/particles/builtin_particle_render_ops.cpp | 16 ++++++----------
 src/public/materialsystem/imesh.h             |  6 ++++++
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/particles/builtin_particle_render_ops.cpp b/src/particles/builtin_particle_render_ops.cpp
index 966b15509..e588f754a 100644
--- a/src/particles/builtin_particle_render_ops.cpp
+++ b/src/particles/builtin_particle_render_ops.cpp
@@ -905,6 +905,9 @@ void C_OP_RenderSprites::RenderUnsortedNonSpriteCardOriented( CParticleCollectio
 void C_OP_RenderSprites::RenderSpriteCard( CMeshBuilder &meshBuilder, C_OP_RenderSpritesContext_t *pCtx, SpriteRenderInfo_t& info, int hParticle, ParticleRenderData_t const *pSortList, Vector *pCamera ) const
 {
 	Assert( hParticle != -1 );
+	unsigned char ac = pSortList->m_nAlpha;
+	if (! ac )
+		return;
 	int nGroup = hParticle / 4;
 	int nOffset = hParticle & 0x3;
 
@@ -921,7 +924,6 @@ void C_OP_RenderSprites::RenderSpriteCard( CMeshBuilder &meshBuilder, C_OP_Rende
 	unsigned char rc = FastFToC( r );
 	unsigned char gc = FastFToC( g );
 	unsigned char bc = FastFToC( b );
-	unsigned char ac = pSortList->m_nAlpha;
 
 	float rad = pSortList->m_flRadius;
 	if ( !IsFinite( rad ) )
@@ -959,16 +961,15 @@ void C_OP_RenderSprites::RenderSpriteCard( CMeshBuilder &meshBuilder, C_OP_Rende
 // 			float flLifetime = SubFloat( pLifeDuration[ nGroup * ld_stride ], nOffset );
 // 			flAgeScale = ( flLifetime > 0.0f ) ? ( 1.0f / flLifetime ) * SEQUENCE_SAMPLE_COUNT : 0.0f;
 // 		}
+		int nSequence = SubFloat( info.m_pSequenceNumber[ nGroup * info.m_nSequenceStride ], nOffset );
 		if ( m_bAnimateInFPS )
 		{
-			int nSequence = SubFloat( info.m_pSequenceNumber[ nGroup * info.m_nSequenceStride ], nOffset );
 			flAgeScale = flAgeScale / info.m_pParticles->m_Sheet()->m_flFrameSpan[nSequence];
 		}
 		pSample = GetSampleForSequence( info.m_pSheet,
 			SubFloat( info.m_pCreationTimeStamp[ nGroup * info.m_nCreationTimeStride ], nOffset ), 
 			info.m_pParticles->m_flCurTime, 
-			flAgeScale,
-			SubFloat( info.m_pSequenceNumber[ nGroup * info.m_nSequenceStride ], nOffset ) );
+			flAgeScale, nSequence );
 	}
 
 	const SequenceSampleTextureCoords_t *pSample0 = &(pSample->m_TextureCoordData[0]);
@@ -1015,12 +1016,7 @@ void C_OP_RenderSprites::RenderSpriteCard( CMeshBuilder &meshBuilder, C_OP_Rende
 		meshBuilder.TexCoord4f( 4, pSecondTexture0->m_fLeft_U0, pSecondTexture0->m_fTop_V0, pSecondTexture0->m_fRight_U0, pSecondTexture0->m_fBottom_V0 );
 		meshBuilder.AdvanceVertex();
 
-		meshBuilder.FastIndex( info.m_nVertexOffset );
-		meshBuilder.FastIndex( info.m_nVertexOffset + 1 );
-		meshBuilder.FastIndex( info.m_nVertexOffset + 2 );
-		meshBuilder.FastIndex( info.m_nVertexOffset );
-		meshBuilder.FastIndex( info.m_nVertexOffset + 2 );
-		meshBuilder.FastIndex( info.m_nVertexOffset + 3 );
+		meshBuilder.FastQuad( info.m_nVertexOffset );
 		info.m_nVertexOffset += 4;
 	}
 }
diff --git a/src/public/materialsystem/imesh.h b/src/public/materialsystem/imesh.h
index 3da443b4e..dffd1e419 100644
--- a/src/public/materialsystem/imesh.h
+++ b/src/public/materialsystem/imesh.h
@@ -3206,6 +3206,7 @@ class CMeshBuilder : public MeshDesc_t
 
 	// Fast Index! No need to call advance index, and no random access allowed
 	void FastIndex( unsigned short index );
+	void FastQuad( int index );
 
 	// Fast Vertex! No need to call advance vertex, and no random access allowed. 
 	// WARNING - these are low level functions that are intended only for use
@@ -3775,6 +3776,11 @@ FORCEINLINE void CMeshBuilder::FastIndex2( unsigned short nIndex1, unsigned shor
 	m_IndexBuilder.FastIndex2( nIndex1, nIndex2 );
 }
 
+FORCEINLINE void CMeshBuilder::FastQuad( int nIndex )
+{
+	m_IndexBuilder.FastQuad( nIndex );
+}
+
 //-----------------------------------------------------------------------------
 // For use with the FastVertex methods, advances the current vertex by N
 //-----------------------------------------------------------------------------

From 41671beaac61a57c8d5663f863c85568deca25b0 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Thu, 9 Mar 2023 22:07:39 -0500
Subject: [PATCH 12/42] perf(vgui): font optimizations

* a lot of profile time in the RB tree lookups for FontTextureCache and Win32Font
* adapt the old Xbox 360 only ABC cache to be optional and lazily filled
  this allows us to have a fast array based lookup for the common characters
* fix IsValidIndex check against Find results, we can just compare directly to InvalidIndex
  this fixes extra bounds checks taking up extra time for both extended cache lookups
* backport the m_CommonCharCache from CSGO for FontTextureCache
  this similarly adds a fast array based lookup for common characters
  and also I modified the code from CSGO to include the InvalidIndex check fix here as well
* in short, this eliminates most of the time used for font rendering and fitting
---
 src/common/vgui_surfacelib/Win32Font.h  |   6 +-
 src/vgui2/vgui_surfacelib/Win32Font.cpp |  70 ++++++++++++----
 src/vguimatsurface/FontTextureCache.cpp | 106 +++++++++++++++++-------
 src/vguimatsurface/FontTextureCache.h   |  17 +++-
 4 files changed, 149 insertions(+), 50 deletions(-)

diff --git a/src/common/vgui_surfacelib/Win32Font.h b/src/common/vgui_surfacelib/Win32Font.h
index 7122fa95a..b6c66eb8d 100644
--- a/src/common/vgui_surfacelib/Win32Font.h
+++ b/src/common/vgui_surfacelib/Win32Font.h
@@ -128,6 +128,8 @@ class CWin32Font
 		char c;
 	};
 
+	enum { ABCWIDTHS_CACHE_SIZE = 256 };
+	abc_t* m_ABCWidthsCache[ABCWIDTHS_CACHE_SIZE];
 #if !defined( _X360 )
 	// On PC we cache char widths on demand when actually requested to minimize our use of the kernels 
 	// paged pool (GDI may cache information about glyphs we have requested and take up lots of paged pool)
@@ -138,10 +140,6 @@ class CWin32Font
 	};
 	CUtlRBTree<abc_cache_t, unsigned short> m_ExtendedABCWidthsCache;
 	static bool ExtendedABCWidthsCacheLessFunc(const abc_cache_t &lhs, const abc_cache_t &rhs);
-#else
-	// 360 requires all possible characters during font init
-	enum { ABCWIDTHS_CACHE_SIZE = 256 };
-	abc_t m_ABCWidthsCache[ABCWIDTHS_CACHE_SIZE];
 #endif
 };
 
diff --git a/src/vgui2/vgui_surfacelib/Win32Font.cpp b/src/vgui2/vgui_surfacelib/Win32Font.cpp
index 5ec02ac69..b6bedc36a 100644
--- a/src/vgui2/vgui_surfacelib/Win32Font.cpp
+++ b/src/vgui2/vgui_surfacelib/Win32Font.cpp
@@ -1,4 +1,4 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
+﻿//========= Copyright Valve Corporation, All rights reserved. ============//
 //
 // Purpose: 
 //
@@ -48,9 +48,7 @@ CWin32Font::CWin32Font() : m_ExtendedABCWidthsCache(256, 0, &ExtendedABCWidthsCa
 	m_bAdditive = false;
 	m_rgiBitmapSize[ 0 ] = m_rgiBitmapSize[ 1 ] = 0;
 
-#if defined( _X360 )
 	Q_memset( m_ABCWidthsCache, 0, sizeof( m_ABCWidthsCache ) );
-#endif
 
 	m_ExtendedABCWidthsCache.EnsureCapacity( 128 );
 
@@ -84,6 +82,14 @@ CWin32Font::~CWin32Font()
 		::DeleteDC( m_hDC );
 	if ( m_hDIB )
 		::DeleteObject( m_hDIB );
+
+#ifndef _X360
+	for (int i = 0; i < ARRAYSIZE(m_ABCWidthsCache); i++)
+	{
+		delete m_ABCWidthsCache[i];
+		m_ABCWidthsCache[i] = NULL;
+	}
+#endif
 }
 
 //-----------------------------------------------------------------------------
@@ -242,6 +248,14 @@ bool CWin32Font::Create(const char *windowsFontName, int tall, int weight, int b
 			}
 		}
 	}
+#else
+	Assert(ABCWIDTHS_CACHE_SIZE <= 256);
+	Q_memset(m_ABCWidthsCache, 0, sizeof(m_ABCWidthsCache));
+	for (int i = 0; i < ARRAYSIZE(m_ABCWidthsCache); i++)
+	{
+		delete m_ABCWidthsCache[i];
+		m_ABCWidthsCache[i] = NULL;
+	}
 #endif
 
 	return true;
@@ -476,7 +490,7 @@ void CWin32Font::GetCharRGBA(wchar_t ch, int rgbaWide, int rgbaTall, unsigned ch
 //-----------------------------------------------------------------------------
 bool CWin32Font::IsEqualTo(const char *windowsFontName, int tall, int weight, int blur, int scanlines, int flags)
 {
-	if ( !stricmp(windowsFontName, m_szName.String() ) 
+	if ( !V_stricmp(windowsFontName, m_szName.String() ) 
 		&& m_iTall == tall
 		&& m_iWeight == weight
 		&& m_iBlur == blur
@@ -512,23 +526,31 @@ void CWin32Font::SetAsActiveFont(HDC hdc)
 void CWin32Font::GetCharABCWidths(int ch, int &a, int &b, int &c)
 {
 	Assert( IsValid() );
-#if defined( _X360 )
-	if (ch < ABCWIDTHS_CACHE_SIZE)
+	bool bFastPath = ch < ABCWIDTHS_CACHE_SIZE;
+	bool bNeedsExtendedLookup = !bFastPath;
+	abc_cache_t finder;
+	if (bFastPath)
 	{
 		// use the cache entry
-		a = m_ABCWidthsCache[ch].a;
-		b = m_ABCWidthsCache[ch].b;
-		c = m_ABCWidthsCache[ch].c;
+		abc_t* p_abc = m_ABCWidthsCache[ch];
+		if (p_abc)
+		{
+			abc_t& abc = *p_abc;
+			a = abc.a;
+			b = abc.b;
+			c = abc.c;
+			return;
+		}
+		bNeedsExtendedLookup = true;
 	}
-	else
-#endif
+	if (bNeedsExtendedLookup)
 	{
-
 		// look for it in the cache
-		abc_cache_t finder = { (wchar_t)ch };
+		finder = { (wchar_t)ch };
 
 		unsigned short i = m_ExtendedABCWidthsCache.Find(finder);
-		if (m_ExtendedABCWidthsCache.IsValidIndex(i))
+		// This used to be IsValidIndex, but we're getting this right from Find so we don't need to do extra bounds checks.
+		if ( i != m_ExtendedABCWidthsCache.InvalidIndex() )
 		{
 			a = m_ExtendedABCWidthsCache[i].abc.a;
 			b = m_ExtendedABCWidthsCache[i].abc.b;
@@ -563,11 +585,25 @@ void CWin32Font::GetCharABCWidths(int ch, int &a, int &b, int &c)
 				b = m_iMaxCharWidth;
 			}
 		}
+	}
 
+	char s_a = a - m_iBlur - m_iOutlineSize;
+	short s_b = b + ((m_iBlur + m_iOutlineSize) * 2) + m_iDropShadowOffset;
+	char s_c = c - m_iBlur - m_iDropShadowOffset - m_iOutlineSize;
+
+	if (bFastPath)
+	{
+		m_ABCWidthsCache[ch] = new abc_t;
+		m_ABCWidthsCache[ch]->a = s_a;
+		m_ABCWidthsCache[ch]->b = s_b;
+		m_ABCWidthsCache[ch]->c = s_c;
+	}
+	else
+	{
 		// add to the cache
-		finder.abc.a = a - m_iBlur - m_iOutlineSize;
-		finder.abc.b = b + ((m_iBlur + m_iOutlineSize) * 2) + m_iDropShadowOffset;
-		finder.abc.c = c - m_iBlur - m_iDropShadowOffset - m_iOutlineSize;
+		finder.abc.a = s_a;
+		finder.abc.b = s_b;
+		finder.abc.c = s_c;
 		m_ExtendedABCWidthsCache.Insert(finder);
 	}
 }
diff --git a/src/vguimatsurface/FontTextureCache.cpp b/src/vguimatsurface/FontTextureCache.cpp
index d94e9afae..1886d0f42 100644
--- a/src/vguimatsurface/FontTextureCache.cpp
+++ b/src/vguimatsurface/FontTextureCache.cpp
@@ -63,6 +63,7 @@ CON_COMMAND( mat_texture_outline_fonts, "Outline fonts textures." )
 CFontTextureCache::CFontTextureCache() 
 	: m_CharCache(0, 256, CacheEntryLessFunc)
 {
+	V_memset(m_CommonCharCache, 0, sizeof(m_CommonCharCache));
 	Clear();
 }
 
@@ -95,6 +96,12 @@ void CFontTextureCache::Clear()
 	}
 	m_FontPages.SetLessFunc( DefLessFunc( vgui::HFont ) );
 	m_FontPages.RemoveAll();
+
+	for (int i = 0; i < ARRAYSIZE(m_CommonCharCache); i++)
+	{
+		delete m_CommonCharCache[i];
+		m_CommonCharCache[i] = 0;
+	}
 }
 
 //-----------------------------------------------------------------------------
@@ -208,19 +215,69 @@ bool CFontTextureCache::GetTextureForChars( vgui::HFont font, vgui::FontDrawType
 		
 		for ( int i = 0; i < numChars; i++ )
 		{
-			CacheEntry_t cacheItem;
-			cacheItem.font = font;
-			cacheItem.wch = wch[i];
-			HCacheEntry cacheHandle = m_CharCache.Find( cacheItem );
-			if ( ! m_CharCache.IsValidIndex( cacheHandle ) )
+			wchar_t wideChar = wch[i];
+
+			int* pCachePage;
+			float* pCacheCoords;
+
+			// profiling dicatated that avoiding the naive font/char RB lookup was beneficial
+			// instead waste a little memory to get all the western language chars to be direct
+			if (wideChar < MAX_COMMON_CHARS && font < ARRAYSIZE(m_CommonCharCache))
 			{
-				// All characters must come out of the same font
-				if ( winFont != FontManager().GetFontForChar( font, wch[i] ) )
-					return false;
+				// dominant amount of simple chars are instant direct lookup
+				CommonChar_t* pCommonChars = m_CommonCharCache[font];
+				if (!pCommonChars)
+				{
+					// missing
+					if (winFont != FontManager().GetFontForChar(font, wideChar))
+					{
+						// all characters in string must come out of the same font
+						return false;
+					}
 
+					// init and insert
+					pCommonChars = new CommonChar_t;
+					memset(pCommonChars, 0, sizeof(CommonChar_t));
+					m_CommonCharCache[font] = pCommonChars;
+				}
+				pCachePage = &pCommonChars->details[wideChar].page;
+				pCacheCoords = pCommonChars->details[wideChar].texCoords;
+			}
+			else
+			{
+				// extended chars are a costlier lookup
+				// page and char form a unique key to find in cache
+				CacheEntry_t cacheItem;
+				cacheItem.font = font;
+				cacheItem.wch = wideChar;
+				HCacheEntry cacheHandle = m_CharCache.Find(cacheItem);
+				if ( cacheHandle == m_CharCache.InvalidIndex() )
+				{
+					// missing
+					if (winFont != FontManager().GetFontForChar(font, wideChar))
+					{
+						// all characters in string must come out of the same font
+						return false;
+					}
+
+					// init and insert
+					cacheItem.texCoords[0] = 0;
+					cacheItem.texCoords[1] = 0;
+					cacheItem.texCoords[2] = 0;
+					cacheItem.texCoords[3] = 0;
+					cacheHandle = m_CharCache.Insert(cacheItem);
+					Assert(m_CharCache.IsValidIndex(cacheHandle));
+				}
+				pCachePage = &m_CharCache[cacheHandle].page;
+				pCacheCoords = m_CharCache[cacheHandle].texCoords;
+			}
+
+			if ( pCacheCoords[2] == 0 && pCacheCoords[3] == 0 )
+			{
+				// invalid page, setup for page allocation
 				// get the char details
 				int a, b, c;
-				winFont->GetCharABCWidths( wch[i], a, b, c );
+				winFont->GetCharABCWidths( wideChar, a, b, c );
 				int fontWide = max( b, 1 );
 				int fontTall = max( winFont->GetHeight(), 1 );
 				if ( winFont->GetUnderlined() )
@@ -230,14 +287,14 @@ bool CFontTextureCache::GetTextureForChars( vgui::HFont font, vgui::FontDrawType
 
 				// Get a texture to render into
 				int page, drawX, drawY, twide, ttall;
-				if ( !AllocatePageForChar( fontWide, fontTall, page, drawX, drawY, twide, ttall ) )
+				if ( !AllocatePageForChar(fontWide, fontTall, page, drawX, drawY, twide, ttall) )
 					return false;
 
 				// accumulate data to pass to GetCharsRGBA below
 				newEntries[	numNewChars ].page		= page;
 				newEntries[	numNewChars ].drawX		= drawX;
 				newEntries[	numNewChars ].drawY		= drawY;
-				newChars[	numNewChars ].wch		= wch[i];
+				newChars[	numNewChars ].wch		= wideChar;
 				newChars[	numNewChars ].fontWide	= fontWide;
 				newChars[	numNewChars ].fontTall	= fontTall;
 				newChars[	numNewChars ].offset	= 4*totalNewCharTexels;
@@ -245,25 +302,18 @@ bool CFontTextureCache::GetTextureForChars( vgui::HFont font, vgui::FontDrawType
 				maxNewCharTexels = max( maxNewCharTexels, fontWide*fontTall );
 				numNewChars++;
 
-				// set the cache info
-				cacheItem.page = page;
-
-				// the 0.5 texel offset is done in CMatSystemTexture::SetMaterial() / CMatSystemSurface::StartDrawing()
-				double adjust =  0.0f;
+				// the 0.5 texel offset is done in CMatSystemTexture::SetMaterial()
+				pCacheCoords[0] = (float)( (double)drawX / ((double)twide) );
+				pCacheCoords[1] = (float)( (double)drawY / ((double)ttall) );
+				pCacheCoords[2] = (float)( (double)(drawX + fontWide) / (double)twide );
+				pCacheCoords[3] = (float)( (double)(drawY + fontTall) / (double)ttall );
 
-				cacheItem.texCoords[0] = (float)( (double)drawX / ((double)twide + adjust) );
-				cacheItem.texCoords[1] = (float)( (double)drawY / ((double)ttall + adjust) );
-				cacheItem.texCoords[2] = (float)( (double)(drawX + fontWide) / (double)twide );
-				cacheItem.texCoords[3] = (float)( (double)(drawY + fontTall) / (double)ttall );
-
-				m_CharCache.Insert(cacheItem);
-				cacheHandle = m_CharCache.Find( cacheItem );
-				Assert( m_CharCache.IsValidIndex( cacheHandle ) );
+				*pCachePage = page;
 			}
-			
-			int page = m_CharCache[cacheHandle].page;
-			textureID[i] = m_PageList[page].textureID[typePage];
-			texCoords[i] = m_CharCache[cacheHandle].texCoords;
+
+			// give data to caller
+			textureID[i] = m_PageList[*pCachePage].textureID[typePage];
+			texCoords[i] = pCacheCoords;
 		}
 
 		// Generate texture data for all newly-encountered characters
diff --git a/src/vguimatsurface/FontTextureCache.h b/src/vguimatsurface/FontTextureCache.h
index 1f12e3ce2..f09c40171 100644
--- a/src/vguimatsurface/FontTextureCache.h
+++ b/src/vguimatsurface/FontTextureCache.h
@@ -18,6 +18,8 @@
 
 class ITexture;
 
+#define MAX_COMMON_CHARS	256
+
 //-----------------------------------------------------------------------------
 // Purpose: manages texture memory for unicode fonts in vgui
 //-----------------------------------------------------------------------------
@@ -51,13 +53,24 @@ class CFontTextureCache
 		FONT_PAGE_SIZE_COUNT,
 	};
 
+	// hold the common characters
+	struct charDetail_t
+	{
+		int page;
+		float texCoords[4];
+	};
+	struct CommonChar_t
+	{
+		charDetail_t	details[MAX_COMMON_CHARS];
+	};
+
 	// a single character in the cache
 	typedef unsigned short HCacheEntry;
 	struct CacheEntry_t
 	{
 		vgui::HFont		font;
 		wchar_t			wch;
-		unsigned char	page;
+		int				page;
 		float			texCoords[4];
 
 		// doubly-linked list for use in the LRU
@@ -81,6 +94,8 @@ class CFontTextureCache
 	// Creates font materials
 	void CreateFontMaterials( Page_t &page, ITexture *pFontTexture, bool bitmapFont = false );
 
+	CommonChar_t* m_CommonCharCache[384];
+
 	// Computes the page size given a character height
 	int ComputePageType( int charTall ) const;
 

From 07ea2e18ffb11940780a85bfc8660c371d048488 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Thu, 9 Mar 2023 22:22:40 -0500
Subject: [PATCH 13/42] perf(vgui): backport CMatSystemSurface::DrawQuadArray
 from CSGO

* this greatly improves performance of various HUD element draws
---
 src/vguimatsurface/MatSystemSurface.cpp | 141 ++++++++++++++----------
 1 file changed, 80 insertions(+), 61 deletions(-)

diff --git a/src/vguimatsurface/MatSystemSurface.cpp b/src/vguimatsurface/MatSystemSurface.cpp
index 742659081..7775f3b7c 100644
--- a/src/vguimatsurface/MatSystemSurface.cpp
+++ b/src/vguimatsurface/MatSystemSurface.cpp
@@ -1092,80 +1092,99 @@ void CMatSystemSurface::DrawQuadArray( int quadCount, vgui::Vertex_t *pVerts, un
 	if ( !m_pMesh )
 		return;
 
-	meshBuilder.Begin( m_pMesh, MATERIAL_QUADS, quadCount );
-
 	vgui::Vertex_t ulc;
 	vgui::Vertex_t lrc;
 	vgui::Vertex_t *pulc;
 	vgui::Vertex_t *plrc;
 
-	if ( bShouldClip )
-	{
-		for ( int i = 0; i < quadCount; ++i )
-		{
-			PREFETCH360( &pVerts[ 2 * ( i + 1 ) ], 0 );
-
-			if ( !ClipRect( pVerts[2*i], pVerts[2*i + 1], &ulc, &lrc ) )
-			{
-				continue;	
-			}
-			pulc = &ulc;
-			plrc = &lrc;
-
-			meshBuilder.Position3f( pulc->m_Position.x, pulc->m_Position.y, m_flZPos );
-			meshBuilder.Color4ubv( pColor );
-			meshBuilder.TexCoord2f( 0, pulc->m_TexCoord.x, pulc->m_TexCoord.y );
-			meshBuilder.AdvanceVertexF<VTX_HAVEPOS | VTX_HAVECOLOR, 1>();
+	int nMaxVertices, nMaxIndices;
+	CMatRenderContextPtr pRenderContext(g_pMaterialSystem);
+	pRenderContext->GetMaxToRender(m_pMesh, false, &nMaxVertices, &nMaxIndices);
+	if (!nMaxVertices || !nMaxIndices)
+		return; // probably in alt-tab
 
-			meshBuilder.Position3f( plrc->m_Position.x, pulc->m_Position.y, m_flZPos );
-			meshBuilder.Color4ubv( pColor );
-			meshBuilder.TexCoord2f( 0, plrc->m_TexCoord.x, pulc->m_TexCoord.y );
-			meshBuilder.AdvanceVertexF<VTX_HAVEPOS | VTX_HAVECOLOR, 1>();
+	int nMaxQuads = nMaxVertices / 4;
+	nMaxQuads = MIN(nMaxQuads, nMaxIndices / 6);
 
-			meshBuilder.Position3f( plrc->m_Position.x, plrc->m_Position.y, m_flZPos );
-			meshBuilder.Color4ubv( pColor );
-			meshBuilder.TexCoord2f( 0, plrc->m_TexCoord.x, plrc->m_TexCoord.y );
-			meshBuilder.AdvanceVertexF<VTX_HAVEPOS | VTX_HAVECOLOR, 1>();
+	int nFirstQuad = 0;
+	int nQuadsRemaining = quadCount;
 
-			meshBuilder.Position3f( pulc->m_Position.x, plrc->m_Position.y, m_flZPos );
-			meshBuilder.Color4ubv( pColor );
-			meshBuilder.TexCoord2f( 0, pulc->m_TexCoord.x, plrc->m_TexCoord.y );
-			meshBuilder.AdvanceVertexF<VTX_HAVEPOS | VTX_HAVECOLOR, 1>();
-		}
-	}
-	else
+	while (nQuadsRemaining > 0)
 	{
-		for ( int i = 0; i < quadCount; ++i )
+		quadCount = MIN( nQuadsRemaining, nMaxQuads );
+		meshBuilder.Begin( m_pMesh, MATERIAL_QUADS, quadCount );
+		if ( bShouldClip )
 		{
-			PREFETCH360( &pVerts[ 2 * ( i + 1 ) ], 0 );
-
-			pulc = &pVerts[2*i];
-			plrc = &pVerts[2*i + 1];
-
-			meshBuilder.Position3f( pulc->m_Position.x, pulc->m_Position.y, m_flZPos );
-			meshBuilder.Color4ubv( pColor );
-			meshBuilder.TexCoord2f( 0, pulc->m_TexCoord.x, pulc->m_TexCoord.y );
-			meshBuilder.AdvanceVertexF<VTX_HAVEPOS | VTX_HAVECOLOR, 1>();
-
-			meshBuilder.Position3f( plrc->m_Position.x, pulc->m_Position.y, m_flZPos );
-			meshBuilder.Color4ubv( pColor );
-			meshBuilder.TexCoord2f( 0, plrc->m_TexCoord.x, pulc->m_TexCoord.y );
-			meshBuilder.AdvanceVertexF<VTX_HAVEPOS | VTX_HAVECOLOR, 1>();
-
-			meshBuilder.Position3f( plrc->m_Position.x, plrc->m_Position.y, m_flZPos );
-			meshBuilder.Color4ubv( pColor );
-			meshBuilder.TexCoord2f( 0, plrc->m_TexCoord.x, plrc->m_TexCoord.y );
-			meshBuilder.AdvanceVertexF<VTX_HAVEPOS | VTX_HAVECOLOR, 1>();
+			for ( int q = 0; q < quadCount; ++q )
+			{
+				int i = q + nFirstQuad;
+				PREFETCH360( &pVerts[ 2 * ( i + 1 ) ], 0 );
 
-			meshBuilder.Position3f( pulc->m_Position.x, plrc->m_Position.y, m_flZPos );
-			meshBuilder.Color4ubv( pColor );
-			meshBuilder.TexCoord2f( 0, pulc->m_TexCoord.x, plrc->m_TexCoord.y );
-			meshBuilder.AdvanceVertexF<VTX_HAVEPOS | VTX_HAVECOLOR, 1>();
+				if ( !ClipRect( pVerts[2*i], pVerts[2*i + 1], &ulc, &lrc ) )
+				{
+					continue;	
+				}
+				pulc = &ulc;
+				plrc = &lrc;
+
+				meshBuilder.Position3f( pulc->m_Position.x, pulc->m_Position.y, m_flZPos );
+				meshBuilder.Color4ubv( pColor );
+				meshBuilder.TexCoord2f( 0, pulc->m_TexCoord.x, pulc->m_TexCoord.y );
+				meshBuilder.AdvanceVertexF<VTX_HAVEPOS | VTX_HAVECOLOR, 1>();
+
+				meshBuilder.Position3f( plrc->m_Position.x, pulc->m_Position.y, m_flZPos );
+				meshBuilder.Color4ubv( pColor );
+				meshBuilder.TexCoord2f( 0, plrc->m_TexCoord.x, pulc->m_TexCoord.y );
+				meshBuilder.AdvanceVertexF<VTX_HAVEPOS | VTX_HAVECOLOR, 1>();
+
+				meshBuilder.Position3f( plrc->m_Position.x, plrc->m_Position.y, m_flZPos );
+				meshBuilder.Color4ubv( pColor );
+				meshBuilder.TexCoord2f( 0, plrc->m_TexCoord.x, plrc->m_TexCoord.y );
+				meshBuilder.AdvanceVertexF<VTX_HAVEPOS | VTX_HAVECOLOR, 1>();
+
+				meshBuilder.Position3f( pulc->m_Position.x, plrc->m_Position.y, m_flZPos );
+				meshBuilder.Color4ubv( pColor );
+				meshBuilder.TexCoord2f( 0, pulc->m_TexCoord.x, plrc->m_TexCoord.y );
+				meshBuilder.AdvanceVertexF<VTX_HAVEPOS | VTX_HAVECOLOR, 1>();
+			}
+		}
+		else
+		{
+			for (int q = 0; q < quadCount; ++q)
+			{
+				int i = q + nFirstQuad;
+				PREFETCH360( &pVerts[ 2 * ( i + 1 ) ], 0 );
+
+				pulc = &pVerts[2*i];
+				plrc = &pVerts[2*i + 1];
+
+				meshBuilder.Position3f( pulc->m_Position.x, pulc->m_Position.y, m_flZPos );
+				meshBuilder.Color4ubv( pColor );
+				meshBuilder.TexCoord2f( 0, pulc->m_TexCoord.x, pulc->m_TexCoord.y );
+				meshBuilder.AdvanceVertexF<VTX_HAVEPOS | VTX_HAVECOLOR, 1>();
+
+				meshBuilder.Position3f( plrc->m_Position.x, pulc->m_Position.y, m_flZPos );
+				meshBuilder.Color4ubv( pColor );
+				meshBuilder.TexCoord2f( 0, plrc->m_TexCoord.x, pulc->m_TexCoord.y );
+				meshBuilder.AdvanceVertexF<VTX_HAVEPOS | VTX_HAVECOLOR, 1>();
+
+				meshBuilder.Position3f( plrc->m_Position.x, plrc->m_Position.y, m_flZPos );
+				meshBuilder.Color4ubv( pColor );
+				meshBuilder.TexCoord2f( 0, plrc->m_TexCoord.x, plrc->m_TexCoord.y );
+				meshBuilder.AdvanceVertexF<VTX_HAVEPOS | VTX_HAVECOLOR, 1>();
+
+				meshBuilder.Position3f( pulc->m_Position.x, plrc->m_Position.y, m_flZPos );
+				meshBuilder.Color4ubv( pColor );
+				meshBuilder.TexCoord2f( 0, pulc->m_TexCoord.x, plrc->m_TexCoord.y );
+				meshBuilder.AdvanceVertexF<VTX_HAVEPOS | VTX_HAVECOLOR, 1>();
+			}
 		}
-	}
 
-	meshBuilder.End();
-	m_pMesh->Draw();
+		meshBuilder.End();
+		m_pMesh->Draw();
+		nFirstQuad += quadCount;
+		nQuadsRemaining -= quadCount;
+	}
 }
 
 

From a81ea13b01fd4b3fefaee5321841aac3f8daee7f Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Thu, 9 Mar 2023 22:11:24 -0500
Subject: [PATCH 14/42] perf(init): move to CUtlSymbolLarge for dmxloader (CSGO
 backport)

* the particle data loaded from dmx is huge, CUtlSymbolLarge is much more well suited for larger symbol tables
* this optimizes particle loading time slightly
* also backport the minor change to use the template GetString in dmxloader for size
---
 src/dmxloader/dmxattribute.cpp      |  6 +++---
 src/dmxloader/dmxelement.cpp        |  9 +++++----
 src/dmxloader/dmxloader.cpp         |  6 +++---
 src/public/dmxloader/dmxattribute.h | 11 ++++++-----
 src/public/dmxloader/dmxelement.h   | 11 ++++++-----
 src/public/tier1/mempool.h          |  6 +++---
 6 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/src/dmxloader/dmxattribute.cpp b/src/dmxloader/dmxattribute.cpp
index aea474c79..4299d0f2e 100644
--- a/src/dmxloader/dmxattribute.cpp
+++ b/src/dmxloader/dmxattribute.cpp
@@ -15,7 +15,7 @@
 //-----------------------------------------------------------------------------
 // globals
 //-----------------------------------------------------------------------------
-CUtlSymbolTableMT CDmxAttribute::s_AttributeNameSymbols;
+CUtlSymbolTableLargeMT CDmxAttribute::s_AttributeNameSymbols;
 
 
 //-----------------------------------------------------------------------------
@@ -128,7 +128,7 @@ CDmxAttribute::CDmxAttribute( const char *pAttributeName )
 	m_pData = NULL;
 }
 
-CDmxAttribute::CDmxAttribute( CUtlSymbol attributeName )
+CDmxAttribute::CDmxAttribute( CUtlSymbolLarge attributeName )
 {
 	m_Name = attributeName;
 	m_Type = AT_UNKNOWN;
@@ -221,7 +221,7 @@ inline const char* CDmxAttribute::GetTypeString() const
 //-----------------------------------------------------------------------------
 const char *CDmxAttribute::GetName() const
 {
-	return s_AttributeNameSymbols.String( m_Name );
+	return m_Name.String();
 }
 
 
diff --git a/src/dmxloader/dmxelement.cpp b/src/dmxloader/dmxelement.cpp
index 7537590f8..17a8b55a7 100644
--- a/src/dmxloader/dmxelement.cpp
+++ b/src/dmxloader/dmxelement.cpp
@@ -8,6 +8,7 @@
 #include "dmxloader/dmxattribute.h"
 #include "tier1/utlbuffer.h"
 #include "mathlib/ssemath.h"
+#include "tier1/utlsymbollarge.h"
 
 // memdbgon must be the last include file in a .cpp file!!!
 #include "tier0/memdbgon.h"
@@ -16,7 +17,7 @@
 //-----------------------------------------------------------------------------
 // globals
 //-----------------------------------------------------------------------------
-CUtlSymbolTableMT CDmxElement::s_TypeSymbols;
+CUtlSymbolTableLargeMT CDmxElement::s_TypeSymbols;
 
 
 //-----------------------------------------------------------------------------
@@ -50,14 +51,14 @@ CDmxElement::~CDmxElement()
 //-----------------------------------------------------------------------------
 // Utility method for getting at the type
 //-----------------------------------------------------------------------------
-CUtlSymbol CDmxElement::GetType()  const
+CUtlSymbolLarge CDmxElement::GetType()  const
 {
 	return m_Type;
 }
 
 const char* CDmxElement::GetTypeString() const
 {
-	return s_TypeSymbols.String( m_Type );
+	return m_Type.String();
 }
 
 const char* CDmxElement::GetName() const
@@ -225,7 +226,7 @@ int CDmxElement::FindAttribute( const char *pAttributeName ) const
 //-----------------------------------------------------------------------------
 // Find an attribute by name-based lookup
 //-----------------------------------------------------------------------------
-int CDmxElement::FindAttribute( CUtlSymbol attributeName ) const
+int CDmxElement::FindAttribute( CUtlSymbolLarge attributeName ) const
 {
 	Resort();
 	CDmxAttribute search( attributeName );
diff --git a/src/dmxloader/dmxloader.cpp b/src/dmxloader/dmxloader.cpp
index 2083f3fec..a45b0f140 100644
--- a/src/dmxloader/dmxloader.cpp
+++ b/src/dmxloader/dmxloader.cpp
@@ -443,7 +443,7 @@ bool CDmxSerializer::Unserialize( CUtlBuffer &buf, int nEncodingVersion, CDmxEle
 	}
 
 	// Read in the element count.
-	int nElementCount = buf.GetInt();
+	const int nElementCount = buf.GetInt();
 	if ( !nElementCount )
 	{
 		// Empty (but valid) file
@@ -474,10 +474,10 @@ bool CDmxSerializer::Unserialize( CUtlBuffer &buf, int nEncodingVersion, CDmxEle
 		}
 		else
 		{
-			buf.GetString( pTypeBuf );
+			buf.GetString<256>( pTypeBuf );
 			pType = pTypeBuf;
 		}
-		buf.GetString( pName );
+		buf.GetString<2048>( pName );
 		buf.Get( &id, sizeof(DmObjectId_t) );
 
 		CDmxElement *pElement = new CDmxElement( pType );
diff --git a/src/public/dmxloader/dmxattribute.h b/src/public/dmxloader/dmxattribute.h
index 17523660e..78adea366 100644
--- a/src/public/dmxloader/dmxattribute.h
+++ b/src/public/dmxloader/dmxattribute.h
@@ -16,6 +16,7 @@
 #include "tier1/utlrbtree.h"
 #include "tier1/utlsymbol.h"
 #include "tier1/mempool.h"
+#include "utlsymbollarge.h"
 #include "dmxloader/dmxloader.h"
 
 
@@ -48,7 +49,7 @@ class CDmxAttribute
 	// Returns the name. NOTE: The utlsymbol
 	// can be turned into a string by using g_pDataModel->String();
 	const char *GetName() const;
-	CUtlSymbol GetNameSymbol() const;
+	CUtlSymbolLarge GetNameSymbol() const;
 	void SetName( const char *pName );
 
 	// Gets values
@@ -89,7 +90,7 @@ class CDmxAttribute
 
 private:
 	CDmxAttribute( const char *pAttributeName );
-	CDmxAttribute( CUtlSymbol attributeName );
+	CDmxAttribute( CUtlSymbolLarge attributeName );
 	~CDmxAttribute();
 
 	// Allocate, free memory for data
@@ -100,10 +101,10 @@ class CDmxAttribute
 	void SetValue( DmAttributeType_t type, const void *pSrc, int nLen );
 
 	DmAttributeType_t m_Type;
-	CUtlSymbol m_Name;
+	CUtlSymbolLarge m_Name;
 	void *m_pData;
 
-	static CUtlSymbolTableMT s_AttributeNameSymbols;
+	static CUtlSymbolTableLargeMT s_AttributeNameSymbols;
 
 	friend class CDmxElement;
 };
@@ -122,7 +123,7 @@ template< class T > inline bool CDmxAttribute::IsA() const
 	return GetType() == CDmAttributeInfo< T >::ATTRIBUTE_TYPE;
 }
 
-inline CUtlSymbol CDmxAttribute::GetNameSymbol() const
+inline CUtlSymbolLarge CDmxAttribute::GetNameSymbol() const
 {
 	return m_Name;
 }
diff --git a/src/public/dmxloader/dmxelement.h b/src/public/dmxloader/dmxelement.h
index c4aadd8ac..bdb290b0e 100644
--- a/src/public/dmxloader/dmxelement.h
+++ b/src/public/dmxloader/dmxelement.h
@@ -15,6 +15,7 @@
 #include "tier1/utlvector.h"
 #include "tier1/utlrbtree.h"
 #include "tier1/utlsymbol.h"
+#include "utlsymbollarge.h"
 #include "tier1/mempool.h"
 #include "tier1/UtlSortVector.h"
 #include "dmxloader/dmxattribute.h"
@@ -28,7 +29,7 @@ class CDmxAttributeLess
 public:
 	bool Less( const CDmxAttribute * pAttribute1, const CDmxAttribute *pAttribute2, void *pContext )
 	{
-		return pAttribute1->GetNameSymbol() < pAttribute2->GetNameSymbol();
+		return (pAttribute1 ? pAttribute1->GetNameSymbol() : CUtlSymbolLarge(UTL_INVAL_SYMBOL_LARGE)) < (pAttribute2 ? pAttribute2->GetNameSymbol() : CUtlSymbolLarge(UTL_INVAL_SYMBOL_LARGE));
 	}
 };
 
@@ -105,7 +106,7 @@ class CDmxElement
 	int					AttributeCount() const;
 	CDmxAttribute		*GetAttribute( int nIndex );
 	const CDmxAttribute *GetAttribute( int nIndex ) const;
-	CUtlSymbol			GetType() const;
+	CUtlSymbolLarge		GetType() const;
 	const char*			GetTypeString() const;
 	const char*			GetName() const;
 	const DmObjectId_t &GetId() const;
@@ -161,7 +162,7 @@ class CDmxElement
 
 	// Finds an attribute by name
 	int FindAttribute( const char *pAttributeName ) const;
-	int FindAttribute( CUtlSymbol attributeName ) const;
+	int FindAttribute( CUtlSymbolLarge attributeName ) const;
 
 	// Sets the object id
 	void SetId( const DmObjectId_t &id );
@@ -171,12 +172,12 @@ class CDmxElement
 
 	AttributeList_t m_Attributes;
 	DmObjectId_t m_Id;	// We need this strictly because we support serialization
-	CUtlSymbol m_Type;
+	CUtlSymbolLarge m_Type;
 	char m_nLockCount;
 	mutable bool m_bResortNeeded : 1;
 	bool m_bIsMarkedForDeletion : 1;
 
-	static CUtlSymbolTableMT s_TypeSymbols;
+	static CUtlSymbolTableLargeMT s_TypeSymbols;
 
 	friend class CDmxSerializer;
 	friend class CDmxSerializerKeyValues2;
diff --git a/src/public/tier1/mempool.h b/src/public/tier1/mempool.h
index 01d3a33f1..a8ae22fd4 100644
--- a/src/public/tier1/mempool.h
+++ b/src/public/tier1/mempool.h
@@ -66,8 +66,8 @@ class CUtlMemoryPool
 	static void SetErrorReportFunc( MemoryPoolReportFunc_t func );
 
 	// returns number of allocated blocks
-	int Count() { return m_BlocksAllocated; }
-	int PeakCount() { return m_PeakAlloc; }
+	int Count() const { return m_BlocksAllocated; }
+	int PeakCount() const { return m_PeakAlloc; }
 
 protected:
 	class CBlob
@@ -111,7 +111,7 @@ class CUtlMemoryPool
 class CMemoryPoolMT : public CUtlMemoryPool
 {
 public:
-	CMemoryPoolMT(int blockSize, int numElements, int growMode = UTLMEMORYPOOL_GROW_FAST, const char *pszAllocOwner = NULL) : CUtlMemoryPool( blockSize, numElements, growMode, pszAllocOwner) {}
+	CMemoryPoolMT(int blockSize, int numElements, int growMode = UTLMEMORYPOOL_GROW_FAST, const char *pszAllocOwner = NULL, int nAlignment = 0) : CUtlMemoryPool( blockSize, numElements, growMode, pszAllocOwner, nAlignment) {}
 
 
 	void*		Alloc()	{ AUTO_LOCK( m_mutex ); return CUtlMemoryPool::Alloc(); }

From 4f63036d229929c87987a82a021dfe5ed84f829e Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Thu, 9 Mar 2023 22:30:06 -0500
Subject: [PATCH 15/42] perf(init): stop using a mutex during search paths

* I don't think there's any IO threading here that would warrant
  the need for mutexing this, as all platforms now use a single IO thread
* this greatly improves initialization time when scanning paths
---
 src/filesystem/basefilesystem.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/filesystem/basefilesystem.h b/src/filesystem/basefilesystem.h
index e413db7da..b7b3529e7 100644
--- a/src/filesystem/basefilesystem.h
+++ b/src/filesystem/basefilesystem.h
@@ -580,10 +580,8 @@ abstract_class CBaseFileSystem : public CTier1AppSystem< IFileSystem >
 
 			if ( *ppszFilename && !Q_IsAbsolutePath( *ppszFilename ) )
 			{
-				// Copy paths to minimize mutex lock time
-				pFileSystem->m_SearchPathsMutex.Lock();
+				// Copy to keep filesystem intact
 				CopySearchPaths( pFileSystem->m_SearchPaths );
-				pFileSystem->m_SearchPathsMutex.Unlock();
 
 				pFileSystem->FixUpPath ( *ppszFilename, m_Filename, sizeof( m_Filename ) );
 			}
@@ -611,10 +609,8 @@ abstract_class CBaseFileSystem : public CTier1AppSystem< IFileSystem >
 			{
 				m_pathID =  UTL_INVAL_SYMBOL;
 			}
-			// Copy paths to minimize mutex lock time
-			pFileSystem->m_SearchPathsMutex.Lock();
+			// Copy to keep filesystem intact
 			CopySearchPaths( pFileSystem->m_SearchPaths );
-			pFileSystem->m_SearchPathsMutex.Unlock();
 			m_Filename[0] = '\0';
 		}
 

From d27e190d282646a6a0c0096eef4fba288b0385ca Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Thu, 9 Mar 2023 22:19:22 -0500
Subject: [PATCH 16/42] perf(init): lazily load parsed data for game DLL when
 on a listen server

during startup, the particles manifest and item schema were being parsed
in both the server DLL and client DLL on startup. this greatly delays
a player's ability quickly launch the game and queue into a match.

so, we can instead just skip initializing for server DLL until we need the manifests
at map load. particles are already parsed specifically for the map at level init,
and pending item schema updates from the GC are also initialized at level init,
so this should be a fine change. it also shouldn't touch the dedicated server path.

as an added boost, we now also backport a change from CSGO which added
threaded init to the client. this completely hides the particle init cost
behind item schema parsing time, which is the longer of the two initializations.
---
 src/game/client/cdll_client_int.cpp          | 15 +++++++++++++--
 src/game/server/gameinterface.cpp            | 19 ++++++++++++++++++-
 src/game/shared/econ/econ_item_inventory.cpp | 11 ++++++++++-
 src/game/shared/econ/econ_item_inventory.h   |  2 ++
 src/game/shared/tf/tf_item_inventory.cpp     |  5 +++++
 src/game/shared/tf/tf_item_inventory.h       |  1 +
 6 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/src/game/client/cdll_client_int.cpp b/src/game/client/cdll_client_int.cpp
index 625cbd120..b6ba114da 100644
--- a/src/game/client/cdll_client_int.cpp
+++ b/src/game/client/cdll_client_int.cpp
@@ -60,6 +60,7 @@
 #include "datacache/imdlcache.h"
 #include "kbutton.h"
 #include "tier0/icommandline.h"
+#include "vstdlib/jobthread.h"
 #include "gamerules_register.h"
 #include "vgui_controls/AnimationController.h"
 #include "bitmap/tgawriter.h"
@@ -838,6 +839,14 @@ bool IsEngineThreaded()
 	return false;
 }
 
+bool InitParticleManager()
+{
+	if (!ParticleMgr()->Init(MAX_TOTAL_PARTICLES, materials))
+		return false;
+
+	return true;
+}
+
 //-----------------------------------------------------------------------------
 // Constructor
 //-----------------------------------------------------------------------------
@@ -991,8 +1000,8 @@ int CHLClient::Init( CreateInterfaceFn appSystemFactory, CreateInterfaceFn physi
 	if (!Initializer::InitializeAllObjects())
 		return false;
 
-	if (!ParticleMgr()->Init(MAX_TOTAL_PARTICLES, materials))
-		return false;
+	CFunctorJob *pGameJob = new CFunctorJob( CreateFunctor( InitParticleManager ) );
+	g_pThreadPool->AddJob( pGameJob );
 
 
 	if (!VGui_Startup( appSystemFactory ))
@@ -1035,6 +1044,8 @@ int CHLClient::Init( CreateInterfaceFn appSystemFactory, CreateInterfaceFn physi
 
 	modemanager->Init( );
 
+	pGameJob->WaitForFinishAndRelease();
+
 	g_pClientMode->InitViewport();
 
 	gHUD.Init();
diff --git a/src/game/server/gameinterface.cpp b/src/game/server/gameinterface.cpp
index 509fd9058..d352ff281 100644
--- a/src/game/server/gameinterface.cpp
+++ b/src/game/server/gameinterface.cpp
@@ -565,6 +565,8 @@ EXPOSE_SINGLE_INTERFACE_GLOBALVAR(CServerGameDLL, IServerGameDLL, INTERFACEVERSI
 // When bumping the version to this interface, check that our assumption is still valid and expose the older version in the same way
 COMPILE_TIME_ASSERT( INTERFACEVERSION_SERVERGAMEDLL_INT == 10 );
 
+static bool bParsedParticles = false;
+
 bool CServerGameDLL::DLLInit( CreateInterfaceFn appSystemFactory, 
 		CreateInterfaceFn physicsFactory, CreateInterfaceFn fileSystemFactory, 
 		CGlobalVars *pGlobals)
@@ -727,7 +729,11 @@ bool CServerGameDLL::DLLInit( CreateInterfaceFn appSystemFactory,
 	InvalidateQueryCache();
 
 	// Parse the particle manifest file & register the effects within it
-	ParseParticleEffects( false, false );
+	if ( engine->IsDedicatedServer() )
+	{
+		ParseParticleEffects( false, false );
+		bParsedParticles = true;
+	}
 
 	// try to get debug overlay, may be NULL if on HLDS
 	debugoverlay = (IVDebugOverlay *)appSystemFactory( VDEBUG_OVERLAY_INTERFACE_VERSION, NULL );
@@ -958,6 +964,11 @@ bool CServerGameDLL::LevelInit( const char *pMapName, char const *pMapEntities,
 	if ( pItemSchema )
 	{
 		pItemSchema->BInitFromDelayedBuffer();
+		// First valid class must be non-zero if we have a valid schema
+		if ( pItemSchema->GetFirstValidClass() == 0 )
+		{
+			InventoryManager()->InitializeInventory();
+		}
 	}
 #endif // USES_ECON_ITEMS
 
@@ -970,6 +981,12 @@ bool CServerGameDLL::LevelInit( const char *pMapName, char const *pMapEntities,
 		UpdateRichPresence();
 	}
 
+	if ( !bParsedParticles )
+	{
+		ParseParticleEffects( false, false );
+		bParsedParticles = true;
+	}
+
 	//Tony; parse custom manifest if exists!
 	ParseParticleEffectsMap( pMapName, false );
 
diff --git a/src/game/shared/econ/econ_item_inventory.cpp b/src/game/shared/econ/econ_item_inventory.cpp
index 26732e783..1bc2b2b93 100644
--- a/src/game/shared/econ/econ_item_inventory.cpp
+++ b/src/game/shared/econ/econ_item_inventory.cpp
@@ -303,6 +303,16 @@ bool CInventoryManager::Init( void )
 // Purpose: 
 //-----------------------------------------------------------------------------
 void CInventoryManager::PostInit( void )
+{
+#ifdef GAME_DLL
+	if ( engine->IsDedicatedServer() )
+#endif
+	{
+		InitializeInventory();
+	}
+}
+
+void CInventoryManager::InitializeInventory()
 {
 	// Initialize the item system.
 	ItemSystem()->Init();
@@ -443,7 +453,6 @@ void CInventoryManager::LevelShutdownPostEntity( void )
 	ItemSystem()->ResetAttribStringCache();
 }
 
-
 //-----------------------------------------------------------------------------
 // Purpose: Lets the client know that we're now connected to the GC
 //-----------------------------------------------------------------------------
diff --git a/src/game/shared/econ/econ_item_inventory.h b/src/game/shared/econ/econ_item_inventory.h
index fd6e8b8b1..4b225a43f 100644
--- a/src/game/shared/econ/econ_item_inventory.h
+++ b/src/game/shared/econ/econ_item_inventory.h
@@ -216,6 +216,8 @@ class CInventoryManager : public CAutoGameSystemPerFrame
 	virtual void LevelInitPreEntity( void ) OVERRIDE;
 	virtual void LevelShutdownPostEntity( void ) OVERRIDE;
 
+	virtual void InitializeInventory( void );
+
 #ifdef CLIENT_DLL
 	// Gets called each frame
 	virtual void Update( float frametime ) OVERRIDE;
diff --git a/src/game/shared/tf/tf_item_inventory.cpp b/src/game/shared/tf/tf_item_inventory.cpp
index 5b77ae604..0e321f959 100644
--- a/src/game/shared/tf/tf_item_inventory.cpp
+++ b/src/game/shared/tf/tf_item_inventory.cpp
@@ -220,6 +220,11 @@ CTFInventoryManager::~CTFInventoryManager( void )
 void CTFInventoryManager::PostInit( void )
 {
 	BaseClass::PostInit();
+}
+
+void CTFInventoryManager::InitializeInventory()
+{
+	BaseClass::InitializeInventory();
 	GenerateBaseItems();
 }
 
diff --git a/src/game/shared/tf/tf_item_inventory.h b/src/game/shared/tf/tf_item_inventory.h
index 8d1f30516..3b14189d7 100644
--- a/src/game/shared/tf/tf_item_inventory.h
+++ b/src/game/shared/tf/tf_item_inventory.h
@@ -150,6 +150,7 @@ class CTFInventoryManager : public CInventoryManager
 	~CTFInventoryManager();
 
 	virtual void		PostInit( void );
+	virtual void		InitializeInventory();
 
 #ifdef CLIENT_DLL
 	virtual CPlayerInventory *GeneratePlayerInventoryObject() const { return new CTFPlayerInventory; }

From 153c113437717fc3d613d50e47fb5e65a0b81e05 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Thu, 9 Mar 2023 22:28:31 -0500
Subject: [PATCH 17/42] perf: stricmp optimizations

* backported FastASCIIToLower from CSGO and applied it appropriately to strtools
* backported _V_stricmp conditional structure from CSGO
  but adjusted it to use a fast ASCII lookup table and also
  use tolower on non-ASCII characters for general safety
* select a few choice cases to migrate from stricmp to V_stricmp
  CSGO applies it to everything but these are few hotspots
---
 src/tier1/strtools.cpp                   | 85 +++++++++++++++++-------
 src/vgui2/src/LocalizedStringTable.cpp   |  2 +-
 src/vgui2/vgui_controls/Panel.cpp        |  2 +-
 src/vguimatsurface/TextureDictionary.cpp |  2 +-
 src/vstdlib/KeyValuesSystem.cpp          |  2 +-
 5 files changed, 65 insertions(+), 28 deletions(-)

diff --git a/src/tier1/strtools.cpp b/src/tier1/strtools.cpp
index a63fa4fcd..62c4d134f 100644
--- a/src/tier1/strtools.cpp
+++ b/src/tier1/strtools.cpp
@@ -80,20 +80,20 @@
 #endif
 #include "tier0/memdbgon.h"
 
-static int FastToLower( char c )
-{
-	int i = (unsigned char) c;
-	if ( i < 0x80 )
-	{
-		// Brutally fast branchless ASCII tolower():
-		i += (((('A'-1) - i) & (i - ('Z'+1))) >> 26) & 0x20;
-	}
-	else
-	{
-		i += isupper( i ) ? 0x20 : 0;
-	}
-	return i;
-}
+#define USE_FAST_CASE_CONVERSION 1
+#if USE_FAST_CASE_CONVERSION
+/// Faster conversion of an ascii char to upper case. This function does not obey locale or any language
+/// setting. It should not be used to convert characters for printing, but it is a better choice
+/// for internal strings such as used for hash table keys, etc. It's meant to be inlined and used
+/// in places like the various dictionary classes. Not obeying locale also protects you from things
+/// like your hash values being different depending on the locale setting.
+#define FastASCIIToUpper( c ) ( ( ( (c) >= 'a' ) && ( (c) <= 'z' ) ) ? ( (c) - 32 ) : (c) )
+/// similar to FastASCIIToLower
+#define FastASCIIToLower( c ) ( ( ( (c) >= 'A' ) && ( (c) <= 'Z' ) ) ? ( (c) + 32 ) : (c) )
+#else
+#define FastASCIIToLower tolower
+#define FastASCIIToUpper toupper
+#endif
 
 void _V_memset (const char* file, int line, void *dest, int fill, int count)
 {
@@ -260,6 +260,17 @@ char *V_strnlwr(char *s, size_t count)
 	return pRet;
 }
 
+static constexpr uint8 lowerAsciiLookup[128] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
+	0x40, 'a',  'b',  'c',  'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',  'n',  'o',
+	'p',  'q',  'r',  's',  't',  'u',  'v',  'w',  'x',  'y',  'z',  0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F
+};
+
 int V_stricmp( const char *str1, const char *str2 )
 {
 	// It is not uncommon to compare a string to itself. See
@@ -272,6 +283,7 @@ int V_stricmp( const char *str1, const char *str2 )
 	}
 	const unsigned char *s1 = (const unsigned char*)str1;
 	const unsigned char *s2 = (const unsigned char*)str2;
+#if 0
 	for ( ; *s1; ++s1, ++s2 )
 	{
 		if ( *s1 != *s2 )
@@ -291,6 +303,31 @@ int V_stricmp( const char *str1, const char *str2 )
 		}
 	}
 	return *s2 ? -1 : 0;
+#else
+	while (true)
+	{
+		unsigned char c1 = *s1++;
+		unsigned char c2 = *s2++;
+		if (c1 == c2)
+		{
+				if ( !c1 ) return 0;
+		}
+		else if ((((uint32)c1 | (uint32)c2) & 0xffffff80) == 0)
+		{
+				if (int32 res = lowerAsciiLookup[c1] - lowerAsciiLookup[c2])
+				{
+				return res;
+				}
+		}
+		else
+		{
+			if (int32 res = tolower(c1) - tolower(c2))
+			{
+				return res;
+			}
+		}
+	}
+#endif
 }
 
 int V_strnicmp( const char *str1, const char *str2, int n )
@@ -348,7 +385,7 @@ const char *StringAfterPrefix( const char *str, const char *prefix )
 		if ( !*prefix )
 			return str;
 	}
-	while ( FastToLower( *str++ ) == FastToLower( *prefix++ ) );
+	while ( tolower( *str++ ) == tolower( *prefix++ ) );
 	return NULL;
 }
 
@@ -638,7 +675,7 @@ char const* V_stristr( char const* pStr, char const* pSearch )
 	while (*pLetter != 0)
 	{
 		// Skip over non-matches
-		if (FastToLower((unsigned char)*pLetter) == FastToLower((unsigned char)*pSearch))
+		if (FastASCIIToLower((unsigned char)*pLetter) == FastASCIIToLower((unsigned char)*pSearch))
 		{
 			// Check for match
 			char const* pMatch = pLetter + 1;
@@ -649,7 +686,7 @@ char const* V_stristr( char const* pStr, char const* pSearch )
 				if (*pMatch == 0)
 					return 0;
 
-				if (FastToLower((unsigned char)*pMatch) != FastToLower((unsigned char)*pTest))
+				if (FastASCIIToLower((unsigned char)*pMatch) != FastASCIIToLower((unsigned char)*pTest))
 					break;
 
 				++pMatch;
@@ -696,7 +733,7 @@ char const* V_strnistr( char const* pStr, char const* pSearch, int n )
 			return 0;
 
 		// Skip over non-matches
-		if (FastToLower(*pLetter) == FastToLower(*pSearch))
+		if (FastASCIIToLower(*pLetter) == FastASCIIToLower(*pSearch))
 		{
 			int n1 = n - 1;
 
@@ -712,7 +749,7 @@ char const* V_strnistr( char const* pStr, char const* pSearch, int n )
 				if (*pMatch == 0)
 					return 0;
 
-				if (FastToLower(*pMatch) != FastToLower(*pTest))
+				if (FastASCIIToLower(*pMatch) != FastASCIIToLower(*pTest))
 					break;
 
 				++pMatch;
@@ -1421,7 +1458,7 @@ int _V_UCS2ToUnicode( const ucs2 *pUCS2, wchar_t *pUnicode, int cubDestSizeInByt
 	size_t nMaxUTF8 = cubDestSizeInBytes;
 	char *pIn = (char *)pUCS2;
 	char *pOut = (char *)pUnicode;
-	if ( conv_t > 0 )
+	if ( conv_t != nullptr  )
 	{
 		cchResult = iconv( conv_t, &pIn, &nLenUnicde, &pOut, &nMaxUTF8 );
 		iconv_close( conv_t );
@@ -1461,7 +1498,7 @@ int _V_UnicodeToUCS2( const wchar_t *pUnicode, int cubSrcInBytes, char *pUCS2, i
 	size_t nMaxUCS2 = cubDestSizeInBytes;
 	char *pIn = (char*)pUnicode;
 	char *pOut = pUCS2;
-	if ( conv_t > 0 )
+	if ( conv_t != nullptr  )
 	{
 		cchResult = iconv( conv_t, &pIn, &nLenUnicde, &pOut, &nMaxUCS2 );
 		iconv_close( conv_t );
@@ -1509,7 +1546,7 @@ int _V_UCS2ToUTF8( const ucs2 *pUCS2, char *pUTF8, int cubDestSizeInBytes )
 	size_t nMaxUTF8 = cubDestSizeInBytes - 1;
 	char *pIn = (char *)pUCS2;
 	char *pOut = (char *)pUTF8;
-	if ( conv_t > 0 )
+	if ( conv_t != nullptr  )
 	{
 		const size_t nBytesToWrite = nMaxUTF8;
 		cchResult = iconv( conv_t, &pIn, &nLenUnicde, &pOut, &nMaxUTF8 );
@@ -1554,7 +1591,7 @@ int _V_UTF8ToUCS2( const char *pUTF8, int cubSrcInBytes, ucs2 *pUCS2, int cubDes
 	size_t nMaxUTF8 = cubDestSizeInBytes;
 	char *pIn = (char *)pUTF8;
 	char *pOut = (char *)pUCS2;
-	if ( conv_t > 0 )
+	if ( conv_t != nullptr  )
 	{
 		cchResult = iconv( conv_t, &pIn, &nLenUnicde, &pOut, &nMaxUTF8 );
 		iconv_close( conv_t );
@@ -2275,7 +2312,7 @@ bool V_MakeRelativePath( const char *pFullPath, const char *pDirectory, char *pR
 	// Strip out common parts of the path
 	const char *pLastCommonPath = NULL;
 	const char *pLastCommonDir = NULL;
-	while ( *pPath && ( FastToLower( *pPath ) == FastToLower( *pDir ) || 
+	while ( *pPath && ( tolower( *pPath ) == tolower( *pDir ) || 
 						( PATHSEPARATOR( *pPath ) && ( PATHSEPARATOR( *pDir ) || (*pDir == 0) ) ) ) )
 	{
 		if ( PATHSEPARATOR( *pPath ) )
diff --git a/src/vgui2/src/LocalizedStringTable.cpp b/src/vgui2/src/LocalizedStringTable.cpp
index 4e34164f9..2f6effd75 100644
--- a/src/vgui2/src/LocalizedStringTable.cpp
+++ b/src/vgui2/src/LocalizedStringTable.cpp
@@ -731,7 +731,7 @@ bool CLocalizedStringTable::SymLess(localizedstring_t const &i1, localizedstring
 	const char *str2 = (i2.nameIndex == INVALID_LOCALIZE_STRING_INDEX) ? i2.pszValueString :
 											&g_StringTable.m_Names[i2.nameIndex];
 	
-	return stricmp(str1, str2) < 0;
+	return V_stricmp(str1, str2) < 0;
 }
 
 
diff --git a/src/vgui2/vgui_controls/Panel.cpp b/src/vgui2/vgui_controls/Panel.cpp
index 2d20a355a..28516440e 100644
--- a/src/vgui2/vgui_controls/Panel.cpp
+++ b/src/vgui2/vgui_controls/Panel.cpp
@@ -6343,7 +6343,7 @@ PanelAnimationMapEntry *Panel::FindPanelAnimationEntry( char const *scriptname,
 	{
 		PanelAnimationMapEntry *e = &map->entries[ i ];
 
-		if ( !stricmp( e->name(), scriptname ) )
+		if ( !V_stricmp( e->name(), scriptname ) )
 		{
 			return e;
 		}
diff --git a/src/vguimatsurface/TextureDictionary.cpp b/src/vguimatsurface/TextureDictionary.cpp
index d8fe072d1..eda60e88e 100644
--- a/src/vguimatsurface/TextureDictionary.cpp
+++ b/src/vguimatsurface/TextureDictionary.cpp
@@ -983,7 +983,7 @@ int	CTextureDictionary::FindTextureIdForTextureFile( char const *pFileName )
 		if ( !mat )
 			continue;
 
-		if ( !stricmp( mat->GetName(), pFileName ) )
+		if ( ! V_stricmp( mat->GetName(), pFileName ) )
 			return i;
 	}
 
diff --git a/src/vstdlib/KeyValuesSystem.cpp b/src/vstdlib/KeyValuesSystem.cpp
index 0665d94f5..4b8f16d03 100644
--- a/src/vstdlib/KeyValuesSystem.cpp
+++ b/src/vstdlib/KeyValuesSystem.cpp
@@ -235,7 +235,7 @@ HKeySymbol CKeyValuesSystem::GetSymbolForString( const char *name, bool bCreate
 	hash_item_t *item = &m_HashTable[hash];
 	while (1)
 	{
-		if (!stricmp(name, (char *)m_Strings.GetBase() + item->stringIndex ))
+		if (!V_stricmp(name, (char *)m_Strings.GetBase() + item->stringIndex ))
 		{
 			return (HKeySymbol)item->stringIndex;
 		}

From b379961aece998da89e061193af7a7b7e3b24ae7 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Thu, 9 Mar 2023 22:22:00 -0500
Subject: [PATCH 18/42] perf: make various SharedObjects final to avoid virtual
 calls

in many cases, certain SharedObject classes are used directly,
so there is no need to virtualize calls in these circumstances.

this saves a bit of runtime performance especially with CEconItem

i didn't finalize the classes whose types were not used directly
---
 src/game/shared/econ/econ_contribution.h        | 2 +-
 src/game/shared/econ/econ_game_account_client.h | 2 +-
 src/game/shared/econ/econ_item.h                | 2 +-
 src/game/shared/tf/tf_duel_summary.h            | 2 +-
 src/game/shared/tf/tf_ladder_data.h             | 4 ++--
 src/game/shared/tf/tf_lobby_server.h            | 2 +-
 src/game/shared/tf/tf_party.h                   | 2 +-
 src/game/shared/tf/tf_rating_data.h             | 2 +-
 src/game/shared/tf/tf_wardata.h                 | 2 +-
 9 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/game/shared/econ/econ_contribution.h b/src/game/shared/econ/econ_contribution.h
index b46f87f47..9721748f9 100644
--- a/src/game/shared/econ/econ_contribution.h
+++ b/src/game/shared/econ/econ_contribution.h
@@ -22,7 +22,7 @@ namespace GCSDK
 //---------------------------------------------------------------------------------
 // Purpose: All the account-level information that the GC tracks for TF
 //---------------------------------------------------------------------------------
-class CTFMapContribution : public GCSDK::CProtoBufSharedObject< CSOTFMapContribution, k_EEconTypeMapContribution >
+class CTFMapContribution final : public GCSDK::CProtoBufSharedObject< CSOTFMapContribution, k_EEconTypeMapContribution >
 {
 #ifdef GC
 	DECLARE_CLASS_MEMPOOL( CTFMapContribution );
diff --git a/src/game/shared/econ/econ_game_account_client.h b/src/game/shared/econ/econ_game_account_client.h
index 372086816..29bb8b478 100644
--- a/src/game/shared/econ/econ_game_account_client.h
+++ b/src/game/shared/econ/econ_game_account_client.h
@@ -17,7 +17,7 @@
 //---------------------------------------------------------------------------------
 // Purpose: All the account-level information that the GC tracks
 //---------------------------------------------------------------------------------
-class CEconGameAccountClient : public GCSDK::CProtoBufSharedObject< CSOEconGameAccountClient, k_EEconTypeGameAccountClient >
+class CEconGameAccountClient final : public GCSDK::CProtoBufSharedObject< CSOEconGameAccountClient, k_EEconTypeGameAccountClient >
 {
 #ifdef GC
 	DECLARE_CLASS_MEMPOOL( CEconGameAccountClient );
diff --git a/src/game/shared/econ/econ_item.h b/src/game/shared/econ/econ_item.h
index ae3270edf..2e1aea098 100644
--- a/src/game/shared/econ/econ_item.h
+++ b/src/game/shared/econ/econ_item.h
@@ -282,7 +282,7 @@ template < typename T >	uint32 WrapDeprecatedUntypedEconItemAttribute( T tValue
 template < typename TAttribInMemoryType >
 schema_attribute_stat_bucket_t ISchemaAttributeTypeBase<TAttribInMemoryType>::s_InstanceStats;
 
-class CEconItem : public GCSDK::CSharedObject, public CMaterialOverrideContainer< IEconItemInterface >
+class CEconItem final : public GCSDK::CSharedObject, public CMaterialOverrideContainer< IEconItemInterface >
 {
 #ifdef GC_DLL
 	DECLARE_CLASS_MEMPOOL( CEconItem );
diff --git a/src/game/shared/tf/tf_duel_summary.h b/src/game/shared/tf/tf_duel_summary.h
index 4529423c0..b209a607e 100644
--- a/src/game/shared/tf/tf_duel_summary.h
+++ b/src/game/shared/tf/tf_duel_summary.h
@@ -44,7 +44,7 @@ const uint32 kWinsPerLevel = 10;
 //---------------------------------------------------------------------------------
 // Purpose: 
 //---------------------------------------------------------------------------------
-class CTFDuelSummary : public GCSDK::CProtoBufSharedObject< CSOTFDuelSummary, k_EEconTypeDuelSummary >
+class CTFDuelSummary final : public GCSDK::CProtoBufSharedObject< CSOTFDuelSummary, k_EEconTypeDuelSummary >
 {
 #ifdef GC
 	DECLARE_CLASS_MEMPOOL( CTFDuelSummary );
diff --git a/src/game/shared/tf/tf_ladder_data.h b/src/game/shared/tf/tf_ladder_data.h
index a1b672a1d..d2ea2d02a 100644
--- a/src/game/shared/tf/tf_ladder_data.h
+++ b/src/game/shared/tf/tf_ladder_data.h
@@ -27,7 +27,7 @@
 //---------------------------------------------------------------------------------
 // Purpose: The shared object that contains a ladder player's stats		
 //---------------------------------------------------------------------------------
-class CSOTFLadderData : public GCSDK::CProtoBufSharedObject< CSOTFLadderPlayerStats, k_EEConTypeLadderData >
+class CSOTFLadderData final : public GCSDK::CProtoBufSharedObject< CSOTFLadderPlayerStats, k_EEConTypeLadderData >
 {
 public:
 	CSOTFLadderData();
@@ -55,7 +55,7 @@ CSOTFLadderData *GetLocalPlayerLadderData( EMatchGroup nMatchGroup );	// TODO: G
 //---------------------------------------------------------------------------------
 // Purpose: The shared object that contains stats from a specific match - for match history on the client
 //---------------------------------------------------------------------------------
-class CSOTFMatchResultPlayerInfo : public GCSDK::CProtoBufSharedObject< CSOTFMatchResultPlayerStats, k_EEConTypeMatchResultPlayerInfo >
+class CSOTFMatchResultPlayerInfo final : public GCSDK::CProtoBufSharedObject< CSOTFMatchResultPlayerStats, k_EEConTypeMatchResultPlayerInfo >
 {
 public:
 	CSOTFMatchResultPlayerInfo();
diff --git a/src/game/shared/tf/tf_lobby_server.h b/src/game/shared/tf/tf_lobby_server.h
index 8252b8397..dffb18788 100644
--- a/src/game/shared/tf/tf_lobby_server.h
+++ b/src/game/shared/tf/tf_lobby_server.h
@@ -15,7 +15,7 @@
 #include "tf_matchmaking_shared.h"
 #include "playergroup.h"
 
-class CTFGSLobby : public GCSDK::CProtoBufSharedObject<CSOTFGameServerLobby, k_EProtoObjectTFGameServerLobby>
+class CTFGSLobby final : public GCSDK::CProtoBufSharedObject<CSOTFGameServerLobby, k_EProtoObjectTFGameServerLobby>
 {
 	typedef GCSDK::CProtoBufSharedObject<CSOTFGameServerLobby, k_EProtoObjectTFGameServerLobby> BaseClass;
 public:
diff --git a/src/game/shared/tf/tf_party.h b/src/game/shared/tf/tf_party.h
index f391773d4..bfca53dcc 100644
--- a/src/game/shared/tf/tf_party.h
+++ b/src/game/shared/tf/tf_party.h
@@ -24,7 +24,7 @@ namespace GCSDK
 
 const int k_nTFPartyMaxSize = 6;
 
-class CTFParty : public GCSDK::CProtoBufSharedObject<CSOTFParty, k_EProtoObjectTFParty>, public GCSDK::IParty
+class CTFParty final : public GCSDK::CProtoBufSharedObject<CSOTFParty, k_EProtoObjectTFParty>, public GCSDK::IParty
 {
 #ifdef GC
 	DECLARE_CLASS_MEMPOOL( CTFParty );
diff --git a/src/game/shared/tf/tf_rating_data.h b/src/game/shared/tf/tf_rating_data.h
index 0d1b53833..bfeaf306b 100644
--- a/src/game/shared/tf/tf_rating_data.h
+++ b/src/game/shared/tf/tf_rating_data.h
@@ -21,7 +21,7 @@
 //---------------------------------------------------------------------------------
 // Purpose: The shared object that contains a specific MM rating
 //---------------------------------------------------------------------------------
-class CTFRatingData : public GCSDK::CProtoBufSharedObject< CSOTFRatingData, k_EProtoObjectTFRatingData, /* bPublicMutable */ false >
+class CTFRatingData final : public GCSDK::CProtoBufSharedObject< CSOTFRatingData, k_EProtoObjectTFRatingData, /* bPublicMutable */ false >
 {
 public:
 	CTFRatingData();
diff --git a/src/game/shared/tf/tf_wardata.h b/src/game/shared/tf/tf_wardata.h
index bd78825d6..b49b684bc 100644
--- a/src/game/shared/tf/tf_wardata.h
+++ b/src/game/shared/tf/tf_wardata.h
@@ -24,7 +24,7 @@
 //---------------------------------------------------------------------------------
 // Purpose: The shared object that contains a user's stats for a war	
 //---------------------------------------------------------------------------------
-class CWarData : public GCSDK::CProtoBufSharedObject< CSOWarData, k_EEConTypeWarData >
+class CWarData final : public GCSDK::CProtoBufSharedObject< CSOWarData, k_EEConTypeWarData >
 {
 public:
 	CWarData();

From 499a50bb929e195649e82fb1c0e53ab23ee445d9 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Thu, 9 Mar 2023 22:33:03 -0500
Subject: [PATCH 19/42] perf: disable CWin32ReadOnlyFile to prevent duplicate
 stat calls

* when a file doesn't exist in a search path, then CWin32ReadOnlyFile can't
  open that file. and then we try yet another open in CStdioFile
* technically, we could refactor the open code to be able to communicate
  an error code, but I didn't see much point to using Win32 specific files
* I think there are some cases where not using CWin32ReadOnlyFile can break
  some demo playback on Windows, so I might either fix that or do the above
---
 src/filesystem/filesystem_stdio.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/filesystem/filesystem_stdio.cpp b/src/filesystem/filesystem_stdio.cpp
index 43916fe33..95a8535ff 100644
--- a/src/filesystem/filesystem_stdio.cpp
+++ b/src/filesystem/filesystem_stdio.cpp
@@ -240,8 +240,8 @@ ConVar filesystem_unbuffered_io( "filesystem_unbuffered_io", "1", 0, "" );
 #define UseUnbufferedIO() true
 #endif
 
-ConVar filesystem_native( "filesystem_native", "1", 0, "Use native FS or STDIO" );
-ConVar filesystem_max_stdio_read( "filesystem_max_stdio_read", IsX360() ? "64" : "16", 0, "" );
+ConVar filesystem_native( "filesystem_native", "0", 0, "Use native FS or STDIO" );
+ConVar filesystem_max_stdio_read( "filesystem_max_stdio_read", "64", 0, "" );
 ConVar filesystem_report_buffered_io( "filesystem_report_buffered_io", "0" );
 
 //-----------------------------------------------------------------------------
@@ -417,7 +417,7 @@ FILE *CFileSystem_Stdio::FS_fopen( const char *filenameT, const char *options, u
 
 	CBaseFileSystem::FixUpPath ( filenameT, filename, sizeof( filename ) );
 
-#ifdef _WIN32
+#if defined(_WIN32) && 0
 	if ( CWin32ReadOnlyFile::CanOpen( filename, options ) )
 	{
 		pFile = CWin32ReadOnlyFile::FS_fopen( filename, options, size );

From 60f231597be343637d21458d5432ddedac65fb00 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Fri, 10 Mar 2023 03:59:52 -0500
Subject: [PATCH 20/42] perf: remove unused r_norefresh functionality

---
 src/engine/gl_rmain.cpp         | 6 ------
 src/engine/matsys_interface.cpp | 1 -
 2 files changed, 7 deletions(-)

diff --git a/src/engine/gl_rmain.cpp b/src/engine/gl_rmain.cpp
index 63707e4d5..36a80f6f9 100644
--- a/src/engine/gl_rmain.cpp
+++ b/src/engine/gl_rmain.cpp
@@ -326,7 +326,6 @@ class CRender : public IRender
 	float			m_yFOV;
 
 	// timing
-	double		m_frameStartTime;
 	float		m_framerate;
 
 	float		m_zNear;
@@ -377,11 +376,6 @@ void CRender::FrameBegin( void )
 		r_framecount++;
 		R_AnimateLight ();
 		R_PushDlights();
-
-		if (!r_norefresh.GetInt())
-		{
-			m_frameStartTime = Sys_FloatTime ();
-		}
 	}
 
 	UpdateStudioRenderConfig();
diff --git a/src/engine/matsys_interface.cpp b/src/engine/matsys_interface.cpp
index 540bd4497..0b7bfd085 100644
--- a/src/engine/matsys_interface.cpp
+++ b/src/engine/matsys_interface.cpp
@@ -100,7 +100,6 @@ static CTextureReference g_ResolvedFullFrameDepth;
 void WorldStaticMeshCreate( void );
 void WorldStaticMeshDestroy( void );
 
-ConVar	r_norefresh( "r_norefresh","0");
 ConVar	r_decals( "r_decals", "2048" );
 ConVar	mp_decals( "mp_decals","200", FCVAR_ARCHIVE);
 ConVar	r_lightmap( "r_lightmap", "-1", FCVAR_CHEAT | FCVAR_MATERIAL_SYSTEM_THREAD );

From 706f5f5945fbf468a6320439964d5988373a612d Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Fri, 10 Mar 2023 22:19:30 -0500
Subject: [PATCH 21/42] perf: fps_max adjustments

backport fps_max 49 limit from CSGO

add a new frame limiter method which sleeps until a certain accuracy threshold
then, it will tightly loop while waiting for our yield to reach the end time

this ensures that we always arrive at the exact time that the FPS limit expects

before, this was sensitive to 2 issues, one being that the pause instruction
has different cycle counts on different CPUs and the other being that the
frame limit was not being handled efficiently within the busy wait period
---
 src/engine/sys_engine.cpp      | 30 +++++++++++++-----------------
 src/public/tier0/threadtools.h |  1 +
 src/tier0/threadtools.cpp      | 11 +++++++++++
 3 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/src/engine/sys_engine.cpp b/src/engine/sys_engine.cpp
index c75abcffb..d80ced99d 100644
--- a/src/engine/sys_engine.cpp
+++ b/src/engine/sys_engine.cpp
@@ -244,13 +244,13 @@ bool CEngine::FilterTime( float dt )
 	// Dedicated's tic_rate regulates server frame rate.  Don't apply fps filter here.
 	// Only do this restriction on the client. Prevents clients from accomplishing certain
 	// hacks by pausing their client for a period of time.
-	if ( IsPC() && !sv.IsDedicated() && !CanCheat() && fps_max.GetFloat() < 30 )
+	if ( IsPC() && !sv.IsDedicated() && !CanCheat() && fps_max.GetFloat() < 49 )
 	{
 		// Don't do anything if fps_max=0 (which means it's unlimited).
 		if ( fps_max.GetFloat() != 0.0f )
 		{
-			Warning( "sv_cheats is 0 and fps_max is being limited to a minimum of 30 (or set to 0).\n" );
-			fps_max.SetValue( 30.0f );
+			Warning( "sv_cheats is 0 and fps_max is being limited to a minimum of 49 (or set to 0).\n" );
+			fps_max.SetValue( 49.0f );
 		}
 	}
 
@@ -343,7 +343,9 @@ void CEngine::Frame( void )
 		{
 			// ThreadSleep may be imprecise. On non-dedicated servers, we busy-sleep
 			// for the last one or two milliseconds to ensure very tight timing.
-			float fBusyWaitMS = IsWindows() ? 2.25f : 1.5f;
+			float fBusyWaitMS = IsWindows() ? 2.0f : 1.5f;
+			float fWaitTime = m_flMinFrameTime - m_flFrameTime;
+			float fWaitEnd = m_flCurrentTime + fWaitTime;
 			if ( sv.IsDedicated() )
 			{
 				fBusyWaitMS = host_timer_spin_ms.GetFloat();
@@ -354,23 +356,17 @@ void CEngine::Frame( void )
 			// to avoid wasting power and to let other threads/processes run.
 			// Calculate how long we need to wait.
 			int nSleepMS = (int)( ( m_flMinFrameTime - m_flFrameTime ) * 1000 - fBusyWaitMS );
-			if ( nSleepMS > 0 )
+			if ( nSleepMS > fBusyWaitMS )
 			{
 				ThreadSleep( nSleepMS );
 			}
-			else
+
+			while ( Plat_FloatTime() < fWaitEnd )
 			{
-				// On x86, busy-wait using PAUSE instruction which encourages
-				// power savings by idling for ~10 cycles (also yielding to
-				// the other logical hyperthread core if the CPU supports it)
-				for (int i = 2000; i >= 0; --i)
-				{
-#if defined(POSIX)
-					__asm( "pause" ); __asm( "pause" ); __asm( "pause" ); __asm( "pause" );
-#elif defined(IS_WINDOWS_PC)
-					_asm { pause }; _asm { pause }; _asm { pause }; _asm { pause };
-#endif
-				}
+				ThreadPause();
+				// Yield the CPU to other threads so we don't spin too tightly
+				// ThreadSleep(0) is not tight enough.
+				ThreadYield();
 			}
 
 			// Go back to the top of the loop and see if it is time yet.
diff --git a/src/public/tier0/threadtools.h b/src/public/tier0/threadtools.h
index b0b9b1d27..f1adca1b2 100644
--- a/src/public/tier0/threadtools.h
+++ b/src/public/tier0/threadtools.h
@@ -118,6 +118,7 @@ PLATFORM_INTERFACE bool ReleaseThreadHandle( ThreadHandle_t );
 //-----------------------------------------------------------------------------
 
 PLATFORM_INTERFACE void ThreadSleep(unsigned duration = 0);
+PLATFORM_INTERFACE void ThreadYield();
 PLATFORM_INTERFACE uint ThreadGetCurrentId();
 PLATFORM_INTERFACE ThreadHandle_t ThreadGetCurrentHandle();
 PLATFORM_INTERFACE int ThreadGetPriority( ThreadHandle_t hThread = NULL );
diff --git a/src/tier0/threadtools.cpp b/src/tier0/threadtools.cpp
index 001e703a8..617d3cf8d 100644
--- a/src/tier0/threadtools.cpp
+++ b/src/tier0/threadtools.cpp
@@ -214,6 +214,17 @@ void ThreadSleep(unsigned nMilliseconds)
 
 //-----------------------------------------------------------------------------
 
+void ThreadYield()
+{
+#ifdef _WIN32
+	SwitchToThread();
+#elif defined(POSIX)
+	sched_yield();
+#endif
+}
+
+//-----------------------------------------------------------------------------
+
 #ifndef ThreadGetCurrentId
 uint ThreadGetCurrentId()
 {

From f193968474d0f0ade9dbcd1cb22e7fa8eec564c5 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Fri, 10 Mar 2023 22:26:00 -0500
Subject: [PATCH 22/42] perf: DX9Ex frame latency improvements

prioritize GPU thread and use DX9 driver to force frame sync if available
---
 .../shaderapidx9/shaderdevicedx8.cpp          | 26 ++++++++++++++++++-
 .../shaderapidx9/shaderdevicedx8.h            |  7 +++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/materialsystem/shaderapidx9/shaderdevicedx8.cpp b/src/materialsystem/shaderapidx9/shaderdevicedx8.cpp
index 210c643bc..2789158b5 100644
--- a/src/materialsystem/shaderapidx9/shaderdevicedx8.cpp
+++ b/src/materialsystem/shaderapidx9/shaderdevicedx8.cpp
@@ -2401,6 +2401,15 @@ bool CShaderDeviceDx8::CreateD3DDevice( void* pHWnd, int nAdapter, const ShaderD
 
 	g_pHardwareConfig->SetupHardwareCaps( info, g_ShaderDeviceMgrDx8.GetHardwareCaps( nAdapter ) );
 
+#if defined(IS_WINDOWS_PC) && defined(SHADERAPIDX9)
+	if ( g_ShaderDeviceUsingD3D9Ex )
+	{
+		Dx9ExDevice()->SetMaximumFrameLatency(2);
+		static ConVarRef mat_forcehardwaresync("mat_forcehardwaresync");
+		mat_forcehardwaresync.SetValue(0);
+	}
+#endif
+
 	// FIXME: Bake this into hardware config
 	// What texture formats do we support?
 	if ( D3DSupportsCompressedTextures() )
@@ -3371,20 +3380,35 @@ void CShaderDeviceDx8::Present()
 	// if we're in queued mode, don't present if the device is already lost
 	bool bValidPresent = true;
 	bool bInMainThread = ThreadInMainThread();
-	if ( !bInMainThread )
+	static bool s_bSetPriority = true;
+	if ( bInMainThread )
+	{
+		s_bSetPriority = true;
+	}
+	else
 	{
 		// don't present if the device is in an invalid state and in queued mode
 		if ( m_DeviceState != DEVICE_STATE_OK )
 		{
+			s_bSetPriority = true;
 			bValidPresent = false;
 		}
 		// check for lost device early in threaded mode
 		CheckDeviceLost( m_bOtherAppInitializing );
 		if ( m_DeviceState != DEVICE_STATE_OK )
 		{
+			s_bSetPriority = true;
 			bValidPresent = false;
 		}
 	}
+#if defined(IS_WINDOWS_PC) && defined(SHADERAPIDX9)
+	if ( bValidPresent && s_bSetPriority && g_ShaderDeviceUsingD3D9Ex )
+	{
+		s_bSetPriority = false;
+		Dx9ExDevice()->SetGPUThreadPriority(7);
+		Dx9ExDevice()->SetMaximumFrameLatency(2);
+	}
+#endif
 	// Copy the back buffer into the non-interactive temp buffer
 	if ( m_NonInteractiveRefresh.m_Mode == MATERIAL_NON_INTERACTIVE_MODE_LEVEL_LOAD )
 	{
diff --git a/src/materialsystem/shaderapidx9/shaderdevicedx8.h b/src/materialsystem/shaderapidx9/shaderdevicedx8.h
index 4e20efea3..3dbbdc949 100644
--- a/src/materialsystem/shaderapidx9/shaderdevicedx8.h
+++ b/src/materialsystem/shaderapidx9/shaderdevicedx8.h
@@ -361,6 +361,13 @@ FORCEINLINE IDirect3DDevice9 *Dx9Device()
 	return g_pD3DDevice;
 }
 
+#if defined(IS_WINDOWS_PC) && defined(SHADERAPIDX9)
+FORCEINLINE IDirect3DDevice9Ex* Dx9ExDevice()
+{
+	return static_cast<IDirect3DDevice9Ex*>( g_pD3DDevice );
+}
+#endif
+
 extern CShaderDeviceDx8* g_pShaderDeviceDx8;
 
 

From f95dbe44233a0f4380e240582391ec451f7e5340 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 6 Mar 2023 12:05:17 -0500
Subject: [PATCH 23/42] fix: MvM bomb carrier voice line playing during normal
 CTF with bots

ref: https://github.com/ValveSoftware/Source-1-Games/issues/715
---
 .../behavior/scenario/capture_the_flag/tf_bot_deliver_flag.cpp  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/game/server/tf/bot/behavior/scenario/capture_the_flag/tf_bot_deliver_flag.cpp b/src/game/server/tf/bot/behavior/scenario/capture_the_flag/tf_bot_deliver_flag.cpp
index 873c36ed3..166dbc305 100644
--- a/src/game/server/tf/bot/behavior/scenario/capture_the_flag/tf_bot_deliver_flag.cpp
+++ b/src/game/server/tf/bot/behavior/scenario/capture_the_flag/tf_bot_deliver_flag.cpp
@@ -243,7 +243,7 @@ ActionResult< CTFBot > CTFBotDeliverFlag::Update( CTFBot *me, float interval )
 
 		m_flTotalTravelDistance = NavAreaTravelDistance( me->GetLastKnownArea(), TheNavMesh->GetNavArea( zone->WorldSpaceCenter() ), cost );
 
-		if ( flOldTravelDistance != -1.0f && m_flTotalTravelDistance - flOldTravelDistance > 2000.0f )
+		if ( TFGameRules()->IsMannVsMachineMode() && flOldTravelDistance != -1.0f && m_flTotalTravelDistance - flOldTravelDistance > 2000.0f )
 		{
 			TFGameRules()->BroadcastSound( 255, "Announcer.MVM_Bomb_Reset" );
 

From c3a06853cb32be89ffcbd330f25830cdc108c1b2 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 25 Jul 2022 14:00:26 -0400
Subject: [PATCH 24/42] fix: tips being changed multiple times during map load

when stats are loaded, tips get updated, cycling to another tip
this causes an unwanted tip cycle where the first tip could be
shown for a very short time while the player is reading it
and moving to a new random one seemingly arbitrarily

this keeps the same tip on the loading screen, as intended
---
 src/game/client/tf/vgui/tf_statsummary.cpp | 11 +++++++----
 src/game/client/tf/vgui/tf_statsummary.h   |  2 +-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/game/client/tf/vgui/tf_statsummary.cpp b/src/game/client/tf/vgui/tf_statsummary.cpp
index 8e968291f..df18e9a0a 100644
--- a/src/game/client/tf/vgui/tf_statsummary.cpp
+++ b/src/game/client/tf/vgui/tf_statsummary.cpp
@@ -483,7 +483,7 @@ void CTFStatsSummaryPanel::SetStats( CUtlVector<ClassStats_t> &vecClassStats )
 	m_aClassStats = vecClassStats; 
 	if ( m_bControlsLoaded )
 	{
-		UpdateDialog();
+		UpdateDialog(false);
 	}
 }
 
@@ -846,7 +846,7 @@ void CTFStatsSummaryPanel::UpdateLeaderboard()
 //-----------------------------------------------------------------------------
 // Purpose: Updates the dialog
 //-----------------------------------------------------------------------------
-void CTFStatsSummaryPanel::UpdateDialog()
+void CTFStatsSummaryPanel::UpdateDialog(bool bUpdateTip)
 {
 	UpdateMainBackground();
 
@@ -921,8 +921,11 @@ void CTFStatsSummaryPanel::UpdateDialog()
 	UpdateBarCharts();
 	// fill out class details
 	UpdateClassDetails();
-	// update the tip
-	UpdateTip();
+	if (bUpdateTip)
+	{
+		// update the tip
+		UpdateTip();
+	}
 	// show or hide controls depending on if we're interactive or not
 	UpdateControls();		
 }
diff --git a/src/game/client/tf/vgui/tf_statsummary.h b/src/game/client/tf/vgui/tf_statsummary.h
index eca7c723f..ffbc205c1 100644
--- a/src/game/client/tf/vgui/tf_statsummary.h
+++ b/src/game/client/tf/vgui/tf_statsummary.h
@@ -64,7 +64,7 @@ class CTFStatsSummaryPanel : public vgui::EditablePanel, public CGameEventListen
 
 	void Reset();
 	void SetDefaultSelections();
-	void UpdateDialog();
+	void UpdateDialog(bool bUpdateTip = true);
 	void UpdateBarCharts();
 	void UpdateClassDetails( bool bIsMVM = false );
 	void UpdateTip();

From 79f025aebe85e6a684defa7d283eb0669e73d3a8 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Fri, 10 Mar 2023 22:09:04 -0500
Subject: [PATCH 25/42] fix: update networking setting defaults

* bump up updaterate and cmdrate to be equal and more closely
  aligned to the tick interval. this is a slight increase so it
  should not adversely affect players in terms of bandwidth and processing
  time. the fact that the rates are equal means that we aren't giving
  an imbalance of data for the player to process and repredict on.
* align cl_interp to 15ms tick interval. we aren't reducing it at the moment
  because too many systems rely on interpolation to smooth out some
  evaluation. if these are fixed we can probably drop it down to the
  safe value of 0.03. I also limited the value to 0.2 instead of 0.5
  since players were exploiting this to desync/backtrack.
* set sv_maxunlag to 0.5. since cl_interp's max value was reduced,
  we can set the unlag window to be shorter. CSGO sets this to 0.2
  but that is much too aggressive, considering interp. 0.7 would be
  a value to decrease by for just the interp limit, but I have decreased
  by a further 0.2s ping to reduce the unlag window for laggy players
  and cheaters who are using this mechanic to backtrack.
* raise bandwidth from 80000 to 131072. this is the maximum value
  that seems to work well for most users across a variety of connections
  and routers, possibly due to some buffers or misconfigurations.
  196608 was tested over from CSGO but this had issues for some connections.
  this increase of rate ensures there are no delays with larger net transfers
  that usually occur in certain situations, such as when a player
  is teleported, spawned or switches spectator targets.
* enforce a minrate of the lowest rate selectable in CSGO
* enforce a max rate of the highest rate selectable in CSGO
  anything higher seems to cause problems with networking
* net_maxpacketdrop is set to 0. I'm not clued in why this was added
  in the first place, but effectively what this does is force you to
  drop even more packets whenever you drop packets from lossy internet.
---
 src/engine/baseserver.cpp                  | 2 +-
 src/engine/cl_bounded_cvars.cpp            | 4 ++--
 src/engine/net.h                           | 4 ++--
 src/engine/net_chan.cpp                    | 2 +-
 src/engine/sv_client.cpp                   | 2 +-
 src/game/client/cdll_bounded_cvars.cpp     | 4 ++--
 src/game/server/player.cpp                 | 4 ++--
 src/game/server/player_lagcompensation.cpp | 2 +-
 8 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/engine/baseserver.cpp b/src/engine/baseserver.cpp
index 17145a187..7822b10c5 100644
--- a/src/engine/baseserver.cpp
+++ b/src/engine/baseserver.cpp
@@ -2086,7 +2086,7 @@ CBaseClient *CBaseServer::CreateFakeClient( const char *name )
 	fakeclient->SetUserCVar( "rate", "30000" );
 	fakeclient->SetUserCVar( "cl_updaterate", "20" );
 	fakeclient->SetUserCVar( "cl_interp_ratio", "1.0" );
-	fakeclient->SetUserCVar( "cl_interp", "0.1" );
+	fakeclient->SetUserCVar( "cl_interp", "0.105" );
 	fakeclient->SetUserCVar( "cl_interpolate", "0" );
 	fakeclient->SetUserCVar( "cl_predict", "1" );
 	fakeclient->SetUserCVar( "cl_predictweapons", "1" );
diff --git a/src/engine/cl_bounded_cvars.cpp b/src/engine/cl_bounded_cvars.cpp
index 4ce0ddd74..e11401fc6 100644
--- a/src/engine/cl_bounded_cvars.cpp
+++ b/src/engine/cl_bounded_cvars.cpp
@@ -72,7 +72,7 @@ class CBoundedCvar_CmdRate : public ConVar_ServerBounded
 	CBoundedCvar_CmdRate() :
 	  ConVar_ServerBounded( 
 		  "cl_cmdrate", 
-		  "30", 
+		  "33", 
 		  FCVAR_ARCHIVE | FCVAR_USERINFO, 
 		  "Max number of command packets sent to server per second", true, MIN_CMD_RATE, true, MAX_CMD_RATE )
 	{
@@ -119,7 +119,7 @@ class CBoundedCvar_UpdateRate : public ConVar_ServerBounded
 	CBoundedCvar_UpdateRate() :
 	  ConVar_ServerBounded( 
 		  "cl_updaterate",
-		  "20", 
+		  "33", 
 		  FCVAR_ARCHIVE | FCVAR_USERINFO | FCVAR_NOT_CONNECTED, 
 		  "Number of packets per second of updates you are requesting from the server" )
 	{
diff --git a/src/engine/net.h b/src/engine/net.h
index 3c8762a11..1abd23493 100644
--- a/src/engine/net.h
+++ b/src/engine/net.h
@@ -19,9 +19,9 @@
 #include "proto_version.h"
 
 // Flow control bytes per second limits
-#define MAX_RATE		(1024*1024)				
+#define MAX_RATE		786432				
 #define MIN_RATE		1000
-#define DEFAULT_RATE	80000
+#define DEFAULT_RATE	131072
 
 #define SIGNON_TIME_OUT				300.0f  // signon disconnect timeout
 
diff --git a/src/engine/net_chan.cpp b/src/engine/net_chan.cpp
index 7b033739f..782d1c4db 100644
--- a/src/engine/net_chan.cpp
+++ b/src/engine/net_chan.cpp
@@ -44,7 +44,7 @@ static ConVar net_maxfilesize( "net_maxfilesize", "16", 0, "Maximum allowed file
 static ConVar net_compresspackets( "net_compresspackets", "1", 0, "Use compression on game packets." );
 static ConVar net_compresspackets_minsize( "net_compresspackets_minsize", "1024", 0, "Don't bother compressing packets below this size." );
 static ConVar net_maxcleartime( "net_maxcleartime", "4.0", 0, "Max # of seconds we can wait for next packets to be sent based on rate setting (0 == no limit)." );
-static ConVar net_maxpacketdrop( "net_maxpacketdrop", "5000", 0, "Ignore any packets with the sequence number more than this ahead (0 == no limit)" );
+static ConVar net_maxpacketdrop( "net_maxpacketdrop", "0", 0, "Ignore any packets with the sequence number more than this ahead (0 == no limit)" );
 
 extern ConVar net_maxroutable;
 
diff --git a/src/engine/sv_client.cpp b/src/engine/sv_client.cpp
index 6279a2672..3186e652a 100644
--- a/src/engine/sv_client.cpp
+++ b/src/engine/sv_client.cpp
@@ -39,7 +39,7 @@ extern CNetworkStringTableContainer *networkStringTableContainerServer;
 
 static ConVar	sv_timeout( "sv_timeout", "65", 0, "After this many seconds without a message from a client, the client is dropped" );
 static ConVar	sv_maxrate( "sv_maxrate", "0", FCVAR_REPLICATED, "Max bandwidth rate allowed on server, 0 == unlimited" );
-static ConVar	sv_minrate( "sv_minrate", "3500", FCVAR_REPLICATED, "Min bandwidth rate allowed on server, 0 == unlimited" );
+static ConVar	sv_minrate( "sv_minrate", "16000", FCVAR_REPLICATED, "Min bandwidth rate allowed on server, 0 == unlimited" );
        
        ConVar	sv_maxupdaterate( "sv_maxupdaterate", "66", FCVAR_REPLICATED, "Maximum updates per second that the server will allow" );
 	   ConVar	sv_minupdaterate( "sv_minupdaterate", "10", FCVAR_REPLICATED, "Minimum updates per second that the server will allow" );
diff --git a/src/game/client/cdll_bounded_cvars.cpp b/src/game/client/cdll_bounded_cvars.cpp
index d1114a381..83b2e9d4e 100644
--- a/src/game/client/cdll_bounded_cvars.cpp
+++ b/src/game/client/cdll_bounded_cvars.cpp
@@ -99,9 +99,9 @@ class CBoundedCvar_Interp : public ConVar_ServerBounded
 public:
 	CBoundedCvar_Interp() :
 	  ConVar_ServerBounded( "cl_interp", 
-		  "0.1", 
+		  "0.105", 
 		  FCVAR_USERINFO | FCVAR_NOT_CONNECTED | FCVAR_ARCHIVE, 
-		  "Sets the interpolation amount (bounded on low side by server interp ratio settings).", true, 0.0f, true, 0.5f )
+		  "Sets the interpolation amount (bounded on low side by server interp ratio settings).", true, 0.0f, true, 0.2f )
 	  {
 	  }
 
diff --git a/src/game/server/player.cpp b/src/game/server/player.cpp
index 7dea1f507..3d09586ae 100644
--- a/src/game/server/player.cpp
+++ b/src/game/server/player.cpp
@@ -593,8 +593,8 @@ CBasePlayer::CBasePlayer( )
 
 	m_hZoomOwner = NULL;
 
-	m_nUpdateRate = 20;  // cl_updaterate defualt
-	m_fLerpTime = 0.1f; // cl_interp default
+	m_nUpdateRate = 33;  // cl_updaterate defualt
+	m_fLerpTime = 0.105f; // cl_interp default
 	m_bPredictWeapons = true;
 	m_bLagCompensation = false;
 	m_flLaggedMovementValue = 1.0f;
diff --git a/src/game/server/player_lagcompensation.cpp b/src/game/server/player_lagcompensation.cpp
index 37f322e59..a9190253a 100644
--- a/src/game/server/player_lagcompensation.cpp
+++ b/src/game/server/player_lagcompensation.cpp
@@ -31,7 +31,7 @@ static ConVar sv_lagcompensation_teleport_dist( "sv_lagcompensation_teleport_dis
 #define LAG_COMPENSATION_ERROR_EPS_SQR ( 4.0f * 4.0f )
 
 ConVar sv_unlag( "sv_unlag", "1", FCVAR_DEVELOPMENTONLY, "Enables player lag compensation" );
-ConVar sv_maxunlag( "sv_maxunlag", "1.0", FCVAR_DEVELOPMENTONLY, "Maximum lag compensation in seconds", true, 0.0f, true, 1.0f );
+ConVar sv_maxunlag( "sv_maxunlag", "0.5", FCVAR_DEVELOPMENTONLY, "Maximum lag compensation in seconds", true, 0.0f, true, 1.0f );
 ConVar sv_lagflushbonecache( "sv_lagflushbonecache", "1", FCVAR_DEVELOPMENTONLY, "Flushes entity bone cache on lag compensation" );
 ConVar sv_showlagcompensation( "sv_showlagcompensation", "0", FCVAR_CHEAT, "Show lag compensated hitboxes whenever a player is lag compensated." );
 

From 4d29cb2dcb887cd7e269bd2165aa13ef39fa44b5 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Sun, 5 Mar 2023 11:58:13 -0500
Subject: [PATCH 26/42] gameplay: fix class limit bypass from selecting class
 to spawn after death

this adds a check to ForceRespawn to make sure that
the player indeed can choose this class

the bypass happens because while you are still dead,
you do not count against the class limit if you select
a new class to spawn as

so, two or more players can select the same class while
dead, putting the team over the limit

Ref: https://github.com/ValveSoftware/Source-1-Games/issues/2084
---
 src/game/server/tf/tf_player.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/game/server/tf/tf_player.cpp b/src/game/server/tf/tf_player.cpp
index 332d50259..f9a36e488 100644
--- a/src/game/server/tf/tf_player.cpp
+++ b/src/game/server/tf/tf_player.cpp
@@ -13893,6 +13893,14 @@ void CTFPlayer::ForceRespawn( void )
 		DropFlag();
 	}
 
+	// Prevent bypassing class limits. Whoever wins on the draw can spawn as this class,
+	// and anyone who comes after will get swapped back to their old class.
+	if (!TFGameRules()->CanPlayerChooseClass(this, iDesiredClass))
+	{
+		iDesiredClass = GetPlayerClass()->GetClassIndex();
+		ClientPrint( this, HUD_PRINTCENTER, "#TF_ClassLimitReached" ); // NOTE: Add localization string 
+	}
+
 	if ( GetPlayerClass()->GetClassIndex() != iDesiredClass )
 	{
 		// clean up any pipebombs/buildings in the world (no explosions)

From cc96cc9833b65a9c481d85b5791c466439f0ba69 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 25 Jul 2022 13:55:06 -0400
Subject: [PATCH 27/42] gameplay: fix switch from bonuses not considering
 deploy time bonuses

switch from bonus is only applied if the weapon we are switching
from has fully deployed. however, the calculation for "fully deployed"
was incorrect. it used the default switch time of 0.5, instead of also
taking into account weapon attributes which affect deploy time.

this affected the degreaser most notably, which would not
allow pyros to switch to their flare gun with increased switch speed
as soon as the degreaser deployed (0.33s).
instead, players would have to wait until 0.5s had passed before switching
which is unintuitive

ref: https://github.com/ValveSoftware/Source-1-Games/issues/3488
---
 src/game/shared/tf/tf_weaponbase.cpp | 33 +++++++++++++++++-----------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/game/shared/tf/tf_weaponbase.cpp b/src/game/shared/tf/tf_weaponbase.cpp
index 7ebd9b20a..93f97622b 100644
--- a/src/game/shared/tf/tf_weaponbase.cpp
+++ b/src/game/shared/tf/tf_weaponbase.cpp
@@ -1056,18 +1056,7 @@ bool CTFWeaponBase::Deploy( void )
 		CALL_ATTRIB_HOOK_FLOAT_ON_OTHER( pPlayer, flDeployTimeMultiplier, mult_deploy_time );
 		CALL_ATTRIB_HOOK_FLOAT( flDeployTimeMultiplier, mult_single_wep_deploy_time );
 
-		// don't apply mult_switch_from_wep_deploy_time attribute if the last weapon hasn't been deployed for more than 0.67 second to match to weapon script switch time
-		// unless the player latched to a hook target, then allow switching right away
-		CTFWeaponBase *pLastWeapon = dynamic_cast< CTFWeaponBase* >( pPlayer->GetLastWeapon() );
-		if ( pPlayer->GetGrapplingHookTarget() != NULL || ( pLastWeapon && gpGlobals->curtime - pLastWeapon->m_flLastDeployTime > flWeaponSwitchTime ) )
-		{
-			CALL_ATTRIB_HOOK_FLOAT_ON_OTHER( pLastWeapon, flDeployTimeMultiplier, mult_switch_from_wep_deploy_time );
-		}
-		
-		if ( pPlayer->m_Shared.InCond( TF_COND_BLASTJUMPING ) )
-		{
-			CALL_ATTRIB_HOOK_FLOAT( flDeployTimeMultiplier, mult_rocketjump_deploy_time );
-		}
+		CTFWeaponBase* pLastWeapon = static_cast<CTFWeaponBase*>(pPlayer->GetLastWeapon());
 
 		int iIsSword = 0;
 		CALL_ATTRIB_HOOK_INT_ON_OTHER( pLastWeapon, iIsSword, is_a_sword );
@@ -1078,6 +1067,11 @@ bool CTFWeaponBase::Deploy( void )
 			flDeployTimeMultiplier *= 1.75f;
 		}
 
+		if ( pPlayer->m_Shared.InCond( TF_COND_BLASTJUMPING ) )
+		{
+			CALL_ATTRIB_HOOK_FLOAT( flDeployTimeMultiplier, mult_rocketjump_deploy_time );
+		}
+
 #ifdef STAGING_ONLY
 		if ( pPlayer->m_Shared.InCond( TF_COND_TRANQ_SPY_BOOST ) )
 		{
@@ -1096,6 +1090,18 @@ bool CTFWeaponBase::Deploy( void )
 		{
 			CALL_ATTRIB_HOOK_FLOAT_ON_OTHER( pPlayer, flDeployTimeMultiplier, mod_medic_healed_deploy_time );
 		}
+
+		// Don't consider mult_switch_from_wep_deploy_time for base deploy time
+		// This avoid feedback loops and gets switch from bonuses in line with the default switch time
+		// This calculation replicates the flDeployTime calculation below
+		float flBaseDeployTime = flWeaponSwitchTime * MAX( flDeployTimeMultiplier, 0.00001f );
+
+		// don't apply mult_switch_from_wep_deploy_time attribute if the last weapon hasn't past its base deploy time
+		// unless the player latched to a hook target, then allow switching right away
+		if ( pPlayer->GetGrapplingHookTarget() != NULL || ( pLastWeapon && gpGlobals->curtime >= pLastWeapon->m_flLastDeployTime ) )
+		{
+			CALL_ATTRIB_HOOK_FLOAT_ON_OTHER( pLastWeapon, flDeployTimeMultiplier, mult_switch_from_wep_deploy_time );
+		}
 		
 		flDeployTimeMultiplier = MAX( flDeployTimeMultiplier, 0.00001f );
 		float flDeployTime = flWeaponSwitchTime * flDeployTimeMultiplier;
@@ -1116,7 +1122,8 @@ bool CTFWeaponBase::Deploy( void )
 
 		pPlayer->SetNextAttack( m_flNextPrimaryAttack );
 
-		m_flLastDeployTime = gpGlobals->curtime;
+		// Last deploy time now refers to the time we actually fully deployed, not the time we switched to the weapon
+		m_flLastDeployTime = gpGlobals->curtime + flBaseDeployTime;
 
 #ifdef GAME_DLL
 		// Reset our deploy-lifetime kill counter.

From 3403817511d2da3b72b50562446b54e42a470b57 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 25 Jul 2022 14:14:23 -0400
Subject: [PATCH 28/42] gameplay: movement speed modifiers were incorrect

stunned movement was not applying correctly to diagonal movement
because diagonal movement is a combination of two axes
they exceed max speed until clamped by CheckParameters

stunned movement slows were applying before this clamp,
so they did not properly slow the actual speed of the player
which could be much lower than the unclamped speed

in addition, high max speed boosts (like the Baby Face's Blaster)
did not apply unless the input speed was default speed
since stunned movement manipulates input speed directly
instead of changing player velocity, this broke the input speed boost
and thus scout's max speed would be capped higher but not have the
input speed to drive it.

with these changes, a 0.6 slow (as applied by the Natascha at close range),
slows down a full charge BFB Scout from 520 to 208 in all directions (40% of 520),
as intended and proper! And of course, normal Scout speed is slowed from
400 to 160 in all directions (40% of 400).

however, with this fix, the slow amount is drastically more effective
than it was before. so, we apply legacy handling to reduce slow amount
by a percentage value relative to the old effective scaling.

a scaling value was chosen to get a near average between the effective slow
applied with a 100% slow, and the full slow amount applied with the fix

note that there is still technically an issue with strafe tapping
boosting your movement speed, but this is a game movement mechanic which
I don't think is related to stuns, which seem to be intended to apply
to movement controls, not exactly correlating to output velocity.
or else stuns would not be a different movement mechanic vs. percentage
movement speed slows.

ref: https://github.com/ValveSoftware/Source-1-Games/issues/3721
---
 src/game/server/tf/tf_player.cpp       | 11 +++
 src/game/shared/gamemovement.cpp       |  3 +
 src/game/shared/tf/tf_gamemovement.cpp | 92 ++++++++++++++++++++++----
 3 files changed, 93 insertions(+), 13 deletions(-)

diff --git a/src/game/server/tf/tf_player.cpp b/src/game/server/tf/tf_player.cpp
index f9a36e488..0c4b83ec4 100644
--- a/src/game/server/tf/tf_player.cpp
+++ b/src/game/server/tf/tf_player.cpp
@@ -21623,6 +21623,17 @@ bool CTFPlayer::CanBreatheUnderwater() const
 	return false;
 }
 
+//-----------------------------------------------------------------------------
+// Purpose: Debug concommand to stun the player
+//-----------------------------------------------------------------------------
+void StunPlayer()
+{
+	CTFPlayer* pPlayer = ToTFPlayer(ToTFPlayer(UTIL_PlayerByIndex(1)));
+	float flStunAmount = 0.60f;
+	pPlayer->m_Shared.StunPlayer(10.0f, flStunAmount, TF_STUN_MOVEMENT, pPlayer);
+}
+static ConCommand cc_StunPlayer("tf_stun_player", StunPlayer, "Stuns you.", FCVAR_CHEAT);
+
 //-----------------------------------------------------------------------------
 // Purpose: 
 //-----------------------------------------------------------------------------
diff --git a/src/game/shared/gamemovement.cpp b/src/game/shared/gamemovement.cpp
index 0efcfa80c..ab6080dff 100644
--- a/src/game/shared/gamemovement.cpp
+++ b/src/game/shared/gamemovement.cpp
@@ -4550,7 +4550,10 @@ void CGameMovement::PlayerMove( void )
 {
 	VPROF( "CGameMovement::PlayerMove" );
 
+	// TF runs this with speed modifiers
+#if !defined(TF_DLL) && !defined(TF_CLIENT_DLL)
 	CheckParameters();
+#endif
 	
 	// clear output applied velocity
 	mv->m_outWishVel.Init();
diff --git a/src/game/shared/tf/tf_gamemovement.cpp b/src/game/shared/tf/tf_gamemovement.cpp
index 2e0f9c86c..d3d984f2e 100644
--- a/src/game/shared/tf/tf_gamemovement.cpp
+++ b/src/game/shared/tf/tf_gamemovement.cpp
@@ -39,7 +39,7 @@
 
 
 ConVar	tf_duck_debug_spew( "tf_duck_debug_spew", "0", FCVAR_REPLICATED | FCVAR_DEVELOPMENTONLY );
-ConVar	tf_showspeed( "tf_showspeed", "0", FCVAR_REPLICATED | FCVAR_DEVELOPMENTONLY );
+ConVar	tf_showspeed( "tf_showspeed", "0", FCVAR_REPLICATED | FCVAR_CHEAT, "1 = show speed during collisions, 2 = always show speed" );
 ConVar	tf_avoidteammates( "tf_avoidteammates", "1", FCVAR_REPLICATED | FCVAR_CHEAT | FCVAR_DEVELOPMENTONLY, "Controls how teammates interact when colliding.\n  0: Teammates block each other\n  1: Teammates pass through each other, but push each other away (default)" );
 ConVar	tf_avoidteammates_pushaway( "tf_avoidteammates_pushaway", "1", FCVAR_REPLICATED, "Whether or not teammates push each other away when occupying the same space" );
 ConVar  tf_solidobjects( "tf_solidobjects", "1", FCVAR_REPLICATED | FCVAR_CHEAT | FCVAR_DEVELOPMENTONLY );
@@ -317,6 +317,12 @@ void CTFGameMovement::ProcessMovement( CBasePlayer *pBasePlayer, CMoveData *pMov
 	// Handle charging demomens
 	ChargeMove();
 
+	// Handle scouts that can move really fast with buffs
+	HighMaxSpeedMove();
+
+	// Limit diagonal movement
+	CheckParameters();
+
 	// Handle player stun.
 	StunMove();
 
@@ -326,9 +332,6 @@ void CTFGameMovement::ProcessMovement( CBasePlayer *pBasePlayer, CMoveData *pMov
 	// Handle grappling hook move
 	GrapplingHookMove();
 
-	// Handle scouts that can move really fast with buffs
-	HighMaxSpeedMove();
-
 	// Run the command.
 	PlayerMove();
 
@@ -434,17 +437,19 @@ bool CTFGameMovement::GrapplingHookMove()
 	if ( tf_grapplinghook_use_acceleration.GetBool() )
 	{
 		// Use acceleration with dampening
-		float flSpeed = mv->m_vecVelocity.Length();
+		float flSpeed = mv->m_vecVelocity.LengthSqr();
 		if ( flSpeed > 0.f ) {
+			flSpeed = FastSqrt( flSpeed );
 			float flDampen = Min( tf_grapplinghook_dampening.GetFloat() * gpGlobals->frametime, flSpeed );
 			mv->m_vecVelocity *= ( flSpeed - flDampen ) / flSpeed;
 		}
 
 		mv->m_vecVelocity += vDesiredMove.Normalized() * ( tf_grapplinghook_acceleration.GetFloat() * gpGlobals->frametime );
 
-		flSpeed = mv->m_vecVelocity.Length();
+		flSpeed = mv->m_vecVelocity.LengthSqr();
 		if ( flSpeed > mv->m_flMaxSpeed )
 		{
+			flSpeed = FastSqrt( flSpeed );
 			mv->m_vecVelocity *= mv->m_flMaxSpeed / flSpeed;
 		}
 	}
@@ -530,6 +535,13 @@ bool CTFGameMovement::ChargeMove()
 	return true;
 }
 
+#ifdef STAGING_ONLY
+static ConVar tf_movement_stun_multiplier("tf_movement_stun_multiplier", "1", FCVAR_REPLICATED, "Multiplier for movement speed when stunned.");
+static ConVar tf_movement_stun_clip("tf_movement_stun_clip", "0.41421356237", FCVAR_REPLICATED, "Clip off stun amount.");
+#endif
+static ConVar tf_movement_stun_legacy_threshold("tf_movement_stun_legacy_threshold", "1.5", FCVAR_REPLICATED, "Relative point for legacy stun amount handling.");
+static ConVar tf_movement_stun_legacy_on_charge("tf_movement_stun_legacy_on_charge", "1", FCVAR_REPLICATED, "Always apply full stun to charging players.");
+
 //-----------------------------------------------------------------------------
 // Purpose: 
 //-----------------------------------------------------------------------------
@@ -565,6 +577,42 @@ bool CTFGameMovement::StunMove()
 
 	// Handle movement stuns
 	float flStunAmount = m_pTFPlayer->m_Shared.GetAmountStunned( TF_STUN_MOVEMENT );
+	if ( flStunAmount )
+	{
+		// Handle legacy clipping value. Before the fix to stunned movement, stuns were applied to diagonal movement of sqrt(2) magnitude.
+		// This means effectively that the stun would have to reduce past the ~141.4% movement speed to have any effect.
+		//
+		// So, a stun amount would at minimum need to be greater than ~0.414 to reduce movement speed below 100%.
+		// Since this effectively meant stuns were clipped, there was non-linear scaling of stun amount to actual movement speed reduction.
+		//
+		// A stun value of 0.414 (or below) would have 0% effective stun, 0.6 would have ~31% effective stun, 1.0 would have ~59% effective stun.
+		//
+		// This legacy handling has been added so that we get similar slow amounts for stuns that were previously applied, but
+		// also have them be linear and consistent from 0 to 1.
+		if ( tf_movement_stun_legacy_on_charge.GetBool() && m_pTFPlayer->m_Shared.InCond( TF_COND_SHIELD_CHARGE ) )
+		{
+			// Slow down charging players the full amount. Charging players never had diagonal movement, so
+			// they always got the full slow amount, which would end their charge. Being able to end
+			// their charge is incredibly important, so we don't want to change that.
+			flStunAmount = flStunAmount;
+		}
+		else if ( flStunAmount > tf_movement_stun_legacy_threshold.GetFloat() )
+		{
+			// For any stun amount greater than the threshold, we use the legacy clip behavior.
+			flStunAmount = max( flStunAmount - 0.41421356237f, 0.0f ); // Reduce by sqrt(2) - 1.0f (see above)
+		}
+		else
+		{
+#ifdef STAGING_ONLY
+			// For playing around with the scaling.
+			flStunAmount = max( flStunAmount - tf_movement_stun_clip.GetFloat(), 0.0f ) * tf_movement_stun_multiplier.GetFloat();
+#else
+			// This equation essentially calculates the percentage of the stun amount that was effectively applied to diagonal movement
+			// at a certain stun amount and applies that to all stun amounts consistently now.
+			flStunAmount *= ( ( -0.41421356237f / tf_movement_stun_legacy_threshold.GetFloat() ) + 1 );
+#endif
+		}
+	}
 	// Lerp to the desired amount
 	if ( flStunAmount )
 	{
@@ -1089,10 +1137,11 @@ void CTFGameMovement::PreventBunnyJumping()
 		return;
 
 	// Current player speed
-	float spd = mv->m_vecVelocity.Length();
-	if ( spd <= maxscaledspeed )
+	float spd = mv->m_vecVelocity.LengthSqr();
+	if ( spd <= maxscaledspeed * maxscaledspeed )
 		return;
 
+	spd = FastSqrt(spd);
 	// Apply this cropping fraction to velocity
 	float fraction = ( maxscaledspeed / spd );
 
@@ -1871,7 +1920,8 @@ void CTFGameMovement::WalkMove( void )
 	{
 		// Made it to the destination (remove the base velocity).
 		mv->SetAbsOrigin( trace.endpos );
-		VectorSubtract( mv->m_vecVelocity, player->GetBaseVelocity(), mv->m_vecVelocity );
+		Vector baseVelocity = player->GetBaseVelocity();
+		VectorSubtract( mv->m_vecVelocity, baseVelocity, mv->m_vecVelocity );
 
 		// Save the wish velocity.
 		mv->m_outWishVel += ( vecWishDirection * flWishSpeed );
@@ -1880,6 +1930,22 @@ void CTFGameMovement::WalkMove( void )
 		// NOTE YWB 7/5/07: Don't do this here, our version of CategorizePosition encompasses this test
 		// StayOnGround();
 
+#if 1
+		// Debugging!!!
+		Vector vecTestVelocity = mv->m_vecVelocity;
+		vecTestVelocity.z = 0.0f;
+		float flTestSpeed = VectorLength( vecTestVelocity );
+		if ( tf_showspeed.GetInt() == 2 && baseVelocity.IsZero() && ( flTestSpeed > ( mv->m_flMaxSpeed + 1.0f ) ) )
+		{
+			Msg( "Step Max Speed < %f\n", flTestSpeed );
+		}
+
+		if ( tf_showspeed.GetInt() == 2 )
+		{
+			Msg( "Speed = %f\n", flTestSpeed );
+		}
+#endif
+
 #ifdef CLIENT_DLL
 		// Track how far we moved (if we're a Scout or an Engineer carrying a building).
 		CTFPlayer* pTFPlayer = ToTFPlayer( player );
@@ -1918,19 +1984,19 @@ void CTFGameMovement::WalkMove( void )
 	// NOTE YWB 7/5/07: Don't do this here, our version of CategorizePosition encompasses this test
 	// StayOnGround();
 
-#if 0
+#if 1
 	// Debugging!!!
 	Vector vecTestVelocity = mv->m_vecVelocity;
 	vecTestVelocity.z = 0.0f;
 	float flTestSpeed = VectorLength( vecTestVelocity );
-	if ( baseVelocity.IsZero() && ( flTestSpeed > ( mv->m_flMaxSpeed + 1.0f ) ) )
+	if ( tf_showspeed.GetInt() == 1 && baseVelocity.IsZero() && ( flTestSpeed > ( mv->m_flMaxSpeed + 1.0f ) ) )
 	{
 		Msg( "Step Max Speed < %f\n", flTestSpeed );
 	}
 
-	if ( tf_showspeed.GetBool() )
+	if ( tf_showspeed.GetInt() == 1 )
 	{
-		Msg( "Speed=%f\n", flTestSpeed );
+		Msg( "Speed = %f\n", flTestSpeed );
 	}
 
 #endif

From efe9744c6fdcc9bbe34b9ebdf428f71ecd0b46b8 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 13 Mar 2023 11:26:53 -0400
Subject: [PATCH 29/42] perf(load): enable async MDL loading, speeds up map
 loads and reduces stutters

---
 src/datacache/mdlcache.cpp       | 8 +-------
 src/engine/modelloader.cpp       | 2 +-
 src/public/datacache/imdlcache.h | 6 ++++++
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/datacache/mdlcache.cpp b/src/datacache/mdlcache.cpp
index 8f1ff3650..859c636a8 100644
--- a/src/datacache/mdlcache.cpp
+++ b/src/datacache/mdlcache.cpp
@@ -78,12 +78,6 @@ namespace {
 #define MdlCacheMsg		if ( !LogMdlCache() ) ; else Msg
 #define MdlCacheWarning if ( !LogMdlCache() ) ; else Warning
 
-#if defined( _X360 )
-#define AsyncMdlCache() 0	// Explicitly OFF for 360 (incompatible)
-#else
-#define AsyncMdlCache() 0
-#endif
-
 #define ERROR_MODEL		"models/error.mdl"
 #define IDSTUDIOHEADER	(('T'<<24)+('S'<<16)+('D'<<8)+'I')
 
@@ -184,7 +178,7 @@ class CTempAllocHelper
 // ConVars
 //-----------------------------------------------------------------------------
 static ConVar r_rootlod( "r_rootlod", "0", FCVAR_ARCHIVE );
-static ConVar mod_forcedata( "mod_forcedata", ( AsyncMdlCache() ) ? "0" : "1",	0, "Forces all model file data into cache on model load." );
+static ConVar mod_forcedata( "mod_forcedata", ( AsyncMdlCache() && IsX360() ) ? "0" : "1",	0, "Forces all model file data into cache on model load." );
 static ConVar mod_test_not_available( "mod_test_not_available", "0", FCVAR_CHEAT );
 static ConVar mod_test_mesh_not_available( "mod_test_mesh_not_available", "0", FCVAR_CHEAT );
 static ConVar mod_test_verts_not_available( "mod_test_verts_not_available", "0", FCVAR_CHEAT );
diff --git a/src/engine/modelloader.cpp b/src/engine/modelloader.cpp
index 72a479d49..4835fea06 100644
--- a/src/engine/modelloader.cpp
+++ b/src/engine/modelloader.cpp
@@ -5022,7 +5022,7 @@ void CModelLoader::Studio_LoadModel( model_t *pModel, bool bTouchAllData )
 	if ( bLoadPhysics && !bPreLoaded )
 	{
 		// load the collision data now
-		bool bSynchronous = bTouchAllData;
+		bool bSynchronous = bTouchAllData && !AsyncMdlCache();
 		double t1 = Plat_FloatTime();
 		g_pMDLCache->GetVCollideEx( pModel->studio, bSynchronous );
 
diff --git a/src/public/datacache/imdlcache.h b/src/public/datacache/imdlcache.h
index 0f7093abe..6d9d31ec9 100644
--- a/src/public/datacache/imdlcache.h
+++ b/src/public/datacache/imdlcache.h
@@ -21,6 +21,12 @@
 
 #include "appframework/IAppSystem.h"
 
+#if defined( _X360 )
+#define AsyncMdlCache() 0	// Explicitly OFF for 360 (incompatible)
+#else
+#define AsyncMdlCache() 1
+#endif
+
 //-----------------------------------------------------------------------------
 // Forward declarations
 //-----------------------------------------------------------------------------

From bba6420c7c0d9319e4ce490150931afc7094cd30 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 13 Mar 2023 11:28:52 -0400
Subject: [PATCH 30/42] perf: disable affinity

this was done in DOTA, let the OS handle where threads
should go, we can't really determine what cores will
be ideal given that hardware is so varied.
---
 src/vstdlib/jobthread.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/vstdlib/jobthread.cpp b/src/vstdlib/jobthread.cpp
index 46d843e04..e7a7c0a54 100644
--- a/src/vstdlib/jobthread.cpp
+++ b/src/vstdlib/jobthread.cpp
@@ -1023,6 +1023,7 @@ bool CThreadPool::Start( const ThreadPoolStartParams_t &startParams, const char
 
 void CThreadPool::Distribute( bool bDistribute, int *pAffinityTable )
 {
+#ifdef _X360
 	if ( bDistribute )
 	{
 		const CPUInformation &ci = *GetCPUInformation();
@@ -1104,6 +1105,7 @@ void CThreadPool::Distribute( bool bDistribute, int *pAffinityTable )
 		}
 #endif
 	}
+#endif
 }
 
 //---------------------------------------------------------

From 481f5eb6e98c33f8e6e188d83e95e24d7600e358 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 13 Mar 2023 11:40:30 -0400
Subject: [PATCH 31/42] perf: backport GetPointContents_WorldOnly from CSGO

we only query water in these cases, and we don't have
a situation where querying the world only would be
inaccurate

this fixes an issue where bones would be set up in
prediction due to a spatial query update which caused
attached cosmetics to re-evaluate their origin according
to their attachment

because prediction changes the time, this would flip flop
bone setup multiple times per frame
---
 src/engine/enginetrace.cpp              | 11 ++++++++++-
 src/game/client/c_baseplayer.cpp        |  2 +-
 src/game/shared/gamemovement.cpp        |  4 ++--
 src/game/shared/physics_main_shared.cpp | 15 ++++++++++++---
 src/public/engine/IEngineTrace.h        |  3 +++
 5 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/src/engine/enginetrace.cpp b/src/engine/enginetrace.cpp
index 54ac243a1..6af21f01b 100644
--- a/src/engine/enginetrace.cpp
+++ b/src/engine/enginetrace.cpp
@@ -73,7 +73,7 @@ abstract_class CEngineTrace : public IEngineTrace
 	CEngineTrace() { m_pRootMoveParent = NULL; }
 	// Returns the contents mask at a particular world-space position
 	virtual int		GetPointContents( const Vector &vecAbsPosition, IHandleEntity** ppEntity );
-
+	virtual int GetPointContents_WorldOnly( const Vector &vecAbsPosition );
 	virtual int		GetPointContents_Collideable( ICollideable *pCollide, const Vector &vecAbsPosition );
 
 	// Traces a ray against a particular edict
@@ -375,6 +375,15 @@ class CPointContentsEnum : public IPartitionEnumerator
 	Vector m_Pos;
 };
 
+//-----------------------------------------------------------------------------
+// Returns the world contents
+//-----------------------------------------------------------------------------
+int	CEngineTrace::GetPointContents_WorldOnly( const Vector &vecAbsPosition )
+{
+	int nContents = CM_PointContents( vecAbsPosition, 0 );
+
+	return nContents;
+}
 
 //-----------------------------------------------------------------------------
 // Returns the contents mask at a particular world-space position
diff --git a/src/game/client/c_baseplayer.cpp b/src/game/client/c_baseplayer.cpp
index 1aadef5c2..6d216eee8 100644
--- a/src/game/client/c_baseplayer.cpp
+++ b/src/game/client/c_baseplayer.cpp
@@ -499,7 +499,7 @@ bool C_BasePlayer::AudioStateIsUnderwater( Vector vecMainViewOrigin )
 	if ( IsObserver() )
 	{
 		// Just check the view position
-		int cont = enginetrace->GetPointContents ( vecMainViewOrigin );
+		int cont = enginetrace->GetPointContents_WorldOnly ( vecMainViewOrigin );
 		return (cont & MASK_WATER);
 	}
 
diff --git a/src/game/shared/gamemovement.cpp b/src/game/shared/gamemovement.cpp
index ab6080dff..f8f88b403 100644
--- a/src/game/shared/gamemovement.cpp
+++ b/src/game/shared/gamemovement.cpp
@@ -3490,7 +3490,7 @@ int CGameMovement::GetPointContentsCached( const Vector &point, int slot )
 
 		if ( m_CachedGetPointContents[ idx ][ slot ] == -9999 || point.DistToSqr( m_CachedGetPointContentsPoint[ idx ][ slot ] ) > 1 )
 		{
-			m_CachedGetPointContents[ idx ][ slot ] = enginetrace->GetPointContents ( point );
+			m_CachedGetPointContents[ idx ][ slot ] = enginetrace->GetPointContents_WorldOnly ( point );
 			m_CachedGetPointContentsPoint[ idx ][ slot ] = point;
 		}
 		
@@ -3498,7 +3498,7 @@ int CGameMovement::GetPointContentsCached( const Vector &point, int slot )
 	}
 	else
 	{
-		return enginetrace->GetPointContents ( point );
+		return enginetrace->GetPointContents_WorldOnly ( point );
 	}
 }
 
diff --git a/src/game/shared/physics_main_shared.cpp b/src/game/shared/physics_main_shared.cpp
index c3719d77b..bf826be4b 100644
--- a/src/game/shared/physics_main_shared.cpp
+++ b/src/game/shared/physics_main_shared.cpp
@@ -1130,6 +1130,15 @@ unsigned int CBaseEntity::PhysicsSolidMaskForEntity( void ) const
 	return MASK_SOLID;
 }
 
+static inline int GetWaterContents( const Vector &point )
+{
+#ifdef HL2_DLL
+	return UTIL_PointContents(point);
+#else
+	// left 4 dead doesn't support moveable water brushes, only world water
+	return enginetrace->GetPointContents_WorldOnly(point);
+#endif
+}
 
 //-----------------------------------------------------------------------------
 // Computes the water level + type
@@ -1146,7 +1155,7 @@ void CBaseEntity::UpdateWaterState()
 
 	SetWaterLevel( 0 );
 	SetWaterType( CONTENTS_EMPTY );
-	int cont = UTIL_PointContents (point);
+	int cont = GetWaterContents (point);
 
 	if (( cont & MASK_WATER ) == 0)
 		return;
@@ -1164,14 +1173,14 @@ void CBaseEntity::UpdateWaterState()
 		// Check the exact center of the box
 		point[2] = WorldSpaceCenter().z;
 
-		int midcont = UTIL_PointContents (point);
+		int midcont = GetWaterContents (point);
 		if ( midcont & MASK_WATER )
 		{
 			// Now check where the eyes are...
 			SetWaterLevel( 2 );
 			point[2] = EyePosition().z;
 
-			int eyecont = UTIL_PointContents (point);
+			int eyecont = GetWaterContents (point);
 			if ( eyecont & MASK_WATER )
 			{
 				SetWaterLevel( 3 );
diff --git a/src/public/engine/IEngineTrace.h b/src/public/engine/IEngineTrace.h
index 6e4977190..b16ef36a1 100644
--- a/src/public/engine/IEngineTrace.h
+++ b/src/public/engine/IEngineTrace.h
@@ -130,6 +130,9 @@ abstract_class IEngineTrace
 public:
 	// Returns the contents mask + entity at a particular world-space position
 	virtual int		GetPointContents( const Vector &vecAbsPosition, IHandleEntity** ppEntity = NULL ) = 0;
+
+	// Returns the contents mask of the world only @ the world-space position (static props are ignored)
+	virtual int		GetPointContents_WorldOnly( const Vector &vecAbsPosition ) = 0;
 	
 	// Get the point contents, but only test the specific entity. This works
 	// on static props and brush models.

From 92f8de03b807a42875d0fa37b5a901c8ab0fb604 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 13 Mar 2023 11:42:13 -0400
Subject: [PATCH 32/42] perf: backport IdealPitch optimization from CSGO

removes ideal pitch, could not see any relevant code path being hit

this is relevant because it causes a setup bones in spatial query
from the trace during prediction, which similarly to in
the water contents optimization, would cause a lot of setup bones
---
 src/game/client/prediction.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/game/client/prediction.cpp b/src/game/client/prediction.cpp
index f81e79d59..d47f5718f 100644
--- a/src/game/client/prediction.cpp
+++ b/src/game/client/prediction.cpp
@@ -1708,8 +1708,10 @@ void CPrediction::_Update( bool received_new_world_update, bool validframe,
 	Assert( C_BaseEntity::IsAbsQueriesValid() );
 	
 	// FIXME: What about hierarchy here?!?
+#if 0 // Where is this ever used?
 	SetIdealPitch( localPlayer, localPlayer->GetLocalOrigin(), localPlayer->GetLocalAngles(), localPlayer->m_vecViewOffset );
 #endif
+#endif
 }
 
 

From e0a134ece7f1dbeebcc3c791d2fbf1dc44dcf89b Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 13 Mar 2023 11:43:38 -0400
Subject: [PATCH 33/42] perf: don't force material system config update on exec

this causes a duplicate material system reload on init
and also slows down config execution. material system
config updates are already checked per frame, I'm not
exactly sure why this is here but it may cause regressions
that I am not aware about
---
 src/engine/cmd.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/engine/cmd.cpp b/src/engine/cmd.cpp
index 20beda09a..830fcfd87 100644
--- a/src/engine/cmd.cpp
+++ b/src/engine/cmd.cpp
@@ -685,7 +685,7 @@ void Cmd_Exec_f( const CCommand &args )
 		}
 	}
 	// force any queued convar changes to flush before reading/writing them
-	UpdateMaterialSystemConfig();
+	//UpdateMaterialSystemConfig();
 }
 
 

From 8d95a3347210669488f8c0d1a78c365e29e14894 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 13 Mar 2023 11:58:28 -0400
Subject: [PATCH 34/42] perf: backport int networking optimization from CSGO

---
 src/engine/dt_encode.cpp | 40 ++++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

diff --git a/src/engine/dt_encode.cpp b/src/engine/dt_encode.cpp
index fdd0639ae..16ab72e04 100644
--- a/src/engine/dt_encode.cpp
+++ b/src/engine/dt_encode.cpp
@@ -221,31 +221,20 @@ void DecodeInfo::CopyVars( const DecodeInfo *pOther )
 // ---------------------------------------------------------------------------------------- //
 
 void Int_Encode( const unsigned char *pStruct, DVariant *pVar, const SendProp *pProp, bf_write *pOut, int objectID )
-{
-	int nValue = pVar->m_Int;
-	
+{	
 	if ( pProp->GetFlags() & SPROP_VARINT)
 	{
 		if ( pProp->GetFlags() & SPROP_UNSIGNED )
 		{
-			pOut->WriteVarInt32( nValue );
+			pOut->WriteVarInt32( pVar->m_Int );
 		}
 		else
 		{
-			pOut->WriteSignedVarInt32( nValue );
+			pOut->WriteSignedVarInt32( pVar->m_Int );
 		}
 	}
 	else
 	{
-		// If signed, preserve lower bits and then re-extend sign if nValue < 0;
-		// if unsigned, preserve all 32 bits no matter what. Bonus: branchless.
-		int nPreserveBits = ( 0x7FFFFFFF >> ( 32 - pProp->m_nBits ) );
-		nPreserveBits |= ( pProp->GetFlags() & SPROP_UNSIGNED ) ? 0xFFFFFFFF : 0;
-		int nSignExtension = ( nValue >> 31 ) & ~nPreserveBits;
-
-		nValue &= nPreserveBits;
-		nValue |= nSignExtension;
-
 #ifdef DBGFLAG_ASSERT
 		// Assert that either the property is unsigned and in valid range,
 		// or signed with a consistent sign extension in the high bits
@@ -253,21 +242,28 @@ void Int_Encode( const unsigned char *pStruct, DVariant *pVar, const SendProp *p
 		{
 			if ( pProp->GetFlags() & SPROP_UNSIGNED )
 			{
-				AssertMsg3( nValue == pVar->m_Int, "Unsigned prop %s needs more bits? Expected %i == %i", pProp->GetName(), nValue, pVar->m_Int );
+				int32 nMaskedValue = pVar->m_Int;
+				nMaskedValue &= (1u << pProp->m_nBits) - 1;
+				Assert(nMaskedValue == pVar->m_Int);
 			}
 			else 
 			{
-				AssertMsg3( nValue == pVar->m_Int, "Signed prop %s needs more bits? Expected %i == %i", pProp->GetName(), nValue, pVar->m_Int );
+				int32 nSignExtendedValue = pVar->m_Int;
+				nSignExtendedValue <<= 32 - pProp->m_nBits;
+				nSignExtendedValue >>= 32 - pProp->m_nBits;
+				Assert(nSignExtendedValue == pVar->m_Int);
 			}
 		}
+#endif
+
+		if (pProp->IsSigned())
+		{
+			pOut->WriteSBitLong(pVar->m_Int, pProp->m_nBits);
+		}
 		else
 		{
-			// This should never trigger, but I'm leaving it in for old-time's sake.
-			Assert( nValue == pVar->m_Int );
+			pOut->WriteUBitLong((unsigned int)pVar->m_Int, pProp->m_nBits);
 		}
-#endif
-
-		pOut->WriteUBitLong( nValue, pProp->m_nBits, false );
 	}
 }
 
@@ -322,7 +318,7 @@ int Int_CompareDeltas( const SendProp *pProp, bf_read *p1, bf_read *p2 )
 		return p1->ReadSignedVarInt32() != p2->ReadSignedVarInt32();
 	}
 
-	return p1->CompareBits(p2, pProp->m_nBits);
+	return p1->ReadUBitLong( pProp->m_nBits ) != p2->ReadUBitLong( pProp->m_nBits );
 }
 
 const char* Int_GetTypeNameString()

From f1885810e3d8be71e450c625465e7e641fd2a8fa Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 13 Mar 2023 12:10:17 -0400
Subject: [PATCH 35/42] perf: shared object find optimizations

when an econ item view goes looking for a
econ item, it looks through the CSharedObjectTypeCache
ideally, this would maintain a map instead of looking up
with O(n) search, but that's a more complicated and scary
change

instead, skip the amount of virtual GetTypeID calls since
SharedObject::BIsKeyEqual is only used in CSharedObjectTypeCache
so the equal types are guaranteed. on GC, BIsKeyEqual is used in
CSharedObjectTransaction as well, so we keep the old check for it.

also move to a static_cast for CEconItem to be explicit/efficient with casting
---
 src/game/shared/econ/econ_item.cpp | 2 +-
 src/gcsdk/sharedobject.cpp         | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/game/shared/econ/econ_item.cpp b/src/game/shared/econ/econ_item.cpp
index a5ec004fa..389a5e832 100644
--- a/src/game/shared/econ/econ_item.cpp
+++ b/src/game/shared/econ/econ_item.cpp
@@ -1509,7 +1509,7 @@ bool CEconItem::BAddDestroyToMessage( std::string *pBuffer ) const
 bool CEconItem::BIsKeyLess( const CSharedObject & soRHS ) const
 {
 	Assert( GetTypeID() == soRHS.GetTypeID() );
-	const CEconItem & soSchemaRHS = (const CEconItem &)soRHS;
+	const CEconItem & soSchemaRHS = static_cast<const CEconItem &>(soRHS);
 
 	return m_ulID < soSchemaRHS.m_ulID;
 }
diff --git a/src/gcsdk/sharedobject.cpp b/src/gcsdk/sharedobject.cpp
index 5b9c4e235..3baf0aa0a 100644
--- a/src/gcsdk/sharedobject.cpp
+++ b/src/gcsdk/sharedobject.cpp
@@ -113,8 +113,13 @@ const char *CSharedObject::PchClassUpdateNodeName( int nTypeID )
 bool CSharedObject::BIsKeyEqual( const CSharedObject & soRHS ) const
 {
 	// Make sure they are the same type.
+#ifdef GC
 	if ( GetTypeID() != soRHS.GetTypeID() )
 		return false;
+#else
+	// BIsKeyEqual is only used for objects of the same type within their CSharedObjectTypeCache.
+	Assert ( GetTypeID() == soRHS.GetTypeID() );
+#endif
 
 	return !BIsKeyLess( soRHS ) && !soRHS.BIsKeyLess( *this );
 }

From 0ac37364e2e4e0ddf89c5fa0fa5501ea9e1899d4 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 13 Mar 2023 12:21:18 -0400
Subject: [PATCH 36/42] perf: skip costly per-frame RTTI for players and
 weapons

---
 src/game/shared/tf/tf_viewmodel.cpp | 74 ++++++++++++++++-------------
 1 file changed, 41 insertions(+), 33 deletions(-)

diff --git a/src/game/shared/tf/tf_viewmodel.cpp b/src/game/shared/tf/tf_viewmodel.cpp
index c3d2231d6..bcd5dcbda 100644
--- a/src/game/shared/tf/tf_viewmodel.cpp
+++ b/src/game/shared/tf/tf_viewmodel.cpp
@@ -384,8 +384,8 @@ class CViewModelInvisProxy : public CBaseInvisMaterialProxy
 	virtual void OnBind( C_BaseEntity *pC_BaseEntity );
 };
 
-#define TF_VM_MIN_INVIS		0.22
-#define TF_VM_MAX_INVIS		0.5
+#define TF_VM_MIN_INVIS		0.22f
+#define TF_VM_MAX_INVIS		0.5f
 
 //-----------------------------------------------------------------------------
 // Purpose: 
@@ -475,6 +475,9 @@ class CInvisProxy : public CBaseInvisMaterialProxy
 {
 public:
 	virtual void OnBind( C_BaseEntity *pC_BaseEntity ) OVERRIDE;
+private:
+	CTFPlayer *pPlayer = NULL;
+	C_BaseEntity *pCachedEntity = NULL;
 };
 
 //-----------------------------------------------------------------------------
@@ -487,59 +490,64 @@ void CInvisProxy::OnBind( C_BaseEntity *pC_BaseEntity )
 
 	C_BaseEntity *pEnt = pC_BaseEntity;
 
-	CTFPlayer *pPlayer = NULL;
-
-	// Check if we have a move parent and if it's a player
-	C_BaseEntity *pMoveParent = pEnt->GetMoveParent();
-	if ( pMoveParent && pMoveParent->IsPlayer() )
+	if ( pEnt != pCachedEntity )
 	{
-		pPlayer = ToTFPlayer( pMoveParent );
+		pPlayer = NULL;
+		pCachedEntity = pEnt;
 	}
 
-	// If it's not a player then check for viewmodel.
 	if ( !pPlayer )
 	{
-		CBaseEntity *pEntParent = pMoveParent ? pMoveParent : pEnt;
-
-		CTFViewModel *pVM = dynamic_cast<CTFViewModel *>( pEntParent );
-		if ( pVM )
+		// Check if we have a move parent and if it's a player
+		C_BaseEntity *pMoveParent = pEnt->GetMoveParent();
+		if ( pMoveParent && pMoveParent->IsPlayer() )
 		{
-			pPlayer = ToTFPlayer( pVM->GetOwner() );
+			pPlayer = ToTFPlayer( pMoveParent );
 		}
-	}
-	
-	if ( !pPlayer )
-	{
-		if ( pEnt->IsPlayer() )
+		// If it's not a player then check for viewmodel.
+		if ( !pPlayer )
 		{
-			pPlayer = dynamic_cast<C_TFPlayer*>( pEnt );
+			CBaseEntity *pEntParent = pMoveParent ? pMoveParent : pEnt;
+
+			CTFViewModel *pVM = dynamic_cast<CTFViewModel *>( pEntParent );
+			if ( pVM )
+			{
+				pPlayer = ToTFPlayer( pVM->GetOwner() );
+			}
 		}
-		else
+		
+		if ( !pPlayer )
 		{
-			IHasOwner *pOwnerInterface = dynamic_cast<IHasOwner*>( pEnt );
-			if ( pOwnerInterface )
+			if ( pEnt->IsPlayer() )
+			{
+				pPlayer = dynamic_cast<C_TFPlayer*>( pEnt );
+			}
+			else
 			{
-				pPlayer = ToTFPlayer( pOwnerInterface->GetOwnerViaInterface() );
+				IHasOwner *pOwnerInterface = dynamic_cast<IHasOwner*>( pEnt );
+				if ( pOwnerInterface )
+				{
+					pPlayer = ToTFPlayer( pOwnerInterface->GetOwnerViaInterface() );
+				}
 			}
 		}
-	}
-	
-	if ( !pPlayer )
-	{
-		m_pPercentInvisible->SetFloatValue( 0.0f );
-		return;
+		
+		if ( !pPlayer )
+		{
+			m_pPercentInvisible->SetFloatValue( 0.0f );
+			return;
+		}
 	}
 
 	// If we're the local player, use the old "vm_invis" code. Otherwise, use the "weapon_invis".
 	if ( pPlayer->IsLocalPlayer() )
 	{
 		float flPercentInvisible = pPlayer->GetPercentInvisible();
-		float flWeaponInvis = flPercentInvisible;
 
 		// remap from 0.22 to 0.5
 		// but drop to 0.0 if we're not invis at all
-		flWeaponInvis = ( flPercentInvisible < 0.01 ) ?
-			0.0 :
+		float flWeaponInvis = ( flPercentInvisible < 0.01f ) ?
+			0.0f :
 		RemapVal( flPercentInvisible, 0.0, 1.0, TF_VM_MIN_INVIS, TF_VM_MAX_INVIS );
 
 		// Exaggerated blink effect on bump.

From 1f38e57f2fcfd738c87f8876ef118ab7f37dcbd1 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 13 Mar 2023 12:25:41 -0400
Subject: [PATCH 37/42] perf: disable COM_TimestampedLog

within threaded particles, there was a lot of traffic on a log
mutex, and that was slowing things down significantly

I'm not exactly sure why this was triggering, as the code
seems to prevent this from going off unless
there is a -profile or -etwprofile argument, will investigate
later.

another option is also removing the COM_TimestampedLog
from just the particles function, but who knows what else
this could be slowing down
---
 src/tier0/dbg.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/tier0/dbg.cpp b/src/tier0/dbg.cpp
index ce01801f1..d10e4df04 100644
--- a/src/tier0/dbg.cpp
+++ b/src/tier0/dbg.cpp
@@ -870,6 +870,7 @@ void ValidateSpew( CValidator &validator )
 //-----------------------------------------------------------------------------
 void COM_TimestampedLog( char const *fmt, ... )
 {
+#ifdef _DEBUG
 	static float s_LastStamp = 0.0;
 	static bool s_bShouldLog = false;
 	static bool s_bShouldLogToETW = false;
@@ -925,6 +926,7 @@ void COM_TimestampedLog( char const *fmt, ... )
 	}
 
 	s_LastStamp = curStamp;
+#endif
 }
 
 //-----------------------------------------------------------------------------

From ed58eae7742c4156752338f364015bc6d1a84e25 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 13 Mar 2023 12:26:44 -0400
Subject: [PATCH 38/42] perf: do not declare convars inline with runtime code

a weird misusage of the convar constructor, should be done at init
instead
---
 src/engine/l_studio.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/engine/l_studio.cpp b/src/engine/l_studio.cpp
index ef96ab929..a3ab707c0 100644
--- a/src/engine/l_studio.cpp
+++ b/src/engine/l_studio.cpp
@@ -3179,6 +3179,11 @@ int CModelRender::DrawStaticPropArrayFast( StaticPropRenderInfo_t *pProps, int c
 #endif // SWDS
 }
 
+#ifndef SWDS
+static ConVar r_shadowlod("r_shadowlod", "-1");
+static ConVar r_shadowlodbias("r_shadowlodbias", "2");
+#endif
+
 //-----------------------------------------------------------------------------
 // Shadow rendering
 //-----------------------------------------------------------------------------
@@ -3186,8 +3191,7 @@ matrix3x4_t* CModelRender::DrawModelShadowSetup( IClientRenderable *pRenderable,
 {
 #ifndef SWDS
 	DrawModelInfo_t &info = *pInfo;
-	static ConVar r_shadowlod("r_shadowlod", "-1");
-	static ConVar r_shadowlodbias("r_shadowlodbias", "2");
+	
 
 	model_t const* pModel = pRenderable->GetModel();
 	if ( !pModel )

From 94fdfd445bde72ae5296df19b3720983ba4dfe43 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Mon, 13 Mar 2023 12:39:16 -0400
Subject: [PATCH 39/42] perf: enable rate limiting water bullet impact effects

---
 src/game/shared/tf/tf_player_shared.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/game/shared/tf/tf_player_shared.cpp b/src/game/shared/tf/tf_player_shared.cpp
index 788310cab..79c58ec21 100644
--- a/src/game/shared/tf/tf_player_shared.cpp
+++ b/src/game/shared/tf/tf_player_shared.cpp
@@ -10095,8 +10095,8 @@ void CTFPlayer::FireBullet( CTFWeaponBase *pWpn, const FireBulletsInfo_t &info,
 }
 
 #ifdef CLIENT_DLL
-static ConVar tf_impactwatertimeenable( "tf_impactwatertimeenable", "0", FCVAR_CHEAT, "Draw impact debris effects." );
-static ConVar tf_impactwatertime( "tf_impactwatertime", "1.0f", FCVAR_CHEAT, "Draw impact debris effects." );
+static ConVar tf_impactwatertimeenable( "tf_impactwatertimeenable", "1", 0, "Rate limit bullet impact effects on water." );
+static ConVar tf_impactwatertime( "tf_impactwatertime", "0.2f", 0, "The interval between bullet impact effects on water." );
 #endif
 
 //-----------------------------------------------------------------------------

From 75f42245c2e6647c019fd4c1a2f75a274abeba30 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Sun, 19 Mar 2023 06:54:06 -0400
Subject: [PATCH 40/42] pending: experimental changes

---
 src/engine/cmodel.cpp                         |  2 +-
 src/engine/engine.vpc                         |  2 +-
 src/engine/enginetool.cpp                     |  2 +-
 src/engine/gl_rsurf.cpp                       | 35 ++++++++++++++-
 src/engine/host.cpp                           | 21 ++++++---
 src/engine/host.h                             |  4 +-
 src/engine/l_studio.cpp                       | 20 ++++-----
 src/engine/modelloader.cpp                    |  8 +---
 src/engine/net_ws.cpp                         | 10 ++++-
 src/engine/vgui_baseui_interface.cpp          |  6 ++-
 src/game/client/c_vguiscreen.cpp              |  2 +
 src/game/client/game_controls/MapOverview.cpp |  6 +--
 .../client/game_controls/baseviewport.cpp     |  4 +-
 src/game/client/hud_base_account.cpp          |  6 +--
 src/game/client/hud_controlpointicons.cpp     |  2 +-
 src/game/client/menu.cpp                      | 12 ++---
 src/game/client/perfvisualbenchmark.cpp       |  2 +-
 .../replay/vgui/replayperformanceeditor.cpp   |  2 +-
 src/game/client/tf/c_tf_player.cpp            | 24 ++++++++--
 src/game/client/tf/tf_hud_arena_vs_panel.cpp  |  2 +-
 src/game/client/tf/tf_hud_escort.cpp          |  6 +--
 src/game/client/tf/tf_hud_flagstatus.cpp      | 18 ++++----
 src/game/client/tf/tf_hud_itemeffectmeter.cpp |  4 +-
 .../tf/tf_hud_mann_vs_machine_status.cpp      |  2 +-
 src/game/client/tf/tf_hud_match_status.cpp    |  2 +-
 src/game/client/tf/tf_hud_passtime.cpp        |  4 +-
 src/game/client/tf/tf_hud_playerstatus.cpp    |  4 +-
 src/game/client/tf/tf_hud_pve_winpanel.cpp    |  2 +-
 .../tf/tf_hud_robot_destruction_status.cpp    | 22 +++++-----
 src/game/client/tf/tf_hud_scope.cpp           |  1 +
 src/game/client/tf/tf_hud_target_id.cpp       |  6 +--
 src/game/client/tf/tf_hud_tournament.cpp      | 44 +++++++++----------
 src/game/client/tf/tf_time_panel.cpp          |  2 +-
 ..._matchmaking_dashboard_next_map_voting.cpp |  8 +++-
 .../client/tf/vgui/tf_playermodelpanel.cpp    |  4 ++
 src/game/client/tf/vgui/tf_training_ui.cpp    | 16 +++----
 src/game/client/viewrender.cpp                |  2 +-
 src/game/shared/econ/econ_item_inventory.cpp  | 12 ++---
 src/game/shared/econ/econ_item_schema.cpp     | 12 ++---
 src/game/shared/econ/econ_item_system.cpp     |  3 +-
 src/game/shared/econ/econ_item_view.cpp       |  3 ++
 src/game/shared/econ/econ_item_view.h         | 36 +++++++++++++++
 src/game/shared/teamplay_round_timer.cpp      |  3 ++
 .../shared/teamplayroundbased_gamerules.cpp   |  2 +-
 src/game/shared/tf/tf_gamerules.h             |  2 +-
 src/game/shared/tf/tf_viewmodel.cpp           |  4 +-
 src/game/shared/tf/tf_weapon_sniperrifle.cpp  | 14 +++---
 src/gameui/GameUI_Interface.cpp               |  4 ++
 src/inputsystem/inputsystem.cpp               |  2 +-
 src/mathlib/mathlib_base.cpp                  |  4 +-
 src/public/bone_setup.cpp                     |  4 +-
 src/public/collisionutils.cpp                 | 10 ++---
 src/public/collisionutils.h                   | 25 ++++++++---
 src/public/mathlib/ssemath.h                  |  3 +-
 src/public/tier1/strtools.h                   | 29 +++++++++++-
 .../vgui_controls/AnimationController.h       |  6 +--
 src/public/vgui_controls/EditablePanel.h      |  8 ++--
 src/public/vgui_controls/Label.h              |  2 +
 src/public/vgui_controls/Panel.h              |  2 +
 .../DirectXMath-dec2022/Inc/DirectXMath.h     |  4 --
 src/thirdparty/quickhull/quickhull.vpc        |  2 +-
 src/tier1/utlbuffer.cpp                       |  2 +-
 .../vgui_controls/AnimationController.cpp     | 22 ++++++----
 src/vgui2/vgui_controls/EditablePanel.cpp     | 24 ++++++++--
 src/vgui2/vgui_controls/Label.cpp             | 43 ++++++++++++++----
 65 files changed, 411 insertions(+), 195 deletions(-)

diff --git a/src/engine/cmodel.cpp b/src/engine/cmodel.cpp
index 466d4451d..2647fe30b 100644
--- a/src/engine/cmodel.cpp
+++ b/src/engine/cmodel.cpp
@@ -1570,7 +1570,7 @@ void FASTCALL CM_TraceToLeaf( TraceInfo_t * RESTRICT pTraceInfo, int ndxLeaf, fl
 		pCounters = pTraceInfo->GetDispCounters();
 		count = pTraceInfo->GetCount();
 
-		if (IsX360())
+		if (IsX360() || 1)
 		{
 			// set up some relatively constant variables we'll use in the loop below
 			fltx4 traceStart = LoadUnaligned3SIMD(pTraceInfo->m_start.Base());
diff --git a/src/engine/engine.vpc b/src/engine/engine.vpc
index d0ebc3a55..ef661b04d 100644
--- a/src/engine/engine.vpc
+++ b/src/engine/engine.vpc
@@ -31,7 +31,7 @@ $Configuration
 
 	$Compiler [$WIN32]
 	{
-		$EnableEnhancedInstructionSet	"Streaming SIMD Extensions (/arch:SSE)"
+		$EnableEnhancedInstructionSet	"Streaming SIMD Extensions 2 (/arch:SSE2)"
 	}
 
 	$Linker
diff --git a/src/engine/enginetool.cpp b/src/engine/enginetool.cpp
index 460838723..6fee58081 100644
--- a/src/engine/enginetool.cpp
+++ b/src/engine/enginetool.cpp
@@ -444,7 +444,7 @@ void CEngineTool::SetGamePaused( bool paused )
 
 float CEngineTool::GetTimescale()
 {
-	return host_timescale.GetFloat();
+	return host_timescale.GetFloat() ? host_timescale.GetFloat() : 1.0f;
 }
 
 void CEngineTool::SetTimescale( float scale )
diff --git a/src/engine/gl_rsurf.cpp b/src/engine/gl_rsurf.cpp
index 82ed85ac9..650de8c00 100644
--- a/src/engine/gl_rsurf.cpp
+++ b/src/engine/gl_rsurf.cpp
@@ -47,6 +47,7 @@
 #include "materialsystem/imaterialvar.h"
 #include "coordsize.h"
 #include "mempool.h"
+#include "mathlib/ssemath.h"
 #ifndef SWDS
 #include "Overlay.h"
 #endif
@@ -4839,10 +4840,12 @@ static bool EnumerateLeafInBox_R(mnode_t *node, EnumLeafBoxInfo_t& info )
 	}
 }
 
-#ifdef _X360
 
+#if defined(_X360) || USE_DXMATH
+#ifdef _DEBUG
 static fltx4 AlignThatVector(const Vector &vc)
 {
+#ifdef _X360
 	fltx4 out = __loadunalignedvector(vc.Base());
 
 	/*
@@ -4853,7 +4856,12 @@ static fltx4 AlignThatVector(const Vector &vc)
 
 	// squelch the w component 
 	return __vrlimi( out, __vzero(), 1, 0 );
+#elif USE_DXMATH
+	fltx4 out = LoadUnaligned3SIMD(vc.Base());
+	return DirectX::XMVectorSetW(out, 0);
+#endif
 }
+#endif
 
 //-----------------------------------------------------------------------------
 // Finds all leaves of the BSP tree within a particular volume
@@ -4864,9 +4872,11 @@ static bool EnumerateLeafInBox_R(mnode_t * RESTRICT node, const EnumLeafBoxInfo_
 	if (node->contents == CONTENTS_SOLID)
 		return true;		// solid
 
+#ifdef _X360
 	// speculatively get the children into the cache
 	__dcbt(0,node->children[0]);
 	__dcbt(0,node->children[1]);
+#endif
 
 	// constructing these here prevents LHS if we spill.
 	// it's not quite a quick enough operation to do extemporaneously.
@@ -4937,6 +4947,7 @@ static bool EnumerateLeafInBox_R(mnode_t * RESTRICT node, const EnumLeafBoxInfo_
 	fltx4 vecBoxMax = LoadAlignedSIMD(pInfo->m_vecBoxMax);
 	fltx4 cornermin, cornermax;
 	// by now planeNormal is ready...
+#ifdef _X360
 	fltx4 control = XMVectorGreaterOrEqual( planeNormal, __vzero() );
 	// now control[i] = planeNormal[i] > 0 ? 0xFF : 0x00
 	cornermin = XMVectorSelect( vecBoxMax, vecBoxMin, control); // cornermin[i] = control[i] ? vecBoxMin[i] : vecBoxMax[i]
@@ -4945,6 +4956,7 @@ static bool EnumerateLeafInBox_R(mnode_t * RESTRICT node, const EnumLeafBoxInfo_
 	// compute dot products
 	fltx4 dotCornerMax = __vmsum3fp(planeNormal, cornermax); // vsumfp ignores w component
 	fltx4 dotCornerMin = __vmsum3fp(planeNormal, cornermin);
+
 	fltx4 vPlaneDist = ReplicateX4(plane->dist);
 	UINT conditionRegister;
 	XMVectorGreaterR(&conditionRegister,vPlaneDist,dotCornerMax);
@@ -4954,6 +4966,25 @@ static bool EnumerateLeafInBox_R(mnode_t * RESTRICT node, const EnumLeafBoxInfo_
 	XMVectorGreaterOrEqualR(&conditionRegister,dotCornerMin,vPlaneDist);
 	if ( XMComparisonAllTrue(conditionRegister) )
 		return EnumerateLeafInBox_R( node->children[0], pInfo );
+#elif USE_DXMATH
+	fltx4 control = DirectX::XMVectorGreaterOrEqual( planeNormal, LoadZeroSIMD() );
+	// now control[i] = planeNormal[i] > 0 ? 0xFF : 0x00
+	cornermin = DirectX::XMVectorSelect( vecBoxMax, vecBoxMin, control); // cornermin[i] = control[i] ? vecBoxMin[i] : vecBoxMax[i]
+	cornermax = DirectX::XMVectorSelect( vecBoxMin, vecBoxMax, control);
+	// compute dot products
+	fltx4 dotCornerMax = DirectX::XMVector3Dot(planeNormal, cornermax); // vsumfp ignores w component
+	fltx4 dotCornerMin = DirectX::XMVector3Dot(planeNormal, cornermin);
+
+	fltx4 vPlaneDist = ReplicateX4(plane->dist);
+	uint conditionRegister;
+	DirectX::XMVectorGreaterR(&conditionRegister,vPlaneDist,dotCornerMax);
+	if (DirectX::XMComparisonAllTrue(conditionRegister)) // plane->normal . cornermax <= plane->dist
+		return EnumerateLeafInBox_R( node->children[1], pInfo );
+
+	DirectX::XMVectorGreaterOrEqualR(&conditionRegister,dotCornerMin,vPlaneDist);
+	if ( DirectX::XMComparisonAllTrue(conditionRegister) )
+		return EnumerateLeafInBox_R( node->children[0], pInfo );
+#endif
 
 	return EnumerateLeafInBox_R( node->children[0], pInfo ) &&
 		   EnumerateLeafInBox_R( node->children[1], pInfo );
@@ -5326,7 +5357,7 @@ bool CEngineBSPTree::EnumerateLeavesInBox( const Vector& mins, const Vector& max
 	info.m_nContext = context;
 	info.m_vecBoxMax = maxs;
 	info.m_vecBoxMin = mins;
-#ifdef _X360
+#if defined(_X360) || USE_DXMATH
 	if (opt_EnumerateLeavesFastAlgorithm.GetBool())
 		return EnumerateLeafInBox_R( host_state.worldbrush->nodes, &info );
 	else
diff --git a/src/engine/host.cpp b/src/engine/host.cpp
index 0ff16bb23..9f1693ba4 100644
--- a/src/engine/host.cpp
+++ b/src/engine/host.cpp
@@ -585,7 +585,7 @@ static ConVar	host_profile( "host_profile","0" );
 
 ConVar	host_limitlocal( "host_limitlocal", "0", 0, "Apply cl_cmdrate and cl_updaterate to loopback connection" );
 ConVar	host_framerate( "host_framerate","0", 0, "Set to lock per-frame time elapse." );
-ConVar	host_timescale( "host_timescale","1.0", FCVAR_REPLICATED, "Prescale the clock by this amount." );
+ConVar	host_timescale( "host_timescale","0.0", FCVAR_REPLICATED, "Prescale the clock by this amount." );
 ConVar	host_speeds( "host_speeds","0", 0, "Show general system running times." );		// set for running times
 
 ConVar	host_flush_threshold( "host_flush_threshold", "20", 0, "Memory threshold below which the host should flush caches between server instances" );
@@ -1758,7 +1758,10 @@ void Host_ReadPreStartupConfiguration()
 	{
 		"sv_unlockedchapters",		// needed to display the startup graphic while loading
 		"snd_legacy_surround",		// needed to init the sound system
+#if defined( _X360 ) || defined( STAGING_ONLY )
 		"gameui_xbox",				// needed to initialize the correct UI
+#endif
+		"cl_hud_minmode",			// needed to initialize the correct UI
 		"save_in_memory"			// needed to preread data from the correct location in UI
 	};
 
@@ -1867,10 +1870,13 @@ void Host_AccumulateTime( float dt )
 		host_frametime	= host_state.interval_per_tick;
 	}
 
+	const bool bIsPlayingDemo = demoplayer->IsPlayingBack();
+	const float flDemoTimescale = bIsPlayingDemo ? demoplayer->GetPlaybackTimeScale() : 1.0f;
+
 #if 1
 	if ( host_framerate.GetFloat() > 0 
 #if !defined(SWDS)
-		&& ( CanCheat() || demoplayer->IsPlayingBack() ) 
+		&& ( CanCheat() || bIsPlayingDemo ) 
 #endif
 		)
 	{
@@ -1883,10 +1889,10 @@ void Host_AccumulateTime( float dt )
 
 #if !defined(SWDS) && defined( REPLAY_ENABLED )
 		extern IDemoPlayer *g_pReplayDemoPlayer;
-		if ( demoplayer->IsPlayingBack() && demoplayer == g_pReplayDemoPlayer )
+		if ( bIsPlayingDemo && demoplayer == g_pReplayDemoPlayer )
 		{
 			// adjust time scale if playing back demo
-			host_frametime *= demoplayer->GetPlaybackTimeScale();
+			host_frametime *= flDemoTimescale;
 		}
 #endif
 
@@ -1894,17 +1900,18 @@ void Host_AccumulateTime( float dt )
 	}
 	else if (host_timescale.GetFloat() > 0 
 #if !defined(SWDS)
-		&& ( CanCheat() || demoplayer->IsPlayingBack() ) 
+		&& ( CanCheat() || bIsPlayingDemo ) 
+		|| ( bIsPlayingDemo && flDemoTimescale != 1.0f )
 #endif
 		)
 	{
 		float fullscale = host_timescale.GetFloat();
 
 #if !defined(SWDS)
-		if ( demoplayer->IsPlayingBack() )
+		if ( bIsPlayingDemo )
 		{
 			// adjust time scale if playing back demo
-			fullscale *= demoplayer->GetPlaybackTimeScale();
+			fullscale *= flDemoTimescale;
 		}
 #endif
 
diff --git a/src/engine/host.h b/src/engine/host.h
index 473a6abd5..4059ea441 100644
--- a/src/engine/host.h
+++ b/src/engine/host.h
@@ -149,10 +149,10 @@ extern int	host_currentframetick;
 
 // PERFORMANCE INFO
 #define MIN_FPS         0.1         // Host minimum fps value for maxfps.
-#define MAX_FPS         1000.0        // Upper limit for maxfps.
+#define MAX_FPS         10000.0     // Upper limit for maxfps.
 
 #define MAX_FRAMETIME	0.1
-#define MIN_FRAMETIME	0.001
+#define MIN_FRAMETIME	0.0001
 
 #define TIME_TO_TICKS( dt )		( (int)( 0.5f + (float)(dt) / host_state.interval_per_tick ) )
 #define TICKS_TO_TIME( dt )		( host_state.interval_per_tick * (float)(dt) )
diff --git a/src/engine/l_studio.cpp b/src/engine/l_studio.cpp
index a3ab707c0..aaeeea989 100644
--- a/src/engine/l_studio.cpp
+++ b/src/engine/l_studio.cpp
@@ -2785,11 +2785,11 @@ struct rbatch_t
 // ----------------------------------------
 */
 
-inline int FindModel( const CUtlVector<rmodel_t> &list, const model_t *pModel )
+inline int FindModel( const rmodel_t* pList, int listCount, const model_t* pModel )
 {
-	for ( int j = list.Count(); --j >= 0 ; )
+	for ( int j = listCount; --j >= 0 ; )
 	{
-		if ( list[j].pModel == pModel )
+		if ( pList[j].pModel == pModel )
 			return j;
 	}
 	return -1;
@@ -2806,13 +2806,13 @@ int CModelRender::DrawStaticPropArrayFast( StaticPropRenderInfo_t *pProps, int c
 #ifndef SWDS
 	MDLCACHE_CRITICAL_SECTION_( g_pMDLCache );
 	CMatRenderContextPtr pRenderContext( materials );
-	const int MAX_OBJECTS = 1024;
+	const int MAX_OBJECTS = 2048;
 	CUtlSortVector<robject_t, CRobjectLess> objectList(0, MAX_OBJECTS);
-	CUtlVector<rmodel_t> modelList(0,256);
-	CUtlVector<short> lightObjects(0,256);
-	CUtlVector<short> shadowObjects(0,64);
-	CUtlVector<rdecalmodel_t> decalObjects(0,64);
-	CUtlVector<LightingState_t> lightStates(0,256);
+	CUtlVectorFixedGrowable<rmodel_t, 256> modelList;
+	CUtlVectorFixedGrowable<short, 256> lightObjects;
+	CUtlVectorFixedGrowable<short, 64> shadowObjects;
+	CUtlVectorFixedGrowable<rdecalmodel_t, 64> decalObjects;
+	CUtlVectorFixedGrowable<LightingState_t, 256> lightStates;
 	bool bForceCubemap = r_showenvcubemap.GetBool();
 	int drawnCount = 0;
 	int forcedLodSetting = r_lod.GetInt();
@@ -2826,7 +2826,7 @@ int CModelRender::DrawStaticPropArrayFast( StaticPropRenderInfo_t *pProps, int c
 	{
 		drawnCount++;
 		// UNDONE: This is a perf hit in some scenes!  Use a hash?
-		int modelIndex = FindModel( modelList, pProps[i].pModel );
+		int modelIndex = FindModel( modelList.Base(), modelList.Count(), pProps[i].pModel );
 		if ( modelIndex < 0 )
 		{
 			modelIndex = modelList.AddToTail();
diff --git a/src/engine/modelloader.cpp b/src/engine/modelloader.cpp
index 4835fea06..77c379286 100644
--- a/src/engine/modelloader.cpp
+++ b/src/engine/modelloader.cpp
@@ -62,13 +62,7 @@
 
 ConVar mat_loadtextures( "mat_loadtextures", "1", FCVAR_CHEAT );
 
-// OS X and Linux are blowing up right now due to this.  Benefits vs possible regressions on DX less clear.
-#if defined( DX_TO_GL_ABSTRACTION ) || defined( STAGING_ONLY )
-	#define CONVAR_DEFAULT_MOD_OFFLINE_HDR_SWITCH "1"
-#else
-	#define CONVAR_DEFAULT_MOD_OFFLINE_HDR_SWITCH "0"
-#endif
-static ConVar mod_offline_hdr_switch( "mod_offline_hdr_switch", CONVAR_DEFAULT_MOD_OFFLINE_HDR_SWITCH, FCVAR_INTERNAL_USE,
+static ConVar mod_offline_hdr_switch( "mod_offline_hdr_switch", "1", FCVAR_INTERNAL_USE,
                                       "Re-order the HDR/LDR mode switch to do most of the material system "
                                       "reloading with the device offline. This reduces unnecessary device "
                                       "resource uploads and may drastically reduce load time and memory pressure "
diff --git a/src/engine/net_ws.cpp b/src/engine/net_ws.cpp
index 3c49ab6cd..5ae945fc0 100644
--- a/src/engine/net_ws.cpp
+++ b/src/engine/net_ws.cpp
@@ -2960,7 +2960,15 @@ void NET_SetTime( double flRealtime )
 	}
 
 	// adjust network time so fakelag works with host_timescale
-	net_time += frametime * host_timescale.GetFloat();
+	const float timescale = host_timescale.GetFloat();
+	if (timescale > 0)
+	{
+		net_time += frametime * timescale;
+	}
+	else
+	{
+		net_time += frametime;
+	}
 }
 
 /*
diff --git a/src/engine/vgui_baseui_interface.cpp b/src/engine/vgui_baseui_interface.cpp
index 368485a4d..7bdd077d3 100644
--- a/src/engine/vgui_baseui_interface.cpp
+++ b/src/engine/vgui_baseui_interface.cpp
@@ -124,7 +124,9 @@ IGameConsole *staticGameConsole = NULL;
 bool s_bWindowsInputEnabled = true;
 
 ConVar r_drawvgui( "r_drawvgui", "1", FCVAR_CHEAT, "Enable the rendering of vgui panels" );
+#if defined( _X360 ) || defined( STAGING_ONLY )
 ConVar gameui_xbox( "gameui_xbox", "0", 0 );
+#endif
 
 void Con_CreateConsolePanel( vgui::Panel *parent );
 void CL_CreateEntityReportPanel( vgui::Panel *parent );
@@ -2142,11 +2144,11 @@ void VGui_FindNamedPanels( CUtlVector< vgui::VPANEL >& panelList, char const *pa
 	VGui_RecursiveFindPanels( panelList, embedded, panelname );
 }
 
-CON_COMMAND( vgui_togglepanel, "show/hide vgui panel by name." )
+CON_COMMAND_F( vgui_togglepanel, "show/hide vgui panel by name.", FCVAR_CHEAT )
 {
 	if ( args.ArgC() < 2 )
 	{
-		ConMsg( "Usage:  vgui_showpanel panelname\n" );
+		ConMsg( "Usage:  vgui_togglepanel panelname\n" );
 		return;
 	}
 
diff --git a/src/game/client/c_vguiscreen.cpp b/src/game/client/c_vguiscreen.cpp
index 1be470248..34aada843 100644
--- a/src/game/client/c_vguiscreen.cpp
+++ b/src/game/client/c_vguiscreen.cpp
@@ -650,6 +650,8 @@ C_BaseEntity *FindNearbyVguiScreen( const Vector &viewPosition, const QAngle &vi
 		// X360TBD: Turn this on if feature actually used
 		return NULL;
 	}
+	// Feature not used, causes crashes if entity exists anyway...
+	return NULL;
 
 	C_BasePlayer *pLocalPlayer = C_BasePlayer::GetLocalPlayer();
 
diff --git a/src/game/client/game_controls/MapOverview.cpp b/src/game/client/game_controls/MapOverview.cpp
index 0deb535ca..8b574aed4 100644
--- a/src/game/client/game_controls/MapOverview.cpp
+++ b/src/game/client/game_controls/MapOverview.cpp
@@ -1019,7 +1019,7 @@ void CMapOverview::SetMode(int mode)
 	{
 		ShowPanel( false );
 
-		g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "MapOff" );
+		g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( this, "MapOff", true, true );
 	}
 	else if ( mode == MAP_MODE_INSET )
 	{
@@ -1041,7 +1041,7 @@ void CMapOverview::SetMode(int mode)
 
 		if ( mode != m_nMode && RunHudAnimations() )
 		{
-			g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "MapZoomToSmall" );
+			g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( this, "MapZoomToSmall", true, true );
 		}
 	}
 	else if ( mode == MAP_MODE_FULL )
@@ -1061,7 +1061,7 @@ void CMapOverview::SetMode(int mode)
 
 		if ( mode != m_nMode && RunHudAnimations() )
 		{
-			g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "MapZoomToLarge" );
+			g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( this, "MapZoomToLarge", true, true );
 		}
 	}
 
diff --git a/src/game/client/game_controls/baseviewport.cpp b/src/game/client/game_controls/baseviewport.cpp
index 181b0d333..90dab0849 100644
--- a/src/game/client/game_controls/baseviewport.cpp
+++ b/src/game/client/game_controls/baseviewport.cpp
@@ -79,7 +79,7 @@ void hud_autoreloadscript_callback( IConVar *var, const char *pOldValue, float f
 
 static ConVar cl_leveloverviewmarker( "cl_leveloverviewmarker", "0", FCVAR_CHEAT );
 
-CON_COMMAND( showpanel, "Shows a viewport panel <name>" )
+CON_COMMAND_F( showpanel, "Shows a viewport panel <name>", FCVAR_CHEAT )
 {
 	if ( !gViewPortInterface )
 		return;
@@ -90,7 +90,7 @@ CON_COMMAND( showpanel, "Shows a viewport panel <name>" )
 	 gViewPortInterface->ShowPanel( args[ 1 ], true );
 }
 
-CON_COMMAND( hidepanel, "Hides a viewport panel <name>" )
+CON_COMMAND_F( hidepanel, "Hides a viewport panel <name>", FCVAR_CHEAT )
 {
 	if ( !gViewPortInterface )
 		return;
diff --git a/src/game/client/hud_base_account.cpp b/src/game/client/hud_base_account.cpp
index 9d99a2c3c..0ec27e0ca 100644
--- a/src/game/client/hud_base_account.cpp
+++ b/src/game/client/hud_base_account.cpp
@@ -28,7 +28,7 @@ void CHudBaseAccount::LevelInit( void )
 	m_pszLastAnimationName = NULL;
 	m_pszQueuedAnimationName = NULL;
 
-	GetAnimationController()->StartAnimationSequence("AccountMoneyInvisible");
+	GetAnimationController()->StartAnimationSequence(this, "AccountMoneyInvisible", true, true);
 }
 
 void CHudBaseAccount::ApplySchemeSettings(vgui::IScheme *pScheme)
@@ -91,14 +91,14 @@ void CHudBaseAccount::Paint()
 		{
 			m_pszLastAnimationName = "AccountMoneyAdded";
 		}
-		GetAnimationController()->StartAnimationSequence( m_pszLastAnimationName );
+		GetAnimationController()->StartAnimationSequence( this, m_pszLastAnimationName, true, true );
 		m_flLastAnimationEnd = gpGlobals->curtime + GetAnimationController()->GetAnimationSequenceLength( m_pszLastAnimationName );
 
 		m_iPreviousAccount = account;
 	}
 	else if ( m_pszQueuedAnimationName )
 	{
-		GetAnimationController()->StartAnimationSequence( m_pszQueuedAnimationName );
+		GetAnimationController()->StartAnimationSequence( this, m_pszQueuedAnimationName, true, true );
 		m_pszQueuedAnimationName = NULL;
 	}
 
diff --git a/src/game/client/hud_controlpointicons.cpp b/src/game/client/hud_controlpointicons.cpp
index b18ab4491..0c1372732 100644
--- a/src/game/client/hud_controlpointicons.cpp
+++ b/src/game/client/hud_controlpointicons.cpp
@@ -551,7 +551,7 @@ void CControlPointIcon::PerformLayout( void )
 			if ( m_pCapNumPlayers )
 			{
 				m_pCapNumPlayers->SetVisible( (iPlayers>1) );
-				SetDialogVariable( "numcappers", iPlayers );
+				SetDialogVariable( "numcappers", iPlayers, false );
 
 				m_pCapNumPlayers->SetFgColor( Color(0,0,0,255) );
 			}
diff --git a/src/game/client/menu.cpp b/src/game/client/menu.cpp
index f5ee31692..1174fa2cd 100644
--- a/src/game/client/menu.cpp
+++ b/src/game/client/menu.cpp
@@ -256,12 +256,12 @@ void CHudMenu::SelectMenuItem( int menu_item )
 
 		m_nSelectedItem = menu_item;
 		// Pulse the selection
-		g_pClientMode->GetViewportAnimationController()->StartAnimationSequence("MenuPulse");
+		g_pClientMode->GetViewportAnimationController()->StartAnimationSequence(this, "MenuPulse", true, true);
 
 		// remove the menu quickly
 		m_bMenuTakesInput = false;
 		m_flShutoffTime = gpGlobals->realtime + m_flOpenCloseTime;
-		g_pClientMode->GetViewportAnimationController()->StartAnimationSequence("MenuClose");
+		g_pClientMode->GetViewportAnimationController()->StartAnimationSequence(this, "MenuClose", true, true);
 	}
 }
 
@@ -365,7 +365,7 @@ void CHudMenu::HideMenu( void )
 {
 	m_bMenuTakesInput = false;
 	m_flShutoffTime = gpGlobals->realtime + m_flOpenCloseTime;
-	g_pClientMode->GetViewportAnimationController()->StartAnimationSequence("MenuClose");
+	g_pClientMode->GetViewportAnimationController()->StartAnimationSequence(this, "MenuClose", true, true);
 }
 
 //-----------------------------------------------------------------------------
@@ -384,7 +384,7 @@ void CHudMenu::ShowMenu( const char * menuName, int validSlots )
 
 	Q_strncpy( g_szPrelocalisedMenuString, menuName, sizeof( g_szPrelocalisedMenuString ) );
 
-	g_pClientMode->GetViewportAnimationController()->StartAnimationSequence("MenuOpen");
+	g_pClientMode->GetViewportAnimationController()->StartAnimationSequence(this, "MenuOpen", true, true);
 	m_nSelectedItem = -1;
 
 	// we have the whole string, so we can localise it now
@@ -409,7 +409,7 @@ void CHudMenu::ShowMenu_KeyValueItems( KeyValues *pKV )
 	m_fWaitingForMore = 0;
 	m_bitsValidSlots = 0;
 
-	g_pClientMode->GetViewportAnimationController()->StartAnimationSequence("MenuOpen");
+	g_pClientMode->GetViewportAnimationController()->StartAnimationSequence(this, "MenuOpen", true, true);
 	m_nSelectedItem = -1;
 	
 	g_szMenuString[0] = '\0';
@@ -489,7 +489,7 @@ void CHudMenu::MsgFunc_ShowMenu( bf_read &msg)
 
 		if ( !NeedMore )
 		{  
-			g_pClientMode->GetViewportAnimationController()->StartAnimationSequence("MenuOpen");
+			g_pClientMode->GetViewportAnimationController()->StartAnimationSequence(this, "MenuOpen", true, true);
 			m_nSelectedItem = -1;
 			
 			// we have the whole string, so we can localise it now
diff --git a/src/game/client/perfvisualbenchmark.cpp b/src/game/client/perfvisualbenchmark.cpp
index 29eb1fdf8..91e336633 100644
--- a/src/game/client/perfvisualbenchmark.cpp
+++ b/src/game/client/perfvisualbenchmark.cpp
@@ -233,7 +233,7 @@ void CPerfVisualBenchmark::Stop()
 #endif
 	m_bIsOn = false;
 	Print();
-	engine->ClientCmd_Unrestricted("host_timescale 0");					// pause the mofo
+	engine->ClientCmd_Unrestricted("host_timescale 0.0001");					// pause the mofo
 //	engine->ClientCmd_Unrestricted("unpause");				// unpause the mofo
 //	engine->ClientCmd_Unrestricted("wait");				
 	engine->ClientCmd_Unrestricted("toggleconsole");
diff --git a/src/game/client/replay/vgui/replayperformanceeditor.cpp b/src/game/client/replay/vgui/replayperformanceeditor.cpp
index be274ccb6..3d05fb802 100644
--- a/src/game/client/replay/vgui/replayperformanceeditor.cpp
+++ b/src/game/client/replay/vgui/replayperformanceeditor.cpp
@@ -909,7 +909,7 @@ class CReplayEditorFastForwardButton : public CReplayButton
 		// the user is still holding downt he FF button at the end of the replay.
 		if ( m_pHostTimescale )
 		{
-			m_pHostTimescale->SetValue( 1.0f );
+			m_pHostTimescale->SetValue( 0.0f );
 		}
 
 		// Resume demo playback so that any demo played later won't start paused.
diff --git a/src/game/client/tf/c_tf_player.cpp b/src/game/client/tf/c_tf_player.cpp
index d43b0c3da..c5396cce3 100644
--- a/src/game/client/tf/c_tf_player.cpp
+++ b/src/game/client/tf/c_tf_player.cpp
@@ -2922,7 +2922,9 @@ class CProxyAnimatedWeaponSheen : public CBaseAnimatedTextureProxy
 		}
 
 		C_BaseEntity *pBaseEntity = pRend->GetIClientUnknown()->GetBaseEntity();
-		const CEconItemView *pItem = dynamic_cast< CEconItemView* >( pRend );
+		CEconItemView *pItem = dynamic_cast< CEconItemView* >( pRend );
+
+		CEconItemViewDataCacher itemDataCacher(pItem);
 
 		uint32 unAttrValue = 0;
 		uint32 unEffectValue = 0;
@@ -2974,6 +2976,7 @@ class CProxyAnimatedWeaponSheen : public CBaseAnimatedTextureProxy
 					if ( pWearable )
 					{
 						pItem = pWearable->GetAttributeContainer()->GetItem();
+						itemDataCacher.SetItem(pItem);
 						pTFPlayer = ToTFPlayer( pWearable->GetOwnerEntity() );
 						break;
 					}
@@ -2983,6 +2986,7 @@ class CProxyAnimatedWeaponSheen : public CBaseAnimatedTextureProxy
 						if ( pModel->GetOuter() )
 						{
 							pItem = pModel->GetOuter()->GetAttributeContainer()->GetItem();
+							itemDataCacher.SetItem(pItem);
 							pBaseEntity = pBaseEntity->GetOwnerEntity();
 							if ( pItem )
 							{
@@ -3005,6 +3009,7 @@ class CProxyAnimatedWeaponSheen : public CBaseAnimatedTextureProxy
 							if ( pWeapon )
 							{
 								pItem = pWeapon->GetAttributeContainer()->GetItem();
+								itemDataCacher.SetItem(pItem);
 								pBaseEntity = pWeapon;
 							}
 							bIsFirstPerson = true;
@@ -3017,6 +3022,7 @@ class CProxyAnimatedWeaponSheen : public CBaseAnimatedTextureProxy
 						if ( pWeapon )
 						{
 							pItem = pWeapon->GetAttributeContainer()->GetItem();
+							itemDataCacher.SetItem(pItem);
 							pBaseEntity = pWeapon;
 						}
 					}
@@ -3025,6 +3031,7 @@ class CProxyAnimatedWeaponSheen : public CBaseAnimatedTextureProxy
 			else
 			{
 				pItem = pWeapon->GetAttributeContainer()->GetItem();
+				itemDataCacher.SetItem(pItem);
 				pBaseEntity = pWeapon;
 				pTFPlayer = ToTFPlayer( pWeapon->GetOwner() );
 			}
@@ -3046,6 +3053,7 @@ class CProxyAnimatedWeaponSheen : public CBaseAnimatedTextureProxy
 					if ( pTFPlayer && pTFPlayer->m_Shared.GetDisguiseWeapon() )
 					{
 						pItem = pTFPlayer->m_Shared.GetDisguiseWeapon()->GetAttributeContainer()->GetItem();
+						itemDataCacher.SetItem(pItem);
 						pBaseEntity = pTFPlayer->m_Shared.GetDisguiseWeapon();
 					}
 				}
@@ -3170,11 +3178,13 @@ class CProxyAnimatedWeaponSheen : public CBaseAnimatedTextureProxy
 		int iShaderIndex = sheenParams.m_iShaderIndex;
 
 		// Australium weapons always use iShaderIndex 1
+		pItem->CacheSOCData();
 		const CEconStyleInfo *pStyle = pItem->GetStaticData()->GetStyleInfo( pItem->GetItemStyle() );
 		if ( pStyle && !pStyle->IsSelectable() )
 		{
 			iShaderIndex = 1;
 		}
+		pItem->UncacheSOCData();
 
 #ifdef STAGING_ONLY
 		if ( tf_sheen_shader_override.GetInt() > 0 )
@@ -3728,6 +3738,8 @@ class CWeaponSkinProxy : public IMaterialProxy
 		if ( !pItem )
 			return;
 
+		CEconItemViewDataCacher dataCacher(pItem);
+
 		C_TFPlayer *pOwner = GetOwnerFromProxyEntity( pC_BaseEntity );
 		int desiredW = m_pBaseTextureOrig->GetActualWidth();
 		int desiredH = m_pBaseTextureOrig->GetActualHeight();
@@ -7493,8 +7505,8 @@ void C_TFPlayer::UpdateIDTarget()
 
 	trace_t tr;
 	Vector vecStart, vecEnd;
-	VectorMA( MainViewOrigin(), MAX_TRACE_LENGTH, MainViewForward(), vecEnd );
-	VectorMA( MainViewOrigin(), 10,   MainViewForward(), vecStart );
+	VectorMA( MainViewOrigin(), 8192.0f, MainViewForward(), vecEnd );
+	VectorMA( MainViewOrigin(), 10.0f,   MainViewForward(), vecStart );
 
 	// If we're in observer mode, ignore our observer target. Otherwise, ignore ourselves.
 	if ( IsObserver() )
@@ -7511,7 +7523,11 @@ void C_TFPlayer::UpdateIDTarget()
 			iReviveMedic = 1;
 		}
 
-		int nMask = MASK_SOLID | CONTENTS_DEBRIS;
+		int nMask = MASK_SOLID;
+		if ( iReviveMedic == 1) 
+		{
+			nMask |= CONTENTS_DEBRIS;
+		}
 		UTIL_TraceLine( vecStart, vecEnd, nMask, this, COLLISION_GROUP_NONE, &tr );
 	}
 
diff --git a/src/game/client/tf/tf_hud_arena_vs_panel.cpp b/src/game/client/tf/tf_hud_arena_vs_panel.cpp
index d2df4f395..cee248151 100644
--- a/src/game/client/tf/tf_hud_arena_vs_panel.cpp
+++ b/src/game/client/tf/tf_hud_arena_vs_panel.cpp
@@ -121,7 +121,7 @@ void CHudArenaVsPanel::FireGameEvent( IGameEvent * event )
 
 		if ( m_bVisible )
 		{
-			g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "ArenaVsPanelOnShow" );			
+			g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( this, "ArenaVsPanelOnShow", true, true );			
 
 			m_flHideTime = gpGlobals->curtime + 10.0f;
 
diff --git a/src/game/client/tf/tf_hud_escort.cpp b/src/game/client/tf/tf_hud_escort.cpp
index 68c4ca740..233fb12b9 100644
--- a/src/game/client/tf/tf_hud_escort.cpp
+++ b/src/game/client/tf/tf_hud_escort.cpp
@@ -696,7 +696,7 @@ void CTFHudEscort::UpdateAlarmAnimations( void )
 void CTFHudEscort::OnTick()
 {
 	// don't need to do this on non-escort maps (unless we're trying to override the HUD type)
-	if ( TFGameRules() && ( TFGameRules()->GetGameType() != TF_GAMETYPE_ESCORT ) && ( TFGameRules()->GetHUDType() != TF_HUDTYPE_ESCORT ) )
+	if ( !TFGameRules() || ( TFGameRules()->GetGameType() != TF_GAMETYPE_ESCORT ) && ( TFGameRules()->GetHUDType() != TF_HUDTYPE_ESCORT ) )
 		return;
 
 	if ( !BaseClass::IsVisible() ) // intentionally skipping our version of IsVisible() to bypass the !m_bHaveValidPointPositions check
@@ -831,14 +831,14 @@ void CTFHudEscort::OnTick()
 	if ( flSecondsToRecede > 0.0f && flSecondsToRecede <= TF_ESCORT_RECEDE_COUNTDOWN )
 	{
 		int iDisplaySeconds = (int)( flSecondsToRecede ) + 1;
-		m_pEscortItemPanel->SetDialogVariable( "recede", VarArgs( "%d", iDisplaySeconds ) );
+		m_pEscortItemPanel->SetDialogVariable( "recede", VarArgs( "%d", iDisplaySeconds ), false );
 
 		// we should not be showing the blocked image if we're showing the countdown
 		m_pBlocked->SetVisible( false );
 	}
 	else
 	{
-		m_pEscortItemPanel->SetDialogVariable( "recede", "" );
+		m_pEscortItemPanel->SetDialogVariable( "recede", "", false );
 	}
 
 	// Debug string
diff --git a/src/game/client/tf/tf_hud_flagstatus.cpp b/src/game/client/tf/tf_hud_flagstatus.cpp
index c851ca162..1cefa8398 100644
--- a/src/game/client/tf/tf_hud_flagstatus.cpp
+++ b/src/game/client/tf/tf_hud_flagstatus.cpp
@@ -507,7 +507,7 @@ void CTFHudFlagObjectives::ApplySchemeSettings( IScheme *pScheme )
 //-----------------------------------------------------------------------------
 void CTFHudFlagObjectives::Reset()
 {
-	g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "FlagOutlineHide" );
+	g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( this, "FlagOutlineHide", true, true );
 
 	UpdateStatus();
 }
@@ -641,37 +641,37 @@ void CTFHudFlagObjectives::OnTick()
 	}
 
 	// are we playing captures for rounds?
-	if ( !TFGameRules() || ( !TFGameRules()->IsPlayingHybrid_CTF_CP() && !TFGameRules()->IsPlayingSpecialDeliveryMode() && !TFGameRules()->IsMannVsMachineMode() ) )
+	if ( TFGameRules() && ( !TFGameRules()->IsPlayingHybrid_CTF_CP() && !TFGameRules()->IsPlayingSpecialDeliveryMode() && !TFGameRules()->IsMannVsMachineMode() ) )
 	{
 		if ( tf_flag_caps_per_round.GetInt() > 0 )
 		{
 			C_TFTeam *pTeam = GetGlobalTFTeam( TF_TEAM_BLUE );
 			if ( pTeam )
 			{
-				SetDialogVariable( "bluescore", pTeam->GetFlagCaptures() );
+				SetDialogVariable( "bluescore", pTeam->GetFlagCaptures(), false );
 			}
 
 			pTeam = GetGlobalTFTeam( TF_TEAM_RED );
 			if ( pTeam )
 			{
-				SetDialogVariable( "redscore", pTeam->GetFlagCaptures() );
+				SetDialogVariable( "redscore", pTeam->GetFlagCaptures(), false );
 			}
 
 			SetPlayingToLabelVisible( true );
-			SetDialogVariable( "rounds", tf_flag_caps_per_round.GetInt() );
+			SetDialogVariable( "rounds", tf_flag_caps_per_round.GetInt(), false );
 		}
 		else // we're just playing straight score
 		{
 			C_TFTeam *pTeam = GetGlobalTFTeam( TF_TEAM_BLUE );
 			if ( pTeam )
 			{
-				SetDialogVariable( "bluescore", pTeam->Get_Score() );
+				SetDialogVariable( "bluescore", pTeam->Get_Score(), false );
 			}
 
 			pTeam = GetGlobalTFTeam( TF_TEAM_RED );
 			if ( pTeam )
 			{
-				SetDialogVariable( "redscore", pTeam->Get_Score() );
+				SetDialogVariable( "redscore", pTeam->Get_Score(), false );
 			}
 
 			SetPlayingToLabelVisible( false );
@@ -834,7 +834,7 @@ void CTFHudFlagObjectives::UpdateStatus( C_BasePlayer *pNewOwner /*= NULL*/, C_B
 		if ( !m_bFlagAnimationPlayed )
 		{
 			m_bFlagAnimationPlayed = true;
-			g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "FlagOutline" );
+			g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( this, "FlagOutline", true, true );
 		}
 
 		if ( m_pCapturePoint && !m_pCapturePoint->IsVisible() )
@@ -864,7 +864,7 @@ void CTFHudFlagObjectives::UpdateStatus( C_BasePlayer *pNewOwner /*= NULL*/, C_B
 		if ( m_bCarryingFlag )
 		{
 			m_bCarryingFlag = false;
-			g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "FlagOutline" );
+			g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( this, "FlagOutline", true, true );
 		}
 
 		m_bFlagAnimationPlayed = false;
diff --git a/src/game/client/tf/tf_hud_itemeffectmeter.cpp b/src/game/client/tf/tf_hud_itemeffectmeter.cpp
index 6f9bb2b42..f16d3b2fb 100644
--- a/src/game/client/tf/tf_hud_itemeffectmeter.cpp
+++ b/src/game/client/tf/tf_hud_itemeffectmeter.cpp
@@ -451,11 +451,11 @@ void CHudItemEffectMeter::Update( C_TFPlayer* pPlayer, const char* pSoundScript
 	{
 		if ( ShowPercentSymbol() )
 		{
-			SetDialogVariable( "progresscount", VarArgs( "%d%%", iCount ) );
+			SetDialogVariable( "progresscount", VarArgs( "%d%%", iCount ), false );
 		}
 		else
 		{
-			SetDialogVariable( "progresscount", iCount );
+			SetDialogVariable( "progresscount", iCount, false );
 		}
 	}
 
diff --git a/src/game/client/tf/tf_hud_mann_vs_machine_status.cpp b/src/game/client/tf/tf_hud_mann_vs_machine_status.cpp
index b05c77992..d969ce6a5 100644
--- a/src/game/client/tf/tf_hud_mann_vs_machine_status.cpp
+++ b/src/game/client/tf/tf_hud_mann_vs_machine_status.cpp
@@ -1123,7 +1123,7 @@ void CInWorldCurrencyStatus::OnTick( void )
 
 	char szTmp[16];
 	Q_snprintf( szTmp, ARRAYSIZE( szTmp ), "$%d", nWorldMoney );
-	SetDialogVariable( "currency", szTmp );
+	SetDialogVariable( "currency", szTmp, false );
 }
 //-----------------------------------------------------------------------------
 // Purpose: 
diff --git a/src/game/client/tf/tf_hud_match_status.cpp b/src/game/client/tf/tf_hud_match_status.cpp
index ed0dac2cb..acd7bc892 100644
--- a/src/game/client/tf/tf_hud_match_status.cpp
+++ b/src/game/client/tf/tf_hud_match_status.cpp
@@ -41,7 +41,7 @@ static ConVar tf_use_match_hud("tf_use_match_hud", "1", FCVAR_ARCHIVE);
 //-----------------------------------------------------------------------------
 bool ShouldUseMatchHUD()
 {
-	if ((TFGameRules()->IsMannVsMachineMode()))
+	if (!TFGameRules() || (TFGameRules()->IsMannVsMachineMode()))
 		return false;
 	
 	return tf_use_match_hud.GetBool();
diff --git a/src/game/client/tf/tf_hud_passtime.cpp b/src/game/client/tf/tf_hud_passtime.cpp
index 6181fedba..4b2a0435d 100644
--- a/src/game/client/tf/tf_hud_passtime.cpp
+++ b/src/game/client/tf/tf_hud_passtime.cpp
@@ -218,8 +218,8 @@ void CTFHudTeamScore::OnTick()
 			tf_passtime_scores_per_round.GetInt() );
 	}
 
-	SetDialogVariable( "bluescore", iBlueScore );
-	SetDialogVariable( "redscore", iRedScore );
+	SetDialogVariable( "bluescore", iBlueScore, false );
+	SetDialogVariable( "redscore", iRedScore, false );
 }
 
 //-----------------------------------------------------------------------------
diff --git a/src/game/client/tf/tf_hud_playerstatus.cpp b/src/game/client/tf/tf_hud_playerstatus.cpp
index 26723ce15..806ceb5fd 100644
--- a/src/game/client/tf/tf_hud_playerstatus.cpp
+++ b/src/game/client/tf/tf_hud_playerstatus.cpp
@@ -866,11 +866,11 @@ void CTFHudPlayerHealth::SetHealth( int iNewHealth, int iMaxHealth, int	iMaxBuff
 	// set our health display value
 	if ( m_nHealth > 0 )
 	{
-		SetDialogVariable( "Health", m_nHealth );
+		SetDialogVariable( "Health", m_nHealth, false );
 	}
 	else
 	{
-		SetDialogVariable( "Health", "" );
+		SetDialogVariable( "Health", "", false );
 	}	
 }
 
diff --git a/src/game/client/tf/tf_hud_pve_winpanel.cpp b/src/game/client/tf/tf_hud_pve_winpanel.cpp
index a8ab8efc6..3f70a4411 100644
--- a/src/game/client/tf/tf_hud_pve_winpanel.cpp
+++ b/src/game/client/tf/tf_hud_pve_winpanel.cpp
@@ -167,7 +167,7 @@ void CTFPVEWinPanel::OnTick()
 				// Do this only once
 				if ( bRespecVisible && !m_pRespecBackground->IsVisible() )
 				{
-					g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "RespecEarnedPulseLoss" );
+					g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( m_pRespecContainerPanel, "RespecEarnedPulseLoss", true, true );
 				
 					C_TFPlayer *pLocalTFPlayer = C_TFPlayer::GetLocalTFPlayer();
 					if ( pLocalTFPlayer )
diff --git a/src/game/client/tf/tf_hud_robot_destruction_status.cpp b/src/game/client/tf/tf_hud_robot_destruction_status.cpp
index 115f708d3..4b88ebcf2 100644
--- a/src/game/client/tf/tf_hud_robot_destruction_status.cpp
+++ b/src/game/client/tf/tf_hud_robot_destruction_status.cpp
@@ -14,6 +14,7 @@
 #include "tf_logic_player_destruction.h"
 #include "c_tf_objective_resource.h"
 #include "c_func_capture_zone.h"
+#include "tf_hud_objectivestatus.h"
 
 #define ATTACK_BLINK_TIME 2.f
 
@@ -516,7 +517,8 @@ void CTFHUDRobotDestruction::PerformRobotLayout( RobotVector_t& vecRobots, int n
 //-----------------------------------------------------------------------------
 void CTFHUDRobotDestruction::Reset()
 {
-	g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "FlagOutlineHide" );
+	CTFHudObjectiveStatus *pStatus = GET_HUDELEMENT( CTFHudObjectiveStatus );
+	g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( pStatus, "FlagOutlineHide" );
 }
 
 //-----------------------------------------------------------------------------
@@ -562,8 +564,8 @@ void CTFHUDRobotDestruction::OnTick()
 	if ( !pRoboLogic )
 		return;
 
-	m_pRedScoreValueContainer->SetDialogVariable( "score", pRoboLogic->GetScore( TF_TEAM_RED ) );
-	m_pBlueScoreValueContainer->SetDialogVariable( "score", pRoboLogic->GetScore( TF_TEAM_BLUE ) );
+	m_pRedScoreValueContainer->SetDialogVariable( "score", pRoboLogic->GetScore( TF_TEAM_RED ), false );
+	m_pBlueScoreValueContainer->SetDialogVariable( "score", pRoboLogic->GetScore( TF_TEAM_BLUE ), false );
 
 #ifdef STAGING_ONLY
 	if ( rd_hud_test_bars.GetBool() )
@@ -574,8 +576,8 @@ void CTFHUDRobotDestruction::OnTick()
 		m_pBlueProgressBarEscrow->SetProgress( 0.f, true );
 		m_pRedProgressBarEscrow->SetProgress( 0.f, true );
 
-		m_pRedScoreValueContainer->SetDialogVariable( "score", flProgress );
-		m_pBlueScoreValueContainer->SetDialogVariable( "score", flProgress );
+		m_pRedScoreValueContainer->SetDialogVariable( "score", flProgress, false );
+		m_pBlueScoreValueContainer->SetDialogVariable( "score", flProgress, false );
 	}
 	else
 #endif
@@ -599,8 +601,8 @@ void CTFHUDRobotDestruction::OnTick()
 
 			if ( m_pProgressBarsContainer )
 			{
-				m_pProgressBarsContainer->SetDialogVariable( "red_escrow", nRedEscrow );
-				m_pProgressBarsContainer->SetDialogVariable( "blue_escrow", nBlueEscrow );
+				m_pProgressBarsContainer->SetDialogVariable( "red_escrow", nRedEscrow, false );
+				m_pProgressBarsContainer->SetDialogVariable( "blue_escrow", nBlueEscrow, false );
 			}
 
 			// update the team leader image
@@ -778,7 +780,7 @@ void CTFHUDRobotDestruction::OnTick()
 	}
 
 	SetPlayingToLabelVisible( true );
-	SetDialogVariable( "rounds", pRoboLogic->GetMaxPoints() );
+	SetDialogVariable( "rounds", pRoboLogic->GetMaxPoints(), false );
 	// HACK!  Fix the events
 	UpdateCarriedFlagStatus( NULL, NULL );
 }
@@ -865,7 +867,7 @@ void CTFHUDRobotDestruction::UpdateStolenPoints( int nTeam, EditablePanel* pCont
 		}
 		// Show the stolen panels if the stolen score is anything
 		pContainer->SetVisible( nStolenPoints > 0 );
-		pContainer->SetDialogVariable( "intelvalue", nStolenPoints );
+		pContainer->SetDialogVariable( "intelvalue", nStolenPoints, false );
 	}
 
 	// Find our stolen flag
@@ -947,7 +949,7 @@ void CTFHUDRobotDestruction::UpdateCarriedFlagStatus( C_BasePlayer *pNewOwner /*
 	if ( pPlayerFlag && !pPlayerFlag->IsMarkedForDeletion() && !pPlayerFlag->IsDormant() )
 	{
 		m_pCarriedContainer->SetVisible( true );
-		m_pCarriedContainer->SetDialogVariable( "flagvalue", pPlayerFlag->GetPointValue() );
+		m_pCarriedContainer->SetDialogVariable( "flagvalue", pPlayerFlag->GetPointValue(), false );
 		// make sure the panels are on, set the initial alpha values, 
 		// set the color of the flag we're carrying, and start the animations
 		if ( m_pCarriedImage && !m_pCarriedImage->IsVisible() )
diff --git a/src/game/client/tf/tf_hud_scope.cpp b/src/game/client/tf/tf_hud_scope.cpp
index 686c68ff7..e071292df 100644
--- a/src/game/client/tf/tf_hud_scope.cpp
+++ b/src/game/client/tf/tf_hud_scope.cpp
@@ -242,6 +242,7 @@ class CHudScope : public vgui::Panel, public CHudElement
 	virtual void ApplySchemeSettings(vgui::IScheme *scheme);
 	virtual void Paint( void );
 	virtual bool ShouldDraw( void );
+	virtual bool CanAnimate() const override { return false; };
 
 private:
 	int m_iScopeTexture[4];
diff --git a/src/game/client/tf/tf_hud_target_id.cpp b/src/game/client/tf/tf_hud_target_id.cpp
index dd41b296b..8dddb7060 100644
--- a/src/game/client/tf/tf_hud_target_id.cpp
+++ b/src/game/client/tf/tf_hud_target_id.cpp
@@ -1016,7 +1016,7 @@ void CTargetID::UpdateID( void )
 			if ( m_pMoveableSubPanel->IsVisible() )
 			{
 				const char *pBoundKey = engine->Key_LookupBinding( pszActionCommand );
-				m_pMoveableSubPanel->SetDialogVariable( "movekey", pBoundKey );
+				m_pMoveableSubPanel->SetDialogVariable( "movekey", pBoundKey, false );
 			}
 
 			if ( m_pMoveableIcon )
@@ -1060,7 +1060,7 @@ void CTargetID::UpdateID( void )
 				m_pTargetNameLabel->SetFgColor( colorName );
 
 				// TODO: Support	if( hud_centerid.GetInt() == 0 )
-				SetDialogVariable( "targetname", sIDString );
+				SetDialogVariable( "targetname", sIDString, false );
 			}
 			else
 			{
@@ -1075,7 +1075,7 @@ void CTargetID::UpdateID( void )
 				m_pTargetDataLabel->SetVisible(true);
 				m_pTargetDataLabel->SetFgColor( colorData );
 
-				SetDialogVariable( "targetdata", sDataString );
+				SetDialogVariable( "targetdata", sDataString, false );
 			}
 			else
 			{
diff --git a/src/game/client/tf/tf_hud_tournament.cpp b/src/game/client/tf/tf_hud_tournament.cpp
index d34cd5738..6653e29fa 100644
--- a/src/game/client/tf/tf_hud_tournament.cpp
+++ b/src/game/client/tf/tf_hud_tournament.cpp
@@ -299,15 +299,15 @@ void CHudTournament::PreparePanel( void )
 					pszLabelText = "Tournament_Instructions_Waiting";
 				}
 
-				SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( pszLabelText ) );
-				SetDialogVariable( "tournamentstatelabel", g_pVGuiLocalize->Find( "Tournament_WaitingForTeam" ) );
+				SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( pszLabelText ), false );
+				SetDialogVariable( "tournamentstatelabel", g_pVGuiLocalize->Find( "Tournament_WaitingForTeam" ), false );
 				SetPlayerPanelsVisible( true );
 				m_pModeImage->SetVisible( m_bCompetitiveMode );
 			}
 			else
 			{
-				SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "Tournament_Instructions" ) );
-				SetDialogVariable( "tournamentstatelabel", g_pVGuiLocalize->Find( "Tournament_WaitingForTeams" ) );
+				SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "Tournament_Instructions" ), false );
+				SetDialogVariable( "tournamentstatelabel", g_pVGuiLocalize->Find( "Tournament_WaitingForTeams" ), false );
 				SetPlayerPanelsVisible( false );
 				m_pModeImage->SetVisible( false );
 			}
@@ -333,18 +333,18 @@ void CHudTournament::PreparePanel( void )
 			if ( pFormatString )
 			{
 				g_pVGuiLocalize->ConstructString_safe( szCountdown, pFormatString, 1, wzVal );
-				SetDialogVariable( "tournamentstatelabel", szCountdown );
+				SetDialogVariable( "tournamentstatelabel", szCountdown, false );
 			}
 
 			if ( bAutoReady )
 			{
-				SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "" ) );
+				SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "" ), false );
 				m_pModeImage->SetVisible( false );
 				SetPlayerPanelsVisible( false );
 			}
 			else if ( nTime <= TOURNAMENT_NOCANCEL_TIME )
 			{
-				SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "" ) );
+				SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "" ), false );
 			}
 			else
 			{
@@ -352,17 +352,17 @@ void CHudTournament::PreparePanel( void )
 				{
 					if ( bSteamController )
 					{
-						SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "Tournament_Instructions_Ready_NoKeyHintText" ) );
+						SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "Tournament_Instructions_Ready_NoKeyHintText" ), false );
 						bShowReadyHintIcon = true;
 					}
 					else
 					{
-						SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "Tournament_Instructions_Ready" ) );
+						SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "Tournament_Instructions_Ready" ), false );
 					}
 				}
 				else
 				{
-					SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "" ) );
+					SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "" ), false );
 				}
 			}
 
@@ -411,13 +411,13 @@ void CHudTournament::PreparePanel( void )
 #endif
 	
 	C_TFTeam *pBlueTeam = GetGlobalTFTeam( TF_TEAM_BLUE );
-	SetDialogVariable( "bluenamelabel", pBlueTeam ? pBlueTeam->Get_Localized_Name() : L"BLU" );
+	SetDialogVariable( "bluenamelabel", pBlueTeam ? pBlueTeam->Get_Localized_Name() : L"BLU", false );
 
 	C_TFTeam *pRedTeam = GetGlobalTFTeam( TF_TEAM_RED );
-	SetDialogVariable( "rednamelabel", pRedTeam ? pRedTeam->Get_Localized_Name() : L"RED" );
+	SetDialogVariable( "rednamelabel", pRedTeam ? pRedTeam->Get_Localized_Name() : L"RED", false );
 
-	SetDialogVariable( "bluestate", TFGameRules()->IsTeamReady( TF_TEAM_BLUE ) ? g_pVGuiLocalize->Find( "Tournament_TeamReady" ) : g_pVGuiLocalize->Find( "Tournament_TeamNotReady" ) );
-	SetDialogVariable( "redstate", TFGameRules()->IsTeamReady( TF_TEAM_RED ) ? g_pVGuiLocalize->Find( "Tournament_TeamReady" ) : g_pVGuiLocalize->Find( "Tournament_TeamNotReady" ) );
+	SetDialogVariable( "bluestate", TFGameRules()->IsTeamReady( TF_TEAM_BLUE ) ? g_pVGuiLocalize->Find( "Tournament_TeamReady" ) : g_pVGuiLocalize->Find( "Tournament_TeamNotReady" ), false );
+	SetDialogVariable( "redstate", TFGameRules()->IsTeamReady( TF_TEAM_RED ) ? g_pVGuiLocalize->Find( "Tournament_TeamReady" ) : g_pVGuiLocalize->Find( "Tournament_TeamNotReady" ), false );
 	
 	if ( m_bTeamReady[TF_TEAM_BLUE] != TFGameRules()->IsTeamReady( TF_TEAM_BLUE ) || m_bTeamReady[TF_TEAM_RED] != TFGameRules()->IsTeamReady( TF_TEAM_RED ) )
 	{
@@ -475,7 +475,7 @@ void CHudTournament::PreparePanel( void )
 		_snwprintf( szWindConditions, ARRAYSIZE( szWindConditions ), STRING_FMT STRING_FMT, szWindConditions, g_pVGuiLocalize->Find( "Tournament_WinConditionsNone" ) );
 	}
 
-	SetDialogVariable( "winconditions", szWindConditions );
+	SetDialogVariable( "winconditions", szWindConditions, false );
 }
 
 //-----------------------------------------------------------------------------
@@ -1199,7 +1199,7 @@ void CHudTournamentSetup::OnTick( void )
 			m_pNameEntry->SetText( ( iLocalTeam == TF_TEAM_BLUE ) ? mp_tournament_blueteamname.GetString() : mp_tournament_redteamname.GetString() );
 		}
 
-		SetDialogVariable( "tournamentstatelabel", TFGameRules()->IsTeamReady( iLocalTeam ) ? g_pVGuiLocalize->Find( "Tournament_TeamSetupReady" ) : g_pVGuiLocalize->Find( "Tournament_TeamSetupNotReady" ) );
+		SetDialogVariable( "tournamentstatelabel", TFGameRules()->IsTeamReady( iLocalTeam ) ? g_pVGuiLocalize->Find( "Tournament_TeamSetupReady" ) : g_pVGuiLocalize->Find( "Tournament_TeamSetupNotReady" ), false );
 
 		m_flNextThink = gpGlobals->curtime + TOURNAMENT_PANEL_UPDATE_INTERVAL;
 	}
@@ -1431,7 +1431,7 @@ void CHudStopWatch::OnTick( void )
 
 			m_pStopWatchImage->SetImage( "../hud/ico_time_none" );
 
-			SetDialogVariable( "stopwatchlabel", g_pVGuiLocalize->Find( "Tournament_StopWatchNoCap" ) );
+			SetDialogVariable( "stopwatchlabel", g_pVGuiLocalize->Find( "Tournament_StopWatchNoCap" ), false );
 		}
 		else if ( TFGameRules()->GetStopWatchState() == STOPWATCH_RUNNING )
 		{
@@ -1472,8 +1472,8 @@ void CHudStopWatch::OnTick( void )
 				pszPoints = g_pVGuiLocalize->Find( "#Tournament_StopWatch_Points" );
 			}
 			
-			SetDialogVariable( "pointslabel", pszPoints );
-			SetDialogVariable( "scoretobeat", wzScoreVal );
+			SetDialogVariable( "pointslabel", pszPoints, false );
+			SetDialogVariable( "scoretobeat", wzScoreVal, false );
 
 			wchar_t wzHelp[128];
 
@@ -1486,7 +1486,7 @@ void CHudStopWatch::OnTick( void )
 				g_pVGuiLocalize->ConstructString_safe( wzHelp, g_pVGuiLocalize->Find( "Tournament_StopWatch_TimeVictoryDefender" ), 1, pDefender->Get_Localized_Name() );
 			}
 
-			SetDialogVariable( "descriptionlabel", wzHelp );
+			SetDialogVariable( "descriptionlabel", wzHelp, false );
 
 			if ( pTimer && !pTimer->IsWatchingTimeStamps() )
 			{
@@ -1509,7 +1509,7 @@ void CHudStopWatch::OnTick( void )
 			m_pStopWatchDescriptionBG->SetVisible( false );
 			m_pStopWatchDescriptionLabel->SetVisible( false );
 
-			SetDialogVariable( "descriptionlabel", g_pVGuiLocalize->Find( "#Tournament_StopWatch_CapVictory" ) );
+			SetDialogVariable( "descriptionlabel", g_pVGuiLocalize->Find( "#Tournament_StopWatch_CapVictory" ), false );
 
 			m_pStopWatchImage->SetImage( "../hud/ico_time_60" );
 
@@ -1533,7 +1533,7 @@ void CHudStopWatch::OnTick( void )
 				g_pVGuiLocalize->ConstructString_safe( wzScoreVal, g_pVGuiLocalize->Find( "Tournament_StopWatchPointCaptureSpectator" ), 2, wzVal, iPoints == 1 ? g_pVGuiLocalize->Find( "#Tournament_StopWatch_Point" ) : g_pVGuiLocalize->Find( "#Tournament_StopWatch_Points" )  );
 			}
 
-			SetDialogVariable( "stopwatchlabel", wzScoreVal );	
+			SetDialogVariable( "stopwatchlabel", wzScoreVal, false );	
 		}
 	}
 }
diff --git a/src/game/client/tf/tf_time_panel.cpp b/src/game/client/tf/tf_time_panel.cpp
index 4d57be64c..26dc9b576 100644
--- a/src/game/client/tf/tf_time_panel.cpp
+++ b/src/game/client/tf/tf_time_panel.cpp
@@ -562,7 +562,7 @@ void CTFHudTimeStatus::SetExtraTimePanels()
 				CheckClockLabelLength( m_pOvertimeLabel, m_pOvertimeBG );
 			}
 		}
-		else
+		else if ( m_pOvertimeLabel->IsVisible() )
 		{
 			m_pOvertimeBG->SetVisible( false );
 			m_pOvertimeLabel->SetVisible( false );
diff --git a/src/game/client/tf/vgui/tf_matchmaking_dashboard_next_map_voting.cpp b/src/game/client/tf/vgui/tf_matchmaking_dashboard_next_map_voting.cpp
index e8b86acb0..9fb0067ec 100644
--- a/src/game/client/tf/vgui/tf_matchmaking_dashboard_next_map_voting.cpp
+++ b/src/game/client/tf/vgui/tf_matchmaking_dashboard_next_map_voting.cpp
@@ -225,6 +225,12 @@ class CNextMapVotingDashboardState : public CTFMatchmakingPopup
 
 	void UpdateVoteCounts()
 	{
+#ifndef STAGING_ONLY
+		if ( !TFGameRules() )
+		{
+			return;
+		}
+#endif
 		int nVotes[ CTFGameRules::EUserNextMapVote::NUM_VOTE_STATES ];
 		memset( nVotes, 0, sizeof( nVotes ) );
 		int nTotalVotes = 0;
@@ -257,7 +263,7 @@ class CNextMapVotingDashboardState : public CTFMatchmakingPopup
 			if ( pMapChoicePanel )
 			{
 				// Update the label with the % total
-				pMapChoicePanel->SetDialogVariable( "votes", CFmtStr( "%3.0f%%", flPercent ) );
+				pMapChoicePanel->SetDialogVariable( "votes", CFmtStr( "%3.0f%%", flPercent ), false );
 				// Do a color change animation
 				if ( g_pClientMode && g_pClientMode->GetViewport() )
 				{
diff --git a/src/game/client/tf/vgui/tf_playermodelpanel.cpp b/src/game/client/tf/vgui/tf_playermodelpanel.cpp
index cb49453a5..6f130b9d2 100644
--- a/src/game/client/tf/vgui/tf_playermodelpanel.cpp
+++ b/src/game/client/tf/vgui/tf_playermodelpanel.cpp
@@ -1349,6 +1349,8 @@ CEconItemView *CTFPlayerModelPanel::GetLoadoutItemFromMDLHandle( loadout_positio
 		if ( ( IsMiscSlot( iLoadoutSlot ) && IsMiscSlot( iPosition ) ) ||
 			 ( IsValidPickupWeaponSlot( iLoadoutSlot ) && iLoadoutSlot == iPosition ) )
 		{
+			// See if we need to cache for our style getters.
+			CEconItemViewDataCacher dataCacher(pItem->GetStaticData()->GetNumStyles() ? pItem : NULL);
 			const char * pDisplayModel = pItem->GetPlayerDisplayModel( m_iCurrentClassIndex, m_iTeam );
 			if ( pDisplayModel )
 			{
@@ -1492,6 +1494,8 @@ bool CTFPlayerModelPanel::UpdateCosmeticParticles(
 	if ( m_aParticleSystems[ iSystem ] && m_aParticleSystems[ iSystem ]->m_bIsUpdateToDate )
 		return false;
 
+	CEconItemViewDataCacher dataCacher(pEconItem);
+
 	attachedparticlesystem_t *pParticleSystem = NULL;
 
 	// do community_sparkle effect if this is a community item?
diff --git a/src/game/client/tf/vgui/tf_training_ui.cpp b/src/game/client/tf/vgui/tf_training_ui.cpp
index 808c4236d..316520e6f 100644
--- a/src/game/client/tf/vgui/tf_training_ui.cpp
+++ b/src/game/client/tf/vgui/tf_training_ui.cpp
@@ -1765,24 +1765,24 @@ class CTrainingDialog : public EditablePanel
 		ivgui()->RemoveTickSignal( GetVPanel() );
 	}
 
-	virtual void SetDialogVariable( const char *pVarName, const char *pValue )
+	virtual void SetDialogVariable( const char *pVarName, const char *pValue, bool bForceUpdate = true )
 	{
-		m_pContainer->SetDialogVariable( pVarName, pValue );
+		m_pContainer->SetDialogVariable( pVarName, pValue, bForceUpdate );
 	}
 
-	virtual void SetDialogVariable( const char *pVarName, const wchar_t *pValue )
+	virtual void SetDialogVariable( const char *pVarName, const wchar_t *pValue, bool bForceUpdate = true )
 	{
-		m_pContainer->SetDialogVariable( pVarName, pValue );
+		m_pContainer->SetDialogVariable( pVarName, pValue, bForceUpdate  );
 	}
 
-	virtual void SetDialogVariable( const char *pVarName, int nValue )
+	virtual void SetDialogVariable( const char *pVarName, int nValue, bool bForceUpdate = true )
 	{
-		m_pContainer->SetDialogVariable( pVarName, nValue );
+		m_pContainer->SetDialogVariable( pVarName, nValue, bForceUpdate  );
 	}
 
-	virtual void SetDialogVariable( const char *pVarName, float flValue )
+	virtual void SetDialogVariable( const char *pVarName, float flValue, bool bForceUpdate = true )
 	{
-		m_pContainer->SetDialogVariable( pVarName, flValue );
+		m_pContainer->SetDialogVariable( pVarName, flValue, bForceUpdate  );
 	}
 
 	void SetupButton( const char *pPanelName, CExButton **ppOut = NULL )
diff --git a/src/game/client/viewrender.cpp b/src/game/client/viewrender.cpp
index 3bc622857..72f25164c 100644
--- a/src/game/client/viewrender.cpp
+++ b/src/game/client/viewrender.cpp
@@ -3877,7 +3877,7 @@ static void DrawOpaqueRenderables_DrawStaticProps( CClientRenderablesList::CEntr
 	render->SetColorModulation(	one );
 	render->SetBlend( 1.0f );
 	
-	const int MAX_STATICS_PER_BATCH = 512;
+	const int MAX_STATICS_PER_BATCH = 2048;
 	IClientRenderable *pStatics[ MAX_STATICS_PER_BATCH ];
 	
 	int numScheduled = 0, numAvailable = MAX_STATICS_PER_BATCH;
diff --git a/src/game/shared/econ/econ_item_inventory.cpp b/src/game/shared/econ/econ_item_inventory.cpp
index 1bc2b2b93..e2996a1c7 100644
--- a/src/game/shared/econ/econ_item_inventory.cpp
+++ b/src/game/shared/econ/econ_item_inventory.cpp
@@ -296,6 +296,12 @@ void CInventoryManager::OnPersonaStateChanged( PersonaStateChange_t *info )
 //-----------------------------------------------------------------------------
 bool CInventoryManager::Init( void )
 {
+#ifdef GAME_DLL
+	if ( engine->IsDedicatedServer() )
+#endif
+	{
+		InitializeInventory();
+	}
 	return true;
 }
 
@@ -304,12 +310,6 @@ bool CInventoryManager::Init( void )
 //-----------------------------------------------------------------------------
 void CInventoryManager::PostInit( void )
 {
-#ifdef GAME_DLL
-	if ( engine->IsDedicatedServer() )
-#endif
-	{
-		InitializeInventory();
-	}
 }
 
 void CInventoryManager::InitializeInventory()
diff --git a/src/game/shared/econ/econ_item_schema.cpp b/src/game/shared/econ/econ_item_schema.cpp
index 1093c5c2a..643a349a1 100644
--- a/src/game/shared/econ/econ_item_schema.cpp
+++ b/src/game/shared/econ/econ_item_schema.cpp
@@ -830,27 +830,29 @@ bool CEconItemPaintKitDefinition::BInitFromKV( KeyValues *pKVPItemPaintKit, CUtl
 
 	SCHEMA_INIT_CHECK( m_pszLocalizedName != NULL, "Paint Kit %s: PaintKit contains no localized name", m_pszName );
 
+	pKVPItemPaintKit = pKVPItemPaintKit->MakeCopy();
+
 	KeyValues *pKVWearInputItems = NULL;
 
 	pKVWearInputItems = pKVPItemPaintKit->FindKey( "wear_level_1", false );
 	SCHEMA_INIT_CHECK( VerifyPaintKitComposite( pKVWearInputItems, m_pszName, 1, pVecErrors ), "Could Not Create Weapon Skin Compositor for [%s][Wear %d]", m_pszName, 1 );
-	m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems->MakeCopy() );
+	m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems );
 	
 	pKVWearInputItems = pKVPItemPaintKit->FindKey( "wear_level_2", false );
 	SCHEMA_INIT_CHECK( VerifyPaintKitComposite( pKVWearInputItems, m_pszName, 2, pVecErrors ), "Could Not Create Weapon Skin Compositor for [%s][Wear %d]", m_pszName, 2 );
-	m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems->MakeCopy() );
+	m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems );
 
 	pKVWearInputItems = pKVPItemPaintKit->FindKey( "wear_level_3", false );
 	SCHEMA_INIT_CHECK( VerifyPaintKitComposite( pKVWearInputItems, m_pszName, 3, pVecErrors ), "Could Not Create Weapon Skin Compositor for [%s][Wear %d]", m_pszName, 3 );
-	m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems->MakeCopy() );
+	m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems );
 
 	pKVWearInputItems = pKVPItemPaintKit->FindKey( "wear_level_4", false );
 	SCHEMA_INIT_CHECK( VerifyPaintKitComposite( pKVWearInputItems, m_pszName, 4, pVecErrors ), "Could Not Create Weapon Skin Compositor for [%s][Wear %d]", m_pszName, 4 );
-	m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems->MakeCopy() );
+	m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems );
 
 	pKVWearInputItems = pKVPItemPaintKit->FindKey( "wear_level_5", false );
 	SCHEMA_INIT_CHECK( VerifyPaintKitComposite( pKVWearInputItems, m_pszName, 5, pVecErrors ), "Could Not Create Weapon Skin Compositor for [%s][Wear %d]", m_pszName, 5 );
-	m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems->MakeCopy() );
+	m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems );
 
 	return SCHEMA_INIT_SUCCESS();
 }
diff --git a/src/game/shared/econ/econ_item_system.cpp b/src/game/shared/econ/econ_item_system.cpp
index aa6478abf..01ba43d4c 100644
--- a/src/game/shared/econ/econ_item_system.cpp
+++ b/src/game/shared/econ/econ_item_system.cpp
@@ -520,11 +520,12 @@ class CGCUpdateItemSchema : public GCSDK::CGCClientJob
 		// Check if we're already up-to-date
 		m_nExpectedVersion = msg.Body().item_schema_version();
 		uint32 nCurrentSchemaVersion = ItemSystem()->GetItemSchema()->GetVersion();
-		if ( m_nExpectedVersion != 0 && m_nExpectedVersion == nCurrentSchemaVersion )
+		if ( m_nExpectedVersion != 0 && m_nExpectedVersion == nCurrentSchemaVersion || m_nExpectedVersion == 1265307132 && nCurrentSchemaVersion == 1797044324 )
 		{
 			Msg( "Current item schema is up-to-date with version %08X.\n", nCurrentSchemaVersion );
 			return true;
 		}
+		Warning( "Current item schema is outdated with version %d instead of %d.\n", nCurrentSchemaVersion, m_nExpectedVersion );
 
 		m_sSignature = msg.Body().signature();
 
diff --git a/src/game/shared/econ/econ_item_view.cpp b/src/game/shared/econ/econ_item_view.cpp
index caf16d067..bb9be0c3e 100644
--- a/src/game/shared/econ/econ_item_view.cpp
+++ b/src/game/shared/econ/econ_item_view.cpp
@@ -844,6 +844,9 @@ CEconItem *CEconItemView::GetSOCData( void ) const
 	if ( m_pNonSOEconItem )
 		return m_pNonSOEconItem;
 
+	if (m_pSOCDataCache)
+		return m_pSOCDataCache;
+
 #ifdef CLIENT_DLL
 	// We need to find the inventory that contains this item. If we're not connected 
 	// to a server, and the owner is the same as the local player, use the local inventory.
diff --git a/src/game/shared/econ/econ_item_view.h b/src/game/shared/econ/econ_item_view.h
index 5ccf8253d..1beef8e4a 100644
--- a/src/game/shared/econ/econ_item_view.h
+++ b/src/game/shared/econ/econ_item_view.h
@@ -358,6 +358,9 @@ class CEconItemView : public CMaterialOverrideContainer< IEconItemInterface >
 	inline int					GetTeamNumber() const { return m_iTeamNumber; }
 	inline void					SetTeamNumber( int iTeamNumber ) { m_iTeamNumber = iTeamNumber; }
 
+	void						CacheSOCData() { if (!m_pSOCDataCache) m_pSOCDataCache = GetSOCData(); }
+	void						UncacheSOCData() { m_pSOCDataCache = NULL; }
+
 protected:
 	// Index of the item definition in the item script file.
 	CNetworkVar( item_definition_index_t,	m_iItemDefinitionIndex );	
@@ -395,6 +398,9 @@ class CEconItemView : public CMaterialOverrideContainer< IEconItemInterface >
 	eEconItemOrigin			m_unOverrideOrigin;
 #endif
 
+	// Can set this temporarily while calling several attribute getters to avoid looking up each time
+	CEconItem				*m_pSOCDataCache = NULL;
+
 	bool	m_bColorInit;
 	bool	m_bPaintOverrideInit;
 	bool	m_bHasPaintOverride;
@@ -452,4 +458,34 @@ bool DoesItemPassSearchFilter( const class IEconItemDescription *pDescription, c
 CBasePlayer *GetPlayerByAccountID( uint32 unAccountID );
 #endif // CLIENT_DLL
 
+/** There are some function calls which repeatedly call out to our underlying item, lets cache beforehand. */
+class CEconItemViewDataCacher
+{
+public:
+	CEconItemViewDataCacher(CEconItemView* pItem) : m_pItem(pItem)
+	{
+		if (!m_pItem) return;
+		pItem->CacheSOCData();
+	}
+
+	~CEconItemViewDataCacher()
+	{
+		if (!m_pItem) return;
+		m_pItem->UncacheSOCData();
+	}
+
+	void SetItem(CEconItemView* pItem)
+	{
+		if (pItem == m_pItem) return;
+		if (!pItem) return;
+		if (m_pItem) m_pItem->UncacheSOCData();
+		m_pItem = pItem;
+		m_pItem->CacheSOCData();
+	}
+
+private:
+
+	CEconItemView* m_pItem;
+};
+
 #endif // ECON_ITEM_CONSTANTS_H
diff --git a/src/game/shared/teamplay_round_timer.cpp b/src/game/shared/teamplay_round_timer.cpp
index f451a296b..047f9d9f3 100644
--- a/src/game/shared/teamplay_round_timer.cpp
+++ b/src/game/shared/teamplay_round_timer.cpp
@@ -97,10 +97,13 @@ static void RecvProxy_TimerPaused( const CRecvProxyData *pData, void *pStruct, v
 
 	bool bTimerPaused = ( pData->m_Value.m_Int > 0 );
 
+	// UNDONE: Unused HUD animation
+#if 0
 	if ( bTimerPaused == false )
 	{
 		g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "TimerFlash" ); 
 	}
+#endif
 
 	if ( pTimer )
 	{
diff --git a/src/game/shared/teamplayroundbased_gamerules.cpp b/src/game/shared/teamplayroundbased_gamerules.cpp
index fb8d63418..f8fb8d373 100644
--- a/src/game/shared/teamplayroundbased_gamerules.cpp
+++ b/src/game/shared/teamplayroundbased_gamerules.cpp
@@ -806,7 +806,7 @@ void CTeamplayRoundBasedRules::GoToIntermission( void )
 {
 	if ( IsInTournamentMode() == true
 #ifdef TF_DLL
-	&& TFGameRules() && !TFGameRules()->IsMannVsMachineMode()
+	&& TFGameRules() && TFGameRules()->IsMannVsMachineMode()
 #endif
 		)
 		return;
diff --git a/src/game/shared/tf/tf_gamerules.h b/src/game/shared/tf/tf_gamerules.h
index 4a9678eeb..284930287 100644
--- a/src/game/shared/tf/tf_gamerules.h
+++ b/src/game/shared/tf/tf_gamerules.h
@@ -1466,7 +1466,7 @@ inline float CTFGameRules::ItemTesting_GetBotAnimSpeed( void )
 		pHostTimescale = cvar->FindVar( "host_timescale" );
 	}
 
-	if ( pHostTimescale )
+	if ( pHostTimescale && pHostTimescale->GetFloat() > 0 )
 		return (m_flItemTesting_BotAnimSpeed * pHostTimescale->GetFloat());
 	return m_flItemTesting_BotAnimSpeed;
 }
diff --git a/src/game/shared/tf/tf_viewmodel.cpp b/src/game/shared/tf/tf_viewmodel.cpp
index bcd5dcbda..2111d5a46 100644
--- a/src/game/shared/tf/tf_viewmodel.cpp
+++ b/src/game/shared/tf/tf_viewmodel.cpp
@@ -338,6 +338,7 @@ int CTFViewModel::GetSkin()
 		CEconItemView *pItem = pWeapon->GetAttributeContainer()->GetItem();
 		if ( pItem->IsValid() )
 		{
+			CEconItemViewDataCacher dataCacher(pItem);
 			iItemSkin = pItem->GetSkin( pPlayer->GetTeamNumber(), true );
 		}
 
@@ -490,10 +491,11 @@ void CInvisProxy::OnBind( C_BaseEntity *pC_BaseEntity )
 
 	C_BaseEntity *pEnt = pC_BaseEntity;
 
+	// TODO: causes crashes
 	if ( pEnt != pCachedEntity )
 	{
 		pPlayer = NULL;
-		pCachedEntity = pEnt;
+		//pCachedEntity = pEnt;
 	}
 
 	if ( !pPlayer )
diff --git a/src/game/shared/tf/tf_weapon_sniperrifle.cpp b/src/game/shared/tf/tf_weapon_sniperrifle.cpp
index 40f095f86..1c50cbb8e 100644
--- a/src/game/shared/tf/tf_weapon_sniperrifle.cpp
+++ b/src/game/shared/tf/tf_weapon_sniperrifle.cpp
@@ -523,19 +523,19 @@ void CTFSniperRifle::ZoomOutIn( void )
 	ZoomOut();
 
 	CTFPlayer *pPlayer = GetTFPlayerOwner();
+	float flRezoomDelay = 0.9f;
+	if ( !UsesClipsForAmmo1() )
+	{
+		// Since sniper rifles don't actually use clips the fast reload hook also affects unzoom and zoom delays
+		ApplyScopeSpeedModifications( flRezoomDelay );
+	}
 	if ( pPlayer && pPlayer->ShouldAutoRezoom() )
 	{
-		float flRezoomDelay = 0.9f;
-		if ( !UsesClipsForAmmo1() )
-		{
-			// Since sniper rifles don't actually use clips the fast reload hook also affects unzoom and zoom delays
-			ApplyScopeSpeedModifications( flRezoomDelay );
-		}
 		m_flRezoomTime = gpGlobals->curtime + flRezoomDelay;
 	}
 	else
 	{
-		m_flNextSecondaryAttack = gpGlobals->curtime + 1.0f;
+		m_flNextSecondaryAttack = gpGlobals->curtime + flRezoomDelay + 0.1f;
 	}
 }
 
diff --git a/src/gameui/GameUI_Interface.cpp b/src/gameui/GameUI_Interface.cpp
index 01c7b39bb..1d699d840 100644
--- a/src/gameui/GameUI_Interface.cpp
+++ b/src/gameui/GameUI_Interface.cpp
@@ -170,8 +170,12 @@ void CGameUI::Initialize( CreateInterfaceFn factory )
 
 	steamapicontext->Init();
 
+#if defined( _X360 ) || defined( STAGING_ONLY )
 	ConVarRef var( "gameui_xbox" );
 	m_bIsConsoleUI = var.IsValid() && var.GetBool();
+#else
+	m_bIsConsoleUI = false;
+#endif
 
 	vgui::VGui_InitInterfacesList( "GameUI", &factory, 1 );
 	vgui::VGui_InitMatSysInterfacesList( "GameUI", &factory, 1 );
diff --git a/src/inputsystem/inputsystem.cpp b/src/inputsystem/inputsystem.cpp
index 5f11ac7d3..3106a5a63 100644
--- a/src/inputsystem/inputsystem.cpp
+++ b/src/inputsystem/inputsystem.cpp
@@ -271,7 +271,7 @@ void CInputSystem::SleepUntilInput( int nMaxSleepTimeMS )
 #else
 #warning "need a SleepUntilInput impl"
 #endif
-}
+	}
 
 
 
diff --git a/src/mathlib/mathlib_base.cpp b/src/mathlib/mathlib_base.cpp
index 4fe9d8fae..14fad005d 100644
--- a/src/mathlib/mathlib_base.cpp
+++ b/src/mathlib/mathlib_base.cpp
@@ -838,7 +838,7 @@ void AngleVectors( const QAngle &angles, Vector *forward, Vector *right, Vector
 	
 	float sr, sp, sy, cr, cp, cy;
 
-#ifdef _X360
+#if defined(_X360) || USE_DXMATH
 	fltx4 radians, scale, sine, cosine;
 	radians = LoadUnaligned3SIMD( angles.Base() );
 	scale = ReplicateX4( M_PI_F / 180.f ); 
@@ -1984,7 +1984,7 @@ void AngleQuaternion( const QAngle &angles, Quaternion &outQuat )
 
 	float sr, sp, sy, cr, cp, cy;
 
-#if defined(_X360)
+#if defined(_X360) || USE_DXMATH
 	fltx4 radians, scale, sine, cosine;
 	radians = LoadUnaligned3SIMD( angles.Base() );
 	scale = ReplicateX4( 0.5f * M_PI_F / 180.f ); 
diff --git a/src/public/bone_setup.cpp b/src/public/bone_setup.cpp
index 27ab95dcf..c2530fc25 100644
--- a/src/public/bone_setup.cpp
+++ b/src/public/bone_setup.cpp
@@ -3118,14 +3118,14 @@ class CIKSolver
          X[i] = P[i];
       normalize(X);
 
-// Its y axis is perpendicular to P, so Y = unit( E - X(E�X) ).
+// Its y axis is perpendicular to P, so Y = unit( E - X(E⋅X) ).
 
       float dDOTx = dot(D,X);
       for (i = 0 ; i < 3 ; i++)
          Y[i] = D[i] - dDOTx * X[i];
       normalize(Y);
 
-// Its z axis is perpendicular to both X and Y, so Z = X�Y.
+// Its z axis is perpendicular to both X and Y, so Z = X⋅Y.
 
       cross(X,Y,Z);
 
diff --git a/src/public/collisionutils.cpp b/src/public/collisionutils.cpp
index 2549a569f..1c648357e 100644
--- a/src/public/collisionutils.cpp
+++ b/src/public/collisionutils.cpp
@@ -543,7 +543,7 @@ bool IsPointInBox( const Vector& pt, const Vector& boxMin, const Vector& boxMax
 	Assert( boxMin[2] <= boxMax[2] );
 
 	// on x360, force use of SIMD version.
-	if (IsX360())
+	if (IsX360() || 1)
 	{
 		return IsPointInBox( LoadUnaligned3SIMD(pt.Base()), LoadUnaligned3SIMD(boxMin.Base()), LoadUnaligned3SIMD(boxMax.Base()) ) ;
 	}
@@ -893,8 +893,8 @@ bool FASTCALL IsBoxIntersectingRay( const Vector& boxMin, const Vector& boxMax,
 bool FASTCALL IsBoxIntersectingRay( const Vector& vecBoxMin, const Vector& vecBoxMax, const Ray_t& ray, float flTolerance )
 {
 	// On the x360, we force use of the SIMD functions.
-#if defined(_X360) 
-	if (IsX360())
+#if defined(_X360) || 1
+	if (IsX360() || 1)
 	{
 		return IsBoxIntersectingRay( 
 			LoadUnaligned3SIMD(vecBoxMin.Base()), LoadUnaligned3SIMD(vecBoxMax.Base()),
@@ -927,7 +927,7 @@ bool FASTCALL IsBoxIntersectingRay( const Vector& vecBoxMin, const Vector& vecBo
 //-----------------------------------------------------------------------------
 
 
-#ifdef _X360
+#if defined(_X360) || 1
 bool FASTCALL IsBoxIntersectingRay( fltx4 boxMin, fltx4 boxMax, 
 								    fltx4 origin, fltx4 delta, fltx4 invDelta, // ray parameters
 									fltx4 vTolerance ///< eg from ReplicateX4(flTolerance)
@@ -943,7 +943,7 @@ bool FASTCALL IsBoxIntersectingRay( const fltx4 &inBoxMin, const fltx4 & inBoxMa
 	// compute the mins/maxs of the box expanded by the ray extents
 	// relocate the problem so that the ray start is at the origin.
 
-#ifdef _X360
+#if defined(_X360) || 1
 	boxMin = SubSIMD(boxMin, origin);
 	boxMax = SubSIMD(boxMax, origin);
 #else
diff --git a/src/public/collisionutils.h b/src/public/collisionutils.h
index 50f87046b..d511287ca 100644
--- a/src/public/collisionutils.h
+++ b/src/public/collisionutils.h
@@ -222,9 +222,7 @@ bool IsBoxIntersectingBoxExtents( const Vector& boxCenter1, const Vector& boxHal
 						   const Vector& boxCenter2, const Vector& boxHalfDiagonal2 );
 
 
-#ifdef _X360
-// inline version:
-#include "mathlib/ssemath.h"
+#if defined(_X360) || USE_DXMATH
 inline bool IsBoxIntersectingBoxExtents( const fltx4 boxCenter1, const fltx4 boxHalfDiagonal1, 
 								 const fltx4 boxCenter2, const fltx4 boxHalfDiagonal2 );
 #endif
@@ -259,9 +257,10 @@ bool FASTCALL IsBoxIntersectingRay( const Vector& boxMin, const Vector& boxMax,
 									const Vector& invDelta, float flTolerance = 0.0f );
 
 
+// UNDONE: with SSE2 on PC, we now can.
 // On the PC, we can't pass fltx4's in registers like this. On the x360, it is 
 // much better if we do.
-#ifdef _X360
+#if defined(_X360) || 1
 bool FASTCALL IsBoxIntersectingRay( fltx4 boxMin, fltx4 boxMax, 
 								   fltx4 origin, fltx4 delta, fltx4 invDelta, // ray parameters
 								   fltx4 vTolerance = LoadZeroSIMD() ///< eg from ReplicateX4(flTolerance)
@@ -428,7 +427,23 @@ bool RayHasFullyContainedIntersectionWithQuad( const Ray_t &ray,
 //-----------------------------------------------------------------------------
 
 
-#ifdef _X360
+#if USE_DXMATH
+inline bool IsBoxIntersectingBoxExtents( const fltx4 boxCenter1, const fltx4 boxHalfDiagonal1,
+								 const fltx4 boxCenter2, const fltx4 boxHalfDiagonal2 )
+{
+	fltx4 vecDelta, vecSize;
+
+	vecDelta = SubSIMD(boxCenter1, boxCenter2);
+	vecSize = AddSIMD(boxHalfDiagonal1, boxHalfDiagonal2);
+
+	uint condition;
+	DirectX::XMVectorInBoundsR(&condition, vecDelta, vecSize);
+	// we want the top three words to be all 1's ; that means in bounds
+
+
+	return DirectX::XMComparisonAllInBounds( condition );
+}
+#elif defined(_X360)
 inline bool IsBoxIntersectingBoxExtents( const fltx4 boxCenter1, const fltx4 boxHalfDiagonal1, 
 								 const fltx4 boxCenter2, const fltx4 boxHalfDiagonal2 )
 {
diff --git a/src/public/mathlib/ssemath.h b/src/public/mathlib/ssemath.h
index 3bcda408a..365de17e5 100644
--- a/src/public/mathlib/ssemath.h
+++ b/src/public/mathlib/ssemath.h
@@ -2057,7 +2057,8 @@ FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
 FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
 {
 #if USE_DXMATH
-	DirectX::XMVectorSinCos( &sine, &cosine, radians );
+	//DirectX::XMVectorSinCos( &sine, &cosine, radians );
+	sincos_ps(radians, &sine, &cosine);
 #else
 	// FIXME: Make a fast SSE version
 	SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
diff --git a/src/public/tier1/strtools.h b/src/public/tier1/strtools.h
index e94d9cedf..d430c032b 100644
--- a/src/public/tier1/strtools.h
+++ b/src/public/tier1/strtools.h
@@ -253,7 +253,34 @@ inline bool V_islower(char c) { return islower( (unsigned char)c ) != 0; }
 inline bool V_iscntrl(char c) { return iscntrl( (unsigned char)c ) != 0; }
 //#undef iscntrl
 //#define iscntrl use_V_iscntrl_instead_of_iscntrl
-inline bool V_isspace(char c) { return isspace( (unsigned char)c ) != 0; }
+inline bool V_isspace(int c)
+{
+	// The standard white-space characters are the following: space, tab, carriage-return, newline, vertical tab, and form-feed. In the C locale, V_isspace() returns true only for the standard white-space characters. 
+	//return c == ' ' || c == 9 /*horizontal tab*/ || c == '\r' || c == '\n' || c == 11 /*vertical tab*/ || c == '\f';
+	// codes of whitespace symbols: 9 HT, 10 \n, 11 VT, 12 form feed, 13 \r, 32 space
+
+	// easy to understand version, validated:
+	// return ((1 << (c-1)) & 0x80001F00) != 0 && ((c-1)&0xE0) == 0;
+	
+	// 5% faster on Core i7, 35% faster on Xbox360, no branches, validated:
+	#ifdef _X360
+	return ((1 << (c-1)) & 0x80001F00 & ~(-int((c-1)&0xE0))) != 0;
+	#else
+	// this is 11% faster on Core i7 than the previous, VC2005 compiler generates a seemingly unbalanced search tree that's faster
+	switch(c)
+	{
+	case ' ':
+	case 9:
+	case '\r':
+	case '\n':
+	case 11:
+	case '\f':
+		return true;
+	default:
+		return false;
+	}
+	#endif
+}
 //#undef isspace
 //#define isspace use_V_isspace_instead_of_isspace
 
diff --git a/src/public/vgui_controls/AnimationController.h b/src/public/vgui_controls/AnimationController.h
index 63f29f41b..4836055e3 100644
--- a/src/public/vgui_controls/AnimationController.h
+++ b/src/public/vgui_controls/AnimationController.h
@@ -50,7 +50,7 @@ class AnimationController : public Panel
 
 	// starts an animation sequence script
 	bool StartAnimationSequence(const char *sequenceName, bool bCanBeCancelled = true );
-	bool StartAnimationSequence(Panel *pWithinParent, const char *sequenceName, bool bCanBeCancelled = true );
+	bool StartAnimationSequence(Panel *pWithinParent, const char *sequenceName, bool bCanBeCancelled = true, bool bIncludeParent = false );
 
 	bool StopAnimationSequence( Panel *pWithinParent, const char *sequenceName );
 	void CancelAnimationsForPanel( Panel *pWithinParent );
@@ -241,14 +241,14 @@ class AnimationController : public Panel
 	CUtlVector<UtlSymId_t>	m_ScriptFileNames;
 
 	// runs a single line of the script
-	void ExecAnimationCommand(UtlSymId_t seqName, AnimCommand_t &animCommand, Panel *pWithinParent, bool bCanBeCancelled);
+	void ExecAnimationCommand(UtlSymId_t seqName, AnimCommand_t &animCommand, Panel *pWithinParent, bool bCanBeCancelled, bool bIncludeParent = false);
 	// removes all commands belonging to a script
 	void RemoveQueuedAnimationCommands(UtlSymId_t seqName, vgui::Panel *panel = NULL);
 	// removes an existing instance of a command
 	void RemoveQueuedAnimationByType(vgui::Panel *panel, UtlSymId_t variable, UtlSymId_t sequenceToIgnore);
 
 	// handlers
-	void StartCmd_Animate(UtlSymId_t seqName, AnimCmdAnimate_t &cmd, Panel *pWithinParent, bool bCanBeCancelled);
+	void StartCmd_Animate(UtlSymId_t seqName, AnimCmdAnimate_t &cmd, Panel *pWithinParent, bool bCanBeCancelled, bool bIncludeParent = false);
 	void StartCmd_Animate(Panel *panel, UtlSymId_t seqName, AnimCmdAnimate_t &cmd, bool bCanBeCancelled);
 	void RunCmd_RunEvent(PostedMessage_t &msg);
 	void RunCmd_StopEvent(PostedMessage_t &msg);
diff --git a/src/public/vgui_controls/EditablePanel.h b/src/public/vgui_controls/EditablePanel.h
index ea1f7248d..c8ee840ce 100644
--- a/src/public/vgui_controls/EditablePanel.h
+++ b/src/public/vgui_controls/EditablePanel.h
@@ -74,10 +74,10 @@ class EditablePanel : public Panel
 
 	// localization variables (used in constructing UI strings)
 	// after the variable is set, causes all the necessary sub-panels to update
-	virtual void SetDialogVariable(const char *varName, const char *value);
-	virtual void SetDialogVariable(const char *varName, const wchar_t *value);
-	virtual void SetDialogVariable(const char *varName, int value);
-	virtual void SetDialogVariable(const char *varName, float value);
+	virtual void SetDialogVariable(const char *varName, const char *value, bool bForceUpdate = true);
+	virtual void SetDialogVariable(const char *varName, const wchar_t *value, bool bForceUpdate = true);
+	virtual void SetDialogVariable(const char *varName, int value, bool bForceUpdate = true);
+	virtual void SetDialogVariable(const char *varName, float value, bool bForceUpdate = true);
 
 	// Focus handling
 	// Delegate focus to a sub panel
diff --git a/src/public/vgui_controls/Label.h b/src/public/vgui_controls/Label.h
index 53422f764..03d37854e 100644
--- a/src/public/vgui_controls/Label.h
+++ b/src/public/vgui_controls/Label.h
@@ -196,6 +196,8 @@ class Label : public Panel
 		short width;
 	};
 	CUtlVector<TImageInfo> _imageDar;
+	bool _isSimpleTextImage = false;
+	TImageInfo *_cachedSimpleTextImage;
 
 	int		   _textInset[2];
 	Color      _disabledFgColor1;
diff --git a/src/public/vgui_controls/Panel.h b/src/public/vgui_controls/Panel.h
index 5d1abd1f7..12422c81b 100644
--- a/src/public/vgui_controls/Panel.h
+++ b/src/public/vgui_controls/Panel.h
@@ -344,6 +344,8 @@ class Panel : public IClientPanel, virtual public IForceVirtualInheritancePanel
 	bool IsRightAligned();		// returns true if the settings are aligned to the right of the screen
 	bool IsBottomAligned();		// returns true if the settings are aligned to the bottom of the screen
 
+	virtual bool CanAnimate() const { return true; }
+
 	// scheme access functions
 	virtual HScheme GetScheme();
 	virtual void SetScheme(const char *tag);
diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h
index fd542388f..5214a7f1f 100644
--- a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h
+++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h
@@ -15,10 +15,6 @@
 
 #define DIRECTX_MATH_VERSION 318
 
-#if defined(_MSC_VER) && (_MSC_VER < 1910)
-#error DirectX Math requires Visual C++ 2017 or later.
-#endif
-
 #if defined(_MSC_VER) && !defined(_M_ARM) && !defined(_M_ARM64) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC) && (!_MANAGED) && (!_M_CEE) && (!defined(_M_IX86_FP) || (_M_IX86_FP > 1)) && !defined(_XM_NO_INTRINSICS_) && !defined(_XM_VECTORCALL_)
 #define _XM_VECTORCALL_ 1
 #endif
diff --git a/src/thirdparty/quickhull/quickhull.vpc b/src/thirdparty/quickhull/quickhull.vpc
index 4935a5d5f..cd6fc11eb 100644
--- a/src/thirdparty/quickhull/quickhull.vpc
+++ b/src/thirdparty/quickhull/quickhull.vpc
@@ -19,7 +19,7 @@ $Configuration
 	}
 	$Compiler [$WINDOWS]
 	{
-		$EnableEnhancedInstructionSet	"Streaming SIMD Extensions (/arch:SSE)"
+		$EnableEnhancedInstructionSet	"Streaming SIMD Extensions 2 (/arch:SSE2)"
 	}
 }
 
diff --git a/src/tier1/utlbuffer.cpp b/src/tier1/utlbuffer.cpp
index ff03da068..be3768137 100644
--- a/src/tier1/utlbuffer.cpp
+++ b/src/tier1/utlbuffer.cpp
@@ -392,7 +392,7 @@ void CUtlBuffer::EatWhiteSpace()
 	{
 		while ( CheckGet( sizeof(char) ) )
 		{
-			if ( !isspace( *(const unsigned char*)PeekGet() ) )
+			if ( !V_isspace( *(const unsigned char*)PeekGet() ) )
 				break;
 			m_Get += sizeof(char);
 		}
diff --git a/src/vgui2/vgui_controls/AnimationController.cpp b/src/vgui2/vgui_controls/AnimationController.cpp
index cee769a97..caac79996 100644
--- a/src/vgui2/vgui_controls/AnimationController.cpp
+++ b/src/vgui2/vgui_controls/AnimationController.cpp
@@ -1042,7 +1042,7 @@ bool AnimationController::StartAnimationSequence(const char *sequenceName, bool
 //-----------------------------------------------------------------------------
 // Purpose: starts an animation sequence script
 //-----------------------------------------------------------------------------
-bool AnimationController::StartAnimationSequence(Panel *pWithinParent, const char *sequenceName, bool bCanBeCancelled )
+bool AnimationController::StartAnimationSequence(Panel *pWithinParent, const char *sequenceName, bool bCanBeCancelled, bool bIncludeParent )
 {
 	Assert( pWithinParent );
 
@@ -1075,7 +1075,7 @@ bool AnimationController::StartAnimationSequence(Panel *pWithinParent, const cha
 	// execute the sequence
 	for (int cmdIndex = 0; cmdIndex < m_Sequences[i].cmdList.Count(); cmdIndex++)
 	{
-		ExecAnimationCommand(seqName, m_Sequences[i].cmdList[cmdIndex], pWithinParent, bCanBeCancelled);
+		ExecAnimationCommand(seqName, m_Sequences[i].cmdList[cmdIndex], pWithinParent, bCanBeCancelled, bIncludeParent);
 	}
 
 	return true;	
@@ -1277,11 +1277,11 @@ void AnimationController::RemoveQueuedAnimationByType(vgui::Panel *panel, UtlSym
 //-----------------------------------------------------------------------------
 // Purpose: runs a single line of the script
 //-----------------------------------------------------------------------------
-void AnimationController::ExecAnimationCommand(UtlSymId_t seqName, AnimCommand_t &animCommand, Panel *pWithinParent, bool bCanBeCancelled)
+void AnimationController::ExecAnimationCommand(UtlSymId_t seqName, AnimCommand_t &animCommand, Panel *pWithinParent, bool bCanBeCancelled, bool bIncludeParent)
 {
 	if (animCommand.commandType == CMD_ANIMATE)
 	{
-		StartCmd_Animate(seqName, animCommand.cmdData.animate, pWithinParent, bCanBeCancelled);
+		StartCmd_Animate(seqName, animCommand.cmdData.animate, pWithinParent, bCanBeCancelled, bIncludeParent);
 	}
 	else
 	{
@@ -1301,19 +1301,21 @@ void AnimationController::ExecAnimationCommand(UtlSymId_t seqName, AnimCommand_t
 //-----------------------------------------------------------------------------
 // Purpose: starts a variable animation
 //-----------------------------------------------------------------------------
-void AnimationController::StartCmd_Animate(UtlSymId_t seqName, AnimCmdAnimate_t &cmd, Panel *pWithinParent, bool bCanBeCancelled)
+void AnimationController::StartCmd_Animate(UtlSymId_t seqName, AnimCmdAnimate_t &cmd, Panel *pWithinParent, bool bCanBeCancelled, bool bIncludeParent)
 {
 	Assert( pWithinParent );
 	if ( !pWithinParent )
 		return;
 
+	const char* panelName = g_ScriptSymbols.String(cmd.panel);
+
 	// make sure the child exists
-	Panel *panel = pWithinParent->FindChildByName(g_ScriptSymbols.String(cmd.panel),true);
+	Panel *panel = pWithinParent->FindChildByName(panelName,true);
 	if ( !panel )
 	{
 		// Check the parent
-		Panel *parent = GetParent();
-		if ( !Q_stricmp( parent->GetName(), g_ScriptSymbols.String(cmd.panel) ) )
+		Panel *parent = bIncludeParent ? pWithinParent : GetParent();
+		if ( !Q_stricmp( parent->GetName(), panelName ) )
 		{
 			panel = parent;
 		}
@@ -1321,6 +1323,10 @@ void AnimationController::StartCmd_Animate(UtlSymId_t seqName, AnimCmdAnimate_t
 	if (!panel)
 		return;
 
+	// Block some panels (like HudScope). Unfortunately players are abusing animations with broad/null parents.
+	if ( !panel->CanAnimate() )
+		return;
+
 	StartCmd_Animate(panel, seqName, cmd, bCanBeCancelled);
 }
 
diff --git a/src/vgui2/vgui_controls/EditablePanel.cpp b/src/vgui2/vgui_controls/EditablePanel.cpp
index 670d4dbc4..553a6e9fc 100644
--- a/src/vgui2/vgui_controls/EditablePanel.cpp
+++ b/src/vgui2/vgui_controls/EditablePanel.cpp
@@ -997,8 +997,12 @@ void EditablePanel::GetControlString(const char *controlName, char *buf, int buf
 //-----------------------------------------------------------------------------
 // Purpose: localization variables (used in constructing UI strings)
 //-----------------------------------------------------------------------------
-void EditablePanel::SetDialogVariable(const char *varName, const char *value)
+void EditablePanel::SetDialogVariable(const char *varName, const char *value, bool bForceUpdate)
 {
+	if (!bForceUpdate && !GetDialogVariables()->IsEmpty(varName) && !strcmp(GetDialogVariables()->GetString(varName), value ? value : ""))
+	{
+		return;
+	}
 	GetDialogVariables()->SetString(varName, value);
 	ForceSubPanelsToUpdateWithNewDialogVariables();
 }
@@ -1006,8 +1010,12 @@ void EditablePanel::SetDialogVariable(const char *varName, const char *value)
 //-----------------------------------------------------------------------------
 // Purpose: localization variables (used in constructing UI strings)
 //-----------------------------------------------------------------------------
-void EditablePanel::SetDialogVariable(const char *varName, const wchar_t *value)
+void EditablePanel::SetDialogVariable(const char *varName, const wchar_t *value, bool bForceUpdate)
 {
+	if (!bForceUpdate && !GetDialogVariables()->IsEmpty(varName) && !wcscmp(GetDialogVariables()->GetWString(varName), value ? value : L""))
+	{
+		return;
+	}
 	GetDialogVariables()->SetWString(varName, value);
 	ForceSubPanelsToUpdateWithNewDialogVariables();
 }
@@ -1015,8 +1023,12 @@ void EditablePanel::SetDialogVariable(const char *varName, const wchar_t *value)
 //-----------------------------------------------------------------------------
 // Purpose: localization variables (used in constructing UI strings)
 //-----------------------------------------------------------------------------
-void EditablePanel::SetDialogVariable(const char *varName, int value)
+void EditablePanel::SetDialogVariable(const char *varName, int value, bool bForceUpdate)
 {
+	if (!bForceUpdate && !GetDialogVariables()->IsEmpty(varName) && GetDialogVariables()->GetInt(varName) == value)
+	{
+		return;
+	}
 	GetDialogVariables()->SetInt(varName, value);
 	ForceSubPanelsToUpdateWithNewDialogVariables();
 }
@@ -1024,8 +1036,12 @@ void EditablePanel::SetDialogVariable(const char *varName, int value)
 //-----------------------------------------------------------------------------
 // Purpose: localization variables (used in constructing UI strings)
 //-----------------------------------------------------------------------------
-void EditablePanel::SetDialogVariable(const char *varName, float value)
+void EditablePanel::SetDialogVariable(const char *varName, float value, bool bForceUpdate)
 {
+	if (!bForceUpdate && !GetDialogVariables()->IsEmpty(varName) && GetDialogVariables()->GetFloat(varName) == value)
+	{
+		return;
+	}
 	GetDialogVariables()->SetFloat(varName, value);
 	ForceSubPanelsToUpdateWithNewDialogVariables();
 }
diff --git a/src/vgui2/vgui_controls/Label.cpp b/src/vgui2/vgui_controls/Label.cpp
index 2178f3904..a50205a8c 100644
--- a/src/vgui2/vgui_controls/Label.cpp
+++ b/src/vgui2/vgui_controls/Label.cpp
@@ -403,13 +403,10 @@ void Label::ComputeAlignment(int &tx0, int &ty0, int &tx1, int &ty1)
 	int maxX = 0, maxY = 0;
 
 	int actualXAlignment = _contentAlignment;
-	for (int i = 0; i < _imageDar.Count(); i++)
+	if (_isSimpleTextImage)
 	{
-		TImageInfo &imageInfo = _imageDar[i];
+		TImageInfo &imageInfo = *_cachedSimpleTextImage;
 		IImage *image = imageInfo.image;
-		if (!image)
-			continue; // skip over null images
-
 		// add up the bounds
 		int iWide, iTall;
 		image->GetSize(iWide, iTall);
@@ -423,6 +420,29 @@ void Label::ComputeAlignment(int &tx0, int &ty0, int &tx1, int &ty1)
 		// add the offset to x
 		maxX += imageInfo.offset;
 	}
+	else
+	{
+		for (int i = 0; i < _imageDar.Count(); i++)
+		{
+			TImageInfo &imageInfo = _imageDar[i];
+			IImage *image = imageInfo.image;
+			if (!image)
+				continue; // skip over null images
+
+			// add up the bounds
+			int iWide, iTall;
+			image->GetSize(iWide, iTall);
+			if (iWide > wide) // if the image is larger than the label just do a west alignment
+				actualXAlignment = Label::a_west;
+			
+			// get the max height
+			maxY = max(maxY, iTall);
+			maxX += iWide;
+
+			// add the offset to x
+			maxX += imageInfo.offset;
+		}
+	}
 
 	tWide = maxX;
 	tTall = maxY;
@@ -824,11 +844,21 @@ void Label::OnSetText(KeyValues *params)
 //-----------------------------------------------------------------------------
 int Label::AddImage(IImage *image, int offset)
 {
+	if (_isSimpleTextImage)
+	{
+		_cachedSimpleTextImage = NULL;
+		_isSimpleTextImage = false;
+	}
 	int newImage = _imageDar.AddToTail();
 	_imageDar[newImage].image = image;
 	_imageDar[newImage].offset = (short)offset;
 	_imageDar[newImage].xpos = -1;
 	_imageDar[newImage].width = -1;
+	if (_imageDar.Count() == 1 && image != NULL)
+	{
+		_cachedSimpleTextImage = _imageDar.Base();
+		_isSimpleTextImage = true;
+	}
 	InvalidateLayout();
 	return newImage;
 }
@@ -1307,9 +1337,6 @@ void Label::PerformLayout()
 		}
 
 		HandleAutoSizing();
-
-		HandleAutoSizing();
-
 		return;
 	}
 

From 3887ac5c7c356c58b54af4fa30183cac678e3702 Mon Sep 17 00:00:00 2001
From: mastercoms <mastercoms@tuta.io>
Date: Wed, 22 Mar 2023 18:03:33 -0400
Subject: [PATCH 41/42] perf: use static ConVarRefs during relatively hot
 functions

instead of doing a O(n) lookup per frame in some cases, we can just
init the ConVarRef once

hottest function in R_LoadSkys, other ones are just possible during runtime
or called relatively a lot compared to other non-static ConVarRefs
---
 src/common/ServerBrowser/blacklisted_server_manager.cpp | 2 +-
 src/engine/gl_warp.cpp                                  | 2 +-
 src/engine/matsys_interface.cpp                         | 6 +++---
 src/game/client/in_steamcontroller.cpp                  | 4 ++--
 src/game/client/tf/tf_hud_notification_panel.cpp        | 2 +-
 src/game/server/effects.cpp                             | 2 +-
 src/game/server/env_tonemap_controller.cpp              | 2 +-
 src/game/server/func_break.cpp                          | 2 +-
 src/game/server/triggers.cpp                            | 2 +-
 src/game/shared/achievementmgr.cpp                      | 2 +-
 src/materialsystem/shaderapidx9/shaderdevicedx8.cpp     | 2 +-
 src/serverbrowser/BaseGamesPage.cpp                     | 4 ++--
 12 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/common/ServerBrowser/blacklisted_server_manager.cpp b/src/common/ServerBrowser/blacklisted_server_manager.cpp
index 95fcbfc91..a1a747523 100644
--- a/src/common/ServerBrowser/blacklisted_server_manager.cpp
+++ b/src/common/ServerBrowser/blacklisted_server_manager.cpp
@@ -223,7 +223,7 @@ bool CBlacklistedServerManager::IsServerBlacklisted( uint32 serverIP, int server
 {
 	netadr_t netAdr( serverIP, serverPort );
 
-	ConVarRef sb_showblacklists( "sb_showblacklists" );
+	static ConVarRef sb_showblacklists( "sb_showblacklists" );
 
 	for ( int i = 0; i < m_Blacklist.Count(); i++ )
 	{
diff --git a/src/engine/gl_warp.cpp b/src/engine/gl_warp.cpp
index 881ac7abd..b52854ce8 100644
--- a/src/engine/gl_warp.cpp
+++ b/src/engine/gl_warp.cpp
@@ -165,7 +165,7 @@ void R_LoadSkys( void )
 
 	char requestedsky[ 128 ];
 
-	ConVarRef skyname( "sv_skyname" );
+	static ConVarRef skyname( "sv_skyname" );
 	if ( skyname.IsValid() )
 	{
 		Q_strncpy( requestedsky, skyname.GetString(), sizeof( requestedsky ) );
diff --git a/src/engine/matsys_interface.cpp b/src/engine/matsys_interface.cpp
index 0b7bfd085..e8446c2f7 100644
--- a/src/engine/matsys_interface.cpp
+++ b/src/engine/matsys_interface.cpp
@@ -412,7 +412,7 @@ static void ReadMaterialSystemConfigFromRegistry( MaterialSystem_Config_t &confi
 	config.SetFlag( MATSYS_VIDCFG_FLAGS_WINDOWED, ReadVideoConfigInt( "ScreenWindowed", 0 ) != 0 );
 #if defined( USE_SDL ) && !defined( SWDS )
 	// Read the ScreenDisplayIndex and set sdl_displayindex if it's there.
-	ConVarRef conVar( "sdl_displayindex" );
+	static ConVarRef conVar( "sdl_displayindex" );
 	if ( conVar.IsValid() )
 	{
 		int displayIndex = 0;
@@ -531,7 +531,7 @@ static void WriteMaterialSystemConfigToRegistry( const MaterialSystem_Config_t &
 
 #if defined( USE_SDL ) && !defined( SWDS )
 	// Save sdl_displayindex out to ScreenDisplayIndex.
-	ConVarRef conVar( "sdl_displayindex" );
+	static ConVarRef conVar( "sdl_displayindex" );
 	if ( conVar.IsValid() && !UseVR() )
 	{
 		WriteVideoConfigInt( "ScreenDisplayIndex", conVar.GetInt() );
@@ -683,7 +683,7 @@ void OverrideMaterialSystemConfig( MaterialSystem_Config_t &config )
 {
 	// enable/disable flashlight support based on mod (user can also set this explicitly)
 	// FIXME: this is only here because dxsupport_override.cfg is currently broken
-	ConVarRef mat_supportflashlight( "mat_supportflashlight" );
+	static ConVarRef mat_supportflashlight( "mat_supportflashlight" );
 	if ( mat_supportflashlight.GetInt() == -1 )
 	{
 		const char * gameName = COM_GetModDirectory();
diff --git a/src/game/client/in_steamcontroller.cpp b/src/game/client/in_steamcontroller.cpp
index b0c062aa5..6748b6f9d 100644
--- a/src/game/client/in_steamcontroller.cpp
+++ b/src/game/client/in_steamcontroller.cpp
@@ -69,8 +69,8 @@ void CInput::ApplySteamControllerCameraMove( QAngle& viewangles, CUserCmd *cmd,
 	//roll the view angles so roll is 0 (the HL2 assumed state) and mouse adjustments are relative to the screen.
 	//Assuming roll is unchanging, we want mouse left to translate to screen left at all times (same for right, up, and down)	
 
-	ConVarRef cl_pitchdown ( "cl_pitchdown" );
-	ConVarRef cl_pitchup ( "cl_pitchup" );
+	static ConVarRef cl_pitchdown ( "cl_pitchdown" );
+	static ConVarRef cl_pitchup ( "cl_pitchup" );
 
 	// Scale yaw and pitch inputs by sensitivity, and make sure they are within acceptable limits (important to avoid exploits, e.g. during Demoman charge we must restrict allowed yaw).
 	float yaw = CAM_CapYaw( sc_yaw_sensitivity.GetFloat() * vecPosition.x );
diff --git a/src/game/client/tf/tf_hud_notification_panel.cpp b/src/game/client/tf/tf_hud_notification_panel.cpp
index 26333fe5a..ee2b7dd92 100644
--- a/src/game/client/tf/tf_hud_notification_panel.cpp
+++ b/src/game/client/tf/tf_hud_notification_panel.cpp
@@ -140,7 +140,7 @@ void CHudNotificationPanel::MsgFunc_HudNotify( bf_read &msg )
 void CHudNotificationPanel::MsgFunc_HudNotifyCustom( bf_read &msg )
 {
 	// Ignore notifications in minmode
-	ConVarRef cl_hud_minmode( "cl_hud_minmode", true );
+	static ConVarRef cl_hud_minmode( "cl_hud_minmode", true );
 	if ( cl_hud_minmode.IsValid() && cl_hud_minmode.GetBool() )
 		return;
 
diff --git a/src/game/server/effects.cpp b/src/game/server/effects.cpp
index f9d457c6e..38a88df1c 100644
--- a/src/game/server/effects.cpp
+++ b/src/game/server/effects.cpp
@@ -451,7 +451,7 @@ void CGibShooter::Spawn( void )
 
 CGib *CGibShooter::CreateGib ( void )
 {
-	ConVarRef violence_hgibs( "violence_hgibs" );
+	static ConVarRef violence_hgibs( "violence_hgibs" );
 	if ( violence_hgibs.IsValid() && !violence_hgibs.GetInt() )
 		return NULL;
 
diff --git a/src/game/server/env_tonemap_controller.cpp b/src/game/server/env_tonemap_controller.cpp
index 613d45481..01a39f17e 100644
--- a/src/game/server/env_tonemap_controller.cpp
+++ b/src/game/server/env_tonemap_controller.cpp
@@ -178,7 +178,7 @@ void CEnvTonemapController::InputSetBloomScaleRange( inputdata_t &inputdata )
 void CEnvTonemapController::InputSetTonemapRate( inputdata_t &inputdata )
 {
 	// TODO: There should be a better way to do this.
-	ConVarRef mat_hdr_manual_tonemap_rate( "mat_hdr_manual_tonemap_rate" );
+	static ConVarRef mat_hdr_manual_tonemap_rate( "mat_hdr_manual_tonemap_rate" );
 	if ( mat_hdr_manual_tonemap_rate.IsValid() )
 	{
 		float flTonemapRate = inputdata.value.Float();
diff --git a/src/game/server/func_break.cpp b/src/game/server/func_break.cpp
index e7043ea93..6438555cb 100644
--- a/src/game/server/func_break.cpp
+++ b/src/game/server/func_break.cpp
@@ -1050,7 +1050,7 @@ void CBreakable::Die( void )
 		iCount = func_break_max_pieces.GetInt();
 	}
 
-	ConVarRef breakable_disable_gib_limit( "breakable_disable_gib_limit" );
+	static ConVarRef breakable_disable_gib_limit( "breakable_disable_gib_limit" );
 	if ( !breakable_disable_gib_limit.GetBool() && iCount )
 	{
 		if ( m_PerformanceMode == PM_NO_GIBS )
diff --git a/src/game/server/triggers.cpp b/src/game/server/triggers.cpp
index 8154e6234..cba01f29e 100644
--- a/src/game/server/triggers.cpp
+++ b/src/game/server/triggers.cpp
@@ -2243,7 +2243,7 @@ void CTriggerPush::Activate()
 {
 	// Fix problems with triggers pushing too hard under sv_alternateticks.
 	// This is somewhat hacky, but it's simple and we're really close to shipping.
-	ConVarRef sv_alternateticks( "sv_alternateticks" );
+	static ConVarRef sv_alternateticks( "sv_alternateticks" );
 	if ( ( m_flAlternateTicksFix != 0 ) && sv_alternateticks.GetBool() )
 	{
 		m_flPushSpeed = m_flSpeed * m_flAlternateTicksFix;
diff --git a/src/game/shared/achievementmgr.cpp b/src/game/shared/achievementmgr.cpp
index ba1fab2bf..c6b51eb95 100644
--- a/src/game/shared/achievementmgr.cpp
+++ b/src/game/shared/achievementmgr.cpp
@@ -1095,7 +1095,7 @@ bool CAchievementMgr::CheckAchievementsEnabled()
 		return false;
 	}
 
-	ConVarRef tf_bot_offline_practice( "tf_bot_offline_practice" );
+	static ConVarRef tf_bot_offline_practice( "tf_bot_offline_practice" );
 	// no achievements for offline practice
 	if ( tf_bot_offline_practice.GetInt() != 0 )
 	{
diff --git a/src/materialsystem/shaderapidx9/shaderdevicedx8.cpp b/src/materialsystem/shaderapidx9/shaderdevicedx8.cpp
index 2789158b5..ae138aed5 100644
--- a/src/materialsystem/shaderapidx9/shaderdevicedx8.cpp
+++ b/src/materialsystem/shaderapidx9/shaderdevicedx8.cpp
@@ -2202,7 +2202,7 @@ IDirect3DDevice9* CShaderDeviceDx8::InvokeCreateDevice( void* hWnd, int nAdapter
 	// Create the device with multi-threaded safeguards if we're using mat_queue_mode 2.
 	// The logic to enable multithreaded rendering happens well after the device has been created, 
 	// so we replicate some of that logic here.
-	ConVarRef mat_queue_mode( "mat_queue_mode" );
+	static ConVarRef mat_queue_mode( "mat_queue_mode" );
 	if ( mat_queue_mode.GetInt() == 2 ||
 		 ( mat_queue_mode.GetInt() == -2 && GetCPUInformation()->m_nPhysicalProcessors >= 2 ) ||
 	     ( mat_queue_mode.GetInt() == -1 && GetCPUInformation()->m_nPhysicalProcessors >= 2 ) )
diff --git a/src/serverbrowser/BaseGamesPage.cpp b/src/serverbrowser/BaseGamesPage.cpp
index 6cef88277..89cce6437 100644
--- a/src/serverbrowser/BaseGamesPage.cpp
+++ b/src/serverbrowser/BaseGamesPage.cpp
@@ -2129,7 +2129,7 @@ void CDialogServerWarning::OnCommand(const char *command)
 //-----------------------------------------------------------------------------
 void CDialogServerWarning::OnButtonToggled(Panel *panel, int state)
 {
-	ConVarRef sb_dontshow_maxplayer_warning( "sb_dontshow_maxplayer_warning", true );
+	static ConVarRef sb_dontshow_maxplayer_warning( "sb_dontshow_maxplayer_warning", true );
 	if ( sb_dontshow_maxplayer_warning.IsValid() )
 	{
 		sb_dontshow_maxplayer_warning.SetValue( state );
@@ -2150,7 +2150,7 @@ void CBaseGamesPage::OnBeginConnect()
 	// Stop the current refresh
 	StopRefresh();
 
-	ConVarRef sb_dontshow_maxplayer_warning( "sb_dontshow_maxplayer_warning", true );
+	static ConVarRef sb_dontshow_maxplayer_warning( "sb_dontshow_maxplayer_warning", true );
 	if ( sb_dontshow_maxplayer_warning.IsValid() )
 	{
 		// If the server is above the suggested maxplayers, warn the player

From a8f842c1575a985c3bc2f260762e0341a411b08e Mon Sep 17 00:00:00 2001
From: seth <getchoo@tuta.io>
Date: Wed, 22 Mar 2023 18:11:26 -0400
Subject: [PATCH 42/42] add link script for powershell

---
 game_clean/link.ps1 | 53 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 game_clean/link.ps1

diff --git a/game_clean/link.ps1 b/game_clean/link.ps1
new file mode 100644
index 000000000..53c60b5a5
--- /dev/null
+++ b/game_clean/link.ps1
@@ -0,0 +1,53 @@
+<#
+	.SYNOPSIS
+		symlink files from a retail Team Fortress 2 build to a patched build
+#>
+
+param (
+	[Parameter(Mandatory=$true)]
+	[string]$TF2Dir
+)
+
+$VerbosePreference = "Continue"
+$OutPath = "$PSScriptRoot\..\game"
+$GameClean = "$PSScriptRoot\."
+
+
+function Make-Symlink {
+	param (
+		[String]$Path
+	)
+
+	$linkpath = "$OutPath/$Path"
+	$targetpath = "$TF2Dir/$Path"
+
+	Write-Verbose -Message "Linking $linkpath to $targetpath"
+	New-Item -ItemType SymbolicLink -Path $linkpath -Target $targetpath
+}
+
+function Glob-Symlink {
+	param (
+		[String]$Path,
+		[String]$Glob
+	)
+
+	Get-ChildItem -Path "$TF2Dir\$Path\*" -Include $Glob | % { $_.Name } | % { Make-Symlink -Path $Path/$_ }
+}
+
+New-Item -ItemType Directory -Path $OutPath
+
+Write-Verbose -Message "Copying $GameClean/copy/ to $OutPath"
+Copy-Item -Recurse -Force $GameClean/clean/* $OutPath
+
+Write-Verbose -Message "Creating $OutPath/tf/materials"
+New-Item -Type Directory -Path $OutPath/tf/materials
+
+$targets = "hl2","platform"
+$targets += ,"maps","media","resource","scripts" | % { "tf/$_" }
+$targets += ,"models","vgui" | % { "tf/materials/$_" }
+ForEach ($t in $targets) {
+	Make-Symlink -Path $t
+}
+
+Glob-Symlink -Glob '' -Path 'bin'
+ForEach ($g in '*.vpk','*.cache') { Glob-Symlink -Glob $g -Path 'tf' }