From 9312111295ef41c6739c201acd33b03b59e3b8e7 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Fri, 3 Mar 2023 06:39:38 -0500 Subject: [PATCH 01/42] perf: fix duplicate trace logic for COLLISION_MODE_PER_FRAME_PLANESET collision mode COLLISION_MODE_PER_FRAME_PLANESET was running traces for the next mode due to a missing break, which decreased performance. these traces are expensive as seen by profiling --- src/particles/builtin_constraints.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/particles/builtin_constraints.cpp b/src/particles/builtin_constraints.cpp index 5658972e1..d84afa3ba 100644 --- a/src/particles/builtin_constraints.cpp +++ b/src/particles/builtin_constraints.cpp @@ -503,17 +503,16 @@ void CWorldCollideContextData::CalculatePlanes( CParticleCollection *pParticles, } m_nNumFixedPlanes = nIndexOut; m_nActivePlanes = nIndexOut; + // UNDONE: We're now introducing this fix. + // Long missing break. Added to Source2 in change 700053. + // It's a bug, but changing it now could cause regressions, so + // leaving it for now until someone decides it's worth fixing. + // This break is necessary when exceptions are enabled because otherwise + // m_bPlaneActive[21] is set even though that plane is filled with + // NaNs. We should perhaps put this break in, but we need to do + // careful particle testing. + break; } - // Long missing break. Added to Source2 in change 700053. - // It's a bug, but changing it now could cause regressions, so - // leaving it for now until someone decides it's worth fixing. -#ifdef FP_EXCEPTIONS_ENABLED - // This break is necessary when exceptions are enabled because otherwise - // m_bPlaneActive[21] is set even though that plane is filled with - // NaNs. We should perhaps put this break in, but we need to do - // careful particle testing. - break; -#endif case COLLISION_MODE_USE_NEAREST_TRACE: { @@ -529,6 +528,7 @@ void CWorldCollideContextData::CalculatePlanes( CParticleCollection *pParticles, } m_nNumFixedPlanes = nIndexOut; m_nActivePlanes = nIndexOut; + break; } } } From ad892e06185a0084c63d283e93a9ba709b9d6540 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Sat, 4 Mar 2023 17:16:07 -0500 Subject: [PATCH 02/42] perf: enable SSE2 in build allows for much more efficient math optimizations --- src/vpc_scripts/source_dll_win32_base.vpc | 2 +- src/vpc_scripts/source_exe_win_win32_base.vpc | 2 +- src/vpc_scripts/source_lib_win32_base.vpc | 2 +- src/vphysics/vphysics.vpc | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/vpc_scripts/source_dll_win32_base.vpc b/src/vpc_scripts/source_dll_win32_base.vpc index a61409584..40729b6c7 100644 --- a/src/vpc_scripts/source_dll_win32_base.vpc +++ b/src/vpc_scripts/source_dll_win32_base.vpc @@ -41,7 +41,7 @@ $Configuration $Compiler [$WIN32] { - $EnableEnhancedInstructionSet "Streaming SIMD Extensions (/arch:SSE)" + $EnableEnhancedInstructionSet "Streaming SIMD Extensions 2 (/arch:SSE2)" } $Linker diff --git a/src/vpc_scripts/source_exe_win_win32_base.vpc b/src/vpc_scripts/source_exe_win_win32_base.vpc index 16350b03b..f71cc479b 100644 --- a/src/vpc_scripts/source_exe_win_win32_base.vpc +++ b/src/vpc_scripts/source_exe_win_win32_base.vpc @@ -41,7 +41,7 @@ $Configuration $Compiler [$WIN32] { - $EnableEnhancedInstructionSet "Streaming SIMD Extensions (/arch:SSE)" + $EnableEnhancedInstructionSet "Streaming SIMD Extensions 2 (/arch:SSE2)" } $Linker diff --git a/src/vpc_scripts/source_lib_win32_base.vpc b/src/vpc_scripts/source_lib_win32_base.vpc index 1bc4f7fe1..94b9dd7d6 100644 --- a/src/vpc_scripts/source_lib_win32_base.vpc +++ b/src/vpc_scripts/source_lib_win32_base.vpc @@ -40,7 +40,7 @@ $Configuration $Compiler [$WIN32] { - $EnableEnhancedInstructionSet "Streaming SIMD Extensions (/arch:SSE)" + $EnableEnhancedInstructionSet "Streaming SIMD Extensions 2 (/arch:SSE2)" } $PreBuildEvent diff --git a/src/vphysics/vphysics.vpc b/src/vphysics/vphysics.vpc index 9b4ee3f1d..5d8f43213 100644 --- a/src/vphysics/vphysics.vpc +++ b/src/vphysics/vphysics.vpc @@ -23,7 +23,7 @@ $Configuration } $Compiler [$WIN32] { - $EnableEnhancedInstructionSet "Streaming SIMD Extensions (/arch:SSE)" + $EnableEnhancedInstructionSet "Streaming SIMD Extensions 2 (/arch:SSE2)" } $Linker From f8dc5b84898b664162c25004690a1c1e2bba872d Mon Sep 17 00:00:00 2001 From: mastercoms Date: Sat, 4 Mar 2023 17:17:02 -0500 Subject: [PATCH 03/42] perf: add DirectXMath third party dep will be used for optimized SSE routines. I chose this library because it has a lineage from what Source uses for Xbox 360 math: xboxmath -> xnamath -> DirectXMath, so porting SIMD operations to PC is simplified, and there are similar guarantees. can be retrieved from https://github.com/microsoft/DirectXMath A blank sal.h also needs to be retrieved from https://github.com/microsoft/omi/blob/master/Unix/common/linux/sal.h for POSIX (https://github.com/microsoft/DirectXMath/issues/89#issuecomment-530519242) an edit needs to be made for Inc/DirectXMath.h to include sal.h on Windows and the above sal.h on POSIX. --- .../DirectXMath-dec2022/.gitattributes | 8 + src/thirdparty/DirectXMath-dec2022/.gitignore | 24 + .../.nuget/directxmath.nuspec | 33 + .../.nuget/directxmath.targets | 11 + .../DirectXMath-dec2022/.nuget/icon.jpg | Bin 0 -> 3479 bytes .../DirectXMath-dec2022/.nuget/signconfig.xml | 6 + .../DirectXMath-dec2022/CMakeLists.txt | 91 + .../DirectXMath-dec2022/CMakePresets.json | 175 + .../Extensions/DirectXMathAVX.h | 275 + .../Extensions/DirectXMathAVX2.h | 1037 ++ .../Extensions/DirectXMathBE.h | 95 + .../Extensions/DirectXMathF16C.h | 471 + .../Extensions/DirectXMathFMA3.h | 391 + .../Extensions/DirectXMathFMA4.h | 415 + .../Extensions/DirectXMathSSE3.h | 111 + .../Extensions/DirectXMathSSE4.h | 417 + src/thirdparty/DirectXMath-dec2022/HISTORY.md | 198 + .../Inc/DirectXCollision.h | 359 + .../Inc/DirectXCollision.inl | 4816 +++++ .../DirectXMath-dec2022/Inc/DirectXColors.h | 312 + .../DirectXMath-dec2022/Inc/DirectXMath.h | 2280 +++ .../Inc/DirectXMathConvert.inl | 2191 +++ .../Inc/DirectXMathMatrix.inl | 3550 ++++ .../Inc/DirectXMathMisc.inl | 2493 +++ .../Inc/DirectXMathVector.inl | 14819 ++++++++++++++++ .../Inc/DirectXPackedVector.h | 1224 ++ .../Inc/DirectXPackedVector.inl | 4459 +++++ src/thirdparty/DirectXMath-dec2022/LICENSE | 21 + .../MatrixStack/DirectXMatrixStack.h | 241 + src/thirdparty/DirectXMath-dec2022/README.md | 115 + .../DirectXMath-dec2022/SECURITY.md | 41 + .../DirectXMath-dec2022/SHMath/DirectXSH.cpp | 4908 +++++ .../DirectXMath-dec2022/SHMath/DirectXSH.h | 72 + .../SHMath/DirectXSHD3D11.cpp | 383 + .../SHMath/DirectXSHD3D12.cpp | 339 + .../Stereo3D/Stereo3DMatrixHelper.cpp | 257 + .../Stereo3D/Stereo3DMatrixHelper.h | 64 + .../DirectXMath-dec2022/XDSP/XDSP.h | 871 + .../build/DirectXMath-GitHub-CMake-Dev17.yml | 119 + .../build/DirectXMath-GitHub-CMake.yml | 103 + .../build/DirectXMath-GitHub-Dev17.yml | 296 + .../build/DirectXMath-GitHub-MinGW.yml | 170 + .../build/DirectXMath-GitHub-WSL-11.yml | 64 + .../build/DirectXMath-GitHub-WSL.yml | 64 + .../build/DirectXMath-GitHub.yml | 543 + .../build/DirectXMath-config.cmake.in | 5 + src/thirdparty/dotnetrt/sal.h | 2953 +++ 47 files changed, 51890 insertions(+) create mode 100644 src/thirdparty/DirectXMath-dec2022/.gitattributes create mode 100644 src/thirdparty/DirectXMath-dec2022/.gitignore create mode 100644 src/thirdparty/DirectXMath-dec2022/.nuget/directxmath.nuspec create mode 100644 src/thirdparty/DirectXMath-dec2022/.nuget/directxmath.targets create mode 100644 src/thirdparty/DirectXMath-dec2022/.nuget/icon.jpg create mode 100644 src/thirdparty/DirectXMath-dec2022/.nuget/signconfig.xml create mode 100644 src/thirdparty/DirectXMath-dec2022/CMakeLists.txt create mode 100644 src/thirdparty/DirectXMath-dec2022/CMakePresets.json create mode 100644 src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathAVX.h create mode 100644 src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathAVX2.h create mode 100644 src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathBE.h create mode 100644 src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathF16C.h create mode 100644 src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathFMA3.h create mode 100644 src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathFMA4.h create mode 100644 src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathSSE3.h create mode 100644 src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathSSE4.h create mode 100644 src/thirdparty/DirectXMath-dec2022/HISTORY.md create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXCollision.h create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXCollision.inl create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXColors.h create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathConvert.inl create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathMatrix.inl create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathMisc.inl create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathVector.inl create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXPackedVector.h create mode 100644 src/thirdparty/DirectXMath-dec2022/Inc/DirectXPackedVector.inl create mode 100644 src/thirdparty/DirectXMath-dec2022/LICENSE create mode 100644 src/thirdparty/DirectXMath-dec2022/MatrixStack/DirectXMatrixStack.h create mode 100644 src/thirdparty/DirectXMath-dec2022/README.md create mode 100644 src/thirdparty/DirectXMath-dec2022/SECURITY.md create mode 100644 src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSH.cpp create mode 100644 src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSH.h create mode 100644 src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSHD3D11.cpp create mode 100644 src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSHD3D12.cpp create mode 100644 src/thirdparty/DirectXMath-dec2022/Stereo3D/Stereo3DMatrixHelper.cpp create mode 100644 src/thirdparty/DirectXMath-dec2022/Stereo3D/Stereo3DMatrixHelper.h create mode 100644 src/thirdparty/DirectXMath-dec2022/XDSP/XDSP.h create mode 100644 src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-CMake-Dev17.yml create mode 100644 src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-CMake.yml create mode 100644 src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-Dev17.yml create mode 100644 src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-MinGW.yml create mode 100644 src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-WSL-11.yml create mode 100644 src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-WSL.yml create mode 100644 src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub.yml create mode 100644 src/thirdparty/DirectXMath-dec2022/build/DirectXMath-config.cmake.in create mode 100644 src/thirdparty/dotnetrt/sal.h diff --git a/src/thirdparty/DirectXMath-dec2022/.gitattributes b/src/thirdparty/DirectXMath-dec2022/.gitattributes new file mode 100644 index 000000000..f416ccf92 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/.gitattributes @@ -0,0 +1,8 @@ +# Auto detect text files and perform LF normalization +* text=auto + +# Explicitly declare code/VS files as CRLF +*.cpp eol=crlf +*.cmd eol=crlf +*.h eol=crlf +*.inl eol=crlf diff --git a/src/thirdparty/DirectXMath-dec2022/.gitignore b/src/thirdparty/DirectXMath-dec2022/.gitignore new file mode 100644 index 000000000..33834c644 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/.gitignore @@ -0,0 +1,24 @@ +*.psess +*.vsp +*.log +*.err +*.wrn +*.suo +*.sdf +*.user +*.i +*.vspscc +*.opensdf +*.opendb +*.ipch +*.cache +*.tlog +*.lastbuildstate +*.ilk +*.VC.db +*.nupkg +.vs +/Tests +/wiki +/out +/CMakeUserPresets.json diff --git a/src/thirdparty/DirectXMath-dec2022/.nuget/directxmath.nuspec b/src/thirdparty/DirectXMath-dec2022/.nuget/directxmath.nuspec new file mode 100644 index 000000000..218c3bfe2 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/.nuget/directxmath.nuspec @@ -0,0 +1,33 @@ + + + + directxmath + 0.0.0-SpecifyVersionOnCommandline + DirectXMath + Microsoft + microsoft,directxtk + DirectXMath is an all inline SIMD C++ linear algebra library for use in games and graphics apps. + The DirectXMath API provides SIMD-friendly C++ types and functions for common linear algebra and graphics math operations common to DirectX applications. The library provides optimized versions for Windows 32-bit (x86), Windows 64-bit (x64), and Windows on ARM through SSE2 and ARM-NEON intrinsics support in the Visual Studio compiler. + Matches the December 2022 release. + http://go.microsoft.com/fwlink/?LinkID=615560 + + images\icon.jpg + docs\README.md + MIT + false + © Microsoft Corporation. All rights reserved. + C++ native DirectX math nativepackage + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/thirdparty/DirectXMath-dec2022/.nuget/directxmath.targets b/src/thirdparty/DirectXMath-dec2022/.nuget/directxmath.targets new file mode 100644 index 000000000..0a31f579b --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/.nuget/directxmath.targets @@ -0,0 +1,11 @@ + + + + + + HAS_DIRECTXMATH;%(PreprocessorDefinitions) + $(MSBuildThisFileDirectory)..\..\include;%(AdditionalIncludeDirectories) + + + + diff --git a/src/thirdparty/DirectXMath-dec2022/.nuget/icon.jpg b/src/thirdparty/DirectXMath-dec2022/.nuget/icon.jpg new file mode 100644 index 0000000000000000000000000000000000000000..08fe1faeb7f6e45d796cf1e67bf1cbb1347c514a GIT binary patch literal 3479 zcmbW$XHe5?mjLj82q7Rr2u1}(Y6PT8@1RJ((ximmf=CfjB1P&|i4>8JSGXbwVrWvN zBb`vCw;<9%dPy)y2mxMvcjxZR&VJb4^UV45oO$N_<~-+Q;$#kBGtkl30YD%S;BN{b_aR={-OXhp_WW z-DEgt>I~(7gphuk_?}Tfv#jO(ogpGp#>Mv;)0qnwIXEv}6%-P_CL${*ub`-;d`nAP zM^{hZz|7pj5@ls=gLb{^cF*0z6Z6>5KOpc)(DMlFi^!{3zLA% z9pZTx-=Q-Xkh1evNxy0Tmi_Nw&;DPse_;RRngn2A(CP5Na6kjtrwV8Pc}BQcs3RN^ zqB7U+Ui?6PBs5OkNy(>k_)is2(G@0jHLtCOS0Y}8aNO!gr(rF$N^Zmt{cf&BrNx?( z9Mf?vlnyKwWC`Vsw*mYRsHZdoN?XxtnDTKSk(O9tSs;+c364gpHQ^ATofK3Y<&VYf zDPvP_;AF|$jI{K^lUTY(K zn7ol&+vJ?@x!p3k5Qg%{L_nLq_y)VA%^lJCoSmpImpHmZ3b6aSpC&W#*ReWo%~tZ9 zYy>Um(b`C+!7j-vP3_udwd#}+zpdFfuGw~R-gRZvcQgh$?x=1|-w{OGYN4?N`84mG zKLJ{`=M;-@$$r}@+T}~T=+5o$y9eQOa=rF>b=A|#uf^ZoWue)HNAh4FHl3C8Q0Jyu zT?!pZDXVw(;zM^3T3($1;OckHl@4^((ho!BM8T+ZVi<0iTbcxOxIoN3{v--G#~z>$ z`-52f8_~oEZ|nLtW<0}XJY_g@eWhn^-{-)RC3!8NrDpa`o!()FUieD^&x>3+EZOb2 zh#cgBQcl6bz_1qY&8^}bY=pEyKDUk(8=%v4M?q;HWABISiK{FBpC* z@~Fi%UCbw5^|Y~>#cGG3m=#sN#n9k+^{P6fQe@p=?)^=u3P>YN^45C8QM??%&UL!y zh}LoJ$55F4o$^wvrs_*H$iMf_B7@)+EB3JGB)Junj!IFrc;z^$eCy5NPuQd4d}qjB z?3!B9Dl*1g35IL1D#-|Aq%^LyH6k;csr8vP={kqkg!q-}QlW>%O5Bv=z0mUxbpd1C zp997qA+Hu^Ig86l7L{zI%G}si6^!%+?a45bY4^(8fc8*~9WJJki-PVD?kn)l z@_gSnp2D_4RzjrLmC9i{@7L>lV4Qy1@jI0Y?oP)hoB+*DF*3)*&Ib36+lkS9Z#cT> zrR6Omeb818{1ncGcv$~rB(}uX1gpf%xx0-BLM!?-|6H@P$c&Mw_|h%DC+e{&ky<9& z!rhih^;JHfFY}f@mG~?Oc-gj$}>%ZBKOsxb_9h5?*OCWMaL?K1B0w6=h>g$sw?hFN*f~$kIm~x z{Db};GeV8{;yuo&-}O=T!;8N?)*z7cD(&xx{((`$4_Y_Z*lF|tn+#8UTI-ijfCZN2 zkI#*UWSeGt+V0Od9Q4byVmXdjd;$aJd|s*=O3zSYmexuYga*`Imx2W!HbwIo^mCl0 z3JA{yuf~bC2OVYXM0v|y;mmj-UX`h4=C)#%>ZH6Bjt#}m9~o9=o?V+vZY4I1)ftfL zQibt!I;`*eFf#~>?YuD~ebQHB+n4Ppz+YdA^aVq8@6v$c{su;EO%dUp2y>V0XECDf zA=adM?~>f%5SfpL0U^a`Nq#N{E;fX(p{Z`!^o*^QZDsp3{-I{h)g`Aa6gI?!k~nAR zHch5fMD2Z;#I9H0x8a51=xJ(c$|0)yu}Y731@B({_V zIS*1Qa*aQrGY9AzQwBAyqS)Qsm-L$972((IoF0(-l~HiWRMJ)zHYDngP&_7$bw8u7 z>CfPwPcy~44!b56RiBCKqW0rd z^lB|CwnlfuK{TyeikDK^XuT3xl&gM5FK+I1n|yerss8&VSu{x!PaC(mw!MalR;`*h zP6atox*856#LrOt$B)HJMYJztmfqBV4GX>a3OCJI%wtJ3Ekmo-m$2n!NqM zI48cN_?-y5e~{D&zSTx;piHEohpWt05jAUO4hc`0^ZuV5XU>p#c8 zh-3}%lUoYA`g+G^{L|xLbGN$s3|IFr2T4v^kw>GVWU;Z;8L82nsyIC)J|Tk;06iSc zzl=I&_<>rhD?g^A_&c%%_V2f&HX6jDh4BwZy5EIEx_|W$j2L1J2dKwQb5|5aEbIF* zrbEjVGm@0p$CR(Gt2zC5o0EjN-iCi}J^^Os*2H;ZOpI<^<&uswf1aRQ5}qfQPdaSP z{LT%Jx2=v>u(7>MRJvZxa=^v&(JPW@S>YECYjyi->=A`@9(4*SG9I}Sl;E+L-yov< z-fGgvAaT{lu6calw{^XMb%(BG8b}_Qd>2(Nq1yZzO|PKCy+ZTcTJEea{T}It{R}M0 zW8zH{yQqXT%1HZ5cKO6ym4@CXWN(8xw>->Gu=ZX=%6g-H+E+0#9BO)^*YIk{BNG4j zTf^_;RLm4!55Syn+`u3s!Gm03+?Br|mGzdd%WQ8v<-^juKhMtN0PmK_P-KYn^;dG# zaryX*){UmM#U1xYqp#6wx)RK4+CP3FRqt)9P^eYdFc&o2S^F)1a%5&s>K`MbhNdwzh(4!tY~Jy zq~gW;q;UhH>x%K5rx0VGjlohsJiQ{3go_TboLscKc|Q(alg7bb;~mo^7RVjjAGoep zQqag1Xcef@ObvU}1sQTwUZwU-RsU!`LRiu}f@uO?3tr-$Jt}{Pm){mV6iRCpbnZI* zCIE#NE0!4G7==Z7BM|2kS*D8=#xXW_yh%)p?n}3ZQesaRzN=@KCQaJ@^kD`XY+5JN zIqTSPd2v;D*=vJ3U++jPY&(`CeDbq5Ew-3{+2cOv^vvX}&SESnc?4k~V$kawlT@xA zSW9})w9rk^BJM?eOZ{BPC@up-BToRvPBSW@j3z;mf?_|sv?kCBKRJaAX{oz!#g`l! z?+p6ymFN!E%VbmBMmFo@|AxH1xAVm7r^aFL6CeC|CHz@jYpl8d8R6`_tn3(J;D1Q) H$+v$2rQD|6 literal 0 HcmV?d00001 diff --git a/src/thirdparty/DirectXMath-dec2022/.nuget/signconfig.xml b/src/thirdparty/DirectXMath-dec2022/.nuget/signconfig.xml new file mode 100644 index 000000000..f32a6a464 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/.nuget/signconfig.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/src/thirdparty/DirectXMath-dec2022/CMakeLists.txt b/src/thirdparty/DirectXMath-dec2022/CMakeLists.txt new file mode 100644 index 000000000..90ca3e2d3 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/CMakeLists.txt @@ -0,0 +1,91 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +cmake_minimum_required (VERSION 3.20) + +set(DIRECTXMATH_VERSION 3.1.8) + +project(DirectXMath + VERSION ${DIRECTXMATH_VERSION} + DESCRIPTION "DirectXMath SIMD C++ math library" + HOMEPAGE_URL "https://go.microsoft.com/fwlink/?LinkID=615560" + LANGUAGES CXX) + +include(GNUInstallDirs) + +#--- Library +set(LIBRARY_HEADERS + Inc/DirectXCollision.h + Inc/DirectXCollision.inl + Inc/DirectXColors.h + Inc/DirectXMath.h + Inc/DirectXMathConvert.inl + Inc/DirectXMathMatrix.inl + Inc/DirectXMathMisc.inl + Inc/DirectXMathVector.inl + Inc/DirectXPackedVector.h + Inc/DirectXPackedVector.inl) + +add_library(${PROJECT_NAME} INTERFACE) + +target_include_directories(${PROJECT_NAME} INTERFACE + $ + $) + +target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_11) + +#--- Package +include(CMakePackageConfigHelpers) + +string(TOLOWER ${PROJECT_NAME} PACKAGE_NAME) + +write_basic_package_version_file( + ${PACKAGE_NAME}-config-version.cmake + VERSION ${DIRECTXMATH_VERSION} + COMPATIBILITY AnyNewerVersion + ARCH_INDEPENDENT) + +install(TARGETS ${PROJECT_NAME} + EXPORT ${PROJECT_NAME}-targets + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + +configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/build/${PROJECT_NAME}-config.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}-config.cmake + INSTALL_DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PACKAGE_NAME}) + +install(EXPORT ${PROJECT_NAME}-targets + FILE ${PROJECT_NAME}-targets.cmake + NAMESPACE Microsoft:: + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PACKAGE_NAME}) + +install(FILES ${LIBRARY_HEADERS} + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/directxmath) + +install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}-config.cmake + ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}-config-version.cmake + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PACKAGE_NAME}) + +#--- Test suite +if (DEFINED VCPKG_TARGET_ARCHITECTURE) + set(DXMATH_ARCHITECTURE ${VCPKG_TARGET_ARCHITECTURE}) +elseif(CMAKE_GENERATOR_PLATFORM MATCHES "^[Ww][Ii][Nn]32$") + set(DXMATH_ARCHITECTURE x86) +elseif(CMAKE_GENERATOR_PLATFORM MATCHES "^[Xx]64$") + set(DXMATH_ARCHITECTURE x64) +elseif(CMAKE_GENERATOR_PLATFORM MATCHES "^[Aa][Rr][Mm]$") + set(DXMATH_ARCHITECTURE arm) +elseif(CMAKE_GENERATOR_PLATFORM MATCHES "^[Aa][Rr][Mm]64$") + set(DXMATH_ARCHITECTURE arm64) +elseif(NOT DXMATH_ARCHITECTURE) + set(DXMATH_ARCHITECTURE "x64") +endif() + +#--- Test suite +include(CTest) +if(BUILD_TESTING AND WIN32 AND (NOT WINDOWS_STORE) AND (EXISTS "${CMAKE_CURRENT_LIST_DIR}/Tests/CMakeLists.txt")) + enable_testing() + add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/Tests) +endif() diff --git a/src/thirdparty/DirectXMath-dec2022/CMakePresets.json b/src/thirdparty/DirectXMath-dec2022/CMakePresets.json new file mode 100644 index 000000000..90b680e7d --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/CMakePresets.json @@ -0,0 +1,175 @@ +{ + "version": 2, + "configurePresets": [ + { + "name": "base", + "displayName": "Basic Config", + "description": "Basic build using Ninja generator", + "generator": "Ninja", + "hidden": true, + "binaryDir": "${sourceDir}/out/build/${presetName}", + "cacheVariables": { "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}" } + }, + + { + "name": "x64", + "architecture": { + "value": "x64", + "strategy": "external" + }, + "cacheVariables": { "DXMATH_ARCHITECTURE": "x64" }, + "hidden": true + }, + { + "name": "x86", + "architecture": { + "value": "x86", + "strategy": "external" + }, + "cacheVariables": { "DXMATH_ARCHITECTURE": "x86" }, + "hidden": true + }, + { + "name": "ARM", + "architecture": { + "value": "arm", + "strategy": "external" + }, + "cacheVariables": { "DXMATH_ARCHITECTURE": "arm" }, + "hidden": true + }, + { + "name": "ARM64", + "architecture": { + "value": "arm64", + "strategy": "external" + }, + "cacheVariables": { "DXMATH_ARCHITECTURE": "arm64" }, + "hidden": true + }, + + { + "name": "Debug", + "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" }, + "hidden": true + }, + { + "name": "Release", + "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" }, + "hidden": true + }, + + { + "name": "OneCore", + "cacheVariables": { "BUILD_FOR_ONECORE": true }, + "hidden": true + }, + { + "name": "AVX", + "cacheVariables": { "BUILD_AVX_TEST": true }, + "hidden": true + }, + { + "name": "AVX2", + "cacheVariables": { "BUILD_AVX2_TEST": true }, + "hidden": true + }, + { + "name": "F16C", + "cacheVariables": { "BUILD_F16C_TEST": true }, + "hidden": true + }, + { + "name": "NI", + "cacheVariables": { "BUILD_NO_INTRINSICS": true }, + "hidden": true + }, + + { + "name": "MSVC", + "hidden": true, + "cacheVariables": { + "CMAKE_CXX_COMPILER": "cl.exe" + }, + "toolset": { + "value": "host=x64", + "strategy": "external" + } + }, + { + "name": "Clang", + "hidden": true, + "cacheVariables": { + "CMAKE_CXX_COMPILER": "clang-cl.exe" + }, + "toolset": { + "value": "host=x64", + "strategy": "external" + } + }, + { + "name": "GNUC", + "hidden": true, + "cacheVariables": { + "CMAKE_CXX_COMPILER": "g++.exe" + }, + "toolset": { + "value": "host=x64", + "strategy": "external" + } + }, + { + "name": "Intel", + "hidden": true, + "cacheVariables": { + "CMAKE_CXX_COMPILER": "icl.exe" + }, + "toolset": { + "value": "host=x64", + "strategy": "external" + } + }, + { + "name": "IntelLLVM", + "hidden": true, + "cacheVariables": { + "CMAKE_CXX_COMPILER": "icx.exe" + }, + "toolset": { + "value": "host=x64", + "strategy": "external" + } + }, + + { "name": "x64-Debug" , "description": "MSVC for x64 (Debug) - SSE/SSE2", "inherits": [ "base", "x64", "Debug", "MSVC" ] }, + { "name": "x64-Release" , "description": "MSVC for x64 (Release) - SSE/SSE2", "inherits": [ "base", "x64", "Release", "MSVC" ] }, + { "name": "x86-Debug" , "description": "MSVC for x86 (Debug) - SSE/SSE2", "inherits": [ "base", "x86", "Debug", "MSVC" ] }, + { "name": "x86-Release" , "description": "MSVC for x86 (Release) - SSE/SSE2", "inherits": [ "base", "x86", "Release", "MSVC" ] }, + { "name": "arm-Debug" , "description": "MSVC for ARM (Debug) - ARM-NEON", "inherits": [ "base", "ARM", "Debug", "MSVC" ] }, + { "name": "arm-Release" , "description": "MSVC for ARM (Release) - ARM-NEON", "inherits": [ "base", "ARM", "Release", "MSVC" ] }, + { "name": "arm64-Debug" , "description": "MSVC for ARM64 (Debug) - ARM-NEON", "inherits": [ "base", "ARM64", "Debug", "MSVC" ] }, + { "name": "arm64-Release", "description": "MSVC for ARM64 (Release) - ARM-NEON", "inherits": [ "base", "ARM64", "Release", "MSVC" ] }, + + { "name": "x64-Debug-Clang" , "description": "Clang/LLVM for x64 (Debug) - SSE/SSE2", "inherits": [ "base", "x64", "Debug", "Clang" ] }, + { "name": "x64-Release-Clang" , "description": "Clang/LLVM for x64 (Release) - SSE/SSE2", "inherits": [ "base", "x64", "Release", "Clang" ] }, + { "name": "x86-Debug-Clang" , "description": "Clang/LLVM for x86 (Debug) - SSE/SSE2", "inherits": [ "base", "x86", "Debug", "Clang" ], "environment": { "CXXFLAGS": "-m32" } }, + { "name": "x86-Release-Clang" , "description": "Clang/LLVM for x86 (Release) - SSE/SSE2", "inherits": [ "base", "x86", "Release", "Clang" ], "environment": { "CXXFLAGS": "-m32" } }, + { "name": "arm64-Debug-Clang" , "description": "Clang/LLVM for AArch64 (Debug) - ARM-NEON", "inherits": [ "base", "ARM64", "Debug", "Clang" ], "environment": { "CXXFLAGS": "--target=arm64-pc-windows-msvc" } }, + { "name": "arm64-Release-Clang", "description": "Clang/LLVM for AArch64 (Release) - ARM-NEON", "inherits": [ "base", "ARM64", "Release", "Clang" ], "environment": { "CXXFLAGS": "--target=arm64-pc-windows-msvc" } } + ], + "testPresets": [ + { "name": "x64-Debug" , "configurePreset": "x64-Debug" }, + { "name": "x64-Release" , "configurePreset": "x64-Release" }, + { "name": "x86-Debug" , "configurePreset": "x86-Debug" }, + { "name": "x86-Release" , "configurePreset": "x86-Release" }, + { "name": "arm64-Debug" , "configurePreset": "arm64-Debug" }, + { "name": "arm64-Release", "configurePreset": "arm64-Release" }, + + { "name": "x64-Debug-Clang" , "configurePreset": "x64-Debug-Clang" }, + { "name": "x64-Release-Clang" , "configurePreset": "x64-Release-Clang" }, + { "name": "x86-Debug-Clang" , "configurePreset": "x86-Debug-Clang" }, + { "name": "x86-Release-Clang" , "configurePreset": "x86-Release-Clang" }, + { "name": "arm64-Debug-Clang" , "configurePreset": "arm64-Debug-Clang" }, + { "name": "arm64-Release-Clang", "configurePreset": "arm64-Release-Clang" } + ] +} \ No newline at end of file diff --git a/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathAVX.h b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathAVX.h new file mode 100644 index 000000000..901a1c9b3 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathAVX.h @@ -0,0 +1,275 @@ +//------------------------------------------------------------------------------------- +// DirectXMathAVX.h -- AVX (version 1) extensions for SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__ +#error AVX not supported on ARM platform +#endif + +#include + +namespace DirectX +{ + +namespace AVX +{ + +inline bool XMVerifyAVXSupport() +{ + // Should return true for AMD Bulldozer, Intel "Sandy Bridge", and Intel "Ivy Bridge" or later processors + // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012) + + // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + int CPUInfo[4] = {-1}; +#if defined(__clang__) || defined(__GNUC__) + __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid( CPUInfo, 0 ); +#endif + + if ( CPUInfo[0] < 1 ) + return false; + +#if defined(__clang__) || defined(__GNUC__) + __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 1 ); +#endif + + // We check for AVX, OSXSAVE, SSSE4.1, and SSE3 + return ( (CPUInfo[2] & 0x18080001) == 0x18080001 ); +} + + +//------------------------------------------------------------------------------------- +// Vector +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr( _In_ const float *pValue ) +{ + return _mm_broadcast_ss( pValue ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSplatX( FXMVECTOR V ) +{ + return _mm_permute_ps( V, _MM_SHUFFLE(0, 0, 0, 0) ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSplatY( FXMVECTOR V ) +{ + return _mm_permute_ps( V, _MM_SHUFFLE(1, 1, 1, 1) ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSplatZ( FXMVECTOR V ) +{ + return _mm_permute_ps( V, _MM_SHUFFLE(2, 2, 2, 2) ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSplatW( FXMVECTOR V ) +{ + return _mm_permute_ps( V, _MM_SHUFFLE(3, 3, 3, 3) ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSwizzle( FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3 ) +{ + assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); + _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); + + unsigned int elem[4] = { E0, E1, E2, E3 }; + __m128i vControl = _mm_loadu_si128( reinterpret_cast(&elem[0]) ); + return _mm_permutevar_ps( V, vControl ); +} + +inline XMVECTOR XM_CALLCONV XMVectorPermute( FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW ) +{ + assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); + _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); + + static const XMVECTORU32 three = { { { 3, 3, 3, 3 } } }; + + XM_ALIGNED_DATA(16) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW }; + __m128i vControl = _mm_load_si128( reinterpret_cast(&elem[0]) ); + + __m128i vSelect = _mm_cmpgt_epi32( vControl, three ); + vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) ); + + __m128 shuffled1 = _mm_permutevar_ps( V1, vControl ); + __m128 shuffled2 = _mm_permutevar_ps( V2, vControl ); + + __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 ); + __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 ); + + return _mm_or_ps( masked1, masked2 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return AVX::XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3)); +} + +inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return AVX::XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return AVX::XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 ); +} + + +//------------------------------------------------------------------------------------- +// Permute Templates +//------------------------------------------------------------------------------------- + +namespace Internal +{ + // Slow path fallback for permutes that do not map to a single SSE opcode. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) + { + static const XMVECTORU32 selectMask = + { + WhichX ? 0xFFFFFFFF : 0, + WhichY ? 0xFFFFFFFF : 0, + WhichZ ? 0xFFFFFFFF : 0, + WhichW ? 0xFFFFFFFF : 0, + }; + + XMVECTOR shuffled1 = _mm_permute_ps(v1, Shuffle); + XMVECTOR shuffled2 = _mm_permute_ps(v2, Shuffle); + + XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1); + XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2); + + return _mm_or_ps(masked1, masked2); + } + }; + + // Fast path for permutes that only read from the first vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_permute_ps(v1, Shuffle); } + }; + + // Fast path for permutes that only read from the second vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return _mm_permute_ps(v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the first vector, ZW from the second. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the second vector, ZW from the first. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); } + }; +}; + +// General permute template +template + inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2) +{ + static_assert(PermuteX <= 7, "PermuteX template parameter out of range"); + static_assert(PermuteY <= 7, "PermuteY template parameter out of range"); + static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range"); + static_assert(PermuteW <= 7, "PermuteW template parameter out of range"); + + const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3); + + const bool WhichX = PermuteX > 3; + const bool WhichY = PermuteY > 3; + const bool WhichZ = PermuteZ > 3; + const bool WhichW = PermuteW > 3; + + return AVX::Internal::PermuteHelper::Permute(V1, V2); +} + +// Special-case permute templates +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR) { return V1; } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR, FXMVECTOR V2) { return V2; } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); } + + +//------------------------------------------------------------------------------------- +// Swizzle Templates +//------------------------------------------------------------------------------------- + +// General swizzle template +template + inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V) +{ + static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range"); + static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range"); + static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range"); + static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range"); + + return _mm_permute_ps( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) ); +} + +// Specialized swizzles +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); } + + +//------------------------------------------------------------------------------------- +// Other Templates +//------------------------------------------------------------------------------------- + +template + inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return AVX::XMVectorPermute(V1, V2); +} + +template + inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return AVX::XMVectorSwizzle(V); +} + +template + inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return AVX::XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V); +} + +} // namespace AVX + +} // namespace DirectX; diff --git a/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathAVX2.h b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathAVX2.h new file mode 100644 index 000000000..9624dc954 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathAVX2.h @@ -0,0 +1,1037 @@ +//------------------------------------------------------------------------------------- +// DirectXMathAVX2.h -- AVX2 extensions for SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__ +#error AVX2 not supported on ARM platform +#endif + +#include +#include + +namespace DirectX +{ + +namespace AVX2 +{ + +inline bool XMVerifyAVX2Support() +{ + // Should return true for AMD "Excavator", Intel "Haswell" or later processors + // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012) + + // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + int CPUInfo[4] = {-1}; +#if defined(__clang__) || defined(__GNUC__) + __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 0); +#endif + + if ( CPUInfo[0] < 7 ) + return false; + +#if defined(__clang__) || defined(__GNUC__) + __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 1); +#endif + + // We check for F16C, FMA3, AVX, OSXSAVE, SSSE4.1, and SSE3 + if ( (CPUInfo[2] & 0x38081001) != 0x38081001 ) + return false; + +#if defined(__clang__) || defined(__GNUC__) + __cpuid_count(7, 0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuidex(CPUInfo, 7, 0); +#endif + + return ( (CPUInfo[1] & 0x20 ) == 0x20 ); +} + + +//------------------------------------------------------------------------------------- +// Vector +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr( _In_ const float *pValue ) +{ + return _mm_broadcast_ss( pValue ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSplatX( FXMVECTOR V ) +{ + return _mm_broadcastss_ps( V ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSplatY( FXMVECTOR V ) +{ + return _mm_permute_ps( V, _MM_SHUFFLE(1, 1, 1, 1) ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSplatZ( FXMVECTOR V ) +{ + return _mm_permute_ps( V, _MM_SHUFFLE(2, 2, 2, 2) ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSplatW( FXMVECTOR V ) +{ + return _mm_permute_ps( V, _MM_SHUFFLE(3, 3, 3, 3) ); +} + +inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ + return _mm_fmadd_ps( V1, V2, V3 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ + return _mm_fnmadd_ps( V1, V2, V3 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSwizzle( FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3 ) +{ + assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); + _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); + + unsigned int elem[4] = { E0, E1, E2, E3 }; + __m128i vControl = _mm_loadu_si128( reinterpret_cast(&elem[0]) ); + return _mm_permutevar_ps( V, vControl ); +} + +inline XMVECTOR XM_CALLCONV XMVectorPermute( FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW ) +{ + assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); + _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); + + static const XMVECTORU32 three = { { { 3, 3, 3, 3 } } }; + + XM_ALIGNED_DATA(16) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW }; + __m128i vControl = _mm_load_si128( reinterpret_cast(&elem[0]) ); + + __m128i vSelect = _mm_cmpgt_epi32( vControl, three ); + vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) ); + + __m128 shuffled1 = _mm_permutevar_ps( V1, vControl ); + __m128 shuffled2 = _mm_permutevar_ps( V2, vControl ); + + __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 ); + __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 ); + + return _mm_or_ps( masked1, masked2 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return AVX2::XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3)); +} + +inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return AVX2::XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return AVX2::XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 ); +} + + +//------------------------------------------------------------------------------------- +// Vector2 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector2Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] ); + XMVECTOR vTemp = _mm_broadcastss_ps(V); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector2TransformCoord +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] ); + XMVECTOR vTemp = _mm_broadcastss_ps(V); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); + vResult = _mm_div_ps( vResult, W ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector2TransformNormal +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_mul_ps( vResult, M.r[1] ); + XMVECTOR vTemp = _mm_broadcastss_ps(V); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Vector3 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector3Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_broadcastss_ps(V); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector3TransformCoord +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_broadcastss_ps(V); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); + vResult = _mm_div_ps( vResult, W ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector3TransformNormal +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_mul_ps( vResult, M.r[2] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_broadcastss_ps(V); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2); + +inline XMVECTOR XM_CALLCONV XMVector3Project +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = AVX2::XMMatrixMultiply(World, View); + Transform = AVX2::XMMatrixMultiply(Transform, Projection); + + XMVECTOR Result = AVX2::XMVector3TransformCoord(V, Transform); + + Result = AVX2::XMVectorMultiplyAdd(Result, Scale, Offset); + + return Result; +} + +inline XMVECTOR XM_CALLCONV XMVector3Unproject +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = AVX2::XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = AVX2::XMMatrixMultiply(World, View); + Transform = AVX2::XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + XMVECTOR Result = AVX2::XMVectorMultiplyAdd(V, Scale, Offset); + + return AVX2::XMVector3TransformCoord(Result, Transform); +} + + +//------------------------------------------------------------------------------------- +// Vector4 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector4Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W + vResult = _mm_mul_ps( vResult, M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_fmadd_ps( vTemp, M.r[2], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_broadcastss_ps(V); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Matrix +//------------------------------------------------------------------------------------- + +inline XMMATRIX XM_CALLCONV XMMatrixMultiply +( + CXMMATRIX M1, + CXMMATRIX M2 +) +{ + XMMATRIX mResult; + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = _mm_broadcastss_ps(vW); + XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + mResult.r[0] = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = _mm_broadcastss_ps(vW); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + mResult.r[1] = vX; + vW = M1.r[2]; + vX = _mm_broadcastss_ps(vW); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + mResult.r[2] = vX; + vW = M1.r[3]; + vX = _mm_broadcastss_ps(vW); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + mResult.r[3] = vX; + return mResult; +} + +inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose +( + FXMMATRIX M1, + CXMMATRIX M2 +) +{ + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = _mm_broadcastss_ps(vW); + XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + __m128 r0 = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = _mm_broadcastss_ps(vW); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + __m128 r1 = vX; + vW = M1.r[2]; + vX = _mm_broadcastss_ps(vW); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + __m128 r2 = vX; + vW = M1.r[3]; + vX = _mm_broadcastss_ps(vW); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + __m128 r3 = vX; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2)); + + XMMATRIX mResult; + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); + return mResult; +} + + +//------------------------------------------------------------------------------------- +// Permute Templates +//------------------------------------------------------------------------------------- + +namespace Internal +{ + // Slow path fallback for permutes that do not map to a single SSE opcode. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) + { + static const XMVECTORU32 selectMask = + { + WhichX ? 0xFFFFFFFF : 0, + WhichY ? 0xFFFFFFFF : 0, + WhichZ ? 0xFFFFFFFF : 0, + WhichW ? 0xFFFFFFFF : 0, + }; + + XMVECTOR shuffled1 = _mm_permute_ps(v1, Shuffle); + XMVECTOR shuffled2 = _mm_permute_ps(v2, Shuffle); + + XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1); + XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2); + + return _mm_or_ps(masked1, masked2); + } + }; + + // Fast path for permutes that only read from the first vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_permute_ps(v1, Shuffle); } + }; + + // Fast path for permutes that only read from the second vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return _mm_permute_ps(v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the first vector, ZW from the second. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the second vector, ZW from the first. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); } + }; +}; + +// General permute template +template + inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2) +{ + static_assert(PermuteX <= 7, "PermuteX template parameter out of range"); + static_assert(PermuteY <= 7, "PermuteY template parameter out of range"); + static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range"); + static_assert(PermuteW <= 7, "PermuteW template parameter out of range"); + + const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3); + + const bool WhichX = PermuteX > 3; + const bool WhichY = PermuteY > 3; + const bool WhichZ = PermuteZ > 3; + const bool WhichW = PermuteW > 3; + + return AVX2::Internal::PermuteHelper::Permute(V1, V2); +} + +// Special-case permute templates +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR) { return V1; } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR, FXMVECTOR V2) { return V2; } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); } + + +//------------------------------------------------------------------------------------- +// Swizzle Templates +//------------------------------------------------------------------------------------- + +// General swizzle template +template + inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V) +{ + static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range"); + static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range"); + static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range"); + static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range"); + + return _mm_permute_ps( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) ); +} + +// Specialized swizzles +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return _mm_broadcastss_ps(V); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); } + + +//------------------------------------------------------------------------------------- +// Other Templates +//------------------------------------------------------------------------------------- + +template + inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return AVX2::XMVectorPermute(V1, V2); +} + +template + inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return AVX2::XMVectorSwizzle(V); +} + +template + inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return AVX2::XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V); +} + +//------------------------------------------------------------------------------------- +// Data conversion +//------------------------------------------------------------------------------------- + +inline float XMConvertHalfToFloat( PackedVector::HALF Value ) +{ + __m128i V1 = _mm_cvtsi32_si128( static_cast(Value) ); + __m128 V2 = _mm_cvtph_ps( V1 ); + return _mm_cvtss_f32( V2 ); +} + +inline PackedVector::HALF XMConvertFloatToHalf( float Value ) +{ + __m128 V1 = _mm_set_ss( Value ); + __m128i V2 = _mm_cvtps_ph( V1, 0 ); + return static_cast( _mm_cvtsi128_si32(V2) ); +} + +inline float* XMConvertHalfToFloatStream +( + _Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(2+InputStride*(HalfCount-1)) const PackedVector::HALF* pInputStream, + _In_ size_t InputStride, + _In_ size_t HalfCount +) +{ + using namespace PackedVector; + + assert(pOutputStream); + assert(pInputStream); + + assert(InputStride >= sizeof(HALF)); + assert(OutputStride >= sizeof(float)); + + auto pHalf = reinterpret_cast(pInputStream); + auto pFloat = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = HalfCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(HALF)) + { + if (OutputStride == sizeof(float)) + { + if ((reinterpret_cast(pFloat) & 0xF) == 0) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + __m128 FV = _mm_cvtph_ps(HV); + + _mm_stream_ps(reinterpret_cast(pFloat), FV); + pFloat += OutputStride * 4; + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + __m128 FV = _mm_cvtph_ps(HV); + + _mm_storeu_ps(reinterpret_cast(pFloat), FV); + pFloat += OutputStride * 4; + i += 4; + } + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + __m128 FV = _mm_cvtph_ps(HV); + + _mm_store_ss(reinterpret_cast(pFloat), FV); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 1); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 2); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 3); + pFloat += OutputStride; + i += 4; + } + } + } + else if (OutputStride == sizeof(float)) + { + if ((reinterpret_cast(pFloat) & 0xF) == 0) + { + // Scattered input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16(HV, H1, 0); + HV = _mm_insert_epi16(HV, H2, 1); + HV = _mm_insert_epi16(HV, H3, 2); + HV = _mm_insert_epi16(HV, H4, 3); + __m128 FV = _mm_cvtph_ps(HV); + + _mm_stream_ps(reinterpret_cast(pFloat), FV); + pFloat += OutputStride * 4; + i += 4; + } + } + else + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16(HV, H1, 0); + HV = _mm_insert_epi16(HV, H2, 1); + HV = _mm_insert_epi16(HV, H3, 2); + HV = _mm_insert_epi16(HV, H4, 3); + __m128 FV = _mm_cvtph_ps(HV); + + _mm_storeu_ps(reinterpret_cast(pFloat), FV); + pFloat += OutputStride * 4; + i += 4; + } + + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16(HV, H1, 0); + HV = _mm_insert_epi16(HV, H2, 1); + HV = _mm_insert_epi16(HV, H3, 2); + HV = _mm_insert_epi16(HV, H4, 3); + __m128 FV = _mm_cvtph_ps(HV); + + _mm_store_ss(reinterpret_cast(pFloat), FV); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 1); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 2); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 3); + pFloat += OutputStride; + i += 4; + } + } + } + + for (; i < HalfCount; ++i) + { + *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); + pHalf += InputStride; + pFloat += OutputStride; + } + + return pOutputStream; +} + + +inline PackedVector::HALF* XMConvertFloatToHalfStream +( + _Out_writes_bytes_(2+OutputStride*(FloatCount-1)) PackedVector::HALF* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream, + _In_ size_t InputStride, + _In_ size_t FloatCount +) +{ + using namespace PackedVector; + + assert(pOutputStream); + assert(pInputStream); + + assert(InputStride >= sizeof(float)); + assert(OutputStride >= sizeof(HALF)); + + auto pFloat = reinterpret_cast(pInputStream); + auto pHalf = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = FloatCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(float)) + { + if (OutputStride == sizeof(HALF)) + { + if ((reinterpret_cast(pFloat) & 0xF) == 0) + { + // Aligned and packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_load_ps(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + __m128i HV = _mm_cvtps_ph(FV, 0); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV); + pHalf += OutputStride * 4; + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_loadu_ps(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + __m128i HV = _mm_cvtps_ph(FV, 0); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV); + pHalf += OutputStride * 4; + i += 4; + } + } + } + else + { + if ((reinterpret_cast(pFloat) & 0xF) == 0) + { + // Aligned & packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_load_ps(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + __m128i HV = _mm_cvtps_ph(FV, 0); + + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 0)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 1)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 2)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 3)); + pHalf += OutputStride; + i += 4; + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_loadu_ps(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + __m128i HV = _mm_cvtps_ph(FV, 0); + + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 0)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 1)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 2)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 3)); + pHalf += OutputStride; + i += 4; + } + } + } + } + else if (OutputStride == sizeof(HALF)) + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV1 = _mm_load_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV2 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV3 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV4 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV = _mm_blend_ps(FV1, FV2, 0x2); + __m128 FT = _mm_blend_ps(FV3, FV4, 0x8); + FV = _mm_blend_ps(FV, FT, 0xC); + + __m128i HV = _mm_cvtps_ph(FV, 0); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV); + pHalf += OutputStride * 4; + i += 4; + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV1 = _mm_load_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV2 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV3 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV4 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV = _mm_blend_ps(FV1, FV2, 0x2); + __m128 FT = _mm_blend_ps(FV3, FV4, 0x8); + FV = _mm_blend_ps(FV, FT, 0xC); + + __m128i HV = _mm_cvtps_ph(FV, 0); + + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 0)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 1)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 2)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 3)); + pHalf += OutputStride; + i += 4; + } + } + } + + for (; i < FloatCount; ++i) + { + *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); + pFloat += InputStride; + pHalf += OutputStride; + } + + return pOutputStream; +} + + +//------------------------------------------------------------------------------------- +// Half2 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMLoadHalf2( _In_ const PackedVector::XMHALF2* pSource ) +{ + assert(pSource); + __m128 V = _mm_load_ss( reinterpret_cast(pSource) ); + return _mm_cvtph_ps( _mm_castps_si128( V ) ); +} + +inline void XM_CALLCONV XMStoreHalf2( _Out_ PackedVector::XMHALF2* pDestination, _In_ FXMVECTOR V ) +{ + assert(pDestination); + __m128i V1 = _mm_cvtps_ph( V, 0 ); + _mm_store_ss( reinterpret_cast(pDestination), _mm_castsi128_ps(V1) ); +} + + +//------------------------------------------------------------------------------------- +// Half4 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMLoadHalf4( _In_ const PackedVector::XMHALF4* pSource ) +{ + assert(pSource); + __m128i V = _mm_loadl_epi64( reinterpret_cast(pSource) ); + return _mm_cvtph_ps( V ); +} + +inline void XM_CALLCONV XMStoreHalf4( _Out_ PackedVector::XMHALF4* pDestination, _In_ FXMVECTOR V ) +{ + assert(pDestination); + __m128i V1 = _mm_cvtps_ph( V, 0 ); + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 ); +} + +} // namespace AVX2 + +} // namespace DirectX; diff --git a/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathBE.h b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathBE.h new file mode 100644 index 000000000..e5a0f85f7 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathBE.h @@ -0,0 +1,95 @@ +//------------------------------------------------------------------------------------- +// DirectXMathBE.h -- Big-endian swap extensions for SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64) +#include +#endif + +#include + +namespace DirectX +{ + +inline XMVECTOR XM_CALLCONV XMVectorEndian +( + FXMVECTOR V +) +{ +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const XMVECTORU32 idx = { { { 0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu } } }; + + uint8x8x2_t tbl; + tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V)); + tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V)); + + const uint8x8_t rL = vtbl2_u8(tbl, vget_low_u32(idx)); + const uint8x8_t rH = vtbl2_u8(tbl, vget_high_u32(idx)); + return vcombine_f32(vreinterpret_f32_u8(rL), vreinterpret_f32_u8(rH)); +#else + XMVECTORU32 E; + E.v = V; + uint32_t value = E.u[0]; + E.u[0] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) ); + value = E.u[1]; + E.u[1] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) ); + value = E.u[2]; + E.u[2] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) ); + value = E.u[3]; + E.u[3] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) ); + return E.v; +#endif +} + + +#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64) +namespace SSSE3 +{ + +inline bool XMVerifySSSE3Support() +{ + // Should return true on AMD Bulldozer, Intel Core i7/i5/i3, Intel Atom, or later processors + + // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + int CPUInfo[4] = { -1 }; +#if defined(__clang__) || defined(__GNUC__) + __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 0); +#endif + + if ( CPUInfo[0] < 1 ) + return false; + +#if defined(__clang__) || defined(__GNUC__) + __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 1); +#endif + + // Check for SSSE3 instruction set. + return ( (CPUInfo[2] & 0x200) != 0 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorEndian +( + FXMVECTOR V +) +{ + static const XMVECTORU32 idx = { { { 0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu } } }; + + __m128i Result = _mm_shuffle_epi8( _mm_castps_si128(V), idx ); + return _mm_castsi128_ps( Result ); +} + +} // namespace SSSE3 +#endif // X86 || X64 + +} // namespace DirectX diff --git a/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathF16C.h b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathF16C.h new file mode 100644 index 000000000..5802be68e --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathF16C.h @@ -0,0 +1,471 @@ +//------------------------------------------------------------------------------------- +// DirectXMathF16C.h -- F16C/CVT16 extensions for SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__ +#error F16C not supported on ARM platform +#endif + +#include +#include + +namespace DirectX +{ + +namespace F16C +{ + +inline bool XMVerifyF16CSupport() +{ + // Should return true for AMD "Piledriver" and Intel "Ivy Bridge" processors + // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012) + + // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + int CPUInfo[4] = { -1 }; +#if defined(__clang__) || defined(__GNUC__) + __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 0); +#endif + + if ( CPUInfo[0] < 1 ) + return false; + +#if defined(__clang__) || defined(__GNUC__) + __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 1); +#endif + + // We check for F16C, AVX, OSXSAVE, and SSE4.1 + return ( (CPUInfo[2] & 0x38080000 ) == 0x38080000 ); +} + + +//------------------------------------------------------------------------------------- +// Data conversion +//------------------------------------------------------------------------------------- + +inline float XMConvertHalfToFloat( PackedVector::HALF Value ) +{ + __m128i V1 = _mm_cvtsi32_si128( static_cast(Value) ); + __m128 V2 = _mm_cvtph_ps( V1 ); + return _mm_cvtss_f32( V2 ); +} + +inline PackedVector::HALF XMConvertFloatToHalf( float Value ) +{ + __m128 V1 = _mm_set_ss( Value ); + __m128i V2 = _mm_cvtps_ph( V1, 0 ); + return static_cast( _mm_cvtsi128_si32(V2) ); +} + +inline float* XMConvertHalfToFloatStream +( + _Out_writes_bytes_(sizeof(float) + OutputStride * (HalfCount - 1)) float* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(2 + InputStride * (HalfCount - 1)) const PackedVector::HALF* pInputStream, + _In_ size_t InputStride, + _In_ size_t HalfCount +) +{ + using namespace PackedVector; + + assert(pOutputStream); + assert(pInputStream); + + assert(InputStride >= sizeof(HALF)); + assert(OutputStride >= sizeof(float)); + + auto pHalf = reinterpret_cast(pInputStream); + auto pFloat = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = HalfCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(HALF)) + { + if (OutputStride == sizeof(float)) + { + if ((reinterpret_cast(pFloat) & 0xF) == 0) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + __m128 FV = _mm_cvtph_ps(HV); + + _mm_stream_ps(reinterpret_cast(pFloat), FV); + pFloat += OutputStride * 4; + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + __m128 FV = _mm_cvtph_ps(HV); + + _mm_storeu_ps(reinterpret_cast(pFloat), FV); + pFloat += OutputStride * 4; + i += 4; + } + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + __m128 FV = _mm_cvtph_ps(HV); + + _mm_store_ss(reinterpret_cast(pFloat), FV); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 1); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 2); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 3); + pFloat += OutputStride; + i += 4; + } + } + } + else if (OutputStride == sizeof(float)) + { + if ((reinterpret_cast(pFloat) & 0xF) == 0) + { + // Scattered input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16(HV, H1, 0); + HV = _mm_insert_epi16(HV, H2, 1); + HV = _mm_insert_epi16(HV, H3, 2); + HV = _mm_insert_epi16(HV, H4, 3); + __m128 FV = _mm_cvtph_ps(HV); + + _mm_stream_ps(reinterpret_cast(pFloat), FV); + pFloat += OutputStride * 4; + i += 4; + } + } + else + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16(HV, H1, 0); + HV = _mm_insert_epi16(HV, H2, 1); + HV = _mm_insert_epi16(HV, H3, 2); + HV = _mm_insert_epi16(HV, H4, 3); + __m128 FV = _mm_cvtph_ps(HV); + + _mm_storeu_ps(reinterpret_cast(pFloat), FV); + pFloat += OutputStride * 4; + i += 4; + } + + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16(HV, H1, 0); + HV = _mm_insert_epi16(HV, H2, 1); + HV = _mm_insert_epi16(HV, H3, 2); + HV = _mm_insert_epi16(HV, H4, 3); + __m128 FV = _mm_cvtph_ps(HV); + + _mm_store_ss(reinterpret_cast(pFloat), FV); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 1); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 2); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 3); + pFloat += OutputStride; + i += 4; + } + } + } + + for (; i < HalfCount; ++i) + { + *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); + pHalf += InputStride; + pFloat += OutputStride; + } + + return pOutputStream; +} + + +inline PackedVector::HALF* XMConvertFloatToHalfStream +( + _Out_writes_bytes_(2 + OutputStride * (FloatCount - 1)) PackedVector::HALF* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(float) + InputStride * (FloatCount - 1)) const float* pInputStream, + _In_ size_t InputStride, + _In_ size_t FloatCount +) +{ + using namespace PackedVector; + + assert(pOutputStream); + assert(pInputStream); + + assert(InputStride >= sizeof(float)); + assert(OutputStride >= sizeof(HALF)); + + auto pFloat = reinterpret_cast(pInputStream); + auto pHalf = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = FloatCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(float)) + { + if (OutputStride == sizeof(HALF)) + { + if ((reinterpret_cast(pFloat) & 0xF) == 0) + { + // Aligned and packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_load_ps(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + __m128i HV = _mm_cvtps_ph(FV, 0); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV); + pHalf += OutputStride * 4; + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_loadu_ps(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + __m128i HV = _mm_cvtps_ph(FV, 0); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV); + pHalf += OutputStride * 4; + i += 4; + } + } + } + else + { + if ((reinterpret_cast(pFloat) & 0xF) == 0) + { + // Aligned & packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_load_ps(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + __m128i HV = _mm_cvtps_ph(FV, 0); + + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 0)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 1)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 2)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 3)); + pHalf += OutputStride; + i += 4; + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_loadu_ps(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + __m128i HV = _mm_cvtps_ph(FV, 0); + + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 0)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 1)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 2)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 3)); + pHalf += OutputStride; + i += 4; + } + } + } + } + else if (OutputStride == sizeof(HALF)) + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV1 = _mm_load_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV2 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV3 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV4 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV = _mm_blend_ps(FV1, FV2, 0x2); + __m128 FT = _mm_blend_ps(FV3, FV4, 0x8); + FV = _mm_blend_ps(FV, FT, 0xC); + + __m128i HV = _mm_cvtps_ph(FV, 0); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV); + pHalf += OutputStride * 4; + i += 4; + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV1 = _mm_load_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV2 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV3 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV4 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV = _mm_blend_ps(FV1, FV2, 0x2); + __m128 FT = _mm_blend_ps(FV3, FV4, 0x8); + FV = _mm_blend_ps(FV, FT, 0xC); + + __m128i HV = _mm_cvtps_ph(FV, 0); + + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 0)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 1)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 2)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 3)); + pHalf += OutputStride; + i += 4; + } + } + } + + for (; i < FloatCount; ++i) + { + *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); + pFloat += InputStride; + pHalf += OutputStride; + } + + return pOutputStream; +} + + +//------------------------------------------------------------------------------------- +// Half2 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMLoadHalf2( _In_ const PackedVector::XMHALF2* pSource ) +{ + assert(pSource); + __m128 V = _mm_load_ss( reinterpret_cast(pSource) ); + return _mm_cvtph_ps( _mm_castps_si128( V ) ); +} + +inline void XM_CALLCONV XMStoreHalf2( _Out_ PackedVector::XMHALF2* pDestination, _In_ FXMVECTOR V ) +{ + assert(pDestination); + __m128i V1 = _mm_cvtps_ph( V, 0 ); + _mm_store_ss( reinterpret_cast(pDestination), _mm_castsi128_ps(V1) ); +} + + +//------------------------------------------------------------------------------------- +// Half4 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMLoadHalf4( _In_ const PackedVector::XMHALF4* pSource ) +{ + assert(pSource); + __m128i V = _mm_loadl_epi64( reinterpret_cast(pSource) ); + return _mm_cvtph_ps( V ); +} + +inline void XM_CALLCONV XMStoreHalf4( _Out_ PackedVector::XMHALF4* pDestination, _In_ FXMVECTOR V ) +{ + assert(pDestination); + __m128i V1 = _mm_cvtps_ph( V, 0 ); + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 ); +} + +} // namespace F16C + +} // namespace DirectX diff --git a/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathFMA3.h b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathFMA3.h new file mode 100644 index 000000000..8fae18e0b --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathFMA3.h @@ -0,0 +1,391 @@ +//------------------------------------------------------------------------------------- +// DirectXMathFMA3.h -- FMA3 extensions for SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__ +#error FMA3 not supported on ARM platform +#endif + +#include + +namespace DirectX +{ + +namespace FMA3 +{ + +inline bool XMVerifyFMA3Support() +{ + // Should return true for AMD "Pildriver" and Intel "Haswell" processors + // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012) + + // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + int CPUInfo[4] = {-1}; +#if defined(__clang__) || defined(__GNUC__) + __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 0); +#endif + + if ( CPUInfo[0] < 1 ) + return false; + +#if defined(__clang__) || defined(__GNUC__) + __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 1); +#endif + + // We check for FMA3, AVX, OSXSAVE + return ( (CPUInfo[2] & 0x18001000) == 0x18001000 ); +} + + +//------------------------------------------------------------------------------------- +// Vector +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ + return _mm_fmadd_ps( V1, V2, V3 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ + return _mm_fnmadd_ps( V1, V2, V3 ); +} + + +//------------------------------------------------------------------------------------- +// Vector2 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector2Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector2TransformCoord +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); + vResult = _mm_div_ps( vResult, W ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector2TransformNormal +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_mul_ps( vResult, M.r[1] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Vector3 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector3Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector3TransformCoord +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); + vResult = _mm_div_ps( vResult, W ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector3TransformNormal +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_mul_ps( vResult, M.r[2] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2); + +inline XMVECTOR XM_CALLCONV XMVector3Project +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = FMA3::XMMatrixMultiply(World, View); + Transform = FMA3::XMMatrixMultiply(Transform, Projection); + + XMVECTOR Result = FMA3::XMVector3TransformCoord(V, Transform); + + Result = FMA3::XMVectorMultiplyAdd(Result, Scale, Offset); + + return Result; +} + +inline XMVECTOR XM_CALLCONV XMVector3Unproject +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = FMA3::XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = FMA3::XMMatrixMultiply(World, View); + Transform = FMA3::XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + XMVECTOR Result = FMA3::XMVectorMultiplyAdd(V, Scale, Offset); + + return FMA3::XMVector3TransformCoord(Result, Transform); +} + + +//------------------------------------------------------------------------------------- +// Vector4 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector4Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W + vResult = _mm_mul_ps( vResult, M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_fmadd_ps( vTemp, M.r[2], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Matrix +//------------------------------------------------------------------------------------- + +inline XMMATRIX XM_CALLCONV XMMatrixMultiply +( + CXMMATRIX M1, + CXMMATRIX M2 +) +{ + XMMATRIX mResult; + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + mResult.r[0] = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + mResult.r[1] = vX; + vW = M1.r[2]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + mResult.r[2] = vX; + vW = M1.r[3]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + mResult.r[3] = vX; + return mResult; +} + +inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose +( + FXMMATRIX M1, + CXMMATRIX M2 +) +{ + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + __m128 r0 = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + __m128 r1 = vX; + vW = M1.r[2]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + __m128 r2 = vX; + vW = M1.r[3]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + __m128 r3 = vX; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2)); + + XMMATRIX mResult; + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); + return mResult; +} + +} // namespace FMA3 + +} // namespace DirectX; diff --git a/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathFMA4.h b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathFMA4.h new file mode 100644 index 000000000..2cec13e38 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathFMA4.h @@ -0,0 +1,415 @@ +//------------------------------------------------------------------------------------- +// DirectXMathFMA4.h -- FMA4 extensions for SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__ +#error FMA4 not supported on ARM platform +#endif + +#include +#include + +#ifdef __GNUC__ +#include +#endif + +namespace DirectX +{ + +namespace FMA4 +{ + +inline bool XMVerifyFMA4Support() +{ + // Should return true for AMD Bulldozer processors + // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012) + + // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + int CPUInfo[4] = {-1}; +#if defined(__clang__) || defined(__GNUC__) + __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 0); +#endif + + if ( CPUInfo[0] < 1 ) + return false; + +#if defined(__clang__) || defined(__GNUC__) + __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 1); +#endif + + // We check for AVX, OSXSAVE (required to access FMA4) + if ( (CPUInfo[2] & 0x18000000) != 0x18000000 ) + return false; + +#if defined(__clang__) || defined(__GNUC__) + __cpuid(0x80000000, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 0x80000000); +#endif + + if ( uint32_t(CPUInfo[0]) < 0x80000001u ) + return false; + + // We check for FMA4 +#if defined(__clang__) || defined(__GNUC__) + __cpuid(0x80000001, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 0x80000001); +#endif + + return ( CPUInfo[2] & 0x10000 ); +} + + +//------------------------------------------------------------------------------------- +// Vector +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ + return _mm_macc_ps( V1, V2, V3 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ + return _mm_nmacc_ps( V1, V2, V3 ); +} + + +//------------------------------------------------------------------------------------- +// Vector2 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector2Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_macc_ps( vResult, M.r[1], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector2TransformCoord +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_macc_ps( vResult, M.r[1], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); + XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); + vResult = _mm_div_ps( vResult, W ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector2TransformNormal +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_mul_ps( vResult, M.r[1] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Vector3 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector3Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_macc_ps( vResult, M.r[2], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_macc_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector3TransformCoord +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_macc_ps( vResult, M.r[2], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_macc_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); + XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); + vResult = _mm_div_ps( vResult, W ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector3TransformNormal +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_mul_ps( vResult, M.r[2] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_macc_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2); + +inline XMVECTOR XM_CALLCONV XMVector3Project +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = FMA4::XMMatrixMultiply(World, View); + Transform = FMA4::XMMatrixMultiply(Transform, Projection); + + XMVECTOR Result = FMA4::XMVector3TransformCoord(V, Transform); + + Result = FMA4::XMVectorMultiplyAdd(Result, Scale, Offset); + + return Result; +} + +inline XMVECTOR XM_CALLCONV XMVector3Unproject +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = FMA4::XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = FMA4::XMMatrixMultiply(World, View); + Transform = FMA4::XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + XMVECTOR Result = FMA4::XMVectorMultiplyAdd(V, Scale, Offset); + + return FMA4::XMVector3TransformCoord(Result, Transform); +} + + +//------------------------------------------------------------------------------------- +// Vector4 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector4Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W + vResult = _mm_mul_ps( vResult, M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_macc_ps( vTemp, M.r[2], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_macc_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Matrix +//------------------------------------------------------------------------------------- + +inline XMMATRIX XM_CALLCONV XMMatrixMultiply +( + CXMMATRIX M1, + CXMMATRIX M2 +) +{ + XMMATRIX mResult; + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_macc_ps(vY,M2.r[1],vX); + vX = _mm_macc_ps(vZ,M2.r[2],vX); + vX = _mm_macc_ps(vW,M2.r[3],vX); + mResult.r[0] = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_macc_ps(vY,M2.r[1],vX); + vX = _mm_macc_ps(vZ,M2.r[2],vX); + vX = _mm_macc_ps(vW,M2.r[3],vX); + mResult.r[1] = vX; + vW = M1.r[2]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_macc_ps(vY,M2.r[1],vX); + vX = _mm_macc_ps(vZ,M2.r[2],vX); + vX = _mm_macc_ps(vW,M2.r[3],vX); + mResult.r[2] = vX; + vW = M1.r[3]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_macc_ps(vY,M2.r[1],vX); + vX = _mm_macc_ps(vZ,M2.r[2],vX); + vX = _mm_macc_ps(vW,M2.r[3],vX); + mResult.r[3] = vX; + return mResult; +} + +inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose +( + FXMMATRIX M1, + CXMMATRIX M2 +) +{ + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_macc_ps(vY,M2.r[1],vX); + vX = _mm_macc_ps(vZ,M2.r[2],vX); + vX = _mm_macc_ps(vW,M2.r[3],vX); + __m128 r0 = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_macc_ps(vY,M2.r[1],vX); + vX = _mm_macc_ps(vZ,M2.r[2],vX); + vX = _mm_macc_ps(vW,M2.r[3],vX); + __m128 r1 = vX; + vW = M1.r[2]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_macc_ps(vY,M2.r[1],vX); + vX = _mm_macc_ps(vZ,M2.r[2],vX); + vX = _mm_macc_ps(vW,M2.r[3],vX); + __m128 r2 = vX; + vW = M1.r[3]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_macc_ps(vY,M2.r[1],vX); + vX = _mm_macc_ps(vZ,M2.r[2],vX); + vX = _mm_macc_ps(vW,M2.r[3],vX); + __m128 r3 = vX; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2)); + + XMMATRIX mResult; + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); + return mResult; +} + +} // namespace FMA4 + +} // namespace DirectX; diff --git a/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathSSE3.h b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathSSE3.h new file mode 100644 index 000000000..926de4a9b --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathSSE3.h @@ -0,0 +1,111 @@ +//------------------------------------------------------------------------------------- +// DirectXMathSSE3.h -- SSE3 extensions for SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__ +#error SSE3 not supported on ARM platform +#endif + +#include + +#include + +namespace DirectX +{ + +namespace SSE3 +{ + +inline bool XMVerifySSE3Support() +{ + // Should return true on AMD Athlon 64, AMD Phenom, and Intel Pentium 4 or later processors + + // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + int CPUInfo[4] = { -1 }; +#if defined(__clang__) || defined(__GNUC__) + __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 0); +#endif + if ( CPUInfo[0] < 1 ) + return false; + +#if defined(__clang__) || defined(__GNUC__) + __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 1); +#endif + + // We only check for SSE3 instruction set. SSSE3 instructions are not used. + return ( (CPUInfo[2] & 0x1) != 0 ); +} + +inline XMVECTOR XM_CALLCONV XMVector2Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + XMVECTOR vTemp = _mm_mul_ps(V1,V2); + vTemp = _mm_hadd_ps(vTemp,vTemp); + return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(0,0,0,0)); +} + +inline XMVECTOR XM_CALLCONV XMVector2LengthSq( FXMVECTOR V ) +{ + return SSE3::XMVector2Dot(V, V); +} + +inline XMVECTOR XM_CALLCONV XMVector3Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + XMVECTOR vTemp = _mm_mul_ps(V1,V2); + vTemp = _mm_and_ps( vTemp, g_XMMask3 ); + vTemp = _mm_hadd_ps(vTemp,vTemp); + return _mm_hadd_ps(vTemp,vTemp); +} + +inline XMVECTOR XM_CALLCONV XMVector3LengthSq( FXMVECTOR V ) +{ + return SSE3::XMVector3Dot(V, V); +} + +inline XMVECTOR XM_CALLCONV XMVector4Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + XMVECTOR vTemp = _mm_mul_ps(V1,V2); + vTemp = _mm_hadd_ps( vTemp, vTemp ); + return _mm_hadd_ps( vTemp, vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector4LengthSq( FXMVECTOR V ) +{ + return SSE3::XMVector4Dot(V, V); +} + +inline XMVECTOR XM_CALLCONV XMVectorSwizzle_0022( FXMVECTOR V ) +{ + return _mm_moveldup_ps(V); +} + +inline XMVECTOR XM_CALLCONV XMVectorSwizzle_1133( FXMVECTOR V ) +{ + return _mm_movehdup_ps(V); +} + +} // namespace SSE3 + +} // namespace DirectX diff --git a/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathSSE4.h b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathSSE4.h new file mode 100644 index 000000000..4e432a986 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Extensions/DirectXMathSSE4.h @@ -0,0 +1,417 @@ +//------------------------------------------------------------------------------------- +// DirectXMathSSE4.h -- SSE4.1 extensions for SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__ +#error SSE4 not supported on ARM platform +#endif + +#include + +#include + +namespace DirectX +{ + +namespace SSE4 +{ + +inline bool XMVerifySSE4Support() +{ + // Should return true on AMD Bulldozer, Intel Core 2 ("Penryn"), and Intel Core i7 ("Nehalem") or later processors + + // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + int CPUInfo[4] = { -1 }; +#if defined(__clang__) || defined(__GNUC__) + __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 0); +#endif + if ( CPUInfo[0] < 1 ) + return false; + +#if defined(__clang__) || defined(__GNUC__) + __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 1); +#endif + + // We only check for SSE4.1 instruction set. SSE4.2 instructions are not used. + return ( (CPUInfo[2] & 0x80000) == 0x80000 ); +} + + +//------------------------------------------------------------------------------------- +// Vector +//------------------------------------------------------------------------------------- + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast" +#endif + +inline void XM_CALLCONV XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V) +{ + assert( y != nullptr ); + *reinterpret_cast(y) = _mm_extract_ps( V, 1 ); +} + +inline void XM_CALLCONV XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V) +{ + assert( z != nullptr ); + *reinterpret_cast(z) = _mm_extract_ps( V, 2 ); +} + +inline void XM_CALLCONV XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V) +{ + assert( w != nullptr ); + *reinterpret_cast(w) = _mm_extract_ps( V, 3 ); +} + +inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) +{ + __m128i V1 = _mm_castps_si128( V ); + return static_cast( _mm_extract_epi32( V1, 1 ) ); +} + +inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) +{ + __m128i V1 = _mm_castps_si128( V ); + return static_cast( _mm_extract_epi32( V1, 2 ) ); +} + +inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) +{ + __m128i V1 = _mm_castps_si128( V ); + return static_cast( _mm_extract_epi32( V1, 3 ) ); +} + +inline void XM_CALLCONV XMVectorGetIntYPtr(_Out_ uint32_t *y, _In_ FXMVECTOR V) +{ + assert( y != nullptr ); + __m128i V1 = _mm_castps_si128( V ); + *y = static_cast( _mm_extract_epi32( V1, 1 ) ); +} + +inline void XM_CALLCONV XMVectorGetIntZPtr(_Out_ uint32_t *z, _In_ FXMVECTOR V) +{ + assert( z != nullptr ); + __m128i V1 = _mm_castps_si128( V ); + *z = static_cast( _mm_extract_epi32( V1, 2 ) ); +} + +inline void XM_CALLCONV XMVectorGetIntWPtr(_Out_ uint32_t *w, _In_ FXMVECTOR V) +{ + assert( w != nullptr ); + __m128i V1 = _mm_castps_si128( V ); + *w = static_cast( _mm_extract_epi32( V1, 3 ) ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) +{ + XMVECTOR vResult = _mm_set_ss(y); + vResult = _mm_insert_ps( V, vResult, 0x10 ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) +{ + XMVECTOR vResult = _mm_set_ss(z); + vResult = _mm_insert_ps( V, vResult, 0x20 ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) +{ + XMVECTOR vResult = _mm_set_ss(w); + vResult = _mm_insert_ps( V, vResult, 0x30 ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) +{ + __m128i vResult = _mm_castps_si128( V ); + vResult = _mm_insert_epi32( vResult, static_cast(y), 1 ); + return _mm_castsi128_ps( vResult ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) +{ + __m128i vResult = _mm_castps_si128( V ); + vResult = _mm_insert_epi32( vResult, static_cast(z), 2 ); + return _mm_castsi128_ps( vResult ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) +{ + __m128i vResult = _mm_castps_si128( V ); + vResult = _mm_insert_epi32( vResult, static_cast(w), 3 ); + return _mm_castsi128_ps( vResult ); +} + +inline XMVECTOR XM_CALLCONV XMVectorRound( FXMVECTOR V ) +{ + return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ); +} + +inline XMVECTOR XM_CALLCONV XMVectorTruncate( FXMVECTOR V ) +{ + return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ); +} + +inline XMVECTOR XM_CALLCONV XMVectorFloor( FXMVECTOR V ) +{ + return _mm_floor_ps( V ); +} + +inline XMVECTOR XM_CALLCONV XMVectorCeiling( FXMVECTOR V ) +{ + return _mm_ceil_ps( V ); +} + + +//------------------------------------------------------------------------------------- +// Vector2 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector2Dot( FXMVECTOR V1, FXMVECTOR V2 ) +{ + return _mm_dp_ps( V1, V2, 0x3f ); +} + +inline XMVECTOR XM_CALLCONV XMVector2LengthSq( FXMVECTOR V ) +{ + return SSE4::XMVector2Dot(V, V); +} + +inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + return _mm_rsqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); + return _mm_div_ps( g_XMOne, vLengthSq ); +} + +inline XMVECTOR XM_CALLCONV XMVector2LengthEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + return _mm_sqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector2Length( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + return _mm_sqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); + return _mm_mul_ps(vResult, V); +} + +inline XMVECTOR XM_CALLCONV XMVector2Normalize( FXMVECTOR V ) +{ + XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f ); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Vector3 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector3Dot( FXMVECTOR V1, FXMVECTOR V2 ) +{ + return _mm_dp_ps( V1, V2, 0x7f ); +} + +inline XMVECTOR XM_CALLCONV XMVector3LengthSq( FXMVECTOR V ) +{ + return SSE4::XMVector3Dot(V, V); +} + +inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + return _mm_rsqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); + return _mm_div_ps( g_XMOne, vLengthSq ); +} + +inline XMVECTOR XM_CALLCONV XMVector3LengthEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + return _mm_sqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector3Length( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + return _mm_sqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); + return _mm_mul_ps(vResult, V); +} + +inline XMVECTOR XM_CALLCONV XMVector3Normalize( FXMVECTOR V ) +{ + XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f ); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Vector4 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector4Dot( FXMVECTOR V1, FXMVECTOR V2 ) +{ + return _mm_dp_ps( V1, V2, 0xff ); +} + +inline XMVECTOR XM_CALLCONV XMVector4LengthSq( FXMVECTOR V ) +{ + return SSE4::XMVector4Dot(V, V); +} + +inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + return _mm_rsqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); + return _mm_div_ps( g_XMOne, vLengthSq ); +} + +inline XMVECTOR XM_CALLCONV XMVector4LengthEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + return _mm_sqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector4Length( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + return _mm_sqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); + return _mm_mul_ps(vResult, V); +} + +inline XMVECTOR XM_CALLCONV XMVector4Normalize( FXMVECTOR V ) +{ + XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff ); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Plane +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst( FXMVECTOR P ) +{ + XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f ); + XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); + return _mm_mul_ps(vResult, P); +} + +inline XMVECTOR XM_CALLCONV XMPlaneNormalize( FXMVECTOR P ) +{ + XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f ); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(P,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vLengthSq); + return vResult; +} + +} // namespace SSE4 + +} // namespace DirectX diff --git a/src/thirdparty/DirectXMath-dec2022/HISTORY.md b/src/thirdparty/DirectXMath-dec2022/HISTORY.md new file mode 100644 index 000000000..776291ab7 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/HISTORY.md @@ -0,0 +1,198 @@ +# DirectXMath + +https://github.com/Microsoft/DirectXMath + +Release available for download on [GitHub](https://github.com/microsoft/DirectXMath/releases) + +## Release History + +### December 2022 (3.18) +* C++20 spaceship operators for XMFLOAT2, XMFLOAT3, etc. when building with ``/std:c++20 /Zc:_cplusplus`` +* Improved conformance for ARM64 when using `/Zc:arm64-aliased-neon-types-` +* Minor code review +* CMake project updated to require 3.20 or later +* Added Azure Dev Ops Pipeline YAML files + +### May 2022 (3.17b) +* Hot-fix to address ``-Wreserved-identifier`` warnings with clang v13 +* C++20 spaceship operators for XMFLOAT2, XMFLOAT3, etc. when building with ``/std:c++20 /Zc:_cplusplus`` +* Minor CMake project update + +### January 2022 (3.17) +* Added ColorsLinear namespace to DirectXColors.h with linear versions of .NET colors +* Optimized the ``XMMatrixRotationRollPitchYaw(FromVector)`` functions +* Fixed overread problem for 16bpp GPU types Load functions: + * ``XMUNIBBLE4``, ``XMU555``, ``XMU565``, ``XMBYTEN2``, ``XMBYTE2``, ``XMUBYTEN2``, ``XMUBYTE2`` +* ``XM_CACHE_LINE_SIZE`` updated for ARM/ARM64 targets to 128 bytes +* A few comments added to improve IntelliSense experience +* Conformance improvements for GNU compiler +* Minor code cleanup + +### January 2021 (3.16b) +* Hot-fixes to resolve build breaks for clang/LLVM and GCC on ARM64 +* ``XM_ALIGNED_DATA`` and ``XM_ALIGNED_STRUCT`` macros updated to use C++17 ``alignas`` when available + +### December 2020 (3.16) +* Added ``XMVectorLog10`` / ``XMVectorExp10`` +* Added ``XMColorRGBToYUV_UHD`` / ``XMColorYUVToRGB_UHD`` for Rec. 2020 YUV +* Added optional ``rhcoords`` parameter for BoundingFrustum ``CreateFromMatrix`` +* Added use of Intel® Short Vector Matrix Library (SVML) supported by VS 2019 + * Opt-in with ``_XM_SVML_INTRINSICS_``; opt-out with ``_XM_DISABLE_INTEL_SVML_`` +* Fixed denorm handling for ``XMConvertFloatToHalf`` +* Fixed flush (too small for denorm) handling for ``XMStoreFloat3PK`` +* Fixed clamping bug in ``XMStoreByteN4`` +* Cleaned up ARM-NEON intrinsics type issues for improved portability on GNUC +* Fixed ``GXMVECTOR`` for x86 ``__vectorcall`` +* Code review + +### April 2020 (3.15) +* Added ``XMMatrixVectorTensorProduct`` for creating a matrix from two vectors +* Use of m256 registers and FMA3 with ``/arch:AVX2`` for stream and some matrix functions +* Optimized load/stores for SSE2 float2 & float3 functions +* Optimized some instruction choices for better AMD CPU support +* Improved conformance for clang/LLVM, GCC, and MinGW compilers +* Code review (``constexpr`` / ``noexcept`` usage) +* Retired VS 2015 support + +### August 2019 (3.14) +* Added float control around IsNan functions to resolve issue with VS 2019 with ``/fp:fast`` +* XMVerifyCPUSupport updated for clang/LLVM cpuid implementation on x86/x64 +* Added support for clang/LLVM built-in platform defines as well as the MSVC ones +* Cleaned up ARM-NEON intrinsics type issues for improved portability +* Removed unneeded malloc.h include in DirectXMath.h +* Whitespace cleanup + +### July 2018 (3.13) +* ``XMFLOAT3X4``, ``XMFLOAT3X4A``, and associated Load/Store functions +* Move/copy constructors and assignment operators for C++ types +* Minor fix for XMVectorClamp behavior with NaN +* Fixed compilation warnings with VS 2017 (15.7 update), Intel C++ 18.0 compiler, and clang 6 +* Retired VS 2013 support +* Minor code cleanup + +### February 2018 (3.12) +* ARM64 use of fused multiply-accumulate intriniscs +* Conformance fix for XMConvertFloatToHalf +* Minor code cleanup + +### June 2017 (3.11) +* AVX optimization of XMMatrixMultiply and XMMatrixMultiplyTranspose +* AVX2 optimization for XMVectorSplatX +* FMA3 optimization of XMVectorMultiplyAdd and XMVectorNegativeMultiplySubtract (implied by /arch:AVX2) +* Conformance fixes to support compilation with Clang 3.7 + +### January 2017 (3.10) +* Added XMVectorSum for horizontal adds +* ARMv8 intrinsics use for ARM64 platform (division, rounding, half-precision conversion) +* Added SSE3 codepaths using opt-in ``_XM_SSE3_INTRINSICS_`` +* XMVectorRound fix for no-intrinsics to match round to nearest (even) +* XMStoreFloat3SE fix when max channel isn't a perfect power of 2 +* constexpr conformance fix and workaround for compiler bug in VS 2015 RTM +* Remove support for VS 2012 compilers +* Remove ``__vector4i`` deprecated type + +### June 2016 (3.09) +* Includes support for additional optimizations when built with /arch:AVX or /arch:AVX2 +* Added use of constexpr for type constructors, XMConvertToRadians, and XMConvertToDegrees +* Marked ``__vector4i``, ``XMXDEC4``, ``XMDECN4``, ``XMDEC4``, and associated Load & Store functions as deprecated. + + These are vestiges of Xbox 360 support and will be removed in a future release +* Renamed parameter in XMMatrixPerspectiveFov* to reduce user confusion when relying on IntelliSense +* XMU565, XMUNIBBLE4 constructors take uint8_t instead of int8_t + +### May 2016 +* DirectXMath 3.08 released under the MIT license + +### November 2015 (3.08) +* Added use of ``_mm_sfence`` for Stream methods +* Fixed bug with non-uniform scaling transforms for BoundingOrientedBox +* Added asserts for Near/FarZ in XMMatrix* methods +* Added use of ``=default`` for PODs with VS 2013/2015 +* Additional SSE and ARM-NEON optimizations for PackedVector functions + +### April 2015 (3.07) +* Fix customer reported bugs in BoundingBox methods +* Fix customer reported bug in XMStoreFloat3SE +* Fix customer reported bug in XMVectorATan2, XMVectorATan2Est +* Fix customer reported bug in XMVectorRound + +### October 2013 (3.06) +* Fixed load/store of XMFLOAT3SE to properly match the ``DXGI_FORMAT_R9G9B9E5_SHAREDEXP`` +* Added ``XMLoadUDecN4_XR`` and ``XMStoreUDecN4_XR`` to match ``DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM`` +* Added ``XMColorRGBToSRGB`` and ``XMColorSRGBToRGB`` to convert linear RGB <-> sRGB + +### July 2013 (3.05) +* Use x86/x64 ``__vectorcall`` calling-convention when available (``XM_CALLCONV``, ``HXMVECTOR``, ``FXMMATRIX`` introduced) +* Fixed bug with XMVectorFloor and XMVectorCeiling when given whole odd numbers (i.e. 105.0) +* Improved XMVectorRound algorithm +* ARM-NEON optimizations for XMVectorExp2, XMVectorLog2, XMVectorExpE, and XMVectorLogE +* ARM-NEON code paths use multiply-by-scalar intrinsics when supported +* Additional optimizations for ARM-NEON Stream functions +* Fixed potential warning C4723 using ``operator/`` or ``operator/=`` + +### March 2013 (3.04) +* ``XMVectorExp2``, ``XMVectorLog2``, ``XMVectorExpE``, and ``XMVectorLogE`` functions added to provide base-e support in addition to the existing base-2 support +* ``XMVectorExp`` and ``XMVectorLog`` are now aliases for XMVectorExp2 and XMVectorLog2 +* Additional optimizations for Stream functions +* XMVector3Cross now ensures w component is zero on ARM +* XMConvertHalfToFloat and XMConvertFloatToHalf now use IEEE 754 standard float16 behavior for INF/QNAN +* Updated matrix version Transform for BoundingOrientedBox and BoundingFrustum to handle scaling + +### March 2012 (3.03) +* *breaking change* Removed union members from XMMATRIX type to make it a fully 'opaque' type +* Marked single-parameter C++ constructors for XMFLOAT2, XMFLOAT2A, XMFLOAT3, XMFLOAT3A, XMFLOAT4, and XMFLOAT4A explicit + +### February 2012 (3.02) +* ARM-NEON intrinsics (selected by default for the ARM platform) +* Reworked XMVectorPermute, change of ``XM_PERMUTE_`` defines, removal of XMVectorPermuteControl +* Addition of ``XM_SWIZZLE_`` defines +* Optimizations for transcendental functions +* Template forms for permute, swizzle, shift-left, rotate-left, rotation-right, and insert +* Removal of deprecated types and functions + + ``XM_CACHE_LINE_SIZE`` define, XMVectorExpEst, XMVectorLogEst, XMVectorPowEst, XMVectorSinHEs, XMVectorCosHEst, XMVectorTanHEst, XMVector2InBoundsR, XMVector3InBoundsR, XMVector4InBoundsR +* Removed ``XM_STRICT_VECTOR4``; XMVECTOR in NO-INTRINSICS always defined without .x, .y, .z, .w, .v, or .u +* Additional bounding types +* SAL fixes and improvements + +### September 2011 (3.00) +* Renamed and reorganized the headers +* Introduced C++ namespaces +* Removed the Xbox 360-specific GPU types + + HENDN3, XMHEND3, XMUHENDN3, XMUHEND3, XMDHENN3, XMDHEN3, XMUDHENN3, XMUDHEN3, XMXICON4, XMXICO4, XMICON4, XMICO4, XMUICON4, XMUICO4 + +### July 2012 (XNAMath 2.05) +* Template forms have been added for `XMVectorPermute`, `XMVectorSwizzle`, `XMVectorShiftLeft`, `XMVectorRotateLeft`, `XMVectorRotateRight`, and `XMVectorInsert` +* The `XM_STRICT_XMMATRIX` compilation define has been added for opaque `XMMATRIX`. +* Stream stride and count arguments have been changed to `size_t` +* The ``pDeterminant`` parameter of `XMMatrixInverse` is now optional +* Additional operator= overloads for `XMBYTEN4`, `XMBYTE4`, `XMUBYTEN4`, and `XMUBYTE4` types are now available + +### February 2011 (XNAMath 2.04) +* Addition of new data types and associated load-store functions: + + `XMBYTEN2, XMBYTE2, XMUBYTEN2, XMUBYTE2` + + `XMLoadByteN2, XMLoadByte2, XMLoadUByteN2, XMLoadUByte2` + + `XMStoreByteN2, XMStoreByte2, XMStoreUByteN2, XMStoreUByte2` + + `XMINT2, XMUINT2, XMINT3, XMUINT3, XMINT4, XMUINT4` + + `XMLoadSInt2, XMLoadUInt2, XMLoadSInt3, XMLoadUInt3, XMLoadSInt4, XMLoadUInt4` + + `XMStoreSInt2, XMStoreUInt2, XMStoreSInt3, XMStoreUInt3, XMStoreSInt4, XMStoreUInt4` +* Marked most single-parameter C++ constructors with `explicit` keyword +* Corrected range issues with SSE implementations of `XMVectorFloor` and `XMVectorCeiling` + + +### June 2010 (XNAMath 2.03) +* Addition of ``XMVectorDivide`` to optimize SSE2 vector division operations +* Unified handling of floating-point specials between the Windows SSE2 and no-intrinsics implementations +* Use of Visual Studio style SAL annotations +* Modifications to the C++ declarations for `XMFLOAT2A/3A/4A/4X3A/4X4A` to better support these types in C++ templates + +### February 2010 (XNAMath 2.02) +* Fixes to `XMStoreColor`, `XMQuaternionRotationMatrix`, `XMVectorATan2`, and `XMVectorATan2Est` + +### August 2009 (XNAMath 2.01) +* Adds ``XM_STRICT_VECTOR4``. This opt-in directive disallows the usage of XboxMath-like member accessors such as .x, .y, and .z. This makes it easier to write portable XNA Math code. +* Added conversion support for the following Windows graphics formats: + + 16-bit color formats (565, 555X, 5551) + + 4-bits per channel color formats (4444) + + Unique Direct3D 10/11 formats (``DXGI_FORMAT_R9G9B9E5_SHAREDEXP`` and ``DXGI_FORMAT_R11G11B10_FLOAT``) + +### March 2009 (XNAMath 2.00) +* Initial release (based on the Xbox 360 Xbox math library) diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXCollision.h b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXCollision.h new file mode 100644 index 000000000..989c469f3 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXCollision.h @@ -0,0 +1,359 @@ +//------------------------------------------------------------------------------------- +// DirectXCollision.h -- C++ Collision Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#include "DirectXMath.h" + +namespace DirectX +{ + + enum ContainmentType + { + DISJOINT = 0, + INTERSECTS = 1, + CONTAINS = 2 + }; + + enum PlaneIntersectionType + { + FRONT = 0, + INTERSECTING = 1, + BACK = 2 + }; + + struct BoundingBox; + struct BoundingOrientedBox; + struct BoundingFrustum; + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4324 4820) + // C4324: alignment padding warnings + // C4820: Off by default noise +#endif + + //------------------------------------------------------------------------------------- + // Bounding sphere + //------------------------------------------------------------------------------------- + struct BoundingSphere + { + XMFLOAT3 Center; // Center of the sphere. + float Radius; // Radius of the sphere. + + // Creators + BoundingSphere() noexcept : Center(0, 0, 0), Radius(1.f) {} + + BoundingSphere(const BoundingSphere&) = default; + BoundingSphere& operator=(const BoundingSphere&) = default; + + BoundingSphere(BoundingSphere&&) = default; + BoundingSphere& operator=(BoundingSphere&&) = default; + + constexpr BoundingSphere(_In_ const XMFLOAT3& center, _In_ float radius) noexcept + : Center(center), Radius(radius) {} + + // Methods + void XM_CALLCONV Transform(_Out_ BoundingSphere& Out, _In_ FXMMATRIX M) const noexcept; + void XM_CALLCONV Transform(_Out_ BoundingSphere& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation) const noexcept; + // Transform the sphere + + ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR Point) const noexcept; + ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept; + ContainmentType Contains(_In_ const BoundingSphere& sh) const noexcept; + ContainmentType Contains(_In_ const BoundingBox& box) const noexcept; + ContainmentType Contains(_In_ const BoundingOrientedBox& box) const noexcept; + ContainmentType Contains(_In_ const BoundingFrustum& fr) const noexcept; + + bool Intersects(_In_ const BoundingSphere& sh) const noexcept; + bool Intersects(_In_ const BoundingBox& box) const noexcept; + bool Intersects(_In_ const BoundingOrientedBox& box) const noexcept; + bool Intersects(_In_ const BoundingFrustum& fr) const noexcept; + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept; + // Triangle-sphere test + + PlaneIntersectionType XM_CALLCONV Intersects(_In_ FXMVECTOR Plane) const noexcept; + // Plane-sphere test + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist) const noexcept; + // Ray-sphere test + + ContainmentType XM_CALLCONV ContainedBy(_In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5) const noexcept; + // Test sphere against six planes (see BoundingFrustum::GetPlanes) + + // Static methods + static void CreateMerged(_Out_ BoundingSphere& Out, _In_ const BoundingSphere& S1, _In_ const BoundingSphere& S2) noexcept; + + static void CreateFromBoundingBox(_Out_ BoundingSphere& Out, _In_ const BoundingBox& box) noexcept; + static void CreateFromBoundingBox(_Out_ BoundingSphere& Out, _In_ const BoundingOrientedBox& box) noexcept; + + static void CreateFromPoints(_Out_ BoundingSphere& Out, _In_ size_t Count, + _In_reads_bytes_(sizeof(XMFLOAT3) + Stride * (Count - 1)) const XMFLOAT3* pPoints, _In_ size_t Stride) noexcept; + + static void CreateFromFrustum(_Out_ BoundingSphere& Out, _In_ const BoundingFrustum& fr) noexcept; + }; + + //------------------------------------------------------------------------------------- + // Axis-aligned bounding box + //------------------------------------------------------------------------------------- + struct BoundingBox + { + static constexpr size_t CORNER_COUNT = 8; + + XMFLOAT3 Center; // Center of the box. + XMFLOAT3 Extents; // Distance from the center to each side. + + // Creators + BoundingBox() noexcept : Center(0, 0, 0), Extents(1.f, 1.f, 1.f) {} + + BoundingBox(const BoundingBox&) = default; + BoundingBox& operator=(const BoundingBox&) = default; + + BoundingBox(BoundingBox&&) = default; + BoundingBox& operator=(BoundingBox&&) = default; + + constexpr BoundingBox(_In_ const XMFLOAT3& center, _In_ const XMFLOAT3& extents) noexcept + : Center(center), Extents(extents) {} + + // Methods + void XM_CALLCONV Transform(_Out_ BoundingBox& Out, _In_ FXMMATRIX M) const noexcept; + void XM_CALLCONV Transform(_Out_ BoundingBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation) const noexcept; + + void GetCorners(_Out_writes_(8) XMFLOAT3* Corners) const noexcept; + // Gets the 8 corners of the box + + ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR Point) const noexcept; + ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept; + ContainmentType Contains(_In_ const BoundingSphere& sh) const noexcept; + ContainmentType Contains(_In_ const BoundingBox& box) const noexcept; + ContainmentType Contains(_In_ const BoundingOrientedBox& box) const noexcept; + ContainmentType Contains(_In_ const BoundingFrustum& fr) const noexcept; + + bool Intersects(_In_ const BoundingSphere& sh) const noexcept; + bool Intersects(_In_ const BoundingBox& box) const noexcept; + bool Intersects(_In_ const BoundingOrientedBox& box) const noexcept; + bool Intersects(_In_ const BoundingFrustum& fr) const noexcept; + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept; + // Triangle-Box test + + PlaneIntersectionType XM_CALLCONV Intersects(_In_ FXMVECTOR Plane) const noexcept; + // Plane-box test + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist) const noexcept; + // Ray-Box test + + ContainmentType XM_CALLCONV ContainedBy(_In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5) const noexcept; + // Test box against six planes (see BoundingFrustum::GetPlanes) + + // Static methods + static void CreateMerged(_Out_ BoundingBox& Out, _In_ const BoundingBox& b1, _In_ const BoundingBox& b2) noexcept; + + static void CreateFromSphere(_Out_ BoundingBox& Out, _In_ const BoundingSphere& sh) noexcept; + + static void XM_CALLCONV CreateFromPoints(_Out_ BoundingBox& Out, _In_ FXMVECTOR pt1, _In_ FXMVECTOR pt2) noexcept; + static void CreateFromPoints(_Out_ BoundingBox& Out, _In_ size_t Count, + _In_reads_bytes_(sizeof(XMFLOAT3) + Stride * (Count - 1)) const XMFLOAT3* pPoints, _In_ size_t Stride) noexcept; + }; + + //------------------------------------------------------------------------------------- + // Oriented bounding box + //------------------------------------------------------------------------------------- + struct BoundingOrientedBox + { + static constexpr size_t CORNER_COUNT = 8; + + XMFLOAT3 Center; // Center of the box. + XMFLOAT3 Extents; // Distance from the center to each side. + XMFLOAT4 Orientation; // Unit quaternion representing rotation (box -> world). + + // Creators + BoundingOrientedBox() noexcept : Center(0, 0, 0), Extents(1.f, 1.f, 1.f), Orientation(0, 0, 0, 1.f) {} + + BoundingOrientedBox(const BoundingOrientedBox&) = default; + BoundingOrientedBox& operator=(const BoundingOrientedBox&) = default; + + BoundingOrientedBox(BoundingOrientedBox&&) = default; + BoundingOrientedBox& operator=(BoundingOrientedBox&&) = default; + + constexpr BoundingOrientedBox(_In_ const XMFLOAT3& center, _In_ const XMFLOAT3& extents, _In_ const XMFLOAT4& orientation) noexcept + : Center(center), Extents(extents), Orientation(orientation) {} + + // Methods + void XM_CALLCONV Transform(_Out_ BoundingOrientedBox& Out, _In_ FXMMATRIX M) const noexcept; + void XM_CALLCONV Transform(_Out_ BoundingOrientedBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation) const noexcept; + + void GetCorners(_Out_writes_(8) XMFLOAT3* Corners) const noexcept; + // Gets the 8 corners of the box + + ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR Point) const noexcept; + ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept; + ContainmentType Contains(_In_ const BoundingSphere& sh) const noexcept; + ContainmentType Contains(_In_ const BoundingBox& box) const noexcept; + ContainmentType Contains(_In_ const BoundingOrientedBox& box) const noexcept; + ContainmentType Contains(_In_ const BoundingFrustum& fr) const noexcept; + + bool Intersects(_In_ const BoundingSphere& sh) const noexcept; + bool Intersects(_In_ const BoundingBox& box) const noexcept; + bool Intersects(_In_ const BoundingOrientedBox& box) const noexcept; + bool Intersects(_In_ const BoundingFrustum& fr) const noexcept; + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept; + // Triangle-OrientedBox test + + PlaneIntersectionType XM_CALLCONV Intersects(_In_ FXMVECTOR Plane) const noexcept; + // Plane-OrientedBox test + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist) const noexcept; + // Ray-OrientedBox test + + ContainmentType XM_CALLCONV ContainedBy(_In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5) const noexcept; + // Test OrientedBox against six planes (see BoundingFrustum::GetPlanes) + + // Static methods + static void CreateFromBoundingBox(_Out_ BoundingOrientedBox& Out, _In_ const BoundingBox& box) noexcept; + + static void CreateFromPoints(_Out_ BoundingOrientedBox& Out, _In_ size_t Count, + _In_reads_bytes_(sizeof(XMFLOAT3) + Stride * (Count - 1)) const XMFLOAT3* pPoints, _In_ size_t Stride) noexcept; + }; + + //------------------------------------------------------------------------------------- + // Bounding frustum + //------------------------------------------------------------------------------------- + struct BoundingFrustum + { + static constexpr size_t CORNER_COUNT = 8; + + XMFLOAT3 Origin; // Origin of the frustum (and projection). + XMFLOAT4 Orientation; // Quaternion representing rotation. + + float RightSlope; // Positive X (X/Z) + float LeftSlope; // Negative X + float TopSlope; // Positive Y (Y/Z) + float BottomSlope; // Negative Y + float Near, Far; // Z of the near plane and far plane. + + // Creators + BoundingFrustum() noexcept : + Origin(0, 0, 0), Orientation(0, 0, 0, 1.f), RightSlope(1.f), LeftSlope(-1.f), + TopSlope(1.f), BottomSlope(-1.f), Near(0), Far(1.f) {} + + BoundingFrustum(const BoundingFrustum&) = default; + BoundingFrustum& operator=(const BoundingFrustum&) = default; + + BoundingFrustum(BoundingFrustum&&) = default; + BoundingFrustum& operator=(BoundingFrustum&&) = default; + + constexpr BoundingFrustum(_In_ const XMFLOAT3& origin, _In_ const XMFLOAT4& orientation, + _In_ float rightSlope, _In_ float leftSlope, _In_ float topSlope, _In_ float bottomSlope, + _In_ float nearPlane, _In_ float farPlane) noexcept + : Origin(origin), Orientation(orientation), + RightSlope(rightSlope), LeftSlope(leftSlope), TopSlope(topSlope), BottomSlope(bottomSlope), + Near(nearPlane), Far(farPlane) {} + BoundingFrustum(_In_ CXMMATRIX Projection, bool rhcoords = false) noexcept; + + // Methods + void XM_CALLCONV Transform(_Out_ BoundingFrustum& Out, _In_ FXMMATRIX M) const noexcept; + void XM_CALLCONV Transform(_Out_ BoundingFrustum& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation) const noexcept; + + void GetCorners(_Out_writes_(8) XMFLOAT3* Corners) const noexcept; + // Gets the 8 corners of the frustum + + ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR Point) const noexcept; + ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept; + ContainmentType Contains(_In_ const BoundingSphere& sp) const noexcept; + ContainmentType Contains(_In_ const BoundingBox& box) const noexcept; + ContainmentType Contains(_In_ const BoundingOrientedBox& box) const noexcept; + ContainmentType Contains(_In_ const BoundingFrustum& fr) const noexcept; + // Frustum-Frustum test + + bool Intersects(_In_ const BoundingSphere& sh) const noexcept; + bool Intersects(_In_ const BoundingBox& box) const noexcept; + bool Intersects(_In_ const BoundingOrientedBox& box) const noexcept; + bool Intersects(_In_ const BoundingFrustum& fr) const noexcept; + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) const noexcept; + // Triangle-Frustum test + + PlaneIntersectionType XM_CALLCONV Intersects(_In_ FXMVECTOR Plane) const noexcept; + // Plane-Frustum test + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR rayOrigin, _In_ FXMVECTOR Direction, _Out_ float& Dist) const noexcept; + // Ray-Frustum test + + ContainmentType XM_CALLCONV ContainedBy(_In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5) const noexcept; + // Test frustum against six planes (see BoundingFrustum::GetPlanes) + + void GetPlanes(_Out_opt_ XMVECTOR* NearPlane, _Out_opt_ XMVECTOR* FarPlane, _Out_opt_ XMVECTOR* RightPlane, + _Out_opt_ XMVECTOR* LeftPlane, _Out_opt_ XMVECTOR* TopPlane, _Out_opt_ XMVECTOR* BottomPlane) const noexcept; + // Create 6 Planes representation of Frustum + + // Static methods + static void XM_CALLCONV CreateFromMatrix(_Out_ BoundingFrustum& Out, _In_ FXMMATRIX Projection, bool rhcoords = false) noexcept; + }; + + //----------------------------------------------------------------------------- + // Triangle intersection testing routines. + //----------------------------------------------------------------------------- + namespace TriangleTests + { + bool XM_CALLCONV Intersects(_In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _In_ FXMVECTOR V0, _In_ GXMVECTOR V1, _In_ HXMVECTOR V2, _Out_ float& Dist) noexcept; + // Ray-Triangle + + bool XM_CALLCONV Intersects(_In_ FXMVECTOR A0, _In_ FXMVECTOR A1, _In_ FXMVECTOR A2, _In_ GXMVECTOR B0, _In_ HXMVECTOR B1, _In_ HXMVECTOR B2) noexcept; + // Triangle-Triangle + + PlaneIntersectionType XM_CALLCONV Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2, _In_ GXMVECTOR Plane) noexcept; + // Plane-Triangle + + ContainmentType XM_CALLCONV ContainedBy(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2, + _In_ GXMVECTOR Plane0, _In_ HXMVECTOR Plane1, _In_ HXMVECTOR Plane2, + _In_ CXMVECTOR Plane3, _In_ CXMVECTOR Plane4, _In_ CXMVECTOR Plane5) noexcept; + // Test a triangle against six planes at once (see BoundingFrustum::GetPlanes) + } + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + /**************************************************************************** + * + * Implementation + * + ****************************************************************************/ + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4068 4365 4616 6001) + // C4068/4616: ignore unknown pragmas + // C4365: Off by default noise + // C6001: False positives +#endif +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") +#pragma prefast(disable : 26495, "Union initialization confuses /analyze") +#endif + +#include "DirectXCollision.inl" + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +} // namespace DirectX + diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXCollision.inl b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXCollision.inl new file mode 100644 index 000000000..51d37926a --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXCollision.inl @@ -0,0 +1,4816 @@ +//------------------------------------------------------------------------------------- +// DirectXCollision.inl -- C++ Collision Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +XMGLOBALCONST XMVECTORF32 g_BoxOffset[8] = +{ + { { { -1.0f, -1.0f, 1.0f, 0.0f } } }, + { { { 1.0f, -1.0f, 1.0f, 0.0f } } }, + { { { 1.0f, 1.0f, 1.0f, 0.0f } } }, + { { { -1.0f, 1.0f, 1.0f, 0.0f } } }, + { { { -1.0f, -1.0f, -1.0f, 0.0f } } }, + { { { 1.0f, -1.0f, -1.0f, 0.0f } } }, + { { { 1.0f, 1.0f, -1.0f, 0.0f } } }, + { { { -1.0f, 1.0f, -1.0f, 0.0f } } }, +}; + +XMGLOBALCONST XMVECTORF32 g_RayEpsilon = { { { 1e-20f, 1e-20f, 1e-20f, 1e-20f } } }; +XMGLOBALCONST XMVECTORF32 g_RayNegEpsilon = { { { -1e-20f, -1e-20f, -1e-20f, -1e-20f } } }; +XMGLOBALCONST XMVECTORF32 g_FltMin = { { { -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX } } }; +XMGLOBALCONST XMVECTORF32 g_FltMax = { { { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX } } }; + +namespace Internal +{ + + //----------------------------------------------------------------------------- + // Return true if any of the elements of a 3 vector are equal to 0xffffffff. + // Slightly more efficient than using XMVector3EqualInt. + //----------------------------------------------------------------------------- + inline bool XMVector3AnyTrue(_In_ FXMVECTOR V) noexcept + { + // Duplicate the fourth element from the first element. + XMVECTOR C = XMVectorSwizzle(V); + + return XMComparisonAnyTrue(XMVector4EqualIntR(C, XMVectorTrueInt())); + } + + + //----------------------------------------------------------------------------- + // Return true if all of the elements of a 3 vector are equal to 0xffffffff. + // Slightly more efficient than using XMVector3EqualInt. + //----------------------------------------------------------------------------- + inline bool XMVector3AllTrue(_In_ FXMVECTOR V) noexcept + { + // Duplicate the fourth element from the first element. + XMVECTOR C = XMVectorSwizzle(V); + + return XMComparisonAllTrue(XMVector4EqualIntR(C, XMVectorTrueInt())); + } + +#if defined(_PREFAST_) || !defined(NDEBUG) + + XMGLOBALCONST XMVECTORF32 g_UnitVectorEpsilon = { { { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f } } }; + XMGLOBALCONST XMVECTORF32 g_UnitQuaternionEpsilon = { { { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f } } }; + XMGLOBALCONST XMVECTORF32 g_UnitPlaneEpsilon = { { { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f } } }; + + //----------------------------------------------------------------------------- + // Return true if the vector is a unit vector (length == 1). + //----------------------------------------------------------------------------- + inline bool XMVector3IsUnit(_In_ FXMVECTOR V) noexcept + { + XMVECTOR Difference = XMVectorSubtract(XMVector3Length(V), XMVectorSplatOne()); + return XMVector4Less(XMVectorAbs(Difference), g_UnitVectorEpsilon); + } + + //----------------------------------------------------------------------------- + // Return true if the quaterion is a unit quaternion. + //----------------------------------------------------------------------------- + inline bool XMQuaternionIsUnit(_In_ FXMVECTOR Q) noexcept + { + XMVECTOR Difference = XMVectorSubtract(XMVector4Length(Q), XMVectorSplatOne()); + return XMVector4Less(XMVectorAbs(Difference), g_UnitQuaternionEpsilon); + } + + //----------------------------------------------------------------------------- + // Return true if the plane is a unit plane. + //----------------------------------------------------------------------------- + inline bool XMPlaneIsUnit(_In_ FXMVECTOR Plane) noexcept + { + XMVECTOR Difference = XMVectorSubtract(XMVector3Length(Plane), XMVectorSplatOne()); + return XMVector4Less(XMVectorAbs(Difference), g_UnitPlaneEpsilon); + } + +#endif // _PREFAST_ || !NDEBUG + + //----------------------------------------------------------------------------- + inline XMVECTOR XMPlaneTransform(_In_ FXMVECTOR Plane, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation) noexcept + { + XMVECTOR vNormal = XMVector3Rotate(Plane, Rotation); + XMVECTOR vD = XMVectorSubtract(XMVectorSplatW(Plane), XMVector3Dot(vNormal, Translation)); + + return XMVectorInsert<0, 0, 0, 0, 1>(vNormal, vD); + } + + //----------------------------------------------------------------------------- + // Return the point on the line segement (S1, S2) nearest the point P. + //----------------------------------------------------------------------------- + inline XMVECTOR PointOnLineSegmentNearestPoint(_In_ FXMVECTOR S1, _In_ FXMVECTOR S2, _In_ FXMVECTOR P) noexcept + { + XMVECTOR Dir = XMVectorSubtract(S2, S1); + XMVECTOR Projection = XMVectorSubtract(XMVector3Dot(P, Dir), XMVector3Dot(S1, Dir)); + XMVECTOR LengthSq = XMVector3Dot(Dir, Dir); + + XMVECTOR t = XMVectorMultiply(Projection, XMVectorReciprocal(LengthSq)); + XMVECTOR Point = XMVectorMultiplyAdd(t, Dir, S1); + + // t < 0 + XMVECTOR SelectS1 = XMVectorLess(Projection, XMVectorZero()); + Point = XMVectorSelect(Point, S1, SelectS1); + + // t > 1 + XMVECTOR SelectS2 = XMVectorGreater(Projection, LengthSq); + Point = XMVectorSelect(Point, S2, SelectS2); + + return Point; + } + + //----------------------------------------------------------------------------- + // Test if the point (P) on the plane of the triangle is inside the triangle + // (V0, V1, V2). + //----------------------------------------------------------------------------- + inline XMVECTOR XM_CALLCONV PointOnPlaneInsideTriangle(_In_ FXMVECTOR P, _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ GXMVECTOR V2) noexcept + { + // Compute the triangle normal. + XMVECTOR N = XMVector3Cross(XMVectorSubtract(V2, V0), XMVectorSubtract(V1, V0)); + + // Compute the cross products of the vector from the base of each edge to + // the point with each edge vector. + XMVECTOR C0 = XMVector3Cross(XMVectorSubtract(P, V0), XMVectorSubtract(V1, V0)); + XMVECTOR C1 = XMVector3Cross(XMVectorSubtract(P, V1), XMVectorSubtract(V2, V1)); + XMVECTOR C2 = XMVector3Cross(XMVectorSubtract(P, V2), XMVectorSubtract(V0, V2)); + + // If the cross product points in the same direction as the normal the the + // point is inside the edge (it is zero if is on the edge). + XMVECTOR Zero = XMVectorZero(); + XMVECTOR Inside0 = XMVectorGreaterOrEqual(XMVector3Dot(C0, N), Zero); + XMVECTOR Inside1 = XMVectorGreaterOrEqual(XMVector3Dot(C1, N), Zero); + XMVECTOR Inside2 = XMVectorGreaterOrEqual(XMVector3Dot(C2, N), Zero); + + // If the point inside all of the edges it is inside. + return XMVectorAndInt(XMVectorAndInt(Inside0, Inside1), Inside2); + } + + //----------------------------------------------------------------------------- + inline bool SolveCubic(_In_ float e, _In_ float f, _In_ float g, _Out_ float* t, _Out_ float* u, _Out_ float* v) noexcept + { + float p, q, h, rc, d, theta, costh3, sinth3; + + p = f - e * e / 3.0f; + q = g - e * f / 3.0f + e * e * e * 2.0f / 27.0f; + h = q * q / 4.0f + p * p * p / 27.0f; + + if (h > 0) + { + *t = *u = *v = 0.f; + return false; // only one real root + } + + if ((h == 0) && (q == 0)) // all the same root + { + *t = -e / 3; + *u = -e / 3; + *v = -e / 3; + + return true; + } + + d = sqrtf(q * q / 4.0f - h); + if (d < 0) + rc = -powf(-d, 1.0f / 3.0f); + else + rc = powf(d, 1.0f / 3.0f); + + theta = XMScalarACos(-q / (2.0f * d)); + costh3 = XMScalarCos(theta / 3.0f); + sinth3 = sqrtf(3.0f) * XMScalarSin(theta / 3.0f); + *t = 2.0f * rc * costh3 - e / 3.0f; + *u = -rc * (costh3 + sinth3) - e / 3.0f; + *v = -rc * (costh3 - sinth3) - e / 3.0f; + + return true; + } + + //----------------------------------------------------------------------------- + inline XMVECTOR CalculateEigenVector(_In_ float m11, _In_ float m12, _In_ float m13, + _In_ float m22, _In_ float m23, _In_ float m33, _In_ float e) noexcept + { + float fTmp[3]; + fTmp[0] = m12 * m23 - m13 * (m22 - e); + fTmp[1] = m13 * m12 - m23 * (m11 - e); + fTmp[2] = (m11 - e) * (m22 - e) - m12 * m12; + + XMVECTOR vTmp = XMLoadFloat3(reinterpret_cast(fTmp)); + + if (XMVector3Equal(vTmp, XMVectorZero())) // planar or linear + { + float f1, f2, f3; + + // we only have one equation - find a valid one + if ((m11 - e != 0) || (m12 != 0) || (m13 != 0)) + { + f1 = m11 - e; f2 = m12; f3 = m13; + } + else if ((m12 != 0) || (m22 - e != 0) || (m23 != 0)) + { + f1 = m12; f2 = m22 - e; f3 = m23; + } + else if ((m13 != 0) || (m23 != 0) || (m33 - e != 0)) + { + f1 = m13; f2 = m23; f3 = m33 - e; + } + else + { + // error, we'll just make something up - we have NO context + f1 = 1.0f; f2 = 0.0f; f3 = 0.0f; + } + + if (f1 == 0) + vTmp = XMVectorSetX(vTmp, 0.0f); + else + vTmp = XMVectorSetX(vTmp, 1.0f); + + if (f2 == 0) + vTmp = XMVectorSetY(vTmp, 0.0f); + else + vTmp = XMVectorSetY(vTmp, 1.0f); + + if (f3 == 0) + { + vTmp = XMVectorSetZ(vTmp, 0.0f); + // recalculate y to make equation work + if (m12 != 0) + vTmp = XMVectorSetY(vTmp, -f1 / f2); + } + else + { + vTmp = XMVectorSetZ(vTmp, (f2 - f1) / f3); + } + } + + if (XMVectorGetX(XMVector3LengthSq(vTmp)) > 1e-5f) + { + return XMVector3Normalize(vTmp); + } + else + { + // Multiply by a value large enough to make the vector non-zero. + vTmp = XMVectorScale(vTmp, 1e5f); + return XMVector3Normalize(vTmp); + } + } + + //----------------------------------------------------------------------------- + inline bool CalculateEigenVectors(_In_ float m11, _In_ float m12, _In_ float m13, + _In_ float m22, _In_ float m23, _In_ float m33, + _In_ float e1, _In_ float e2, _In_ float e3, + _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3) noexcept + { + *pV1 = DirectX::Internal::CalculateEigenVector(m11, m12, m13, m22, m23, m33, e1); + *pV2 = DirectX::Internal::CalculateEigenVector(m11, m12, m13, m22, m23, m33, e2); + *pV3 = DirectX::Internal::CalculateEigenVector(m11, m12, m13, m22, m23, m33, e3); + + bool v1z = false; + bool v2z = false; + bool v3z = false; + + XMVECTOR Zero = XMVectorZero(); + + if (XMVector3Equal(*pV1, Zero)) + v1z = true; + + if (XMVector3Equal(*pV2, Zero)) + v2z = true; + + if (XMVector3Equal(*pV3, Zero)) + v3z = true; + + bool e12 = (fabsf(XMVectorGetX(XMVector3Dot(*pV1, *pV2))) > 0.1f); // check for non-orthogonal vectors + bool e13 = (fabsf(XMVectorGetX(XMVector3Dot(*pV1, *pV3))) > 0.1f); + bool e23 = (fabsf(XMVectorGetX(XMVector3Dot(*pV2, *pV3))) > 0.1f); + + if ((v1z && v2z && v3z) || (e12 && e13 && e23) || + (e12 && v3z) || (e13 && v2z) || (e23 && v1z)) // all eigenvectors are 0- any basis set + { + *pV1 = g_XMIdentityR0.v; + *pV2 = g_XMIdentityR1.v; + *pV3 = g_XMIdentityR2.v; + return true; + } + + if (v1z && v2z) + { + XMVECTOR vTmp = XMVector3Cross(g_XMIdentityR1, *pV3); + if (XMVectorGetX(XMVector3LengthSq(vTmp)) < 1e-5f) + { + vTmp = XMVector3Cross(g_XMIdentityR0, *pV3); + } + *pV1 = XMVector3Normalize(vTmp); + *pV2 = XMVector3Cross(*pV3, *pV1); + return true; + } + + if (v3z && v1z) + { + XMVECTOR vTmp = XMVector3Cross(g_XMIdentityR1, *pV2); + if (XMVectorGetX(XMVector3LengthSq(vTmp)) < 1e-5f) + { + vTmp = XMVector3Cross(g_XMIdentityR0, *pV2); + } + *pV3 = XMVector3Normalize(vTmp); + *pV1 = XMVector3Cross(*pV2, *pV3); + return true; + } + + if (v2z && v3z) + { + XMVECTOR vTmp = XMVector3Cross(g_XMIdentityR1, *pV1); + if (XMVectorGetX(XMVector3LengthSq(vTmp)) < 1e-5f) + { + vTmp = XMVector3Cross(g_XMIdentityR0, *pV1); + } + *pV2 = XMVector3Normalize(vTmp); + *pV3 = XMVector3Cross(*pV1, *pV2); + return true; + } + + if ((v1z) || e12) + { + *pV1 = XMVector3Cross(*pV2, *pV3); + return true; + } + + if ((v2z) || e23) + { + *pV2 = XMVector3Cross(*pV3, *pV1); + return true; + } + + if ((v3z) || e13) + { + *pV3 = XMVector3Cross(*pV1, *pV2); + return true; + } + + return true; + } + + //----------------------------------------------------------------------------- + inline bool CalculateEigenVectorsFromCovarianceMatrix(_In_ float Cxx, _In_ float Cyy, _In_ float Czz, + _In_ float Cxy, _In_ float Cxz, _In_ float Cyz, + _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3) noexcept + { + // Calculate the eigenvalues by solving a cubic equation. + float e = -(Cxx + Cyy + Czz); + float f = Cxx * Cyy + Cyy * Czz + Czz * Cxx - Cxy * Cxy - Cxz * Cxz - Cyz * Cyz; + float g = Cxy * Cxy * Czz + Cxz * Cxz * Cyy + Cyz * Cyz * Cxx - Cxy * Cyz * Cxz * 2.0f - Cxx * Cyy * Czz; + + float ev1, ev2, ev3; + if (!DirectX::Internal::SolveCubic(e, f, g, &ev1, &ev2, &ev3)) + { + // set them to arbitrary orthonormal basis set + *pV1 = g_XMIdentityR0.v; + *pV2 = g_XMIdentityR1.v; + *pV3 = g_XMIdentityR2.v; + return false; + } + + return DirectX::Internal::CalculateEigenVectors(Cxx, Cxy, Cxz, Cyy, Cyz, Czz, ev1, ev2, ev3, pV1, pV2, pV3); + } + + //----------------------------------------------------------------------------- + inline void XM_CALLCONV FastIntersectTrianglePlane( + FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, + GXMVECTOR Plane, + XMVECTOR& Outside, XMVECTOR& Inside) noexcept + { + // Plane0 + XMVECTOR Dist0 = XMVector4Dot(V0, Plane); + XMVECTOR Dist1 = XMVector4Dot(V1, Plane); + XMVECTOR Dist2 = XMVector4Dot(V2, Plane); + + XMVECTOR MinDist = XMVectorMin(Dist0, Dist1); + MinDist = XMVectorMin(MinDist, Dist2); + + XMVECTOR MaxDist = XMVectorMax(Dist0, Dist1); + MaxDist = XMVectorMax(MaxDist, Dist2); + + XMVECTOR Zero = XMVectorZero(); + + // Outside the plane? + Outside = XMVectorGreater(MinDist, Zero); + + // Fully inside the plane? + Inside = XMVectorLess(MaxDist, Zero); + } + + //----------------------------------------------------------------------------- + inline void FastIntersectSpherePlane(_In_ FXMVECTOR Center, _In_ FXMVECTOR Radius, _In_ FXMVECTOR Plane, + _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside) noexcept + { + XMVECTOR Dist = XMVector4Dot(Center, Plane); + + // Outside the plane? + Outside = XMVectorGreater(Dist, Radius); + + // Fully inside the plane? + Inside = XMVectorLess(Dist, XMVectorNegate(Radius)); + } + + //----------------------------------------------------------------------------- + inline void FastIntersectAxisAlignedBoxPlane(_In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Plane, + _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside) noexcept + { + // Compute the distance to the center of the box. + XMVECTOR Dist = XMVector4Dot(Center, Plane); + + // Project the axes of the box onto the normal of the plane. Half the + // length of the projection (sometime called the "radius") is equal to + // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) + // where h(i) are extents of the box, n is the plane normal, and b(i) are the + // axes of the box. In this case b(i) = [(1,0,0), (0,1,0), (0,0,1)]. + XMVECTOR Radius = XMVector3Dot(Extents, XMVectorAbs(Plane)); + + // Outside the plane? + Outside = XMVectorGreater(Dist, Radius); + + // Fully inside the plane? + Inside = XMVectorLess(Dist, XMVectorNegate(Radius)); + } + + //----------------------------------------------------------------------------- + inline void XM_CALLCONV FastIntersectOrientedBoxPlane( + _In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Axis0, + _In_ GXMVECTOR Axis1, + _In_ HXMVECTOR Axis2, _In_ HXMVECTOR Plane, + _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside) noexcept + { + // Compute the distance to the center of the box. + XMVECTOR Dist = XMVector4Dot(Center, Plane); + + // Project the axes of the box onto the normal of the plane. Half the + // length of the projection (sometime called the "radius") is equal to + // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) + // where h(i) are extents of the box, n is the plane normal, and b(i) are the + // axes of the box. + XMVECTOR Radius = XMVector3Dot(Plane, Axis0); + Radius = XMVectorInsert<0, 0, 1, 0, 0>(Radius, XMVector3Dot(Plane, Axis1)); + Radius = XMVectorInsert<0, 0, 0, 1, 0>(Radius, XMVector3Dot(Plane, Axis2)); + Radius = XMVector3Dot(Extents, XMVectorAbs(Radius)); + + // Outside the plane? + Outside = XMVectorGreater(Dist, Radius); + + // Fully inside the plane? + Inside = XMVectorLess(Dist, XMVectorNegate(Radius)); + } + + //----------------------------------------------------------------------------- + inline void XM_CALLCONV FastIntersectFrustumPlane( + _In_ FXMVECTOR Point0, _In_ FXMVECTOR Point1, _In_ FXMVECTOR Point2, + _In_ GXMVECTOR Point3, + _In_ HXMVECTOR Point4, _In_ HXMVECTOR Point5, + _In_ CXMVECTOR Point6, _In_ CXMVECTOR Point7, _In_ CXMVECTOR Plane, + _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside) noexcept + { + // Find the min/max projection of the frustum onto the plane normal. + XMVECTOR Min, Max, Dist; + + Min = Max = XMVector3Dot(Plane, Point0); + + Dist = XMVector3Dot(Plane, Point1); + Min = XMVectorMin(Min, Dist); + Max = XMVectorMax(Max, Dist); + + Dist = XMVector3Dot(Plane, Point2); + Min = XMVectorMin(Min, Dist); + Max = XMVectorMax(Max, Dist); + + Dist = XMVector3Dot(Plane, Point3); + Min = XMVectorMin(Min, Dist); + Max = XMVectorMax(Max, Dist); + + Dist = XMVector3Dot(Plane, Point4); + Min = XMVectorMin(Min, Dist); + Max = XMVectorMax(Max, Dist); + + Dist = XMVector3Dot(Plane, Point5); + Min = XMVectorMin(Min, Dist); + Max = XMVectorMax(Max, Dist); + + Dist = XMVector3Dot(Plane, Point6); + Min = XMVectorMin(Min, Dist); + Max = XMVectorMax(Max, Dist); + + Dist = XMVector3Dot(Plane, Point7); + Min = XMVectorMin(Min, Dist); + Max = XMVectorMax(Max, Dist); + + XMVECTOR PlaneDist = XMVectorNegate(XMVectorSplatW(Plane)); + + // Outside the plane? + Outside = XMVectorGreater(Min, PlaneDist); + + // Fully inside the plane? + Inside = XMVectorLess(Max, PlaneDist); + } + +} // namespace Internal + + +/**************************************************************************** + * + * BoundingSphere + * + ****************************************************************************/ + + //----------------------------------------------------------------------------- + // Transform a sphere by an angle preserving transform. + //----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingSphere::Transform(BoundingSphere& Out, FXMMATRIX M) const noexcept +{ + // Load the center of the sphere. + XMVECTOR vCenter = XMLoadFloat3(&Center); + + // Transform the center of the sphere. + XMVECTOR C = XMVector3Transform(vCenter, M); + + XMVECTOR dX = XMVector3Dot(M.r[0], M.r[0]); + XMVECTOR dY = XMVector3Dot(M.r[1], M.r[1]); + XMVECTOR dZ = XMVector3Dot(M.r[2], M.r[2]); + + XMVECTOR d = XMVectorMax(dX, XMVectorMax(dY, dZ)); + + // Store the center sphere. + XMStoreFloat3(&Out.Center, C); + + // Scale the radius of the pshere. + float Scale = sqrtf(XMVectorGetX(d)); + Out.Radius = Radius * Scale; +} + +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingSphere::Transform(BoundingSphere& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation) const noexcept +{ + // Load the center of the sphere. + XMVECTOR vCenter = XMLoadFloat3(&Center); + + // Transform the center of the sphere. + vCenter = XMVectorAdd(XMVector3Rotate(XMVectorScale(vCenter, Scale), Rotation), Translation); + + // Store the center sphere. + XMStoreFloat3(&Out.Center, vCenter); + + // Scale the radius of the pshere. + Out.Radius = Radius * Scale; +} + + +//----------------------------------------------------------------------------- +// Point in sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingSphere::Contains(FXMVECTOR Point) const noexcept +{ + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + + XMVECTOR DistanceSquared = XMVector3LengthSq(XMVectorSubtract(Point, vCenter)); + XMVECTOR RadiusSquared = XMVectorMultiply(vRadius, vRadius); + + return XMVector3LessOrEqual(DistanceSquared, RadiusSquared) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingSphere::Contains(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept +{ + if (!Intersects(V0, V1, V2)) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + XMVECTOR RadiusSquared = XMVectorMultiply(vRadius, vRadius); + + XMVECTOR DistanceSquared = XMVector3LengthSq(XMVectorSubtract(V0, vCenter)); + XMVECTOR Inside = XMVectorLessOrEqual(DistanceSquared, RadiusSquared); + + DistanceSquared = XMVector3LengthSq(XMVectorSubtract(V1, vCenter)); + Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared)); + + DistanceSquared = XMVector3LengthSq(XMVectorSubtract(V2, vCenter)); + Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared)); + + return (XMVector3EqualInt(Inside, XMVectorTrueInt())) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere in sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains(const BoundingSphere& sh) const noexcept +{ + XMVECTOR Center1 = XMLoadFloat3(&Center); + float r1 = Radius; + + XMVECTOR Center2 = XMLoadFloat3(&sh.Center); + float r2 = sh.Radius; + + XMVECTOR V = XMVectorSubtract(Center2, Center1); + + XMVECTOR Dist = XMVector3Length(V); + + float d = XMVectorGetX(Dist); + + return (r1 + r2 >= d) ? ((r1 - r2 >= d) ? CONTAINS : INTERSECTS) : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Axis-aligned box in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains(const BoundingBox& box) const noexcept +{ + if (!box.Intersects(*this)) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius); + + XMVECTOR boxCenter = XMLoadFloat3(&box.Center); + XMVECTOR boxExtents = XMLoadFloat3(&box.Extents); + + XMVECTOR InsideAll = XMVectorTrueInt(); + + XMVECTOR offset = XMVectorSubtract(boxCenter, vCenter); + + for (size_t i = 0; i < BoundingBox::CORNER_COUNT; ++i) + { + XMVECTOR C = XMVectorMultiplyAdd(boxExtents, g_BoxOffset[i], offset); + XMVECTOR d = XMVector3LengthSq(C); + InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(d, RadiusSq)); + } + + return (XMVector3EqualInt(InsideAll, XMVectorTrueInt())) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Oriented box in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains(const BoundingOrientedBox& box) const noexcept +{ + if (!box.Intersects(*this)) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius); + + XMVECTOR boxCenter = XMLoadFloat3(&box.Center); + XMVECTOR boxExtents = XMLoadFloat3(&box.Extents); + XMVECTOR boxOrientation = XMLoadFloat4(&box.Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(boxOrientation)); + + XMVECTOR InsideAll = XMVectorTrueInt(); + + for (size_t i = 0; i < BoundingOrientedBox::CORNER_COUNT; ++i) + { + XMVECTOR C = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(boxExtents, g_BoxOffset[i]), boxOrientation), boxCenter); + XMVECTOR d = XMVector3LengthSq(XMVectorSubtract(vCenter, C)); + InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(d, RadiusSq)); + } + + return (XMVector3EqualInt(InsideAll, XMVectorTrueInt())) ? CONTAINS : INTERSECTS; + +} + + +//----------------------------------------------------------------------------- +// Frustum in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains(const BoundingFrustum& fr) const noexcept +{ + if (!fr.Intersects(*this)) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius); + + XMVECTOR vOrigin = XMLoadFloat3(&fr.Origin); + XMVECTOR vOrientation = XMLoadFloat4(&fr.Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet(fr.RightSlope, fr.TopSlope, 1.0f, 0.0f); + XMVECTOR vRightBottom = XMVectorSet(fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f); + XMVECTOR vLeftTop = XMVectorSet(fr.LeftSlope, fr.TopSlope, 1.0f, 0.0f); + XMVECTOR vLeftBottom = XMVectorSet(fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f); + XMVECTOR vNear = XMVectorReplicatePtr(&fr.Near); + XMVECTOR vFar = XMVectorReplicatePtr(&fr.Far); + + XMVECTOR Corners[BoundingFrustum::CORNER_COUNT]; + Corners[0] = XMVectorMultiply(vRightTop, vNear); + Corners[1] = XMVectorMultiply(vRightBottom, vNear); + Corners[2] = XMVectorMultiply(vLeftTop, vNear); + Corners[3] = XMVectorMultiply(vLeftBottom, vNear); + Corners[4] = XMVectorMultiply(vRightTop, vFar); + Corners[5] = XMVectorMultiply(vRightBottom, vFar); + Corners[6] = XMVectorMultiply(vLeftTop, vFar); + Corners[7] = XMVectorMultiply(vLeftBottom, vFar); + + XMVECTOR InsideAll = XMVectorTrueInt(); + for (size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i) + { + XMVECTOR C = XMVectorAdd(XMVector3Rotate(Corners[i], vOrientation), vOrigin); + XMVECTOR d = XMVector3LengthSq(XMVectorSubtract(vCenter, C)); + InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(d, RadiusSq)); + } + + return (XMVector3EqualInt(InsideAll, XMVectorTrueInt())) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere vs. sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects(const BoundingSphere& sh) const noexcept +{ + // Load A. + XMVECTOR vCenterA = XMLoadFloat3(&Center); + XMVECTOR vRadiusA = XMVectorReplicatePtr(&Radius); + + // Load B. + XMVECTOR vCenterB = XMLoadFloat3(&sh.Center); + XMVECTOR vRadiusB = XMVectorReplicatePtr(&sh.Radius); + + // Distance squared between centers. + XMVECTOR Delta = XMVectorSubtract(vCenterB, vCenterA); + XMVECTOR DistanceSquared = XMVector3LengthSq(Delta); + + // Sum of the radii squared. + XMVECTOR RadiusSquared = XMVectorAdd(vRadiusA, vRadiusB); + RadiusSquared = XMVectorMultiply(RadiusSquared, RadiusSquared); + + return XMVector3LessOrEqual(DistanceSquared, RadiusSquared); +} + + +//----------------------------------------------------------------------------- +// Box vs. sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects(const BoundingBox& box) const noexcept +{ + return box.Intersects(*this); +} + +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects(const BoundingOrientedBox& box) const noexcept +{ + return box.Intersects(*this); +} + + +//----------------------------------------------------------------------------- +// Frustum vs. sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects(const BoundingFrustum& fr) const noexcept +{ + return fr.Intersects(*this); +} + + +//----------------------------------------------------------------------------- +// Triangle vs sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingSphere::Intersects(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept +{ + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + + // Compute the plane of the triangle (has to be normalized). + XMVECTOR N = XMVector3Normalize(XMVector3Cross(XMVectorSubtract(V1, V0), XMVectorSubtract(V2, V0))); + + // Assert that the triangle is not degenerate. + assert(!XMVector3Equal(N, XMVectorZero())); + + // Find the nearest feature on the triangle to the sphere. + XMVECTOR Dist = XMVector3Dot(XMVectorSubtract(vCenter, V0), N); + + // If the center of the sphere is farther from the plane of the triangle than + // the radius of the sphere, then there cannot be an intersection. + XMVECTOR NoIntersection = XMVectorLess(Dist, XMVectorNegate(vRadius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Dist, vRadius)); + + // Project the center of the sphere onto the plane of the triangle. + XMVECTOR Point = XMVectorNegativeMultiplySubtract(N, Dist, vCenter); + + // Is it inside all the edges? If so we intersect because the distance + // to the plane is less than the radius. + XMVECTOR Intersection = DirectX::Internal::PointOnPlaneInsideTriangle(Point, V0, V1, V2); + + // Find the nearest point on each edge. + XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius); + + // Edge 0,1 + Point = DirectX::Internal::PointOnLineSegmentNearestPoint(V0, V1, vCenter); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersection = XMVectorOrInt(Intersection, XMVectorLessOrEqual(XMVector3LengthSq(XMVectorSubtract(vCenter, Point)), RadiusSq)); + + // Edge 1,2 + Point = DirectX::Internal::PointOnLineSegmentNearestPoint(V1, V2, vCenter); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersection = XMVectorOrInt(Intersection, XMVectorLessOrEqual(XMVector3LengthSq(XMVectorSubtract(vCenter, Point)), RadiusSq)); + + // Edge 2,0 + Point = DirectX::Internal::PointOnLineSegmentNearestPoint(V2, V0, vCenter); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersection = XMVectorOrInt(Intersection, XMVectorLessOrEqual(XMVector3LengthSq(XMVectorSubtract(vCenter, Point)), RadiusSq)); + + return XMVector4EqualInt(XMVectorAndCInt(Intersection, NoIntersection), XMVectorTrueInt()); +} + + +//----------------------------------------------------------------------------- +// Sphere-plane intersection +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV BoundingSphere::Intersects(FXMVECTOR Plane) const noexcept +{ + assert(DirectX::Internal::XMPlaneIsUnit(Plane)); + + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne()); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane, Outside, Inside); + + // If the sphere is outside any plane it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return FRONT; + + // If the sphere is inside all planes it is inside. + if (XMVector4EqualInt(Inside, XMVectorTrueInt())) + return BACK; + + // The sphere is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with a sphere. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingSphere::Intersects(FXMVECTOR Origin, FXMVECTOR Direction, float& Dist) const noexcept +{ + assert(DirectX::Internal::XMVector3IsUnit(Direction)); + + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + + // l is the vector from the ray origin to the center of the sphere. + XMVECTOR l = XMVectorSubtract(vCenter, Origin); + + // s is the projection of the l onto the ray direction. + XMVECTOR s = XMVector3Dot(l, Direction); + + XMVECTOR l2 = XMVector3Dot(l, l); + + XMVECTOR r2 = XMVectorMultiply(vRadius, vRadius); + + // m2 is squared distance from the center of the sphere to the projection. + XMVECTOR m2 = XMVectorNegativeMultiplySubtract(s, s, l2); + + XMVECTOR NoIntersection; + + // If the ray origin is outside the sphere and the center of the sphere is + // behind the ray origin there is no intersection. + NoIntersection = XMVectorAndInt(XMVectorLess(s, XMVectorZero()), XMVectorGreater(l2, r2)); + + // If the squared distance from the center of the sphere to the projection + // is greater than the radius squared the ray will miss the sphere. + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(m2, r2)); + + // The ray hits the sphere, compute the nearest intersection point. + XMVECTOR q = XMVectorSqrt(XMVectorSubtract(r2, m2)); + XMVECTOR t1 = XMVectorSubtract(s, q); + XMVECTOR t2 = XMVectorAdd(s, q); + + XMVECTOR OriginInside = XMVectorLessOrEqual(l2, r2); + XMVECTOR t = XMVectorSelect(t1, t2, OriginInside); + + if (XMVector4NotEqualInt(NoIntersection, XMVectorTrueInt())) + { + // Store the x-component to *pDist. + XMStoreFloat(&Dist, t); + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test a sphere vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingSphere::ContainedBy( + FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, + HXMVECTOR Plane4, HXMVECTOR Plane5) const noexcept +{ + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&Radius); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne()); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane0, Outside, Inside); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane1, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane2, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane3, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane4, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectSpherePlane(vCenter, vRadius, Plane5, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + // If the sphere is outside any plane it is outside. + if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt())) + return DISJOINT; + + // If the sphere is inside all planes it is inside. + if (XMVector4EqualInt(AllInside, XMVectorTrueInt())) + return CONTAINS; + + // The sphere is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Creates a bounding sphere that contains two other bounding spheres +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateMerged(BoundingSphere& Out, const BoundingSphere& S1, const BoundingSphere& S2) noexcept +{ + XMVECTOR Center1 = XMLoadFloat3(&S1.Center); + float r1 = S1.Radius; + + XMVECTOR Center2 = XMLoadFloat3(&S2.Center); + float r2 = S2.Radius; + + XMVECTOR V = XMVectorSubtract(Center2, Center1); + + XMVECTOR Dist = XMVector3Length(V); + + float d = XMVectorGetX(Dist); + + if (r1 + r2 >= d) + { + if (r1 - r2 >= d) + { + Out = S1; + return; + } + else if (r2 - r1 >= d) + { + Out = S2; + return; + } + } + + XMVECTOR N = XMVectorDivide(V, Dist); + + float t1 = XMMin(-r1, d - r2); + float t2 = XMMax(r1, d + r2); + float t_5 = (t2 - t1) * 0.5f; + + XMVECTOR NCenter = XMVectorAdd(Center1, XMVectorMultiply(N, XMVectorReplicate(t_5 + t1))); + + XMStoreFloat3(&Out.Center, NCenter); + Out.Radius = t_5; +} + + +//----------------------------------------------------------------------------- +// Create sphere enscribing bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromBoundingBox(BoundingSphere& Out, const BoundingBox& box) noexcept +{ + Out.Center = box.Center; + XMVECTOR vExtents = XMLoadFloat3(&box.Extents); + Out.Radius = XMVectorGetX(XMVector3Length(vExtents)); +} + +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromBoundingBox(BoundingSphere& Out, const BoundingOrientedBox& box) noexcept +{ + // Bounding box orientation is irrelevant because a sphere is rotationally invariant + Out.Center = box.Center; + XMVECTOR vExtents = XMLoadFloat3(&box.Extents); + Out.Radius = XMVectorGetX(XMVector3Length(vExtents)); +} + + +//----------------------------------------------------------------------------- +// Find the approximate smallest enclosing bounding sphere for a set of +// points. Exact computation of the smallest enclosing bounding sphere is +// possible but is slower and requires a more complex algorithm. +// The algorithm is based on Jack Ritter, "An Efficient Bounding Sphere", +// Graphics Gems. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromPoints(BoundingSphere& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride) noexcept +{ + assert(Count > 0); + assert(pPoints); + + // Find the points with minimum and maximum x, y, and z + XMVECTOR MinX, MaxX, MinY, MaxY, MinZ, MaxZ; + + MinX = MaxX = MinY = MaxY = MinZ = MaxZ = XMLoadFloat3(pPoints); + + for (size_t i = 1; i < Count; ++i) + { + XMVECTOR Point = XMLoadFloat3(reinterpret_cast(reinterpret_cast(pPoints) + i * Stride)); + + float px = XMVectorGetX(Point); + float py = XMVectorGetY(Point); + float pz = XMVectorGetZ(Point); + + if (px < XMVectorGetX(MinX)) + MinX = Point; + + if (px > XMVectorGetX(MaxX)) + MaxX = Point; + + if (py < XMVectorGetY(MinY)) + MinY = Point; + + if (py > XMVectorGetY(MaxY)) + MaxY = Point; + + if (pz < XMVectorGetZ(MinZ)) + MinZ = Point; + + if (pz > XMVectorGetZ(MaxZ)) + MaxZ = Point; + } + + // Use the min/max pair that are farthest apart to form the initial sphere. + XMVECTOR DeltaX = XMVectorSubtract(MaxX, MinX); + XMVECTOR DistX = XMVector3Length(DeltaX); + + XMVECTOR DeltaY = XMVectorSubtract(MaxY, MinY); + XMVECTOR DistY = XMVector3Length(DeltaY); + + XMVECTOR DeltaZ = XMVectorSubtract(MaxZ, MinZ); + XMVECTOR DistZ = XMVector3Length(DeltaZ); + + XMVECTOR vCenter; + XMVECTOR vRadius; + + if (XMVector3Greater(DistX, DistY)) + { + if (XMVector3Greater(DistX, DistZ)) + { + // Use min/max x. + vCenter = XMVectorLerp(MaxX, MinX, 0.5f); + vRadius = XMVectorScale(DistX, 0.5f); + } + else + { + // Use min/max z. + vCenter = XMVectorLerp(MaxZ, MinZ, 0.5f); + vRadius = XMVectorScale(DistZ, 0.5f); + } + } + else // Y >= X + { + if (XMVector3Greater(DistY, DistZ)) + { + // Use min/max y. + vCenter = XMVectorLerp(MaxY, MinY, 0.5f); + vRadius = XMVectorScale(DistY, 0.5f); + } + else + { + // Use min/max z. + vCenter = XMVectorLerp(MaxZ, MinZ, 0.5f); + vRadius = XMVectorScale(DistZ, 0.5f); + } + } + + // Add any points not inside the sphere. + for (size_t i = 0; i < Count; ++i) + { + XMVECTOR Point = XMLoadFloat3(reinterpret_cast(reinterpret_cast(pPoints) + i * Stride)); + + XMVECTOR Delta = XMVectorSubtract(Point, vCenter); + + XMVECTOR Dist = XMVector3Length(Delta); + + if (XMVector3Greater(Dist, vRadius)) + { + // Adjust sphere to include the new point. + vRadius = XMVectorScale(XMVectorAdd(vRadius, Dist), 0.5f); + vCenter = XMVectorAdd(vCenter, XMVectorMultiply(XMVectorSubtract(XMVectorReplicate(1.0f), XMVectorDivide(vRadius, Dist)), Delta)); + } + } + + XMStoreFloat3(&Out.Center, vCenter); + XMStoreFloat(&Out.Radius, vRadius); +} + + +//----------------------------------------------------------------------------- +// Create sphere containing frustum +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromFrustum(BoundingSphere& Out, const BoundingFrustum& fr) noexcept +{ + XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; + fr.GetCorners(Corners); + CreateFromPoints(Out, BoundingFrustum::CORNER_COUNT, Corners, sizeof(XMFLOAT3)); +} + + +/**************************************************************************** + * + * BoundingBox + * + ****************************************************************************/ + + //----------------------------------------------------------------------------- + // Transform an axis aligned box by an angle preserving transform. + //----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingBox::Transform(BoundingBox& Out, FXMMATRIX M) const noexcept +{ + // Load center and extents. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + // Compute and transform the corners and find new min/max bounds. + XMVECTOR Corner = XMVectorMultiplyAdd(vExtents, g_BoxOffset[0], vCenter); + Corner = XMVector3Transform(Corner, M); + + XMVECTOR Min, Max; + Min = Max = Corner; + + for (size_t i = 1; i < CORNER_COUNT; ++i) + { + Corner = XMVectorMultiplyAdd(vExtents, g_BoxOffset[i], vCenter); + Corner = XMVector3Transform(Corner, M); + + Min = XMVectorMin(Min, Corner); + Max = XMVectorMax(Max, Corner); + } + + // Store center and extents. + XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f)); + XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(Max, Min), 0.5f)); +} + +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingBox::Transform(BoundingBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation) const noexcept +{ + assert(DirectX::Internal::XMQuaternionIsUnit(Rotation)); + + // Load center and extents. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + XMVECTOR VectorScale = XMVectorReplicate(Scale); + + // Compute and transform the corners and find new min/max bounds. + XMVECTOR Corner = XMVectorMultiplyAdd(vExtents, g_BoxOffset[0], vCenter); + Corner = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(Corner, VectorScale), Rotation), Translation); + + XMVECTOR Min, Max; + Min = Max = Corner; + + for (size_t i = 1; i < CORNER_COUNT; ++i) + { + Corner = XMVectorMultiplyAdd(vExtents, g_BoxOffset[i], vCenter); + Corner = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(Corner, VectorScale), Rotation), Translation); + + Min = XMVectorMin(Min, Corner); + Max = XMVectorMax(Max, Corner); + } + + // Store center and extents. + XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f)); + XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(Max, Min), 0.5f)); +} + + +//----------------------------------------------------------------------------- +// Get the corner points of the box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::GetCorners(XMFLOAT3* Corners) const noexcept +{ + assert(Corners != nullptr); + + // Load the box + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + for (size_t i = 0; i < CORNER_COUNT; ++i) + { + XMVECTOR C = XMVectorMultiplyAdd(vExtents, g_BoxOffset[i], vCenter); + XMStoreFloat3(&Corners[i], C); + } +} + + +//----------------------------------------------------------------------------- +// Point in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingBox::Contains(FXMVECTOR Point) const noexcept +{ + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + return XMVector3InBounds(XMVectorSubtract(Point, vCenter), vExtents) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingBox::Contains(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept +{ + if (!Intersects(V0, V1, V2)) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + XMVECTOR d = XMVectorAbs(XMVectorSubtract(V0, vCenter)); + XMVECTOR Inside = XMVectorLessOrEqual(d, vExtents); + + d = XMVectorAbs(XMVectorSubtract(V1, vCenter)); + Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(d, vExtents)); + + d = XMVectorAbs(XMVectorSubtract(V2, vCenter)); + Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(d, vExtents)); + + return (XMVector3EqualInt(Inside, XMVectorTrueInt())) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains(const BoundingSphere& sh) const noexcept +{ + XMVECTOR SphereCenter = XMLoadFloat3(&sh.Center); + XMVECTOR SphereRadius = XMVectorReplicatePtr(&sh.Radius); + + XMVECTOR BoxCenter = XMLoadFloat3(&Center); + XMVECTOR BoxExtents = XMLoadFloat3(&Extents); + + XMVECTOR BoxMin = XMVectorSubtract(BoxCenter, BoxExtents); + XMVECTOR BoxMax = XMVectorAdd(BoxCenter, BoxExtents); + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess(SphereCenter, BoxMin); + XMVECTOR GreaterThanMax = XMVectorGreater(SphereCenter, BoxMax); + + XMVECTOR MinDelta = XMVectorSubtract(SphereCenter, BoxMin); + XMVECTOR MaxDelta = XMVectorSubtract(SphereCenter, BoxMax); + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect(d, MinDelta, LessThanMin); + d = XMVectorSelect(d, MaxDelta, GreaterThanMax); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot(d, d); + + if (XMVector3Greater(d2, XMVectorMultiply(SphereRadius, SphereRadius))) + return DISJOINT; + + XMVECTOR InsideAll = XMVectorLessOrEqual(XMVectorAdd(BoxMin, SphereRadius), SphereCenter); + InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(SphereCenter, XMVectorSubtract(BoxMax, SphereRadius))); + InsideAll = XMVectorAndInt(InsideAll, XMVectorGreater(XMVectorSubtract(BoxMax, BoxMin), SphereRadius)); + + return (XMVector3EqualInt(InsideAll, XMVectorTrueInt())) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Axis-aligned box in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains(const BoundingBox& box) const noexcept +{ + XMVECTOR CenterA = XMLoadFloat3(&Center); + XMVECTOR ExtentsA = XMLoadFloat3(&Extents); + + XMVECTOR CenterB = XMLoadFloat3(&box.Center); + XMVECTOR ExtentsB = XMLoadFloat3(&box.Extents); + + XMVECTOR MinA = XMVectorSubtract(CenterA, ExtentsA); + XMVECTOR MaxA = XMVectorAdd(CenterA, ExtentsA); + + XMVECTOR MinB = XMVectorSubtract(CenterB, ExtentsB); + XMVECTOR MaxB = XMVectorAdd(CenterB, ExtentsB); + + // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false + XMVECTOR Disjoint = XMVectorOrInt(XMVectorGreater(MinA, MaxB), XMVectorGreater(MinB, MaxA)); + + if (DirectX::Internal::XMVector3AnyTrue(Disjoint)) + return DISJOINT; + + // for each i in (x, y, z) if a_min(i) <= b_min(i) and b_max(i) <= a_max(i) then A contains B + XMVECTOR Inside = XMVectorAndInt(XMVectorLessOrEqual(MinA, MinB), XMVectorLessOrEqual(MaxB, MaxA)); + + return DirectX::Internal::XMVector3AllTrue(Inside) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Oriented box in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains(const BoundingOrientedBox& box) const noexcept +{ + if (!box.Intersects(*this)) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + // Subtract off the AABB center to remove a subtract below + XMVECTOR oCenter = XMVectorSubtract(XMLoadFloat3(&box.Center), vCenter); + + XMVECTOR oExtents = XMLoadFloat3(&box.Extents); + XMVECTOR oOrientation = XMLoadFloat4(&box.Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(oOrientation)); + + XMVECTOR Inside = XMVectorTrueInt(); + + for (size_t i = 0; i < BoundingOrientedBox::CORNER_COUNT; ++i) + { + XMVECTOR C = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(oExtents, g_BoxOffset[i]), oOrientation), oCenter); + XMVECTOR d = XMVectorAbs(C); + Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(d, vExtents)); + } + + return (XMVector3EqualInt(Inside, XMVectorTrueInt())) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Frustum in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains(const BoundingFrustum& fr) const noexcept +{ + if (!fr.Intersects(*this)) + return DISJOINT; + + XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; + fr.GetCorners(Corners); + + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + XMVECTOR Inside = XMVectorTrueInt(); + + for (size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i) + { + XMVECTOR Point = XMLoadFloat3(&Corners[i]); + XMVECTOR d = XMVectorAbs(XMVectorSubtract(Point, vCenter)); + Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(d, vExtents)); + } + + return (XMVector3EqualInt(Inside, XMVectorTrueInt())) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere vs axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects(const BoundingSphere& sh) const noexcept +{ + XMVECTOR SphereCenter = XMLoadFloat3(&sh.Center); + XMVECTOR SphereRadius = XMVectorReplicatePtr(&sh.Radius); + + XMVECTOR BoxCenter = XMLoadFloat3(&Center); + XMVECTOR BoxExtents = XMLoadFloat3(&Extents); + + XMVECTOR BoxMin = XMVectorSubtract(BoxCenter, BoxExtents); + XMVECTOR BoxMax = XMVectorAdd(BoxCenter, BoxExtents); + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess(SphereCenter, BoxMin); + XMVECTOR GreaterThanMax = XMVectorGreater(SphereCenter, BoxMax); + + XMVECTOR MinDelta = XMVectorSubtract(SphereCenter, BoxMin); + XMVECTOR MaxDelta = XMVectorSubtract(SphereCenter, BoxMax); + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect(d, MinDelta, LessThanMin); + d = XMVectorSelect(d, MaxDelta, GreaterThanMax); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot(d, d); + + return XMVector3LessOrEqual(d2, XMVectorMultiply(SphereRadius, SphereRadius)); +} + + +//----------------------------------------------------------------------------- +// Axis-aligned box vs. axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects(const BoundingBox& box) const noexcept +{ + XMVECTOR CenterA = XMLoadFloat3(&Center); + XMVECTOR ExtentsA = XMLoadFloat3(&Extents); + + XMVECTOR CenterB = XMLoadFloat3(&box.Center); + XMVECTOR ExtentsB = XMLoadFloat3(&box.Extents); + + XMVECTOR MinA = XMVectorSubtract(CenterA, ExtentsA); + XMVECTOR MaxA = XMVectorAdd(CenterA, ExtentsA); + + XMVECTOR MinB = XMVectorSubtract(CenterB, ExtentsB); + XMVECTOR MaxB = XMVectorAdd(CenterB, ExtentsB); + + // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false + XMVECTOR Disjoint = XMVectorOrInt(XMVectorGreater(MinA, MaxB), XMVectorGreater(MinB, MaxA)); + + return !DirectX::Internal::XMVector3AnyTrue(Disjoint); +} + + +//----------------------------------------------------------------------------- +// Oriented box vs. axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects(const BoundingOrientedBox& box) const noexcept +{ + return box.Intersects(*this); +} + + +//----------------------------------------------------------------------------- +// Frustum vs. axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects(const BoundingFrustum& fr) const noexcept +{ + return fr.Intersects(*this); +} + + +//----------------------------------------------------------------------------- +// Triangle vs. axis aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingBox::Intersects(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept +{ + XMVECTOR Zero = XMVectorZero(); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + XMVECTOR BoxMin = XMVectorSubtract(vCenter, vExtents); + XMVECTOR BoxMax = XMVectorAdd(vCenter, vExtents); + + // Test the axes of the box (in effect test the AAB against the minimal AAB + // around the triangle). + XMVECTOR TriMin = XMVectorMin(XMVectorMin(V0, V1), V2); + XMVECTOR TriMax = XMVectorMax(XMVectorMax(V0, V1), V2); + + // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then disjoint + XMVECTOR Disjoint = XMVectorOrInt(XMVectorGreater(TriMin, BoxMax), XMVectorGreater(BoxMin, TriMax)); + if (DirectX::Internal::XMVector3AnyTrue(Disjoint)) + return false; + + // Test the plane of the triangle. + XMVECTOR Normal = XMVector3Cross(XMVectorSubtract(V1, V0), XMVectorSubtract(V2, V0)); + XMVECTOR Dist = XMVector3Dot(Normal, V0); + + // Assert that the triangle is not degenerate. + assert(!XMVector3Equal(Normal, Zero)); + + // for each i in (x, y, z) if n(i) >= 0 then v_min(i)=b_min(i), v_max(i)=b_max(i) + // else v_min(i)=b_max(i), v_max(i)=b_min(i) + XMVECTOR NormalSelect = XMVectorGreater(Normal, Zero); + XMVECTOR V_Min = XMVectorSelect(BoxMax, BoxMin, NormalSelect); + XMVECTOR V_Max = XMVectorSelect(BoxMin, BoxMax, NormalSelect); + + // if n dot v_min + d > 0 || n dot v_max + d < 0 then disjoint + XMVECTOR MinDist = XMVector3Dot(V_Min, Normal); + XMVECTOR MaxDist = XMVector3Dot(V_Max, Normal); + + XMVECTOR NoIntersection = XMVectorGreater(MinDist, Dist); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(MaxDist, Dist)); + + // Move the box center to zero to simplify the following tests. + XMVECTOR TV0 = XMVectorSubtract(V0, vCenter); + XMVECTOR TV1 = XMVectorSubtract(V1, vCenter); + XMVECTOR TV2 = XMVectorSubtract(V2, vCenter); + + // Test the edge/edge axes (3*3). + XMVECTOR e0 = XMVectorSubtract(TV1, TV0); + XMVECTOR e1 = XMVectorSubtract(TV2, TV1); + XMVECTOR e2 = XMVectorSubtract(TV0, TV2); + + // Make w zero. + e0 = XMVectorInsert<0, 0, 0, 0, 1>(e0, Zero); + e1 = XMVectorInsert<0, 0, 0, 0, 1>(e1, Zero); + e2 = XMVectorInsert<0, 0, 0, 0, 1>(e2, Zero); + + XMVECTOR Axis; + XMVECTOR p0, p1, p2; + XMVECTOR Min, Max; + XMVECTOR Radius; + + // Axis == (1,0,0) x e0 = (0, -e0.z, e0.y) + Axis = XMVectorPermute(e0, XMVectorNegate(e0)); + p0 = XMVector3Dot(TV0, Axis); + // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; + p2 = XMVector3Dot(TV2, Axis); + Min = XMVectorMin(p0, p2); + Max = XMVectorMax(p0, p2); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + // Axis == (1,0,0) x e1 = (0, -e1.z, e1.y) + Axis = XMVectorPermute(e1, XMVectorNegate(e1)); + p0 = XMVector3Dot(TV0, Axis); + p1 = XMVector3Dot(TV1, Axis); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; + Min = XMVectorMin(p0, p1); + Max = XMVectorMax(p0, p1); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + // Axis == (1,0,0) x e2 = (0, -e2.z, e2.y) + Axis = XMVectorPermute(e2, XMVectorNegate(e2)); + p0 = XMVector3Dot(TV0, Axis); + p1 = XMVector3Dot(TV1, Axis); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; + Min = XMVectorMin(p0, p1); + Max = XMVectorMax(p0, p1); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + // Axis == (0,1,0) x e0 = (e0.z, 0, -e0.x) + Axis = XMVectorPermute(e0, XMVectorNegate(e0)); + p0 = XMVector3Dot(TV0, Axis); + // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; + p2 = XMVector3Dot(TV2, Axis); + Min = XMVectorMin(p0, p2); + Max = XMVectorMax(p0, p2); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + // Axis == (0,1,0) x e1 = (e1.z, 0, -e1.x) + Axis = XMVectorPermute(e1, XMVectorNegate(e1)); + p0 = XMVector3Dot(TV0, Axis); + p1 = XMVector3Dot(TV1, Axis); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; + Min = XMVectorMin(p0, p1); + Max = XMVectorMax(p0, p1); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + // Axis == (0,0,1) x e2 = (e2.z, 0, -e2.x) + Axis = XMVectorPermute(e2, XMVectorNegate(e2)); + p0 = XMVector3Dot(TV0, Axis); + p1 = XMVector3Dot(TV1, Axis); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; + Min = XMVectorMin(p0, p1); + Max = XMVectorMax(p0, p1); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + // Axis == (0,0,1) x e0 = (-e0.y, e0.x, 0) + Axis = XMVectorPermute(e0, XMVectorNegate(e0)); + p0 = XMVector3Dot(TV0, Axis); + // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; + p2 = XMVector3Dot(TV2, Axis); + Min = XMVectorMin(p0, p2); + Max = XMVectorMax(p0, p2); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + // Axis == (0,0,1) x e1 = (-e1.y, e1.x, 0) + Axis = XMVectorPermute(e1, XMVectorNegate(e1)); + p0 = XMVector3Dot(TV0, Axis); + p1 = XMVector3Dot(TV1, Axis); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; + Min = XMVectorMin(p0, p1); + Max = XMVectorMax(p0, p1); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + // Axis == (0,0,1) x e2 = (-e2.y, e2.x, 0) + Axis = XMVectorPermute(e2, XMVectorNegate(e2)); + p0 = XMVector3Dot(TV0, Axis); + p1 = XMVector3Dot(TV1, Axis); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; + Min = XMVectorMin(p0, p1); + Max = XMVectorMax(p0, p1); + Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(Max, XMVectorNegate(Radius))); + + return XMVector4NotEqualInt(NoIntersection, XMVectorTrueInt()); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV BoundingBox::Intersects(FXMVECTOR Plane) const noexcept +{ + assert(DirectX::Internal::XMPlaneIsUnit(Plane)); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne()); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane, Outside, Inside); + + // If the box is outside any plane it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return FRONT; + + // If the box is inside all planes it is inside. + if (XMVector4EqualInt(Inside, XMVectorTrueInt())) + return BACK; + + // The box is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with an axis aligned +// box using the slabs method. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingBox::Intersects(FXMVECTOR Origin, FXMVECTOR Direction, float& Dist) const noexcept +{ + assert(DirectX::Internal::XMVector3IsUnit(Direction)); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + // Adjust ray origin to be relative to center of the box. + XMVECTOR TOrigin = XMVectorSubtract(vCenter, Origin); + + // Compute the dot product againt each axis of the box. + // Since the axii are (1,0,0), (0,1,0), (0,0,1) no computation is necessary. + XMVECTOR AxisDotOrigin = TOrigin; + XMVECTOR AxisDotDirection = Direction; + + // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab. + XMVECTOR IsParallel = XMVectorLessOrEqual(XMVectorAbs(AxisDotDirection), g_RayEpsilon); + + // Test against all three axii simultaneously. + XMVECTOR InverseAxisDotDirection = XMVectorReciprocal(AxisDotDirection); + XMVECTOR t1 = XMVectorMultiply(XMVectorSubtract(AxisDotOrigin, vExtents), InverseAxisDotDirection); + XMVECTOR t2 = XMVectorMultiply(XMVectorAdd(AxisDotOrigin, vExtents), InverseAxisDotDirection); + + // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't + // use the results from any directions parallel to the slab. + XMVECTOR t_min = XMVectorSelect(XMVectorMin(t1, t2), g_FltMin, IsParallel); + XMVECTOR t_max = XMVectorSelect(XMVectorMax(t1, t2), g_FltMax, IsParallel); + + // t_min.x = maximum( t_min.x, t_min.y, t_min.z ); + // t_max.x = minimum( t_max.x, t_max.y, t_max.z ); + t_min = XMVectorMax(t_min, XMVectorSplatY(t_min)); // x = max(x,y) + t_min = XMVectorMax(t_min, XMVectorSplatZ(t_min)); // x = max(max(x,y),z) + t_max = XMVectorMin(t_max, XMVectorSplatY(t_max)); // x = min(x,y) + t_max = XMVectorMin(t_max, XMVectorSplatZ(t_max)); // x = min(min(x,y),z) + + // if ( t_min > t_max ) return false; + XMVECTOR NoIntersection = XMVectorGreater(XMVectorSplatX(t_min), XMVectorSplatX(t_max)); + + // if ( t_max < 0.0f ) return false; + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(XMVectorSplatX(t_max), XMVectorZero())); + + // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false; + XMVECTOR ParallelOverlap = XMVectorInBounds(AxisDotOrigin, vExtents); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorAndCInt(IsParallel, ParallelOverlap)); + + if (!DirectX::Internal::XMVector3AnyTrue(NoIntersection)) + { + // Store the x-component to *pDist + XMStoreFloat(&Dist, t_min); + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test an axis alinged box vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingBox::ContainedBy( + FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, + HXMVECTOR Plane4, HXMVECTOR Plane5) const noexcept +{ + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne()); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane0, Outside, Inside); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane1, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane2, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane3, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane4, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane(vCenter, vExtents, Plane5, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + // If the box is outside any plane it is outside. + if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt())) + return DISJOINT; + + // If the box is inside all planes it is inside. + if (XMVector4EqualInt(AllInside, XMVectorTrueInt())) + return CONTAINS; + + // The box is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Create axis-aligned box that contains two other bounding boxes +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::CreateMerged(BoundingBox& Out, const BoundingBox& b1, const BoundingBox& b2) noexcept +{ + XMVECTOR b1Center = XMLoadFloat3(&b1.Center); + XMVECTOR b1Extents = XMLoadFloat3(&b1.Extents); + + XMVECTOR b2Center = XMLoadFloat3(&b2.Center); + XMVECTOR b2Extents = XMLoadFloat3(&b2.Extents); + + XMVECTOR Min = XMVectorSubtract(b1Center, b1Extents); + Min = XMVectorMin(Min, XMVectorSubtract(b2Center, b2Extents)); + + XMVECTOR Max = XMVectorAdd(b1Center, b1Extents); + Max = XMVectorMax(Max, XMVectorAdd(b2Center, b2Extents)); + + assert(XMVector3LessOrEqual(Min, Max)); + + XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f)); + XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(Max, Min), 0.5f)); +} + + +//----------------------------------------------------------------------------- +// Create axis-aligned box that contains a bounding sphere +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::CreateFromSphere(BoundingBox& Out, const BoundingSphere& sh) noexcept +{ + XMVECTOR spCenter = XMLoadFloat3(&sh.Center); + XMVECTOR shRadius = XMVectorReplicatePtr(&sh.Radius); + + XMVECTOR Min = XMVectorSubtract(spCenter, shRadius); + XMVECTOR Max = XMVectorAdd(spCenter, shRadius); + + assert(XMVector3LessOrEqual(Min, Max)); + + XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f)); + XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(Max, Min), 0.5f)); +} + + +//----------------------------------------------------------------------------- +// Create axis-aligned box from min/max points +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingBox::CreateFromPoints(BoundingBox& Out, FXMVECTOR pt1, FXMVECTOR pt2) noexcept +{ + XMVECTOR Min = XMVectorMin(pt1, pt2); + XMVECTOR Max = XMVectorMax(pt1, pt2); + + // Store center and extents. + XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f)); + XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(Max, Min), 0.5f)); +} + + +//----------------------------------------------------------------------------- +// Find the minimum axis aligned bounding box containing a set of points. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::CreateFromPoints(BoundingBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride) noexcept +{ + assert(Count > 0); + assert(pPoints); + + // Find the minimum and maximum x, y, and z + XMVECTOR vMin, vMax; + + vMin = vMax = XMLoadFloat3(pPoints); + + for (size_t i = 1; i < Count; ++i) + { + XMVECTOR Point = XMLoadFloat3(reinterpret_cast(reinterpret_cast(pPoints) + i * Stride)); + + vMin = XMVectorMin(vMin, Point); + vMax = XMVectorMax(vMax, Point); + } + + // Store center and extents. + XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(vMin, vMax), 0.5f)); + XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(vMax, vMin), 0.5f)); +} + + +/**************************************************************************** + * + * BoundingOrientedBox + * + ****************************************************************************/ + + //----------------------------------------------------------------------------- + // Transform an oriented box by an angle preserving transform. + //----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingOrientedBox::Transform(BoundingOrientedBox& Out, FXMMATRIX M) const noexcept +{ + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Composite the box rotation and the transform rotation. + XMMATRIX nM; + nM.r[0] = XMVector3Normalize(M.r[0]); + nM.r[1] = XMVector3Normalize(M.r[1]); + nM.r[2] = XMVector3Normalize(M.r[2]); + nM.r[3] = g_XMIdentityR3; + XMVECTOR Rotation = XMQuaternionRotationMatrix(nM); + vOrientation = XMQuaternionMultiply(vOrientation, Rotation); + + // Transform the center. + vCenter = XMVector3Transform(vCenter, M); + + // Scale the box extents. + XMVECTOR dX = XMVector3Length(M.r[0]); + XMVECTOR dY = XMVector3Length(M.r[1]); + XMVECTOR dZ = XMVector3Length(M.r[2]); + + XMVECTOR VectorScale = XMVectorSelect(dY, dX, g_XMSelect1000); + VectorScale = XMVectorSelect(dZ, VectorScale, g_XMSelect1100); + vExtents = XMVectorMultiply(vExtents, VectorScale); + + // Store the box. + XMStoreFloat3(&Out.Center, vCenter); + XMStoreFloat3(&Out.Extents, vExtents); + XMStoreFloat4(&Out.Orientation, vOrientation); +} + +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingOrientedBox::Transform(BoundingOrientedBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation) const noexcept +{ + assert(DirectX::Internal::XMQuaternionIsUnit(Rotation)); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Composite the box rotation and the transform rotation. + vOrientation = XMQuaternionMultiply(vOrientation, Rotation); + + // Transform the center. + XMVECTOR VectorScale = XMVectorReplicate(Scale); + vCenter = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(vCenter, VectorScale), Rotation), Translation); + + // Scale the box extents. + vExtents = XMVectorMultiply(vExtents, VectorScale); + + // Store the box. + XMStoreFloat3(&Out.Center, vCenter); + XMStoreFloat3(&Out.Extents, vExtents); + XMStoreFloat4(&Out.Orientation, vOrientation); +} + + +//----------------------------------------------------------------------------- +// Get the corner points of the box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingOrientedBox::GetCorners(XMFLOAT3* Corners) const noexcept +{ + assert(Corners != nullptr); + + // Load the box + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + for (size_t i = 0; i < CORNER_COUNT; ++i) + { + XMVECTOR C = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(vExtents, g_BoxOffset[i]), vOrientation), vCenter); + XMStoreFloat3(&Corners[i], C); + } +} + + +//----------------------------------------------------------------------------- +// Point in oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingOrientedBox::Contains(FXMVECTOR Point) const noexcept +{ + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + // Transform the point to be local to the box. + XMVECTOR TPoint = XMVector3InverseRotate(XMVectorSubtract(Point, vCenter), vOrientation); + + return XMVector3InBounds(TPoint, vExtents) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingOrientedBox::Contains(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept +{ + // Load the box center & orientation. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + // Transform the triangle vertices into the space of the box. + XMVECTOR TV0 = XMVector3InverseRotate(XMVectorSubtract(V0, vCenter), vOrientation); + XMVECTOR TV1 = XMVector3InverseRotate(XMVectorSubtract(V1, vCenter), vOrientation); + XMVECTOR TV2 = XMVector3InverseRotate(XMVectorSubtract(V2, vCenter), vOrientation); + + BoundingBox box; + box.Center = XMFLOAT3(0.0f, 0.0f, 0.0f); + box.Extents = Extents; + + // Use the triangle vs axis aligned box intersection routine. + return box.Contains(TV0, TV1, TV2); +} + + +//----------------------------------------------------------------------------- +// Sphere in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains(const BoundingSphere& sh) const noexcept +{ + XMVECTOR SphereCenter = XMLoadFloat3(&sh.Center); + XMVECTOR SphereRadius = XMVectorReplicatePtr(&sh.Radius); + + XMVECTOR BoxCenter = XMLoadFloat3(&Center); + XMVECTOR BoxExtents = XMLoadFloat3(&Extents); + XMVECTOR BoxOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(BoxOrientation)); + + // Transform the center of the sphere to be local to the box. + // BoxMin = -BoxExtents + // BoxMax = +BoxExtents + SphereCenter = XMVector3InverseRotate(XMVectorSubtract(SphereCenter, BoxCenter), BoxOrientation); + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess(SphereCenter, XMVectorNegate(BoxExtents)); + XMVECTOR GreaterThanMax = XMVectorGreater(SphereCenter, BoxExtents); + + XMVECTOR MinDelta = XMVectorAdd(SphereCenter, BoxExtents); + XMVECTOR MaxDelta = XMVectorSubtract(SphereCenter, BoxExtents); + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect(d, MinDelta, LessThanMin); + d = XMVectorSelect(d, MaxDelta, GreaterThanMax); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot(d, d); + XMVECTOR SphereRadiusSq = XMVectorMultiply(SphereRadius, SphereRadius); + + if (XMVector4Greater(d2, SphereRadiusSq)) + return DISJOINT; + + // See if we are completely inside the box + XMVECTOR SMin = XMVectorSubtract(SphereCenter, SphereRadius); + XMVECTOR SMax = XMVectorAdd(SphereCenter, SphereRadius); + + return (XMVector3InBounds(SMin, BoxExtents) && XMVector3InBounds(SMax, BoxExtents)) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Axis aligned box vs. oriented box. Constructs an oriented box and uses +// the oriented box vs. oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains(const BoundingBox& box) const noexcept +{ + // Make the axis aligned box oriented and do an OBB vs OBB test. + BoundingOrientedBox obox(box.Center, box.Extents, XMFLOAT4(0.f, 0.f, 0.f, 1.f)); + return Contains(obox); +} + + +//----------------------------------------------------------------------------- +// Oriented bounding box in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains(const BoundingOrientedBox& box) const noexcept +{ + if (!Intersects(box)) + return DISJOINT; + + // Load the boxes + XMVECTOR aCenter = XMLoadFloat3(&Center); + XMVECTOR aExtents = XMLoadFloat3(&Extents); + XMVECTOR aOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(aOrientation)); + + XMVECTOR bCenter = XMLoadFloat3(&box.Center); + XMVECTOR bExtents = XMLoadFloat3(&box.Extents); + XMVECTOR bOrientation = XMLoadFloat4(&box.Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(bOrientation)); + + XMVECTOR offset = XMVectorSubtract(bCenter, aCenter); + + for (size_t i = 0; i < CORNER_COUNT; ++i) + { + // Cb = rotate( bExtents * corneroffset[i], bOrientation ) + bcenter + // Ca = invrotate( Cb - aCenter, aOrientation ) + + XMVECTOR C = XMVectorAdd(XMVector3Rotate(XMVectorMultiply(bExtents, g_BoxOffset[i]), bOrientation), offset); + C = XMVector3InverseRotate(C, aOrientation); + + if (!XMVector3InBounds(C, aExtents)) + return INTERSECTS; + } + + return CONTAINS; +} + + +//----------------------------------------------------------------------------- +// Frustum in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains(const BoundingFrustum& fr) const noexcept +{ + if (!fr.Intersects(*this)) + return DISJOINT; + + XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; + fr.GetCorners(Corners); + + // Load the box + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + for (size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i) + { + XMVECTOR C = XMVector3InverseRotate(XMVectorSubtract(XMLoadFloat3(&Corners[i]), vCenter), vOrientation); + + if (!XMVector3InBounds(C, vExtents)) + return INTERSECTS; + } + + return CONTAINS; +} + + +//----------------------------------------------------------------------------- +// Sphere vs. oriented box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects(const BoundingSphere& sh) const noexcept +{ + XMVECTOR SphereCenter = XMLoadFloat3(&sh.Center); + XMVECTOR SphereRadius = XMVectorReplicatePtr(&sh.Radius); + + XMVECTOR BoxCenter = XMLoadFloat3(&Center); + XMVECTOR BoxExtents = XMLoadFloat3(&Extents); + XMVECTOR BoxOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(BoxOrientation)); + + // Transform the center of the sphere to be local to the box. + // BoxMin = -BoxExtents + // BoxMax = +BoxExtents + SphereCenter = XMVector3InverseRotate(XMVectorSubtract(SphereCenter, BoxCenter), BoxOrientation); + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess(SphereCenter, XMVectorNegate(BoxExtents)); + XMVECTOR GreaterThanMax = XMVectorGreater(SphereCenter, BoxExtents); + + XMVECTOR MinDelta = XMVectorAdd(SphereCenter, BoxExtents); + XMVECTOR MaxDelta = XMVectorSubtract(SphereCenter, BoxExtents); + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect(d, MinDelta, LessThanMin); + d = XMVectorSelect(d, MaxDelta, GreaterThanMax); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot(d, d); + + return XMVector4LessOrEqual(d2, XMVectorMultiply(SphereRadius, SphereRadius)) ? true : false; +} + + +//----------------------------------------------------------------------------- +// Axis aligned box vs. oriented box. Constructs an oriented box and uses +// the oriented box vs. oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects(const BoundingBox& box) const noexcept +{ + // Make the axis aligned box oriented and do an OBB vs OBB test. + BoundingOrientedBox obox(box.Center, box.Extents, XMFLOAT4(0.f, 0.f, 0.f, 1.f)); + return Intersects(obox); +} + + +//----------------------------------------------------------------------------- +// Fast oriented box / oriented box intersection test using the separating axis +// theorem. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects(const BoundingOrientedBox& box) const noexcept +{ + // Build the 3x3 rotation matrix that defines the orientation of B relative to A. + XMVECTOR A_quat = XMLoadFloat4(&Orientation); + XMVECTOR B_quat = XMLoadFloat4(&box.Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(A_quat)); + assert(DirectX::Internal::XMQuaternionIsUnit(B_quat)); + + XMVECTOR Q = XMQuaternionMultiply(A_quat, XMQuaternionConjugate(B_quat)); + XMMATRIX R = XMMatrixRotationQuaternion(Q); + + // Compute the translation of B relative to A. + XMVECTOR A_cent = XMLoadFloat3(&Center); + XMVECTOR B_cent = XMLoadFloat3(&box.Center); + XMVECTOR t = XMVector3InverseRotate(XMVectorSubtract(B_cent, A_cent), A_quat); + + // + // h(A) = extents of A. + // h(B) = extents of B. + // + // a(u) = axes of A = (1,0,0), (0,1,0), (0,0,1) + // b(u) = axes of B relative to A = (r00,r10,r20), (r01,r11,r21), (r02,r12,r22) + // + // For each possible separating axis l: + // d(A) = sum (for i = u,v,w) h(A)(i) * abs( a(i) dot l ) + // d(B) = sum (for i = u,v,w) h(B)(i) * abs( b(i) dot l ) + // if abs( t dot l ) > d(A) + d(B) then disjoint + // + + // Load extents of A and B. + XMVECTOR h_A = XMLoadFloat3(&Extents); + XMVECTOR h_B = XMLoadFloat3(&box.Extents); + + // Rows. Note R[0,1,2]X.w = 0. + XMVECTOR R0X = R.r[0]; + XMVECTOR R1X = R.r[1]; + XMVECTOR R2X = R.r[2]; + + R = XMMatrixTranspose(R); + + // Columns. Note RX[0,1,2].w = 0. + XMVECTOR RX0 = R.r[0]; + XMVECTOR RX1 = R.r[1]; + XMVECTOR RX2 = R.r[2]; + + // Absolute value of rows. + XMVECTOR AR0X = XMVectorAbs(R0X); + XMVECTOR AR1X = XMVectorAbs(R1X); + XMVECTOR AR2X = XMVectorAbs(R2X); + + // Absolute value of columns. + XMVECTOR ARX0 = XMVectorAbs(RX0); + XMVECTOR ARX1 = XMVectorAbs(RX1); + XMVECTOR ARX2 = XMVectorAbs(RX2); + + // Test each of the 15 possible seperating axii. + XMVECTOR d, d_A, d_B; + + // l = a(u) = (1, 0, 0) + // t dot l = t.x + // d(A) = h(A).x + // d(B) = h(B) dot abs(r00, r01, r02) + d = XMVectorSplatX(t); + d_A = XMVectorSplatX(h_A); + d_B = XMVector3Dot(h_B, AR0X); + XMVECTOR NoIntersection = XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)); + + // l = a(v) = (0, 1, 0) + // t dot l = t.y + // d(A) = h(A).y + // d(B) = h(B) dot abs(r10, r11, r12) + d = XMVectorSplatY(t); + d_A = XMVectorSplatY(h_A); + d_B = XMVector3Dot(h_B, AR1X); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(w) = (0, 0, 1) + // t dot l = t.z + // d(A) = h(A).z + // d(B) = h(B) dot abs(r20, r21, r22) + d = XMVectorSplatZ(t); + d_A = XMVectorSplatZ(h_A); + d_B = XMVector3Dot(h_B, AR2X); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = b(u) = (r00, r10, r20) + // d(A) = h(A) dot abs(r00, r10, r20) + // d(B) = h(B).x + d = XMVector3Dot(t, RX0); + d_A = XMVector3Dot(h_A, ARX0); + d_B = XMVectorSplatX(h_B); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = b(v) = (r01, r11, r21) + // d(A) = h(A) dot abs(r01, r11, r21) + // d(B) = h(B).y + d = XMVector3Dot(t, RX1); + d_A = XMVector3Dot(h_A, ARX1); + d_B = XMVectorSplatY(h_B); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = b(w) = (r02, r12, r22) + // d(A) = h(A) dot abs(r02, r12, r22) + // d(B) = h(B).z + d = XMVector3Dot(t, RX2); + d_A = XMVector3Dot(h_A, ARX2); + d_B = XMVectorSplatZ(h_B); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(u) x b(u) = (0, -r20, r10) + // d(A) = h(A) dot abs(0, r20, r10) + // d(B) = h(B) dot abs(0, r02, r01) + d = XMVector3Dot(t, XMVectorPermute(RX0, XMVectorNegate(RX0))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX0)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR0X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(u) x b(v) = (0, -r21, r11) + // d(A) = h(A) dot abs(0, r21, r11) + // d(B) = h(B) dot abs(r02, 0, r00) + d = XMVector3Dot(t, XMVectorPermute(RX1, XMVectorNegate(RX1))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX1)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR0X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(u) x b(w) = (0, -r22, r12) + // d(A) = h(A) dot abs(0, r22, r12) + // d(B) = h(B) dot abs(r01, r00, 0) + d = XMVector3Dot(t, XMVectorPermute(RX2, XMVectorNegate(RX2))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX2)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR0X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(v) x b(u) = (r20, 0, -r00) + // d(A) = h(A) dot abs(r20, 0, r00) + // d(B) = h(B) dot abs(0, r12, r11) + d = XMVector3Dot(t, XMVectorPermute(RX0, XMVectorNegate(RX0))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX0)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR1X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(v) x b(v) = (r21, 0, -r01) + // d(A) = h(A) dot abs(r21, 0, r01) + // d(B) = h(B) dot abs(r12, 0, r10) + d = XMVector3Dot(t, XMVectorPermute(RX1, XMVectorNegate(RX1))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX1)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR1X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(v) x b(w) = (r22, 0, -r02) + // d(A) = h(A) dot abs(r22, 0, r02) + // d(B) = h(B) dot abs(r11, r10, 0) + d = XMVector3Dot(t, XMVectorPermute(RX2, XMVectorNegate(RX2))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX2)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR1X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(w) x b(u) = (-r10, r00, 0) + // d(A) = h(A) dot abs(r10, r00, 0) + // d(B) = h(B) dot abs(0, r22, r21) + d = XMVector3Dot(t, XMVectorPermute(RX0, XMVectorNegate(RX0))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX0)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR2X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(w) x b(v) = (-r11, r01, 0) + // d(A) = h(A) dot abs(r11, r01, 0) + // d(B) = h(B) dot abs(r22, 0, r20) + d = XMVector3Dot(t, XMVectorPermute(RX1, XMVectorNegate(RX1))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX1)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR2X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // l = a(w) x b(w) = (-r12, r02, 0) + // d(A) = h(A) dot abs(r12, r02, 0) + // d(B) = h(B) dot abs(r21, r20, 0) + d = XMVector3Dot(t, XMVectorPermute(RX2, XMVectorNegate(RX2))); + d_A = XMVector3Dot(h_A, XMVectorSwizzle(ARX2)); + d_B = XMVector3Dot(h_B, XMVectorSwizzle(AR2X)); + NoIntersection = XMVectorOrInt(NoIntersection, + XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B))); + + // No seperating axis found, boxes must intersect. + return XMVector4NotEqualInt(NoIntersection, XMVectorTrueInt()) ? true : false; +} + + +//----------------------------------------------------------------------------- +// Frustum vs. oriented box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects(const BoundingFrustum& fr) const noexcept +{ + return fr.Intersects(*this); +} + + +//----------------------------------------------------------------------------- +// Triangle vs. oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingOrientedBox::Intersects(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept +{ + // Load the box center & orientation. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + // Transform the triangle vertices into the space of the box. + XMVECTOR TV0 = XMVector3InverseRotate(XMVectorSubtract(V0, vCenter), vOrientation); + XMVECTOR TV1 = XMVector3InverseRotate(XMVectorSubtract(V1, vCenter), vOrientation); + XMVECTOR TV2 = XMVector3InverseRotate(XMVectorSubtract(V2, vCenter), vOrientation); + + BoundingBox box; + box.Center = XMFLOAT3(0.0f, 0.0f, 0.0f); + box.Extents = Extents; + + // Use the triangle vs axis aligned box intersection routine. + return box.Intersects(TV0, TV1, TV2); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV BoundingOrientedBox::Intersects(FXMVECTOR Plane) const noexcept +{ + assert(DirectX::Internal::XMPlaneIsUnit(Plane)); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + XMVECTOR BoxOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(BoxOrientation)); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne()); + + // Build the 3x3 rotation matrix that defines the box axes. + XMMATRIX R = XMMatrixRotationQuaternion(BoxOrientation); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane, Outside, Inside); + + // If the box is outside any plane it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return FRONT; + + // If the box is inside all planes it is inside. + if (XMVector4EqualInt(Inside, XMVectorTrueInt())) + return BACK; + + // The box is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with an oriented box +// using the slabs method. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingOrientedBox::Intersects(FXMVECTOR Origin, FXMVECTOR Direction, float& Dist) const noexcept +{ + assert(DirectX::Internal::XMVector3IsUnit(Direction)); + + static const XMVECTORU32 SelectY = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 } } }; + static const XMVECTORU32 SelectZ = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } }; + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Get the boxes normalized side directions. + XMMATRIX R = XMMatrixRotationQuaternion(vOrientation); + + // Adjust ray origin to be relative to center of the box. + XMVECTOR TOrigin = XMVectorSubtract(vCenter, Origin); + + // Compute the dot product againt each axis of the box. + XMVECTOR AxisDotOrigin = XMVector3Dot(R.r[0], TOrigin); + AxisDotOrigin = XMVectorSelect(AxisDotOrigin, XMVector3Dot(R.r[1], TOrigin), SelectY); + AxisDotOrigin = XMVectorSelect(AxisDotOrigin, XMVector3Dot(R.r[2], TOrigin), SelectZ); + + XMVECTOR AxisDotDirection = XMVector3Dot(R.r[0], Direction); + AxisDotDirection = XMVectorSelect(AxisDotDirection, XMVector3Dot(R.r[1], Direction), SelectY); + AxisDotDirection = XMVectorSelect(AxisDotDirection, XMVector3Dot(R.r[2], Direction), SelectZ); + + // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab. + XMVECTOR IsParallel = XMVectorLessOrEqual(XMVectorAbs(AxisDotDirection), g_RayEpsilon); + + // Test against all three axes simultaneously. + XMVECTOR InverseAxisDotDirection = XMVectorReciprocal(AxisDotDirection); + XMVECTOR t1 = XMVectorMultiply(XMVectorSubtract(AxisDotOrigin, vExtents), InverseAxisDotDirection); + XMVECTOR t2 = XMVectorMultiply(XMVectorAdd(AxisDotOrigin, vExtents), InverseAxisDotDirection); + + // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't + // use the results from any directions parallel to the slab. + XMVECTOR t_min = XMVectorSelect(XMVectorMin(t1, t2), g_FltMin, IsParallel); + XMVECTOR t_max = XMVectorSelect(XMVectorMax(t1, t2), g_FltMax, IsParallel); + + // t_min.x = maximum( t_min.x, t_min.y, t_min.z ); + // t_max.x = minimum( t_max.x, t_max.y, t_max.z ); + t_min = XMVectorMax(t_min, XMVectorSplatY(t_min)); // x = max(x,y) + t_min = XMVectorMax(t_min, XMVectorSplatZ(t_min)); // x = max(max(x,y),z) + t_max = XMVectorMin(t_max, XMVectorSplatY(t_max)); // x = min(x,y) + t_max = XMVectorMin(t_max, XMVectorSplatZ(t_max)); // x = min(min(x,y),z) + + // if ( t_min > t_max ) return false; + XMVECTOR NoIntersection = XMVectorGreater(XMVectorSplatX(t_min), XMVectorSplatX(t_max)); + + // if ( t_max < 0.0f ) return false; + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(XMVectorSplatX(t_max), XMVectorZero())); + + // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false; + XMVECTOR ParallelOverlap = XMVectorInBounds(AxisDotOrigin, vExtents); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorAndCInt(IsParallel, ParallelOverlap)); + + if (!DirectX::Internal::XMVector3AnyTrue(NoIntersection)) + { + // Store the x-component to *pDist + XMStoreFloat(&Dist, t_min); + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test an oriented box vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingOrientedBox::ContainedBy( + FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, + HXMVECTOR Plane4, HXMVECTOR Plane5) const noexcept +{ + // Load the box. + XMVECTOR vCenter = XMLoadFloat3(&Center); + XMVECTOR vExtents = XMLoadFloat3(&Extents); + XMVECTOR BoxOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(BoxOrientation)); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne()); + + // Build the 3x3 rotation matrix that defines the box axes. + XMMATRIX R = XMMatrixRotationQuaternion(BoxOrientation); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane0, Outside, Inside); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane1, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane2, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane3, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane4, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectOrientedBoxPlane(vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane5, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + // If the box is outside any plane it is outside. + if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt())) + return DISJOINT; + + // If the box is inside all planes it is inside. + if (XMVector4EqualInt(AllInside, XMVectorTrueInt())) + return CONTAINS; + + // The box is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Create oriented bounding box from axis-aligned bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingOrientedBox::CreateFromBoundingBox(BoundingOrientedBox& Out, const BoundingBox& box) noexcept +{ + Out.Center = box.Center; + Out.Extents = box.Extents; + Out.Orientation = XMFLOAT4(0.f, 0.f, 0.f, 1.f); +} + + +//----------------------------------------------------------------------------- +// Find the approximate minimum oriented bounding box containing a set of +// points. Exact computation of minimum oriented bounding box is possible but +// is slower and requires a more complex algorithm. +// The algorithm works by computing the inertia tensor of the points and then +// using the eigenvectors of the intertia tensor as the axes of the box. +// Computing the intertia tensor of the convex hull of the points will usually +// result in better bounding box but the computation is more complex. +// Exact computation of the minimum oriented bounding box is possible but the +// best know algorithm is O(N^3) and is significanly more complex to implement. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingOrientedBox::CreateFromPoints(BoundingOrientedBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride) noexcept +{ + assert(Count > 0); + assert(pPoints != nullptr); + + XMVECTOR CenterOfMass = XMVectorZero(); + + // Compute the center of mass and inertia tensor of the points. + for (size_t i = 0; i < Count; ++i) + { + XMVECTOR Point = XMLoadFloat3(reinterpret_cast(reinterpret_cast(pPoints) + i * Stride)); + + CenterOfMass = XMVectorAdd(CenterOfMass, Point); + } + + CenterOfMass = XMVectorMultiply(CenterOfMass, XMVectorReciprocal(XMVectorReplicate(float(Count)))); + + // Compute the inertia tensor of the points around the center of mass. + // Using the center of mass is not strictly necessary, but will hopefully + // improve the stability of finding the eigenvectors. + XMVECTOR XX_YY_ZZ = XMVectorZero(); + XMVECTOR XY_XZ_YZ = XMVectorZero(); + + for (size_t i = 0; i < Count; ++i) + { + XMVECTOR Point = XMVectorSubtract(XMLoadFloat3(reinterpret_cast(reinterpret_cast(pPoints) + i * Stride)), CenterOfMass); + + XX_YY_ZZ = XMVectorAdd(XX_YY_ZZ, XMVectorMultiply(Point, Point)); + + XMVECTOR XXY = XMVectorSwizzle(Point); + XMVECTOR YZZ = XMVectorSwizzle(Point); + + XY_XZ_YZ = XMVectorAdd(XY_XZ_YZ, XMVectorMultiply(XXY, YZZ)); + } + + XMVECTOR v1, v2, v3; + + // Compute the eigenvectors of the inertia tensor. + DirectX::Internal::CalculateEigenVectorsFromCovarianceMatrix(XMVectorGetX(XX_YY_ZZ), XMVectorGetY(XX_YY_ZZ), + XMVectorGetZ(XX_YY_ZZ), + XMVectorGetX(XY_XZ_YZ), XMVectorGetY(XY_XZ_YZ), + XMVectorGetZ(XY_XZ_YZ), + &v1, &v2, &v3); + + // Put them in a matrix. + XMMATRIX R; + + R.r[0] = XMVectorSetW(v1, 0.f); + R.r[1] = XMVectorSetW(v2, 0.f); + R.r[2] = XMVectorSetW(v3, 0.f); + R.r[3] = g_XMIdentityR3.v; + + // Multiply by -1 to convert the matrix into a right handed coordinate + // system (Det ~= 1) in case the eigenvectors form a left handed + // coordinate system (Det ~= -1) because XMQuaternionRotationMatrix only + // works on right handed matrices. + XMVECTOR Det = XMMatrixDeterminant(R); + + if (XMVector4Less(Det, XMVectorZero())) + { + R.r[0] = XMVectorMultiply(R.r[0], g_XMNegativeOne.v); + R.r[1] = XMVectorMultiply(R.r[1], g_XMNegativeOne.v); + R.r[2] = XMVectorMultiply(R.r[2], g_XMNegativeOne.v); + } + + // Get the rotation quaternion from the matrix. + XMVECTOR vOrientation = XMQuaternionRotationMatrix(R); + + // Make sure it is normal (in case the vectors are slightly non-orthogonal). + vOrientation = XMQuaternionNormalize(vOrientation); + + // Rebuild the rotation matrix from the quaternion. + R = XMMatrixRotationQuaternion(vOrientation); + + // Build the rotation into the rotated space. + XMMATRIX InverseR = XMMatrixTranspose(R); + + // Find the minimum OBB using the eigenvectors as the axes. + XMVECTOR vMin, vMax; + + vMin = vMax = XMVector3TransformNormal(XMLoadFloat3(pPoints), InverseR); + + for (size_t i = 1; i < Count; ++i) + { + XMVECTOR Point = XMVector3TransformNormal(XMLoadFloat3(reinterpret_cast(reinterpret_cast(pPoints) + i * Stride)), + InverseR); + + vMin = XMVectorMin(vMin, Point); + vMax = XMVectorMax(vMax, Point); + } + + // Rotate the center into world space. + XMVECTOR vCenter = XMVectorScale(XMVectorAdd(vMin, vMax), 0.5f); + vCenter = XMVector3TransformNormal(vCenter, R); + + // Store center, extents, and orientation. + XMStoreFloat3(&Out.Center, vCenter); + XMStoreFloat3(&Out.Extents, XMVectorScale(XMVectorSubtract(vMax, vMin), 0.5f)); + XMStoreFloat4(&Out.Orientation, vOrientation); +} + + +/**************************************************************************** + * + * BoundingFrustum + * + ****************************************************************************/ + +_Use_decl_annotations_ +inline BoundingFrustum::BoundingFrustum(CXMMATRIX Projection, bool rhcoords) noexcept +{ + CreateFromMatrix(*this, Projection, rhcoords); +} + + +//----------------------------------------------------------------------------- +// Transform a frustum by an angle preserving transform. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingFrustum::Transform(BoundingFrustum& Out, FXMMATRIX M) const noexcept +{ + // Load the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Composite the frustum rotation and the transform rotation + XMMATRIX nM; + nM.r[0] = XMVector3Normalize(M.r[0]); + nM.r[1] = XMVector3Normalize(M.r[1]); + nM.r[2] = XMVector3Normalize(M.r[2]); + nM.r[3] = g_XMIdentityR3; + XMVECTOR Rotation = XMQuaternionRotationMatrix(nM); + vOrientation = XMQuaternionMultiply(vOrientation, Rotation); + + // Transform the center. + vOrigin = XMVector3Transform(vOrigin, M); + + // Store the frustum. + XMStoreFloat3(&Out.Origin, vOrigin); + XMStoreFloat4(&Out.Orientation, vOrientation); + + // Scale the near and far distances (the slopes remain the same). + XMVECTOR dX = XMVector3Dot(M.r[0], M.r[0]); + XMVECTOR dY = XMVector3Dot(M.r[1], M.r[1]); + XMVECTOR dZ = XMVector3Dot(M.r[2], M.r[2]); + + XMVECTOR d = XMVectorMax(dX, XMVectorMax(dY, dZ)); + float Scale = sqrtf(XMVectorGetX(d)); + + Out.Near = Near * Scale; + Out.Far = Far * Scale; + + // Copy the slopes. + Out.RightSlope = RightSlope; + Out.LeftSlope = LeftSlope; + Out.TopSlope = TopSlope; + Out.BottomSlope = BottomSlope; +} + +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingFrustum::Transform(BoundingFrustum& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation) const noexcept +{ + assert(DirectX::Internal::XMQuaternionIsUnit(Rotation)); + + // Load the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Composite the frustum rotation and the transform rotation. + vOrientation = XMQuaternionMultiply(vOrientation, Rotation); + + // Transform the origin. + vOrigin = XMVectorAdd(XMVector3Rotate(XMVectorScale(vOrigin, Scale), Rotation), Translation); + + // Store the frustum. + XMStoreFloat3(&Out.Origin, vOrigin); + XMStoreFloat4(&Out.Orientation, vOrientation); + + // Scale the near and far distances (the slopes remain the same). + Out.Near = Near * Scale; + Out.Far = Far * Scale; + + // Copy the slopes. + Out.RightSlope = RightSlope; + Out.LeftSlope = LeftSlope; + Out.TopSlope = TopSlope; + Out.BottomSlope = BottomSlope; +} + + +//----------------------------------------------------------------------------- +// Get the corner points of the frustum +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingFrustum::GetCorners(XMFLOAT3* Corners) const noexcept +{ + assert(Corners != nullptr); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR vRightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vLeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR vLeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vNear = XMVectorReplicatePtr(&Near); + XMVECTOR vFar = XMVectorReplicatePtr(&Far); + + // Returns 8 corners position of bounding frustum. + // Near Far + // 0----1 4----5 + // | | | | + // | | | | + // 3----2 7----6 + + XMVECTOR vCorners[CORNER_COUNT]; + vCorners[0] = XMVectorMultiply(vLeftTop, vNear); + vCorners[1] = XMVectorMultiply(vRightTop, vNear); + vCorners[2] = XMVectorMultiply(vRightBottom, vNear); + vCorners[3] = XMVectorMultiply(vLeftBottom, vNear); + vCorners[4] = XMVectorMultiply(vLeftTop, vFar); + vCorners[5] = XMVectorMultiply(vRightTop, vFar); + vCorners[6] = XMVectorMultiply(vRightBottom, vFar); + vCorners[7] = XMVectorMultiply(vLeftBottom, vFar); + + for (size_t i = 0; i < CORNER_COUNT; ++i) + { + XMVECTOR C = XMVectorAdd(XMVector3Rotate(vCorners[i], vOrientation), vOrigin); + XMStoreFloat3(&Corners[i], C); + } +} + + +//----------------------------------------------------------------------------- +// Point in frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingFrustum::Contains(FXMVECTOR Point) const noexcept +{ + // Build frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + + // Load origin and orientation. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Transform point into local space of frustum. + XMVECTOR TPoint = XMVector3InverseRotate(XMVectorSubtract(Point, vOrigin), vOrientation); + + // Set w to one. + TPoint = XMVectorInsert<0, 0, 0, 0, 1>(TPoint, XMVectorSplatOne()); + + XMVECTOR Zero = XMVectorZero(); + XMVECTOR Outside = Zero; + + // Test point against each plane of the frustum. + for (size_t i = 0; i < 6; ++i) + { + XMVECTOR Dot = XMVector4Dot(TPoint, Planes[i]); + Outside = XMVectorOrInt(Outside, XMVectorGreater(Dot, Zero)); + } + + return XMVector4NotEqualInt(Outside, XMVectorTrueInt()) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingFrustum::Contains(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + NearPlane = DirectX::Internal::XMPlaneTransform(NearPlane, vOrientation, vOrigin); + NearPlane = XMPlaneNormalize(NearPlane); + + XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + FarPlane = DirectX::Internal::XMPlaneTransform(FarPlane, vOrientation, vOrigin); + FarPlane = XMPlaneNormalize(FarPlane); + + XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + RightPlane = DirectX::Internal::XMPlaneTransform(RightPlane, vOrientation, vOrigin); + RightPlane = XMPlaneNormalize(RightPlane); + + XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + LeftPlane = DirectX::Internal::XMPlaneTransform(LeftPlane, vOrientation, vOrigin); + LeftPlane = XMPlaneNormalize(LeftPlane); + + XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + TopPlane = DirectX::Internal::XMPlaneTransform(TopPlane, vOrientation, vOrigin); + TopPlane = XMPlaneNormalize(TopPlane); + + XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + BottomPlane = DirectX::Internal::XMPlaneTransform(BottomPlane, vOrientation, vOrigin); + BottomPlane = XMPlaneNormalize(BottomPlane); + + return TriangleTests::ContainedBy(V0, V1, V2, NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains(const BoundingSphere& sh) const noexcept +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + NearPlane = DirectX::Internal::XMPlaneTransform(NearPlane, vOrientation, vOrigin); + NearPlane = XMPlaneNormalize(NearPlane); + + XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + FarPlane = DirectX::Internal::XMPlaneTransform(FarPlane, vOrientation, vOrigin); + FarPlane = XMPlaneNormalize(FarPlane); + + XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + RightPlane = DirectX::Internal::XMPlaneTransform(RightPlane, vOrientation, vOrigin); + RightPlane = XMPlaneNormalize(RightPlane); + + XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + LeftPlane = DirectX::Internal::XMPlaneTransform(LeftPlane, vOrientation, vOrigin); + LeftPlane = XMPlaneNormalize(LeftPlane); + + XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + TopPlane = DirectX::Internal::XMPlaneTransform(TopPlane, vOrientation, vOrigin); + TopPlane = XMPlaneNormalize(TopPlane); + + XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + BottomPlane = DirectX::Internal::XMPlaneTransform(BottomPlane, vOrientation, vOrigin); + BottomPlane = XMPlaneNormalize(BottomPlane); + + return sh.ContainedBy(NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains(const BoundingBox& box) const noexcept +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + NearPlane = DirectX::Internal::XMPlaneTransform(NearPlane, vOrientation, vOrigin); + NearPlane = XMPlaneNormalize(NearPlane); + + XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + FarPlane = DirectX::Internal::XMPlaneTransform(FarPlane, vOrientation, vOrigin); + FarPlane = XMPlaneNormalize(FarPlane); + + XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + RightPlane = DirectX::Internal::XMPlaneTransform(RightPlane, vOrientation, vOrigin); + RightPlane = XMPlaneNormalize(RightPlane); + + XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + LeftPlane = DirectX::Internal::XMPlaneTransform(LeftPlane, vOrientation, vOrigin); + LeftPlane = XMPlaneNormalize(LeftPlane); + + XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + TopPlane = DirectX::Internal::XMPlaneTransform(TopPlane, vOrientation, vOrigin); + TopPlane = XMPlaneNormalize(TopPlane); + + XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + BottomPlane = DirectX::Internal::XMPlaneTransform(BottomPlane, vOrientation, vOrigin); + BottomPlane = XMPlaneNormalize(BottomPlane); + + return box.ContainedBy(NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains(const BoundingOrientedBox& box) const noexcept +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + NearPlane = DirectX::Internal::XMPlaneTransform(NearPlane, vOrientation, vOrigin); + NearPlane = XMPlaneNormalize(NearPlane); + + XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + FarPlane = DirectX::Internal::XMPlaneTransform(FarPlane, vOrientation, vOrigin); + FarPlane = XMPlaneNormalize(FarPlane); + + XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + RightPlane = DirectX::Internal::XMPlaneTransform(RightPlane, vOrientation, vOrigin); + RightPlane = XMPlaneNormalize(RightPlane); + + XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + LeftPlane = DirectX::Internal::XMPlaneTransform(LeftPlane, vOrientation, vOrigin); + LeftPlane = XMPlaneNormalize(LeftPlane); + + XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + TopPlane = DirectX::Internal::XMPlaneTransform(TopPlane, vOrientation, vOrigin); + TopPlane = XMPlaneNormalize(TopPlane); + + XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + BottomPlane = DirectX::Internal::XMPlaneTransform(BottomPlane, vOrientation, vOrigin); + BottomPlane = XMPlaneNormalize(BottomPlane); + + return box.ContainedBy(NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains(const BoundingFrustum& fr) const noexcept +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + NearPlane = DirectX::Internal::XMPlaneTransform(NearPlane, vOrientation, vOrigin); + NearPlane = XMPlaneNormalize(NearPlane); + + XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + FarPlane = DirectX::Internal::XMPlaneTransform(FarPlane, vOrientation, vOrigin); + FarPlane = XMPlaneNormalize(FarPlane); + + XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + RightPlane = DirectX::Internal::XMPlaneTransform(RightPlane, vOrientation, vOrigin); + RightPlane = XMPlaneNormalize(RightPlane); + + XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + LeftPlane = DirectX::Internal::XMPlaneTransform(LeftPlane, vOrientation, vOrigin); + LeftPlane = XMPlaneNormalize(LeftPlane); + + XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + TopPlane = DirectX::Internal::XMPlaneTransform(TopPlane, vOrientation, vOrigin); + TopPlane = XMPlaneNormalize(TopPlane); + + XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + BottomPlane = DirectX::Internal::XMPlaneTransform(BottomPlane, vOrientation, vOrigin); + BottomPlane = XMPlaneNormalize(BottomPlane); + + return fr.ContainedBy(NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane); +} + + +//----------------------------------------------------------------------------- +// Exact sphere vs frustum test. The algorithm first checks the sphere against +// the planes of the frustum, then if the plane checks were indeterminate finds +// the nearest feature (plane, line, point) on the frustum to the center of the +// sphere and compares the distance to the nearest feature to the radius of the +// sphere +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects(const BoundingSphere& sh) const noexcept +{ + XMVECTOR Zero = XMVectorZero(); + + // Build the frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + + // Normalize the planes so we can compare to the sphere radius. + Planes[2] = XMVector3Normalize(Planes[2]); + Planes[3] = XMVector3Normalize(Planes[3]); + Planes[4] = XMVector3Normalize(Planes[4]); + Planes[5] = XMVector3Normalize(Planes[5]); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3(&sh.Center); + XMVECTOR vRadius = XMVectorReplicatePtr(&sh.Radius); + + // Transform the center of the sphere into the local space of frustum. + vCenter = XMVector3InverseRotate(XMVectorSubtract(vCenter, vOrigin), vOrientation); + + // Set w of the center to one so we can dot4 with the plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne()); + + // Check against each plane of the frustum. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + XMVECTOR CenterInsideAll = XMVectorTrueInt(); + + XMVECTOR Dist[6]; + + for (size_t i = 0; i < 6; ++i) + { + Dist[i] = XMVector4Dot(vCenter, Planes[i]); + + // Outside the plane? + Outside = XMVectorOrInt(Outside, XMVectorGreater(Dist[i], vRadius)); + + // Fully inside the plane? + InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(Dist[i], XMVectorNegate(vRadius))); + + // Check if the center is inside the plane. + CenterInsideAll = XMVectorAndInt(CenterInsideAll, XMVectorLessOrEqual(Dist[i], Zero)); + } + + // If the sphere is outside any of the planes it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // If the sphere is inside all planes it is fully inside. + if (XMVector4EqualInt(InsideAll, XMVectorTrueInt())) + return true; + + // If the center of the sphere is inside all planes and the sphere intersects + // one or more planes then it must intersect. + if (XMVector4EqualInt(CenterInsideAll, XMVectorTrueInt())) + return true; + + // The sphere may be outside the frustum or intersecting the frustum. + // Find the nearest feature (face, edge, or corner) on the frustum + // to the sphere. + + // The faces adjacent to each face are: + static const size_t adjacent_faces[6][4] = + { + { 2, 3, 4, 5 }, // 0 + { 2, 3, 4, 5 }, // 1 + { 0, 1, 4, 5 }, // 2 + { 0, 1, 4, 5 }, // 3 + { 0, 1, 2, 3 }, // 4 + { 0, 1, 2, 3 } + }; // 5 + + XMVECTOR Intersects = XMVectorFalseInt(); + + // Check to see if the nearest feature is one of the planes. + for (size_t i = 0; i < 6; ++i) + { + // Find the nearest point on the plane to the center of the sphere. + XMVECTOR Point = XMVectorNegativeMultiplySubtract(Planes[i], Dist[i], vCenter); + + // Set w of the point to one. + Point = XMVectorInsert<0, 0, 0, 0, 1>(Point, XMVectorSplatOne()); + + // If the point is inside the face (inside the adjacent planes) then + // this plane is the nearest feature. + XMVECTOR InsideFace = XMVectorTrueInt(); + + for (size_t j = 0; j < 4; j++) + { + size_t plane_index = adjacent_faces[i][j]; + + InsideFace = XMVectorAndInt(InsideFace, + XMVectorLessOrEqual(XMVector4Dot(Point, Planes[plane_index]), Zero)); + } + + // Since we have already checked distance from the plane we know that the + // sphere must intersect if this plane is the nearest feature. + Intersects = XMVectorOrInt(Intersects, + XMVectorAndInt(XMVectorGreater(Dist[i], Zero), InsideFace)); + } + + if (XMVector4EqualInt(Intersects, XMVectorTrueInt())) + return true; + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR vRightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vLeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR vLeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vNear = XMVectorReplicatePtr(&Near); + XMVECTOR vFar = XMVectorReplicatePtr(&Far); + + XMVECTOR Corners[CORNER_COUNT]; + Corners[0] = XMVectorMultiply(vRightTop, vNear); + Corners[1] = XMVectorMultiply(vRightBottom, vNear); + Corners[2] = XMVectorMultiply(vLeftTop, vNear); + Corners[3] = XMVectorMultiply(vLeftBottom, vNear); + Corners[4] = XMVectorMultiply(vRightTop, vFar); + Corners[5] = XMVectorMultiply(vRightBottom, vFar); + Corners[6] = XMVectorMultiply(vLeftTop, vFar); + Corners[7] = XMVectorMultiply(vLeftBottom, vFar); + + // The Edges are: + static const size_t edges[12][2] = + { + { 0, 1 }, { 2, 3 }, { 0, 2 }, { 1, 3 }, // Near plane + { 4, 5 }, { 6, 7 }, { 4, 6 }, { 5, 7 }, // Far plane + { 0, 4 }, { 1, 5 }, { 2, 6 }, { 3, 7 }, + }; // Near to far + + XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius); + + // Check to see if the nearest feature is one of the edges (or corners). + for (size_t i = 0; i < 12; ++i) + { + size_t ei0 = edges[i][0]; + size_t ei1 = edges[i][1]; + + // Find the nearest point on the edge to the center of the sphere. + // The corners of the frustum are included as the endpoints of the edges. + XMVECTOR Point = DirectX::Internal::PointOnLineSegmentNearestPoint(Corners[ei0], Corners[ei1], vCenter); + + XMVECTOR Delta = XMVectorSubtract(vCenter, Point); + + XMVECTOR DistSq = XMVector3Dot(Delta, Delta); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersects = XMVectorOrInt(Intersects, XMVectorLessOrEqual(DistSq, RadiusSq)); + } + + if (XMVector4EqualInt(Intersects, XMVectorTrueInt())) + return true; + + // The sphere must be outside the frustum. + return false; +} + + +//----------------------------------------------------------------------------- +// Exact axis aligned box vs frustum test. Constructs an oriented box and uses +// the oriented box vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects(const BoundingBox& box) const noexcept +{ + // Make the axis aligned box oriented and do an OBB vs frustum test. + BoundingOrientedBox obox(box.Center, box.Extents, XMFLOAT4(0.f, 0.f, 0.f, 1.f)); + return Intersects(obox); +} + + +//----------------------------------------------------------------------------- +// Exact oriented box vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects(const BoundingOrientedBox& box) const noexcept +{ + static const XMVECTORU32 SelectY = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 } } }; + static const XMVECTORU32 SelectZ = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } }; + + XMVECTOR Zero = XMVectorZero(); + + // Build the frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR FrustumOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(FrustumOrientation)); + + // Load the box. + XMVECTOR Center = XMLoadFloat3(&box.Center); + XMVECTOR Extents = XMLoadFloat3(&box.Extents); + XMVECTOR BoxOrientation = XMLoadFloat4(&box.Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(BoxOrientation)); + + // Transform the oriented box into the space of the frustum in order to + // minimize the number of transforms we have to do. + Center = XMVector3InverseRotate(XMVectorSubtract(Center, vOrigin), FrustumOrientation); + BoxOrientation = XMQuaternionMultiply(BoxOrientation, XMQuaternionConjugate(FrustumOrientation)); + + // Set w of the center to one so we can dot4 with the plane. + Center = XMVectorInsert<0, 0, 0, 0, 1>(Center, XMVectorSplatOne()); + + // Build the 3x3 rotation matrix that defines the box axes. + XMMATRIX R = XMMatrixRotationQuaternion(BoxOrientation); + + // Check against each plane of the frustum. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + XMVECTOR CenterInsideAll = XMVectorTrueInt(); + + for (size_t i = 0; i < 6; ++i) + { + // Compute the distance to the center of the box. + XMVECTOR Dist = XMVector4Dot(Center, Planes[i]); + + // Project the axes of the box onto the normal of the plane. Half the + // length of the projection (sometime called the "radius") is equal to + // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) + // where h(i) are extents of the box, n is the plane normal, and b(i) are the + // axes of the box. + XMVECTOR Radius = XMVector3Dot(Planes[i], R.r[0]); + Radius = XMVectorSelect(Radius, XMVector3Dot(Planes[i], R.r[1]), SelectY); + Radius = XMVectorSelect(Radius, XMVector3Dot(Planes[i], R.r[2]), SelectZ); + Radius = XMVector3Dot(Extents, XMVectorAbs(Radius)); + + // Outside the plane? + Outside = XMVectorOrInt(Outside, XMVectorGreater(Dist, Radius)); + + // Fully inside the plane? + InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(Dist, XMVectorNegate(Radius))); + + // Check if the center is inside the plane. + CenterInsideAll = XMVectorAndInt(CenterInsideAll, XMVectorLessOrEqual(Dist, Zero)); + } + + // If the box is outside any of the planes it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // If the box is inside all planes it is fully inside. + if (XMVector4EqualInt(InsideAll, XMVectorTrueInt())) + return true; + + // If the center of the box is inside all planes and the box intersects + // one or more planes then it must intersect. + if (XMVector4EqualInt(CenterInsideAll, XMVectorTrueInt())) + return true; + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR vRightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vLeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR vLeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vNear = XMVectorReplicatePtr(&Near); + XMVECTOR vFar = XMVectorReplicatePtr(&Far); + + XMVECTOR Corners[CORNER_COUNT]; + Corners[0] = XMVectorMultiply(vRightTop, vNear); + Corners[1] = XMVectorMultiply(vRightBottom, vNear); + Corners[2] = XMVectorMultiply(vLeftTop, vNear); + Corners[3] = XMVectorMultiply(vLeftBottom, vNear); + Corners[4] = XMVectorMultiply(vRightTop, vFar); + Corners[5] = XMVectorMultiply(vRightBottom, vFar); + Corners[6] = XMVectorMultiply(vLeftTop, vFar); + Corners[7] = XMVectorMultiply(vLeftBottom, vFar); + + // Test against box axes (3) + { + // Find the min/max values of the projection of the frustum onto each axis. + XMVECTOR FrustumMin, FrustumMax; + + FrustumMin = XMVector3Dot(Corners[0], R.r[0]); + FrustumMin = XMVectorSelect(FrustumMin, XMVector3Dot(Corners[0], R.r[1]), SelectY); + FrustumMin = XMVectorSelect(FrustumMin, XMVector3Dot(Corners[0], R.r[2]), SelectZ); + FrustumMax = FrustumMin; + + for (size_t i = 1; i < BoundingOrientedBox::CORNER_COUNT; ++i) + { + XMVECTOR Temp = XMVector3Dot(Corners[i], R.r[0]); + Temp = XMVectorSelect(Temp, XMVector3Dot(Corners[i], R.r[1]), SelectY); + Temp = XMVectorSelect(Temp, XMVector3Dot(Corners[i], R.r[2]), SelectZ); + + FrustumMin = XMVectorMin(FrustumMin, Temp); + FrustumMax = XMVectorMax(FrustumMax, Temp); + } + + // Project the center of the box onto the axes. + XMVECTOR BoxDist = XMVector3Dot(Center, R.r[0]); + BoxDist = XMVectorSelect(BoxDist, XMVector3Dot(Center, R.r[1]), SelectY); + BoxDist = XMVectorSelect(BoxDist, XMVector3Dot(Center, R.r[2]), SelectZ); + + // The projection of the box onto the axis is just its Center and Extents. + // if (min > box_max || max < box_min) reject; + XMVECTOR Result = XMVectorOrInt(XMVectorGreater(FrustumMin, XMVectorAdd(BoxDist, Extents)), + XMVectorLess(FrustumMax, XMVectorSubtract(BoxDist, Extents))); + + if (DirectX::Internal::XMVector3AnyTrue(Result)) + return false; + } + + // Test against edge/edge axes (3*6). + XMVECTOR FrustumEdgeAxis[6]; + + FrustumEdgeAxis[0] = vRightTop; + FrustumEdgeAxis[1] = vRightBottom; + FrustumEdgeAxis[2] = vLeftTop; + FrustumEdgeAxis[3] = vLeftBottom; + FrustumEdgeAxis[4] = XMVectorSubtract(vRightTop, vLeftTop); + FrustumEdgeAxis[5] = XMVectorSubtract(vLeftBottom, vLeftTop); + + for (size_t i = 0; i < 3; ++i) + { + for (size_t j = 0; j < 6; j++) + { + // Compute the axis we are going to test. + XMVECTOR Axis = XMVector3Cross(R.r[i], FrustumEdgeAxis[j]); + + // Find the min/max values of the projection of the frustum onto the axis. + XMVECTOR FrustumMin, FrustumMax; + + FrustumMin = FrustumMax = XMVector3Dot(Axis, Corners[0]); + + for (size_t k = 1; k < CORNER_COUNT; k++) + { + XMVECTOR Temp = XMVector3Dot(Axis, Corners[k]); + FrustumMin = XMVectorMin(FrustumMin, Temp); + FrustumMax = XMVectorMax(FrustumMax, Temp); + } + + // Project the center of the box onto the axis. + XMVECTOR Dist = XMVector3Dot(Center, Axis); + + // Project the axes of the box onto the axis to find the "radius" of the box. + XMVECTOR Radius = XMVector3Dot(Axis, R.r[0]); + Radius = XMVectorSelect(Radius, XMVector3Dot(Axis, R.r[1]), SelectY); + Radius = XMVectorSelect(Radius, XMVector3Dot(Axis, R.r[2]), SelectZ); + Radius = XMVector3Dot(Extents, XMVectorAbs(Radius)); + + // if (center > max + radius || center < min - radius) reject; + Outside = XMVectorOrInt(Outside, XMVectorGreater(Dist, XMVectorAdd(FrustumMax, Radius))); + Outside = XMVectorOrInt(Outside, XMVectorLess(Dist, XMVectorSubtract(FrustumMin, Radius))); + } + } + + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // If we did not find a separating plane then the box must intersect the frustum. + return true; +} + + +//----------------------------------------------------------------------------- +// Exact frustum vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects(const BoundingFrustum& fr) const noexcept +{ + // Load origin and orientation of frustum B. + XMVECTOR OriginB = XMLoadFloat3(&Origin); + XMVECTOR OrientationB = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(OrientationB)); + + // Build the planes of frustum B. + XMVECTOR AxisB[6]; + AxisB[0] = XMVectorSet(0.0f, 0.0f, -1.0f, 0.0f); + AxisB[1] = XMVectorSet(0.0f, 0.0f, 1.0f, 0.0f); + AxisB[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + AxisB[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + AxisB[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + AxisB[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + + XMVECTOR PlaneDistB[6]; + PlaneDistB[0] = XMVectorNegate(XMVectorReplicatePtr(&Near)); + PlaneDistB[1] = XMVectorReplicatePtr(&Far); + PlaneDistB[2] = XMVectorZero(); + PlaneDistB[3] = XMVectorZero(); + PlaneDistB[4] = XMVectorZero(); + PlaneDistB[5] = XMVectorZero(); + + // Load origin and orientation of frustum A. + XMVECTOR OriginA = XMLoadFloat3(&fr.Origin); + XMVECTOR OrientationA = XMLoadFloat4(&fr.Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(OrientationA)); + + // Transform frustum A into the space of the frustum B in order to + // minimize the number of transforms we have to do. + OriginA = XMVector3InverseRotate(XMVectorSubtract(OriginA, OriginB), OrientationB); + OrientationA = XMQuaternionMultiply(OrientationA, XMQuaternionConjugate(OrientationB)); + + // Build the corners of frustum A (in the local space of B). + XMVECTOR RightTopA = XMVectorSet(fr.RightSlope, fr.TopSlope, 1.0f, 0.0f); + XMVECTOR RightBottomA = XMVectorSet(fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f); + XMVECTOR LeftTopA = XMVectorSet(fr.LeftSlope, fr.TopSlope, 1.0f, 0.0f); + XMVECTOR LeftBottomA = XMVectorSet(fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f); + XMVECTOR NearA = XMVectorReplicatePtr(&fr.Near); + XMVECTOR FarA = XMVectorReplicatePtr(&fr.Far); + + RightTopA = XMVector3Rotate(RightTopA, OrientationA); + RightBottomA = XMVector3Rotate(RightBottomA, OrientationA); + LeftTopA = XMVector3Rotate(LeftTopA, OrientationA); + LeftBottomA = XMVector3Rotate(LeftBottomA, OrientationA); + + XMVECTOR CornersA[CORNER_COUNT]; + CornersA[0] = XMVectorMultiplyAdd(RightTopA, NearA, OriginA); + CornersA[1] = XMVectorMultiplyAdd(RightBottomA, NearA, OriginA); + CornersA[2] = XMVectorMultiplyAdd(LeftTopA, NearA, OriginA); + CornersA[3] = XMVectorMultiplyAdd(LeftBottomA, NearA, OriginA); + CornersA[4] = XMVectorMultiplyAdd(RightTopA, FarA, OriginA); + CornersA[5] = XMVectorMultiplyAdd(RightBottomA, FarA, OriginA); + CornersA[6] = XMVectorMultiplyAdd(LeftTopA, FarA, OriginA); + CornersA[7] = XMVectorMultiplyAdd(LeftBottomA, FarA, OriginA); + + // Check frustum A against each plane of frustum B. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + + for (size_t i = 0; i < 6; ++i) + { + // Find the min/max projection of the frustum onto the plane normal. + XMVECTOR Min, Max; + + Min = Max = XMVector3Dot(AxisB[i], CornersA[0]); + + for (size_t j = 1; j < CORNER_COUNT; j++) + { + XMVECTOR Temp = XMVector3Dot(AxisB[i], CornersA[j]); + Min = XMVectorMin(Min, Temp); + Max = XMVectorMax(Max, Temp); + } + + // Outside the plane? + Outside = XMVectorOrInt(Outside, XMVectorGreater(Min, PlaneDistB[i])); + + // Fully inside the plane? + InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(Max, PlaneDistB[i])); + } + + // If the frustum A is outside any of the planes of frustum B it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // If frustum A is inside all planes of frustum B it is fully inside. + if (XMVector4EqualInt(InsideAll, XMVectorTrueInt())) + return true; + + // Build the corners of frustum B. + XMVECTOR RightTopB = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR RightBottomB = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR LeftTopB = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR LeftBottomB = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR NearB = XMVectorReplicatePtr(&Near); + XMVECTOR FarB = XMVectorReplicatePtr(&Far); + + XMVECTOR CornersB[BoundingFrustum::CORNER_COUNT]; + CornersB[0] = XMVectorMultiply(RightTopB, NearB); + CornersB[1] = XMVectorMultiply(RightBottomB, NearB); + CornersB[2] = XMVectorMultiply(LeftTopB, NearB); + CornersB[3] = XMVectorMultiply(LeftBottomB, NearB); + CornersB[4] = XMVectorMultiply(RightTopB, FarB); + CornersB[5] = XMVectorMultiply(RightBottomB, FarB); + CornersB[6] = XMVectorMultiply(LeftTopB, FarB); + CornersB[7] = XMVectorMultiply(LeftBottomB, FarB); + + // Build the planes of frustum A (in the local space of B). + XMVECTOR AxisA[6]; + XMVECTOR PlaneDistA[6]; + + AxisA[0] = XMVectorSet(0.0f, 0.0f, -1.0f, 0.0f); + AxisA[1] = XMVectorSet(0.0f, 0.0f, 1.0f, 0.0f); + AxisA[2] = XMVectorSet(1.0f, 0.0f, -fr.RightSlope, 0.0f); + AxisA[3] = XMVectorSet(-1.0f, 0.0f, fr.LeftSlope, 0.0f); + AxisA[4] = XMVectorSet(0.0f, 1.0f, -fr.TopSlope, 0.0f); + AxisA[5] = XMVectorSet(0.0f, -1.0f, fr.BottomSlope, 0.0f); + + AxisA[0] = XMVector3Rotate(AxisA[0], OrientationA); + AxisA[1] = XMVectorNegate(AxisA[0]); + AxisA[2] = XMVector3Rotate(AxisA[2], OrientationA); + AxisA[3] = XMVector3Rotate(AxisA[3], OrientationA); + AxisA[4] = XMVector3Rotate(AxisA[4], OrientationA); + AxisA[5] = XMVector3Rotate(AxisA[5], OrientationA); + + PlaneDistA[0] = XMVector3Dot(AxisA[0], CornersA[0]); // Re-use corner on near plane. + PlaneDistA[1] = XMVector3Dot(AxisA[1], CornersA[4]); // Re-use corner on far plane. + PlaneDistA[2] = XMVector3Dot(AxisA[2], OriginA); + PlaneDistA[3] = XMVector3Dot(AxisA[3], OriginA); + PlaneDistA[4] = XMVector3Dot(AxisA[4], OriginA); + PlaneDistA[5] = XMVector3Dot(AxisA[5], OriginA); + + // Check each axis of frustum A for a seperating plane (5). + for (size_t i = 0; i < 6; ++i) + { + // Find the minimum projection of the frustum onto the plane normal. + XMVECTOR Min; + + Min = XMVector3Dot(AxisA[i], CornersB[0]); + + for (size_t j = 1; j < CORNER_COUNT; j++) + { + XMVECTOR Temp = XMVector3Dot(AxisA[i], CornersB[j]); + Min = XMVectorMin(Min, Temp); + } + + // Outside the plane? + Outside = XMVectorOrInt(Outside, XMVectorGreater(Min, PlaneDistA[i])); + } + + // If the frustum B is outside any of the planes of frustum A it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // Check edge/edge axes (6 * 6). + XMVECTOR FrustumEdgeAxisA[6]; + FrustumEdgeAxisA[0] = RightTopA; + FrustumEdgeAxisA[1] = RightBottomA; + FrustumEdgeAxisA[2] = LeftTopA; + FrustumEdgeAxisA[3] = LeftBottomA; + FrustumEdgeAxisA[4] = XMVectorSubtract(RightTopA, LeftTopA); + FrustumEdgeAxisA[5] = XMVectorSubtract(LeftBottomA, LeftTopA); + + XMVECTOR FrustumEdgeAxisB[6]; + FrustumEdgeAxisB[0] = RightTopB; + FrustumEdgeAxisB[1] = RightBottomB; + FrustumEdgeAxisB[2] = LeftTopB; + FrustumEdgeAxisB[3] = LeftBottomB; + FrustumEdgeAxisB[4] = XMVectorSubtract(RightTopB, LeftTopB); + FrustumEdgeAxisB[5] = XMVectorSubtract(LeftBottomB, LeftTopB); + + for (size_t i = 0; i < 6; ++i) + { + for (size_t j = 0; j < 6; j++) + { + // Compute the axis we are going to test. + XMVECTOR Axis = XMVector3Cross(FrustumEdgeAxisA[i], FrustumEdgeAxisB[j]); + + // Find the min/max values of the projection of both frustums onto the axis. + XMVECTOR MinA, MaxA; + XMVECTOR MinB, MaxB; + + MinA = MaxA = XMVector3Dot(Axis, CornersA[0]); + MinB = MaxB = XMVector3Dot(Axis, CornersB[0]); + + for (size_t k = 1; k < CORNER_COUNT; k++) + { + XMVECTOR TempA = XMVector3Dot(Axis, CornersA[k]); + MinA = XMVectorMin(MinA, TempA); + MaxA = XMVectorMax(MaxA, TempA); + + XMVECTOR TempB = XMVector3Dot(Axis, CornersB[k]); + MinB = XMVectorMin(MinB, TempB); + MaxB = XMVectorMax(MaxB, TempB); + } + + // if (MinA > MaxB || MinB > MaxA) reject + Outside = XMVectorOrInt(Outside, XMVectorGreater(MinA, MaxB)); + Outside = XMVectorOrInt(Outside, XMVectorGreater(MinB, MaxA)); + } + } + + // If there is a seperating plane, then the frustums do not intersect. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // If we did not find a separating plane then the frustums intersect. + return true; +} + + +//----------------------------------------------------------------------------- +// Triangle vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingFrustum::Intersects(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept +{ + // Build the frustum planes (NOTE: D is negated from the usual). + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, -Near); + Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, Far); + Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Transform triangle into the local space of frustum. + XMVECTOR TV0 = XMVector3InverseRotate(XMVectorSubtract(V0, vOrigin), vOrientation); + XMVECTOR TV1 = XMVector3InverseRotate(XMVectorSubtract(V1, vOrigin), vOrientation); + XMVECTOR TV2 = XMVector3InverseRotate(XMVectorSubtract(V2, vOrigin), vOrientation); + + // Test each vertex of the triangle against the frustum planes. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + + for (size_t i = 0; i < 6; ++i) + { + XMVECTOR Dist0 = XMVector3Dot(TV0, Planes[i]); + XMVECTOR Dist1 = XMVector3Dot(TV1, Planes[i]); + XMVECTOR Dist2 = XMVector3Dot(TV2, Planes[i]); + + XMVECTOR MinDist = XMVectorMin(Dist0, Dist1); + MinDist = XMVectorMin(MinDist, Dist2); + XMVECTOR MaxDist = XMVectorMax(Dist0, Dist1); + MaxDist = XMVectorMax(MaxDist, Dist2); + + XMVECTOR PlaneDist = XMVectorSplatW(Planes[i]); + + // Outside the plane? + Outside = XMVectorOrInt(Outside, XMVectorGreater(MinDist, PlaneDist)); + + // Fully inside the plane? + InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(MaxDist, PlaneDist)); + } + + // If the triangle is outside any of the planes it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // If the triangle is inside all planes it is fully inside. + if (XMVector4EqualInt(InsideAll, XMVectorTrueInt())) + return true; + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR vRightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vLeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR vLeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vNear = XMVectorReplicatePtr(&Near); + XMVECTOR vFar = XMVectorReplicatePtr(&Far); + + XMVECTOR Corners[CORNER_COUNT]; + Corners[0] = XMVectorMultiply(vRightTop, vNear); + Corners[1] = XMVectorMultiply(vRightBottom, vNear); + Corners[2] = XMVectorMultiply(vLeftTop, vNear); + Corners[3] = XMVectorMultiply(vLeftBottom, vNear); + Corners[4] = XMVectorMultiply(vRightTop, vFar); + Corners[5] = XMVectorMultiply(vRightBottom, vFar); + Corners[6] = XMVectorMultiply(vLeftTop, vFar); + Corners[7] = XMVectorMultiply(vLeftBottom, vFar); + + // Test the plane of the triangle. + XMVECTOR Normal = XMVector3Cross(XMVectorSubtract(V1, V0), XMVectorSubtract(V2, V0)); + XMVECTOR Dist = XMVector3Dot(Normal, V0); + + XMVECTOR MinDist, MaxDist; + MinDist = MaxDist = XMVector3Dot(Corners[0], Normal); + for (size_t i = 1; i < CORNER_COUNT; ++i) + { + XMVECTOR Temp = XMVector3Dot(Corners[i], Normal); + MinDist = XMVectorMin(MinDist, Temp); + MaxDist = XMVectorMax(MaxDist, Temp); + } + + Outside = XMVectorOrInt(XMVectorGreater(MinDist, Dist), XMVectorLess(MaxDist, Dist)); + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // Check the edge/edge axes (3*6). + XMVECTOR TriangleEdgeAxis[3]; + TriangleEdgeAxis[0] = XMVectorSubtract(V1, V0); + TriangleEdgeAxis[1] = XMVectorSubtract(V2, V1); + TriangleEdgeAxis[2] = XMVectorSubtract(V0, V2); + + XMVECTOR FrustumEdgeAxis[6]; + FrustumEdgeAxis[0] = vRightTop; + FrustumEdgeAxis[1] = vRightBottom; + FrustumEdgeAxis[2] = vLeftTop; + FrustumEdgeAxis[3] = vLeftBottom; + FrustumEdgeAxis[4] = XMVectorSubtract(vRightTop, vLeftTop); + FrustumEdgeAxis[5] = XMVectorSubtract(vLeftBottom, vLeftTop); + + for (size_t i = 0; i < 3; ++i) + { + for (size_t j = 0; j < 6; j++) + { + // Compute the axis we are going to test. + XMVECTOR Axis = XMVector3Cross(TriangleEdgeAxis[i], FrustumEdgeAxis[j]); + + // Find the min/max of the projection of the triangle onto the axis. + XMVECTOR MinA, MaxA; + + XMVECTOR Dist0 = XMVector3Dot(V0, Axis); + XMVECTOR Dist1 = XMVector3Dot(V1, Axis); + XMVECTOR Dist2 = XMVector3Dot(V2, Axis); + + MinA = XMVectorMin(Dist0, Dist1); + MinA = XMVectorMin(MinA, Dist2); + MaxA = XMVectorMax(Dist0, Dist1); + MaxA = XMVectorMax(MaxA, Dist2); + + // Find the min/max of the projection of the frustum onto the axis. + XMVECTOR MinB, MaxB; + + MinB = MaxB = XMVector3Dot(Axis, Corners[0]); + + for (size_t k = 1; k < CORNER_COUNT; k++) + { + XMVECTOR Temp = XMVector3Dot(Axis, Corners[k]); + MinB = XMVectorMin(MinB, Temp); + MaxB = XMVectorMax(MaxB, Temp); + } + + // if (MinA > MaxB || MinB > MaxA) reject; + Outside = XMVectorOrInt(Outside, XMVectorGreater(MinA, MaxB)); + Outside = XMVectorOrInt(Outside, XMVectorGreater(MinB, MaxA)); + } + } + + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return false; + + // If we did not find a separating plane then the triangle must intersect the frustum. + return true; +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV BoundingFrustum::Intersects(FXMVECTOR Plane) const noexcept +{ + assert(DirectX::Internal::XMPlaneIsUnit(Plane)); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Set w of the origin to one so we can dot4 with a plane. + vOrigin = XMVectorInsert<0, 0, 0, 0, 1>(vOrigin, XMVectorSplatOne()); + + // Build the corners of the frustum (in world space). + XMVECTOR RightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR RightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR LeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR LeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vNear = XMVectorReplicatePtr(&Near); + XMVECTOR vFar = XMVectorReplicatePtr(&Far); + + RightTop = XMVector3Rotate(RightTop, vOrientation); + RightBottom = XMVector3Rotate(RightBottom, vOrientation); + LeftTop = XMVector3Rotate(LeftTop, vOrientation); + LeftBottom = XMVector3Rotate(LeftBottom, vOrientation); + + XMVECTOR Corners0 = XMVectorMultiplyAdd(RightTop, vNear, vOrigin); + XMVECTOR Corners1 = XMVectorMultiplyAdd(RightBottom, vNear, vOrigin); + XMVECTOR Corners2 = XMVectorMultiplyAdd(LeftTop, vNear, vOrigin); + XMVECTOR Corners3 = XMVectorMultiplyAdd(LeftBottom, vNear, vOrigin); + XMVECTOR Corners4 = XMVectorMultiplyAdd(RightTop, vFar, vOrigin); + XMVECTOR Corners5 = XMVectorMultiplyAdd(RightBottom, vFar, vOrigin); + XMVECTOR Corners6 = XMVectorMultiplyAdd(LeftTop, vFar, vOrigin); + XMVECTOR Corners7 = XMVectorMultiplyAdd(LeftBottom, vFar, vOrigin); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane, Outside, Inside); + + // If the frustum is outside any plane it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return FRONT; + + // If the frustum is inside all planes it is inside. + if (XMVector4EqualInt(Inside, XMVectorTrueInt())) + return BACK; + + // The frustum is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Ray vs. frustum test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingFrustum::Intersects(FXMVECTOR rayOrigin, FXMVECTOR Direction, float& Dist) const noexcept +{ + // If ray starts inside the frustum, return a distance of 0 for the hit + if (Contains(rayOrigin) == CONTAINS) + { + Dist = 0.0f; + return true; + } + + // Build the frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + + // Load origin and orientation of the frustum. + XMVECTOR frOrigin = XMLoadFloat3(&Origin); + XMVECTOR frOrientation = XMLoadFloat4(&Orientation); + + // This algorithm based on "Fast Ray-Convex Polyhedron Intersectin," in James Arvo, ed., Graphics Gems II pp. 247-250 + float tnear = -FLT_MAX; + float tfar = FLT_MAX; + + for (size_t i = 0; i < 6; ++i) + { + XMVECTOR Plane = DirectX::Internal::XMPlaneTransform(Planes[i], frOrientation, frOrigin); + Plane = XMPlaneNormalize(Plane); + + XMVECTOR AxisDotOrigin = XMPlaneDotCoord(Plane, rayOrigin); + XMVECTOR AxisDotDirection = XMVector3Dot(Plane, Direction); + + if (XMVector3LessOrEqual(XMVectorAbs(AxisDotDirection), g_RayEpsilon)) + { + // Ray is parallel to plane - check if ray origin is inside plane's + if (XMVector3Greater(AxisDotOrigin, g_XMZero)) + { + // Ray origin is outside half-space. + Dist = 0.f; + return false; + } + } + else + { + // Ray not parallel - get distance to plane. + float vd = XMVectorGetX(AxisDotDirection); + float vn = XMVectorGetX(AxisDotOrigin); + float t = -vn / vd; + if (vd < 0.0f) + { + // Front face - T is a near point. + if (t > tfar) + { + Dist = 0.f; + return false; + } + if (t > tnear) + { + // Hit near face. + tnear = t; + } + } + else + { + // back face - T is far point. + if (t < tnear) + { + Dist = 0.f; + return false; + } + if (t < tfar) + { + // Hit far face. + tfar = t; + } + } + } + } + + // Survived all tests. + // Note: if ray originates on polyhedron, may want to change 0.0f to some + // epsilon to avoid intersecting the originating face. + float distance = (tnear >= 0.0f) ? tnear : tfar; + if (distance >= 0.0f) + { + Dist = distance; + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test a frustum vs 6 planes (typically forming another frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingFrustum::ContainedBy( + FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, + HXMVECTOR Plane4, HXMVECTOR Plane5) const noexcept +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + assert(DirectX::Internal::XMQuaternionIsUnit(vOrientation)); + + // Set w of the origin to one so we can dot4 with a plane. + vOrigin = XMVectorInsert<0, 0, 0, 0, 1>(vOrigin, XMVectorSplatOne()); + + // Build the corners of the frustum (in world space). + XMVECTOR RightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR RightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR LeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f); + XMVECTOR LeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f); + XMVECTOR vNear = XMVectorReplicatePtr(&Near); + XMVECTOR vFar = XMVectorReplicatePtr(&Far); + + RightTop = XMVector3Rotate(RightTop, vOrientation); + RightBottom = XMVector3Rotate(RightBottom, vOrientation); + LeftTop = XMVector3Rotate(LeftTop, vOrientation); + LeftBottom = XMVector3Rotate(LeftBottom, vOrientation); + + XMVECTOR Corners0 = XMVectorMultiplyAdd(RightTop, vNear, vOrigin); + XMVECTOR Corners1 = XMVectorMultiplyAdd(RightBottom, vNear, vOrigin); + XMVECTOR Corners2 = XMVectorMultiplyAdd(LeftTop, vNear, vOrigin); + XMVECTOR Corners3 = XMVectorMultiplyAdd(LeftBottom, vNear, vOrigin); + XMVECTOR Corners4 = XMVectorMultiplyAdd(RightTop, vFar, vOrigin); + XMVECTOR Corners5 = XMVectorMultiplyAdd(RightBottom, vFar, vOrigin); + XMVECTOR Corners6 = XMVectorMultiplyAdd(LeftTop, vFar, vOrigin); + XMVECTOR Corners7 = XMVectorMultiplyAdd(LeftBottom, vFar, vOrigin); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane0, Outside, Inside); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane1, Outside, Inside); + + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane2, Outside, Inside); + + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane3, Outside, Inside); + + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane4, Outside, Inside); + + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectFrustumPlane(Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane5, Outside, Inside); + + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + // If the frustum is outside any plane it is outside. + if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt())) + return DISJOINT; + + // If the frustum is inside all planes it is inside. + if (XMVector4EqualInt(AllInside, XMVectorTrueInt())) + return CONTAINS; + + // The frustum is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Build the 6 frustum planes from a frustum. +// +// The intended use for these routines is for fast culling to a view frustum. +// When the volume being tested against a view frustum is small relative to the +// view frustum it is usually either inside all six planes of the frustum +// (CONTAINS) or outside one of the planes of the frustum (DISJOINT). If neither +// of these cases is true then it may or may not be intersecting the frustum +// (INTERSECTS) +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingFrustum::GetPlanes(XMVECTOR* NearPlane, XMVECTOR* FarPlane, XMVECTOR* RightPlane, + XMVECTOR* LeftPlane, XMVECTOR* TopPlane, XMVECTOR* BottomPlane) const noexcept +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3(&Origin); + XMVECTOR vOrientation = XMLoadFloat4(&Orientation); + + if (NearPlane) + { + XMVECTOR vNearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near); + vNearPlane = DirectX::Internal::XMPlaneTransform(vNearPlane, vOrientation, vOrigin); + *NearPlane = XMPlaneNormalize(vNearPlane); + } + + if (FarPlane) + { + XMVECTOR vFarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far); + vFarPlane = DirectX::Internal::XMPlaneTransform(vFarPlane, vOrientation, vOrigin); + *FarPlane = XMPlaneNormalize(vFarPlane); + } + + if (RightPlane) + { + XMVECTOR vRightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f); + vRightPlane = DirectX::Internal::XMPlaneTransform(vRightPlane, vOrientation, vOrigin); + *RightPlane = XMPlaneNormalize(vRightPlane); + } + + if (LeftPlane) + { + XMVECTOR vLeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f); + vLeftPlane = DirectX::Internal::XMPlaneTransform(vLeftPlane, vOrientation, vOrigin); + *LeftPlane = XMPlaneNormalize(vLeftPlane); + } + + if (TopPlane) + { + XMVECTOR vTopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f); + vTopPlane = DirectX::Internal::XMPlaneTransform(vTopPlane, vOrientation, vOrigin); + *TopPlane = XMPlaneNormalize(vTopPlane); + } + + if (BottomPlane) + { + XMVECTOR vBottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f); + vBottomPlane = DirectX::Internal::XMPlaneTransform(vBottomPlane, vOrientation, vOrigin); + *BottomPlane = XMPlaneNormalize(vBottomPlane); + } +} + + +//----------------------------------------------------------------------------- +// Build a frustum from a persepective projection matrix. The matrix may only +// contain a projection; any rotation, translation or scale will cause the +// constructed frustum to be incorrect. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingFrustum::CreateFromMatrix(BoundingFrustum& Out, FXMMATRIX Projection, bool rhcoords) noexcept +{ + // Corners of the projection frustum in homogenous space. + static XMVECTORF32 HomogenousPoints[6] = + { + { { { 1.0f, 0.0f, 1.0f, 1.0f } } }, // right (at far plane) + { { { -1.0f, 0.0f, 1.0f, 1.0f } } }, // left + { { { 0.0f, 1.0f, 1.0f, 1.0f } } }, // top + { { { 0.0f, -1.0f, 1.0f, 1.0f } } }, // bottom + + { { { 0.0f, 0.0f, 0.0f, 1.0f } } }, // near + { { { 0.0f, 0.0f, 1.0f, 1.0f } } } // far + }; + + XMVECTOR Determinant; + XMMATRIX matInverse = XMMatrixInverse(&Determinant, Projection); + + // Compute the frustum corners in world space. + XMVECTOR Points[6]; + + for (size_t i = 0; i < 6; ++i) + { + // Transform point. + Points[i] = XMVector4Transform(HomogenousPoints[i], matInverse); + } + + Out.Origin = XMFLOAT3(0.0f, 0.0f, 0.0f); + Out.Orientation = XMFLOAT4(0.0f, 0.0f, 0.0f, 1.0f); + + // Compute the slopes. + Points[0] = XMVectorMultiply(Points[0], XMVectorReciprocal(XMVectorSplatZ(Points[0]))); + Points[1] = XMVectorMultiply(Points[1], XMVectorReciprocal(XMVectorSplatZ(Points[1]))); + Points[2] = XMVectorMultiply(Points[2], XMVectorReciprocal(XMVectorSplatZ(Points[2]))); + Points[3] = XMVectorMultiply(Points[3], XMVectorReciprocal(XMVectorSplatZ(Points[3]))); + + Out.RightSlope = XMVectorGetX(Points[0]); + Out.LeftSlope = XMVectorGetX(Points[1]); + Out.TopSlope = XMVectorGetY(Points[2]); + Out.BottomSlope = XMVectorGetY(Points[3]); + + // Compute near and far. + Points[4] = XMVectorMultiply(Points[4], XMVectorReciprocal(XMVectorSplatW(Points[4]))); + Points[5] = XMVectorMultiply(Points[5], XMVectorReciprocal(XMVectorSplatW(Points[5]))); + + if (rhcoords) + { + Out.Near = XMVectorGetZ(Points[5]); + Out.Far = XMVectorGetZ(Points[4]); + } + else + { + Out.Near = XMVectorGetZ(Points[4]); + Out.Far = XMVectorGetZ(Points[5]); + } +} + + +/**************************************************************************** + * + * TriangleTests + * + ****************************************************************************/ + +namespace TriangleTests +{ + + //----------------------------------------------------------------------------- + // Compute the intersection of a ray (Origin, Direction) with a triangle + // (V0, V1, V2). Return true if there is an intersection and also set *pDist + // to the distance along the ray to the intersection. + // + // The algorithm is based on Moller, Tomas and Trumbore, "Fast, Minimum Storage + // Ray-Triangle Intersection", Journal of Graphics Tools, vol. 2, no. 1, + // pp 21-28, 1997. + //----------------------------------------------------------------------------- + _Use_decl_annotations_ + inline bool XM_CALLCONV Intersects( + FXMVECTOR Origin, FXMVECTOR Direction, FXMVECTOR V0, + GXMVECTOR V1, + HXMVECTOR V2, float& Dist) noexcept + { + assert(DirectX::Internal::XMVector3IsUnit(Direction)); + + XMVECTOR Zero = XMVectorZero(); + + XMVECTOR e1 = XMVectorSubtract(V1, V0); + XMVECTOR e2 = XMVectorSubtract(V2, V0); + + // p = Direction ^ e2; + XMVECTOR p = XMVector3Cross(Direction, e2); + + // det = e1 * p; + XMVECTOR det = XMVector3Dot(e1, p); + + XMVECTOR u, v, t; + + if (XMVector3GreaterOrEqual(det, g_RayEpsilon)) + { + // Determinate is positive (front side of the triangle). + XMVECTOR s = XMVectorSubtract(Origin, V0); + + // u = s * p; + u = XMVector3Dot(s, p); + + XMVECTOR NoIntersection = XMVectorLess(u, Zero); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(u, det)); + + // q = s ^ e1; + XMVECTOR q = XMVector3Cross(s, e1); + + // v = Direction * q; + v = XMVector3Dot(Direction, q); + + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(v, Zero)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(XMVectorAdd(u, v), det)); + + // t = e2 * q; + t = XMVector3Dot(e2, q); + + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(t, Zero)); + + if (XMVector4EqualInt(NoIntersection, XMVectorTrueInt())) + { + Dist = 0.f; + return false; + } + } + else if (XMVector3LessOrEqual(det, g_RayNegEpsilon)) + { + // Determinate is negative (back side of the triangle). + XMVECTOR s = XMVectorSubtract(Origin, V0); + + // u = s * p; + u = XMVector3Dot(s, p); + + XMVECTOR NoIntersection = XMVectorGreater(u, Zero); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(u, det)); + + // q = s ^ e1; + XMVECTOR q = XMVector3Cross(s, e1); + + // v = Direction * q; + v = XMVector3Dot(Direction, q); + + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(v, Zero)); + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(XMVectorAdd(u, v), det)); + + // t = e2 * q; + t = XMVector3Dot(e2, q); + + NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(t, Zero)); + + if (XMVector4EqualInt(NoIntersection, XMVectorTrueInt())) + { + Dist = 0.f; + return false; + } + } + else + { + // Parallel ray. + Dist = 0.f; + return false; + } + + t = XMVectorDivide(t, det); + + // (u / det) and (v / dev) are the barycentric cooridinates of the intersection. + + // Store the x-component to *pDist + XMStoreFloat(&Dist, t); + + return true; + } + + + //----------------------------------------------------------------------------- + // Test if two triangles intersect. + // + // The final test of algorithm is based on Shen, Heng, and Tang, "A Fast + // Triangle-Triangle Overlap Test Using Signed Distances", Journal of Graphics + // Tools, vol. 8, no. 1, pp 17-23, 2003 and Guigue and Devillers, "Fast and + // Robust Triangle-Triangle Overlap Test Using Orientation Predicates", Journal + // of Graphics Tools, vol. 8, no. 1, pp 25-32, 2003. + // + // The final test could be considered an edge-edge separating plane test with + // the 9 possible cases narrowed down to the only two pairs of edges that can + // actaully result in a seperation. + //----------------------------------------------------------------------------- + _Use_decl_annotations_ + inline bool XM_CALLCONV Intersects(FXMVECTOR A0, FXMVECTOR A1, FXMVECTOR A2, GXMVECTOR B0, HXMVECTOR B1, HXMVECTOR B2) noexcept + { + static const XMVECTORU32 SelectY = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 } } }; + static const XMVECTORU32 SelectZ = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } }; + static const XMVECTORU32 Select0111 = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_1 } } }; + static const XMVECTORU32 Select1011 = { { { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 } } }; + static const XMVECTORU32 Select1101 = { { { XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1 } } }; + + XMVECTOR Zero = XMVectorZero(); + + // Compute the normal of triangle A. + XMVECTOR N1 = XMVector3Cross(XMVectorSubtract(A1, A0), XMVectorSubtract(A2, A0)); + + // Assert that the triangle is not degenerate. + assert(!XMVector3Equal(N1, Zero)); + + // Test points of B against the plane of A. + XMVECTOR BDist = XMVector3Dot(N1, XMVectorSubtract(B0, A0)); + BDist = XMVectorSelect(BDist, XMVector3Dot(N1, XMVectorSubtract(B1, A0)), SelectY); + BDist = XMVectorSelect(BDist, XMVector3Dot(N1, XMVectorSubtract(B2, A0)), SelectZ); + + // Ensure robustness with co-planar triangles by zeroing small distances. + uint32_t BDistIsZeroCR; + XMVECTOR BDistIsZero = XMVectorGreaterR(&BDistIsZeroCR, g_RayEpsilon, XMVectorAbs(BDist)); + BDist = XMVectorSelect(BDist, Zero, BDistIsZero); + + uint32_t BDistIsLessCR; + XMVECTOR BDistIsLess = XMVectorGreaterR(&BDistIsLessCR, Zero, BDist); + + uint32_t BDistIsGreaterCR; + XMVECTOR BDistIsGreater = XMVectorGreaterR(&BDistIsGreaterCR, BDist, Zero); + + // If all the points are on the same side we don't intersect. + if (XMComparisonAllTrue(BDistIsLessCR) || XMComparisonAllTrue(BDistIsGreaterCR)) + return false; + + // Compute the normal of triangle B. + XMVECTOR N2 = XMVector3Cross(XMVectorSubtract(B1, B0), XMVectorSubtract(B2, B0)); + + // Assert that the triangle is not degenerate. + assert(!XMVector3Equal(N2, Zero)); + + // Test points of A against the plane of B. + XMVECTOR ADist = XMVector3Dot(N2, XMVectorSubtract(A0, B0)); + ADist = XMVectorSelect(ADist, XMVector3Dot(N2, XMVectorSubtract(A1, B0)), SelectY); + ADist = XMVectorSelect(ADist, XMVector3Dot(N2, XMVectorSubtract(A2, B0)), SelectZ); + + // Ensure robustness with co-planar triangles by zeroing small distances. + uint32_t ADistIsZeroCR; + XMVECTOR ADistIsZero = XMVectorGreaterR(&ADistIsZeroCR, g_RayEpsilon, XMVectorAbs(BDist)); + ADist = XMVectorSelect(ADist, Zero, ADistIsZero); + + uint32_t ADistIsLessCR; + XMVECTOR ADistIsLess = XMVectorGreaterR(&ADistIsLessCR, Zero, ADist); + + uint32_t ADistIsGreaterCR; + XMVECTOR ADistIsGreater = XMVectorGreaterR(&ADistIsGreaterCR, ADist, Zero); + + // If all the points are on the same side we don't intersect. + if (XMComparisonAllTrue(ADistIsLessCR) || XMComparisonAllTrue(ADistIsGreaterCR)) + return false; + + // Special case for co-planar triangles. + if (XMComparisonAllTrue(ADistIsZeroCR) || XMComparisonAllTrue(BDistIsZeroCR)) + { + XMVECTOR Axis, Dist, MinDist; + + // Compute an axis perpindicular to the edge (points out). + Axis = XMVector3Cross(N1, XMVectorSubtract(A1, A0)); + Dist = XMVector3Dot(Axis, A0); + + // Test points of B against the axis. + MinDist = XMVector3Dot(B0, Axis); + MinDist = XMVectorMin(MinDist, XMVector3Dot(B1, Axis)); + MinDist = XMVectorMin(MinDist, XMVector3Dot(B2, Axis)); + if (XMVector4GreaterOrEqual(MinDist, Dist)) + return false; + + // Edge (A1, A2) + Axis = XMVector3Cross(N1, XMVectorSubtract(A2, A1)); + Dist = XMVector3Dot(Axis, A1); + + MinDist = XMVector3Dot(B0, Axis); + MinDist = XMVectorMin(MinDist, XMVector3Dot(B1, Axis)); + MinDist = XMVectorMin(MinDist, XMVector3Dot(B2, Axis)); + if (XMVector4GreaterOrEqual(MinDist, Dist)) + return false; + + // Edge (A2, A0) + Axis = XMVector3Cross(N1, XMVectorSubtract(A0, A2)); + Dist = XMVector3Dot(Axis, A2); + + MinDist = XMVector3Dot(B0, Axis); + MinDist = XMVectorMin(MinDist, XMVector3Dot(B1, Axis)); + MinDist = XMVectorMin(MinDist, XMVector3Dot(B2, Axis)); + if (XMVector4GreaterOrEqual(MinDist, Dist)) + return false; + + // Edge (B0, B1) + Axis = XMVector3Cross(N2, XMVectorSubtract(B1, B0)); + Dist = XMVector3Dot(Axis, B0); + + MinDist = XMVector3Dot(A0, Axis); + MinDist = XMVectorMin(MinDist, XMVector3Dot(A1, Axis)); + MinDist = XMVectorMin(MinDist, XMVector3Dot(A2, Axis)); + if (XMVector4GreaterOrEqual(MinDist, Dist)) + return false; + + // Edge (B1, B2) + Axis = XMVector3Cross(N2, XMVectorSubtract(B2, B1)); + Dist = XMVector3Dot(Axis, B1); + + MinDist = XMVector3Dot(A0, Axis); + MinDist = XMVectorMin(MinDist, XMVector3Dot(A1, Axis)); + MinDist = XMVectorMin(MinDist, XMVector3Dot(A2, Axis)); + if (XMVector4GreaterOrEqual(MinDist, Dist)) + return false; + + // Edge (B2,B0) + Axis = XMVector3Cross(N2, XMVectorSubtract(B0, B2)); + Dist = XMVector3Dot(Axis, B2); + + MinDist = XMVector3Dot(A0, Axis); + MinDist = XMVectorMin(MinDist, XMVector3Dot(A1, Axis)); + MinDist = XMVectorMin(MinDist, XMVector3Dot(A2, Axis)); + if (XMVector4GreaterOrEqual(MinDist, Dist)) + return false; + + return true; + } + + // + // Find the single vertex of A and B (ie the vertex on the opposite side + // of the plane from the other two) and reorder the edges so we can compute + // the signed edge/edge distances. + // + // if ( (V0 >= 0 && V1 < 0 && V2 < 0) || + // (V0 > 0 && V1 <= 0 && V2 <= 0) || + // (V0 <= 0 && V1 > 0 && V2 > 0) || + // (V0 < 0 && V1 >= 0 && V2 >= 0) ) then V0 is singular; + // + // If our singular vertex is not on the positive side of the plane we reverse + // the triangle winding so that the overlap comparisons will compare the + // correct edges with the correct signs. + // + XMVECTOR ADistIsLessEqual = XMVectorOrInt(ADistIsLess, ADistIsZero); + XMVECTOR ADistIsGreaterEqual = XMVectorOrInt(ADistIsGreater, ADistIsZero); + + XMVECTOR AA0, AA1, AA2; + bool bPositiveA; + + if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreaterEqual, ADistIsLess, Select0111)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreater, ADistIsLessEqual, Select0111))) + { + // A0 is singular, crossing from positive to negative. + AA0 = A0; AA1 = A1; AA2 = A2; + bPositiveA = true; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLessEqual, ADistIsGreater, Select0111)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLess, ADistIsGreaterEqual, Select0111))) + { + // A0 is singular, crossing from negative to positive. + AA0 = A0; AA1 = A2; AA2 = A1; + bPositiveA = false; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreaterEqual, ADistIsLess, Select1011)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreater, ADistIsLessEqual, Select1011))) + { + // A1 is singular, crossing from positive to negative. + AA0 = A1; AA1 = A2; AA2 = A0; + bPositiveA = true; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLessEqual, ADistIsGreater, Select1011)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLess, ADistIsGreaterEqual, Select1011))) + { + // A1 is singular, crossing from negative to positive. + AA0 = A1; AA1 = A0; AA2 = A2; + bPositiveA = false; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreaterEqual, ADistIsLess, Select1101)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsGreater, ADistIsLessEqual, Select1101))) + { + // A2 is singular, crossing from positive to negative. + AA0 = A2; AA1 = A0; AA2 = A1; + bPositiveA = true; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLessEqual, ADistIsGreater, Select1101)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(ADistIsLess, ADistIsGreaterEqual, Select1101))) + { + // A2 is singular, crossing from negative to positive. + AA0 = A2; AA1 = A1; AA2 = A0; + bPositiveA = false; + } + else + { + assert(false); + return false; + } + + XMVECTOR BDistIsLessEqual = XMVectorOrInt(BDistIsLess, BDistIsZero); + XMVECTOR BDistIsGreaterEqual = XMVectorOrInt(BDistIsGreater, BDistIsZero); + + XMVECTOR BB0, BB1, BB2; + bool bPositiveB; + + if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreaterEqual, BDistIsLess, Select0111)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreater, BDistIsLessEqual, Select0111))) + { + // B0 is singular, crossing from positive to negative. + BB0 = B0; BB1 = B1; BB2 = B2; + bPositiveB = true; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLessEqual, BDistIsGreater, Select0111)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLess, BDistIsGreaterEqual, Select0111))) + { + // B0 is singular, crossing from negative to positive. + BB0 = B0; BB1 = B2; BB2 = B1; + bPositiveB = false; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreaterEqual, BDistIsLess, Select1011)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreater, BDistIsLessEqual, Select1011))) + { + // B1 is singular, crossing from positive to negative. + BB0 = B1; BB1 = B2; BB2 = B0; + bPositiveB = true; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLessEqual, BDistIsGreater, Select1011)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLess, BDistIsGreaterEqual, Select1011))) + { + // B1 is singular, crossing from negative to positive. + BB0 = B1; BB1 = B0; BB2 = B2; + bPositiveB = false; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreaterEqual, BDistIsLess, Select1101)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsGreater, BDistIsLessEqual, Select1101))) + { + // B2 is singular, crossing from positive to negative. + BB0 = B2; BB1 = B0; BB2 = B1; + bPositiveB = true; + } + else if (DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLessEqual, BDistIsGreater, Select1101)) || + DirectX::Internal::XMVector3AllTrue(XMVectorSelect(BDistIsLess, BDistIsGreaterEqual, Select1101))) + { + // B2 is singular, crossing from negative to positive. + BB0 = B2; BB1 = B1; BB2 = B0; + bPositiveB = false; + } + else + { + assert(false); + return false; + } + + XMVECTOR Delta0, Delta1; + + // Reverse the direction of the test depending on whether the singular vertices are + // the same sign or different signs. + if (bPositiveA ^ bPositiveB) + { + Delta0 = XMVectorSubtract(BB0, AA0); + Delta1 = XMVectorSubtract(AA0, BB0); + } + else + { + Delta0 = XMVectorSubtract(AA0, BB0); + Delta1 = XMVectorSubtract(BB0, AA0); + } + + // Check if the triangles overlap on the line of intersection between the + // planes of the two triangles by finding the signed line distances. + XMVECTOR Dist0 = XMVector3Dot(Delta0, XMVector3Cross(XMVectorSubtract(BB2, BB0), XMVectorSubtract(AA2, AA0))); + if (XMVector4Greater(Dist0, Zero)) + return false; + + XMVECTOR Dist1 = XMVector3Dot(Delta1, XMVector3Cross(XMVectorSubtract(BB1, BB0), XMVectorSubtract(AA1, AA0))); + if (XMVector4Greater(Dist1, Zero)) + return false; + + return true; + } + + + //----------------------------------------------------------------------------- + // Ray-triangle test + //----------------------------------------------------------------------------- + _Use_decl_annotations_ + inline PlaneIntersectionType XM_CALLCONV Intersects(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane) noexcept + { + XMVECTOR One = XMVectorSplatOne(); + + assert(DirectX::Internal::XMPlaneIsUnit(Plane)); + + // Set w of the points to one so we can dot4 with a plane. + XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One); + XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One); + XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane, Outside, Inside); + + // If the triangle is outside any plane it is outside. + if (XMVector4EqualInt(Outside, XMVectorTrueInt())) + return FRONT; + + // If the triangle is inside all planes it is inside. + if (XMVector4EqualInt(Inside, XMVectorTrueInt())) + return BACK; + + // The triangle is not inside all planes or outside a plane it intersects. + return INTERSECTING; + } + + + //----------------------------------------------------------------------------- + // Test a triangle vs 6 planes (typically forming a frustum). + //----------------------------------------------------------------------------- + _Use_decl_annotations_ + inline ContainmentType XM_CALLCONV ContainedBy( + FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, + GXMVECTOR Plane0, + HXMVECTOR Plane1, HXMVECTOR Plane2, + CXMVECTOR Plane3, CXMVECTOR Plane4, CXMVECTOR Plane5) noexcept + { + XMVECTOR One = XMVectorSplatOne(); + + // Set w of the points to one so we can dot4 with a plane. + XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One); + XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One); + XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane0, Outside, Inside); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane1, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane2, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane3, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane4, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + DirectX::Internal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane5, Outside, Inside); + AnyOutside = XMVectorOrInt(AnyOutside, Outside); + AllInside = XMVectorAndInt(AllInside, Inside); + + // If the triangle is outside any plane it is outside. + if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt())) + return DISJOINT; + + // If the triangle is inside all planes it is inside. + if (XMVector4EqualInt(AllInside, XMVectorTrueInt())) + return CONTAINS; + + // The triangle is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; + } + +} // namespace TriangleTests + diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXColors.h b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXColors.h new file mode 100644 index 000000000..83fa21093 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXColors.h @@ -0,0 +1,312 @@ +//------------------------------------------------------------------------------------- +// DirectXColors.h -- C++ Color Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#include "DirectXMath.h" + +namespace DirectX +{ + + namespace Colors + { + // Standard colors (Red/Green/Blue/Alpha) in sRGB colorspace + XMGLOBALCONST XMVECTORF32 AliceBlue = { { { 0.941176534f, 0.972549081f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 AntiqueWhite = { { { 0.980392218f, 0.921568692f, 0.843137324f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Aqua = { { { 0.f, 1.f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Aquamarine = { { { 0.498039246f, 1.f, 0.831372619f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Azure = { { { 0.941176534f, 1.f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Beige = { { { 0.960784376f, 0.960784376f, 0.862745166f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Bisque = { { { 1.f, 0.894117713f, 0.768627524f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Black = { { { 0.f, 0.f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 BlanchedAlmond = { { { 1.f, 0.921568692f, 0.803921640f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Blue = { { { 0.f, 0.f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 BlueViolet = { { { 0.541176498f, 0.168627456f, 0.886274576f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Brown = { { { 0.647058845f, 0.164705887f, 0.164705887f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 BurlyWood = { { { 0.870588303f, 0.721568644f, 0.529411793f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 CadetBlue = { { { 0.372549027f, 0.619607866f, 0.627451003f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Chartreuse = { { { 0.498039246f, 1.f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Chocolate = { { { 0.823529482f, 0.411764741f, 0.117647067f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Coral = { { { 1.f, 0.498039246f, 0.313725501f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 CornflowerBlue = { { { 0.392156899f, 0.584313750f, 0.929411829f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Cornsilk = { { { 1.f, 0.972549081f, 0.862745166f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Crimson = { { { 0.862745166f, 0.078431375f, 0.235294133f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Cyan = { { { 0.f, 1.f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkBlue = { { { 0.f, 0.f, 0.545098066f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkCyan = { { { 0.f, 0.545098066f, 0.545098066f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkGoldenrod = { { { 0.721568644f, 0.525490224f, 0.043137256f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkGray = { { { 0.662745118f, 0.662745118f, 0.662745118f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkGreen = { { { 0.f, 0.392156899f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkKhaki = { { { 0.741176486f, 0.717647076f, 0.419607878f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkMagenta = { { { 0.545098066f, 0.f, 0.545098066f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkOliveGreen = { { { 0.333333343f, 0.419607878f, 0.184313729f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkOrange = { { { 1.f, 0.549019635f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkOrchid = { { { 0.600000024f, 0.196078449f, 0.800000072f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkRed = { { { 0.545098066f, 0.f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkSalmon = { { { 0.913725555f, 0.588235319f, 0.478431404f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkSeaGreen = { { { 0.560784340f, 0.737254918f, 0.545098066f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkSlateBlue = { { { 0.282352954f, 0.239215702f, 0.545098066f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkSlateGray = { { { 0.184313729f, 0.309803933f, 0.309803933f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkTurquoise = { { { 0.f, 0.807843208f, 0.819607913f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkViolet = { { { 0.580392182f, 0.f, 0.827451050f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DeepPink = { { { 1.f, 0.078431375f, 0.576470613f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DeepSkyBlue = { { { 0.f, 0.749019623f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DimGray = { { { 0.411764741f, 0.411764741f, 0.411764741f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DodgerBlue = { { { 0.117647067f, 0.564705908f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Firebrick = { { { 0.698039234f, 0.133333340f, 0.133333340f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 FloralWhite = { { { 1.f, 0.980392218f, 0.941176534f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 ForestGreen = { { { 0.133333340f, 0.545098066f, 0.133333340f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Fuchsia = { { { 1.f, 0.f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Gainsboro = { { { 0.862745166f, 0.862745166f, 0.862745166f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 GhostWhite = { { { 0.972549081f, 0.972549081f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Gold = { { { 1.f, 0.843137324f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Goldenrod = { { { 0.854902029f, 0.647058845f, 0.125490203f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Gray = { { { 0.501960814f, 0.501960814f, 0.501960814f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Green = { { { 0.f, 0.501960814f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 GreenYellow = { { { 0.678431392f, 1.f, 0.184313729f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Honeydew = { { { 0.941176534f, 1.f, 0.941176534f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 HotPink = { { { 1.f, 0.411764741f, 0.705882370f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 IndianRed = { { { 0.803921640f, 0.360784322f, 0.360784322f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Indigo = { { { 0.294117659f, 0.f, 0.509803951f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Ivory = { { { 1.f, 1.f, 0.941176534f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Khaki = { { { 0.941176534f, 0.901960850f, 0.549019635f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Lavender = { { { 0.901960850f, 0.901960850f, 0.980392218f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LavenderBlush = { { { 1.f, 0.941176534f, 0.960784376f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LawnGreen = { { { 0.486274540f, 0.988235354f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LemonChiffon = { { { 1.f, 0.980392218f, 0.803921640f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightBlue = { { { 0.678431392f, 0.847058892f, 0.901960850f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightCoral = { { { 0.941176534f, 0.501960814f, 0.501960814f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightCyan = { { { 0.878431439f, 1.f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightGoldenrodYellow = { { { 0.980392218f, 0.980392218f, 0.823529482f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightGray = { { { 0.827451050f, 0.827451050f, 0.827451050f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightGreen = { { { 0.564705908f, 0.933333397f, 0.564705908f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightPink = { { { 1.f, 0.713725507f, 0.756862819f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightSalmon = { { { 1.f, 0.627451003f, 0.478431404f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightSeaGreen = { { { 0.125490203f, 0.698039234f, 0.666666687f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightSkyBlue = { { { 0.529411793f, 0.807843208f, 0.980392218f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightSlateGray = { { { 0.466666698f, 0.533333361f, 0.600000024f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightSteelBlue = { { { 0.690196097f, 0.768627524f, 0.870588303f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightYellow = { { { 1.f, 1.f, 0.878431439f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Lime = { { { 0.f, 1.f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LimeGreen = { { { 0.196078449f, 0.803921640f, 0.196078449f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Linen = { { { 0.980392218f, 0.941176534f, 0.901960850f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Magenta = { { { 1.f, 0.f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Maroon = { { { 0.501960814f, 0.f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumAquamarine = { { { 0.400000036f, 0.803921640f, 0.666666687f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumBlue = { { { 0.f, 0.f, 0.803921640f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumOrchid = { { { 0.729411781f, 0.333333343f, 0.827451050f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumPurple = { { { 0.576470613f, 0.439215720f, 0.858823597f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumSeaGreen = { { { 0.235294133f, 0.701960802f, 0.443137288f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumSlateBlue = { { { 0.482352972f, 0.407843173f, 0.933333397f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumSpringGreen = { { { 0.f, 0.980392218f, 0.603921592f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumTurquoise = { { { 0.282352954f, 0.819607913f, 0.800000072f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumVioletRed = { { { 0.780392230f, 0.082352944f, 0.521568656f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MidnightBlue = { { { 0.098039225f, 0.098039225f, 0.439215720f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MintCream = { { { 0.960784376f, 1.f, 0.980392218f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MistyRose = { { { 1.f, 0.894117713f, 0.882353008f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Moccasin = { { { 1.f, 0.894117713f, 0.709803939f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 NavajoWhite = { { { 1.f, 0.870588303f, 0.678431392f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Navy = { { { 0.f, 0.f, 0.501960814f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 OldLace = { { { 0.992156923f, 0.960784376f, 0.901960850f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Olive = { { { 0.501960814f, 0.501960814f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 OliveDrab = { { { 0.419607878f, 0.556862772f, 0.137254909f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Orange = { { { 1.f, 0.647058845f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 OrangeRed = { { { 1.f, 0.270588249f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Orchid = { { { 0.854902029f, 0.439215720f, 0.839215755f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 PaleGoldenrod = { { { 0.933333397f, 0.909803987f, 0.666666687f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 PaleGreen = { { { 0.596078455f, 0.984313786f, 0.596078455f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 PaleTurquoise = { { { 0.686274529f, 0.933333397f, 0.933333397f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 PaleVioletRed = { { { 0.858823597f, 0.439215720f, 0.576470613f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 PapayaWhip = { { { 1.f, 0.937254965f, 0.835294187f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 PeachPuff = { { { 1.f, 0.854902029f, 0.725490212f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Peru = { { { 0.803921640f, 0.521568656f, 0.247058839f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Pink = { { { 1.f, 0.752941251f, 0.796078503f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Plum = { { { 0.866666734f, 0.627451003f, 0.866666734f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 PowderBlue = { { { 0.690196097f, 0.878431439f, 0.901960850f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Purple = { { { 0.501960814f, 0.f, 0.501960814f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Red = { { { 1.f, 0.f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 RosyBrown = { { { 0.737254918f, 0.560784340f, 0.560784340f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 RoyalBlue = { { { 0.254901975f, 0.411764741f, 0.882353008f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SaddleBrown = { { { 0.545098066f, 0.270588249f, 0.074509807f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Salmon = { { { 0.980392218f, 0.501960814f, 0.447058856f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SandyBrown = { { { 0.956862807f, 0.643137276f, 0.376470625f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SeaGreen = { { { 0.180392161f, 0.545098066f, 0.341176480f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SeaShell = { { { 1.f, 0.960784376f, 0.933333397f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Sienna = { { { 0.627451003f, 0.321568638f, 0.176470593f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Silver = { { { 0.752941251f, 0.752941251f, 0.752941251f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SkyBlue = { { { 0.529411793f, 0.807843208f, 0.921568692f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SlateBlue = { { { 0.415686309f, 0.352941185f, 0.803921640f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SlateGray = { { { 0.439215720f, 0.501960814f, 0.564705908f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Snow = { { { 1.f, 0.980392218f, 0.980392218f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SpringGreen = { { { 0.f, 1.f, 0.498039246f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SteelBlue = { { { 0.274509817f, 0.509803951f, 0.705882370f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Tan = { { { 0.823529482f, 0.705882370f, 0.549019635f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Teal = { { { 0.f, 0.501960814f, 0.501960814f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Thistle = { { { 0.847058892f, 0.749019623f, 0.847058892f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Tomato = { { { 1.f, 0.388235331f, 0.278431386f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Transparent = { { { 0.f, 0.f, 0.f, 0.f } } }; + XMGLOBALCONST XMVECTORF32 Turquoise = { { { 0.250980407f, 0.878431439f, 0.815686345f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Violet = { { { 0.933333397f, 0.509803951f, 0.933333397f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Wheat = { { { 0.960784376f, 0.870588303f, 0.701960802f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 White = { { { 1.f, 1.f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 WhiteSmoke = { { { 0.960784376f, 0.960784376f, 0.960784376f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Yellow = { { { 1.f, 1.f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 YellowGreen = { { { 0.603921592f, 0.803921640f, 0.196078449f, 1.f } } }; + + } // namespace Colors + + namespace ColorsLinear + { + // Standard colors (Red/Green/Blue/Alpha) in linear colorspace + XMGLOBALCONST XMVECTORF32 AliceBlue = { { { 0.871367335f, 0.938685894f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 AntiqueWhite = { { { 0.955973506f, 0.830770075f, 0.679542601f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Aqua = { { { 0.f, 1.f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Aquamarine = { { { 0.212230787f, 1.f, 0.658374965f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Azure = { { { 0.871367335f, 1.f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Beige = { { { 0.913098991f, 0.913098991f, 0.715693772f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Bisque = { { { 1.f, 0.775822461f, 0.552011609f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Black = { { { 0.f, 0.f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 BlanchedAlmond = { { { 1.f, 0.830770075f, 0.610495746f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Blue = { { { 0.f, 0.f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 BlueViolet = { { { 0.254152179f, 0.024157630f, 0.760524750f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Brown = { { { 0.376262218f, 0.023153365f, 0.023153365f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 BurlyWood = { { { 0.730461001f, 0.479320228f, 0.242281199f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 CadetBlue = { { { 0.114435382f, 0.341914445f, 0.351532698f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Chartreuse = { { { 0.212230787f, 1.f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Chocolate = { { { 0.644479871f, 0.141263321f, 0.012983031f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Coral = { { { 1.f, 0.212230787f, 0.080219828f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 CornflowerBlue = { { { 0.127437726f, 0.300543845f, 0.846873462f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Cornsilk = { { { 1.f, 0.938685894f, 0.715693772f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Crimson = { { { 0.715693772f, 0.006995410f, 0.045186214f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Cyan = { { { 0.f, 1.f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkBlue = { { { 0.f, 0.f, 0.258182913f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkCyan = { { { 0.f, 0.258182913f, 0.258182913f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkGoldenrod = { { { 0.479320228f, 0.238397658f, 0.003346536f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkGray = { { { 0.396755308f, 0.396755308f, 0.396755308f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkGreen = { { { 0.f, 0.127437726f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkKhaki = { { { 0.508881450f, 0.473531544f, 0.147027299f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkMagenta = { { { 0.258182913f, 0.f, 0.258182913f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkOliveGreen = { { { 0.090841733f, 0.147027299f, 0.028426038f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkOrange = { { { 1.f, 0.262250721f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkOrchid = { { { 0.318546832f, 0.031896040f, 0.603827536f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkRed = { { { 0.258182913f, 0.f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkSalmon = { { { 0.814846814f, 0.304987371f, 0.194617867f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkSeaGreen = { { { 0.274677366f, 0.502886593f, 0.258182913f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkSlateBlue = { { { 0.064803280f, 0.046665095f, 0.258182913f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkSlateGray = { { { 0.028426038f, 0.078187428f, 0.078187428f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkTurquoise = { { { 0.f, 0.617206752f, 0.637597024f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DarkViolet = { { { 0.296138316f, 0.f, 0.651405811f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DeepPink = { { { 1.f, 0.006995410f, 0.291770697f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DeepSkyBlue = { { { 0.f, 0.520995677f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DimGray = { { { 0.141263321f, 0.141263321f, 0.141263321f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 DodgerBlue = { { { 0.012983031f, 0.278894335f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Firebrick = { { { 0.445201248f, 0.015996292f, 0.015996292f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 FloralWhite = { { { 1.f, 0.955973506f, 0.871367335f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 ForestGreen = { { { 0.015996292f, 0.258182913f, 0.015996292f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Fuchsia = { { { 1.f, 0.f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Gainsboro = { { { 0.715693772f, 0.715693772f, 0.715693772f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 GhostWhite = { { { 0.938685894f, 0.938685894f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Gold = { { { 1.f, 0.679542601f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Goldenrod = { { { 0.701102138f, 0.376262218f, 0.014443844f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Gray = { { { 0.215860531f, 0.215860531f, 0.215860531f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Green = { { { 0.f, 0.215860531f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 GreenYellow = { { { 0.417885154f, 1.f, 0.028426038f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Honeydew = { { { 0.871367335f, 1.f, 0.871367335f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 HotPink = { { { 1.f, 0.141263321f, 0.456411064f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 IndianRed = { { { 0.610495746f, 0.107023112f, 0.107023112f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Indigo = { { { 0.070360109f, 0.f, 0.223227978f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Ivory = { { { 1.f, 1.f, 0.871367335f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Khaki = { { { 0.871367335f, 0.791298151f, 0.262250721f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Lavender = { { { 0.791298151f, 0.791298151f, 0.955973506f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LavenderBlush = { { { 1.f, 0.871367335f, 0.913098991f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LawnGreen = { { { 0.201556295f, 0.973445475f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LemonChiffon = { { { 1.f, 0.955973506f, 0.610495746f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightBlue = { { { 0.417885154f, 0.686685443f, 0.791298151f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightCoral = { { { 0.871367335f, 0.215860531f, 0.215860531f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightCyan = { { { 0.745404482f, 1.f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightGoldenrodYellow = { { { 0.955973506f, 0.955973506f, 0.644479871f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightGray = { { { 0.651405811f, 0.651405811f, 0.651405811f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightGreen = { { { 0.278894335f, 0.854992807f, 0.278894335f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightPink = { { { 1.f, 0.467783839f, 0.533276618f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightSalmon = { { { 1.f, 0.351532698f, 0.194617867f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightSeaGreen = { { { 0.014443844f, 0.445201248f, 0.401977867f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightSkyBlue = { { { 0.242281199f, 0.617206752f, 0.955973506f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightSlateGray = { { { 0.184475034f, 0.246201396f, 0.318546832f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightSteelBlue = { { { 0.434153706f, 0.552011609f, 0.730461001f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LightYellow = { { { 1.f, 1.f, 0.745404482f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Lime = { { { 0.f, 1.f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 LimeGreen = { { { 0.031896040f, 0.610495746f, 0.031896040f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Linen = { { { 0.955973506f, 0.871367335f, 0.791298151f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Magenta = { { { 1.f, 0.f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Maroon = { { { 0.215860531f, 0.f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumAquamarine = { { { 0.132868364f, 0.610495746f, 0.401977867f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumBlue = { { { 0.f, 0.f, 0.610495746f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumOrchid = { { { 0.491020888f, 0.090841733f, 0.651405811f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumPurple = { { { 0.291770697f, 0.162029430f, 0.708376050f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumSeaGreen = { { { 0.045186214f, 0.450785846f, 0.165132239f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumSlateBlue = { { { 0.198069349f, 0.138431653f, 0.854992807f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumSpringGreen = { { { 0.f, 0.955973506f, 0.323143244f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumTurquoise = { { { 0.064803280f, 0.637597024f, 0.603827536f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MediumVioletRed = { { { 0.571125031f, 0.007499032f, 0.234550655f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MidnightBlue = { { { 0.009721218f, 0.009721218f, 0.162029430f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MintCream = { { { 0.913098991f, 1.f, 0.955973506f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 MistyRose = { { { 1.f, 0.775822461f, 0.752942443f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Moccasin = { { { 1.f, 0.775822461f, 0.462077051f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 NavajoWhite = { { { 1.f, 0.730461001f, 0.417885154f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Navy = { { { 0.f, 0.f, 0.215860531f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 OldLace = { { { 0.982250869f, 0.913098991f, 0.791298151f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Olive = { { { 0.215860531f, 0.215860531f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 OliveDrab = { { { 0.147027299f, 0.270497859f, 0.016807375f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Orange = { { { 1.f, 0.376262218f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 OrangeRed = { { { 1.f, 0.059511241f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Orchid = { { { 0.701102138f, 0.162029430f, 0.672443330f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 PaleGoldenrod = { { { 0.854992807f, 0.806952477f, 0.401977867f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 PaleGreen = { { { 0.313988745f, 0.964686573f, 0.313988745f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 PaleTurquoise = { { { 0.428690553f, 0.854992807f, 0.854992807f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 PaleVioletRed = { { { 0.708376050f, 0.162029430f, 0.291770697f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 PapayaWhip = { { { 1.f, 0.863157392f, 0.665387452f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 PeachPuff = { { { 1.f, 0.701102138f, 0.485149980f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Peru = { { { 0.610495746f, 0.234550655f, 0.049706575f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Pink = { { { 1.f, 0.527115345f, 0.597202003f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Plum = { { { 0.723055363f, 0.351532698f, 0.723055363f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 PowderBlue = { { { 0.434153706f, 0.745404482f, 0.791298151f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Purple = { { { 0.215860531f, 0.f, 0.215860531f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Red = { { { 1.f, 0.f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 RosyBrown = { { { 0.502886593f, 0.274677366f, 0.274677366f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 RoyalBlue = { { { 0.052860655f, 0.141263321f, 0.752942443f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SaddleBrown = { { { 0.258182913f, 0.059511241f, 0.006512091f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Salmon = { { { 0.955973506f, 0.215860531f, 0.168269455f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SandyBrown = { { { 0.904661357f, 0.371237785f, 0.116970696f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SeaGreen = { { { 0.027320892f, 0.258182913f, 0.095307484f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SeaShell = { { { 1.f, 0.913098991f, 0.854992807f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Sienna = { { { 0.351532698f, 0.084376216f, 0.026241222f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Silver = { { { 0.527115345f, 0.527115345f, 0.527115345f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SkyBlue = { { { 0.242281199f, 0.617206752f, 0.830770075f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SlateBlue = { { { 0.144128501f, 0.102241747f, 0.610495746f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SlateGray = { { { 0.162029430f, 0.215860531f, 0.278894335f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Snow = { { { 1.f, 0.955973506f, 0.955973506f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SpringGreen = { { { 0.f, 1.f, 0.212230787f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 SteelBlue = { { { 0.061246071f, 0.223227978f, 0.456411064f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Tan = { { { 0.644479871f, 0.456411064f, 0.262250721f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Teal = { { { 0.f, 0.215860531f, 0.215860531f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Thistle = { { { 0.686685443f, 0.520995677f, 0.686685443f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Tomato = { { { 1.f, 0.124771863f, 0.063010029f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Transparent = { { { 0.f, 0.f, 0.f, 0.f } } }; + XMGLOBALCONST XMVECTORF32 Turquoise = { { { 0.051269468f, 0.745404482f, 0.630757332f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Violet = { { { 0.854992807f, 0.223227978f, 0.854992807f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Wheat = { { { 0.913098991f, 0.730461001f, 0.450785846f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 White = { { { 1.f, 1.f, 1.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 WhiteSmoke = { { { 0.913098991f, 0.913098991f, 0.913098991f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 Yellow = { { { 1.f, 1.f, 0.f, 1.f } } }; + XMGLOBALCONST XMVECTORF32 YellowGreen = { { { 0.323143244f, 0.610495746f, 0.031896040f, 1.f } } }; + + } // namespace ColorsLinear + +} // namespace DirectX + diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h new file mode 100644 index 000000000..593aead5b --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h @@ -0,0 +1,2280 @@ +//------------------------------------------------------------------------------------- +// DirectXMath.h -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#ifndef __cplusplus +#error DirectX Math requires C++ +#endif + +#define DIRECTX_MATH_VERSION 318 + +#if defined(_MSC_VER) && (_MSC_VER < 1910) +#error DirectX Math requires Visual C++ 2017 or later. +#endif + +#if defined(_MSC_VER) && !defined(_M_ARM) && !defined(_M_ARM64) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC) && (!_MANAGED) && (!_M_CEE) && (!defined(_M_IX86_FP) || (_M_IX86_FP > 1)) && !defined(_XM_NO_INTRINSICS_) && !defined(_XM_VECTORCALL_) +#define _XM_VECTORCALL_ 1 +#endif + +#if _XM_VECTORCALL_ +#define XM_CALLCONV __vectorcall +#elif defined(__GNUC__) +#define XM_CALLCONV +#else +#define XM_CALLCONV __fastcall +#endif + +#ifndef XM_DEPRECATED +#ifdef __GNUC__ +#define XM_DEPRECATED __attribute__ ((deprecated)) +#else +#define XM_DEPRECATED __declspec(deprecated("This is deprecated and will be removed in a future version.")) +#endif +#endif + +#if !defined(_XM_AVX2_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_) +#define _XM_AVX2_INTRINSICS_ +#endif + +#if !defined(_XM_FMA3_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#define _XM_FMA3_INTRINSICS_ +#endif + +#if !defined(_XM_F16C_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#define _XM_F16C_INTRINSICS_ +#endif + +#if !defined(_XM_F16C_INTRINSICS_) && defined(__F16C__) && !defined(_XM_NO_INTRINSICS_) +#define _XM_F16C_INTRINSICS_ +#endif + +#if defined(_XM_FMA3_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_) +#define _XM_AVX_INTRINSICS_ +#endif + +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_) +#define _XM_AVX_INTRINSICS_ +#endif + +#if !defined(_XM_AVX_INTRINSICS_) && defined(__AVX__) && !defined(_XM_NO_INTRINSICS_) +#define _XM_AVX_INTRINSICS_ +#endif + +#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_SSE4_INTRINSICS_) +#define _XM_SSE4_INTRINSICS_ +#endif + +#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_SSE3_INTRINSICS_) +#define _XM_SSE3_INTRINSICS_ +#endif + +#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) +#define _XM_SSE_INTRINSICS_ +#endif + +#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC) +#define _XM_SSE_INTRINSICS_ +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __arm__ || __aarch64__ +#define _XM_ARM_NEON_INTRINSICS_ +#elif !defined(_XM_NO_INTRINSICS_) +#error DirectX Math does not support this target +#endif +#endif // !_XM_ARM_NEON_INTRINSICS_ && !_XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +#if defined(_XM_SSE_INTRINSICS_) && defined(_MSC_VER) && (_MSC_VER >= 1920) && !defined(__clang__) && !defined(_XM_SVML_INTRINSICS_) && !defined(_XM_DISABLE_INTEL_SVML_) +#define _XM_SVML_INTRINSICS_ +#endif + +#if !defined(_XM_NO_XMVECTOR_OVERLOADS_) && (defined(__clang__) || defined(__GNUC__)) && !defined(_XM_NO_INTRINSICS_) +#define _XM_NO_XMVECTOR_OVERLOADS_ +#endif + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4514 4820) +// C4514/4820: Off by default noise +#endif +#include +#include +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#ifndef _XM_NO_INTRINSICS_ + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4987) +// C4987: Off by default noise +#include +#pragma warning(pop) +#endif + +#if (defined(__clang__) || defined(__GNUC__)) && (__x86_64__ || __i386__) +#include +#endif + +#ifdef _XM_SSE_INTRINSICS_ +#include +#include + +#ifdef _XM_SSE3_INTRINSICS_ +#include +#endif + +#ifdef _XM_SSE4_INTRINSICS_ +#include +#endif + +#ifdef _XM_AVX_INTRINSICS_ +#include +#endif + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)) +#include +#else +#include +#endif +#endif +#endif // !_XM_NO_INTRINSICS_ + +#include "sal.h" +#include + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4005 4668) +// C4005/4668: Old header issue +#endif +#include +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#if __cplusplus >= 201703L +#define XM_ALIGNED_DATA(x) alignas(x) +#define XM_ALIGNED_STRUCT(x) struct alignas(x) +#elif defined(__GNUC__) +#define XM_ALIGNED_DATA(x) __attribute__ ((aligned(x))) +#define XM_ALIGNED_STRUCT(x) struct __attribute__ ((aligned(x))) +#else +#define XM_ALIGNED_DATA(x) __declspec(align(x)) +#define XM_ALIGNED_STRUCT(x) __declspec(align(x)) struct +#endif + +#if (__cplusplus >= 202002L) +#include +#endif + +/**************************************************************************** + * + * Conditional intrinsics + * + ****************************************************************************/ + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +#if defined(_XM_NO_MOVNT_) +#define XM_STREAM_PS( p, a ) _mm_store_ps((p), (a)) +#define XM256_STREAM_PS( p, a ) _mm256_store_ps((p), (a)) +#define XM_SFENCE() +#else +#define XM_STREAM_PS( p, a ) _mm_stream_ps((p), (a)) +#define XM256_STREAM_PS( p, a ) _mm256_stream_ps((p), (a)) +#define XM_SFENCE() _mm_sfence() +#endif + +#if defined(_XM_FMA3_INTRINSICS_) +#define XM_FMADD_PS( a, b, c ) _mm_fmadd_ps((a), (b), (c)) +#define XM_FNMADD_PS( a, b, c ) _mm_fnmadd_ps((a), (b), (c)) +#else +#define XM_FMADD_PS( a, b, c ) _mm_add_ps(_mm_mul_ps((a), (b)), (c)) +#define XM_FNMADD_PS( a, b, c ) _mm_sub_ps((c), _mm_mul_ps((a), (b))) +#endif + +#if defined(_XM_AVX_INTRINSICS_) && defined(_XM_FAVOR_INTEL_) +#define XM_PERMUTE_PS( v, c ) _mm_permute_ps((v), c ) +#else +#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps((v), (v), c ) +#endif + +#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 11) +#define XM_LOADU_SI16( p ) _mm_cvtsi32_si128(*reinterpret_cast(p)) +#else +#define XM_LOADU_SI16( p ) _mm_loadu_si16(p) +#endif + +#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +#if defined(__clang__) || defined(__GNUC__) +#define XM_PREFETCH( a ) __builtin_prefetch(a) +#elif defined(_MSC_VER) +#define XM_PREFETCH( a ) __prefetch(a) +#else +#define XM_PREFETCH( a ) +#endif + +#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +namespace DirectX +{ + + /**************************************************************************** + * + * Constant definitions + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XM_PI) +#undef XM_PI +#undef XM_2PI +#undef XM_1DIVPI +#undef XM_1DIV2PI +#undef XM_PIDIV2 +#undef XM_PIDIV4 +#undef XM_SELECT_0 +#undef XM_SELECT_1 +#undef XM_PERMUTE_0X +#undef XM_PERMUTE_0Y +#undef XM_PERMUTE_0Z +#undef XM_PERMUTE_0W +#undef XM_PERMUTE_1X +#undef XM_PERMUTE_1Y +#undef XM_PERMUTE_1Z +#undef XM_PERMUTE_1W +#undef XM_CRMASK_CR6 +#undef XM_CRMASK_CR6TRUE +#undef XM_CRMASK_CR6FALSE +#undef XM_CRMASK_CR6BOUNDS +#undef XM_CACHE_LINE_SIZE +#endif + + constexpr float XM_PI = 3.141592654f; + constexpr float XM_2PI = 6.283185307f; + constexpr float XM_1DIVPI = 0.318309886f; + constexpr float XM_1DIV2PI = 0.159154943f; + constexpr float XM_PIDIV2 = 1.570796327f; + constexpr float XM_PIDIV4 = 0.785398163f; + + constexpr uint32_t XM_SELECT_0 = 0x00000000; + constexpr uint32_t XM_SELECT_1 = 0xFFFFFFFF; + + constexpr uint32_t XM_PERMUTE_0X = 0; + constexpr uint32_t XM_PERMUTE_0Y = 1; + constexpr uint32_t XM_PERMUTE_0Z = 2; + constexpr uint32_t XM_PERMUTE_0W = 3; + constexpr uint32_t XM_PERMUTE_1X = 4; + constexpr uint32_t XM_PERMUTE_1Y = 5; + constexpr uint32_t XM_PERMUTE_1Z = 6; + constexpr uint32_t XM_PERMUTE_1W = 7; + + constexpr uint32_t XM_SWIZZLE_X = 0; + constexpr uint32_t XM_SWIZZLE_Y = 1; + constexpr uint32_t XM_SWIZZLE_Z = 2; + constexpr uint32_t XM_SWIZZLE_W = 3; + + constexpr uint32_t XM_CRMASK_CR6 = 0x000000F0; + constexpr uint32_t XM_CRMASK_CR6TRUE = 0x00000080; + constexpr uint32_t XM_CRMASK_CR6FALSE = 0x00000020; + constexpr uint32_t XM_CRMASK_CR6BOUNDS = XM_CRMASK_CR6FALSE; + +#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __arm__ || __aarch64__ + constexpr size_t XM_CACHE_LINE_SIZE = 128; +#else + constexpr size_t XM_CACHE_LINE_SIZE = 64; +#endif + + + /**************************************************************************** + * + * Macros + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XMComparisonAllTrue) +#undef XMComparisonAllTrue +#undef XMComparisonAnyTrue +#undef XMComparisonAllFalse +#undef XMComparisonAnyFalse +#undef XMComparisonMixed +#undef XMComparisonAllInBounds +#undef XMComparisonAnyOutOfBounds +#endif + + // Unit conversion + + constexpr float XMConvertToRadians(float fDegrees) noexcept { return fDegrees * (XM_PI / 180.0f); } + constexpr float XMConvertToDegrees(float fRadians) noexcept { return fRadians * (180.0f / XM_PI); } + + // Condition register evaluation proceeding a recording (R) comparison + + constexpr bool XMComparisonAllTrue(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6TRUE) == XM_CRMASK_CR6TRUE; } + constexpr bool XMComparisonAnyTrue(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6FALSE) != XM_CRMASK_CR6FALSE; } + constexpr bool XMComparisonAllFalse(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6FALSE) == XM_CRMASK_CR6FALSE; } + constexpr bool XMComparisonAnyFalse(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6TRUE) != XM_CRMASK_CR6TRUE; } + constexpr bool XMComparisonMixed(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6) == 0; } + constexpr bool XMComparisonAllInBounds(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6BOUNDS) == XM_CRMASK_CR6BOUNDS; } + constexpr bool XMComparisonAnyOutOfBounds(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6BOUNDS) != XM_CRMASK_CR6BOUNDS; } + + + /**************************************************************************** + * + * Data types + * + ****************************************************************************/ + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4068 4201 4365 4324 4820) + // C4068: ignore unknown pragmas + // C4201: nonstandard extension used : nameless struct/union + // C4365: Off by default noise + // C4324/4820: padding warnings +#endif + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") +#endif + +//------------------------------------------------------------------------------ +#if defined(_XM_NO_INTRINSICS_) + struct __vector4 + { + union + { + float vector4_f32[4]; + uint32_t vector4_u32[4]; + }; + }; +#endif // _XM_NO_INTRINSICS_ + + //------------------------------------------------------------------------------ + // Vector intrinsic: Four 32 bit floating point components aligned on a 16 byte + // boundary and mapped to hardware vector registers +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + using XMVECTOR = __m128; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + using XMVECTOR = float32x4_t; +#else + using XMVECTOR = __vector4; +#endif + + // Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86, ARM, ARM64, and vector call; by reference otherwise +#if ( defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) || _XM_VECTORCALL_ || __i386__ || __arm__ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_) + typedef const XMVECTOR FXMVECTOR; +#else + typedef const XMVECTOR& FXMVECTOR; +#endif + + // Fix-up for (4th) XMVECTOR parameter to pass in-register for ARM, ARM64, and vector call; by reference otherwise +#if ( defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || _XM_VECTORCALL_ || __arm__ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_) + typedef const XMVECTOR GXMVECTOR; +#else + typedef const XMVECTOR& GXMVECTOR; +#endif + + // Fix-up for (5th & 6th) XMVECTOR parameter to pass in-register for ARM64 and vector call; by reference otherwise +#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || _XM_VECTORCALL_ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_) + typedef const XMVECTOR HXMVECTOR; +#else + typedef const XMVECTOR& HXMVECTOR; +#endif + + // Fix-up for (7th+) XMVECTOR parameters to pass by reference + typedef const XMVECTOR& CXMVECTOR; + + //------------------------------------------------------------------------------ + // Conversion types for constants + XM_ALIGNED_STRUCT(16) XMVECTORF32 + { + union + { + float f[4]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const noexcept { return v; } + inline operator const float* () const noexcept { return f; } +#ifdef _XM_NO_INTRINSICS_ +#elif defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const noexcept { return _mm_castps_si128(v); } + inline operator __m128d() const noexcept { return _mm_castps_pd(v); } +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES)) + inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); } + inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); } +#endif + }; + + XM_ALIGNED_STRUCT(16) XMVECTORI32 + { + union + { + int32_t i[4]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const noexcept { return v; } +#ifdef _XM_NO_INTRINSICS_ +#elif defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const noexcept { return _mm_castps_si128(v); } + inline operator __m128d() const noexcept { return _mm_castps_pd(v); } +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES)) + inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); } + inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); } +#endif + }; + + XM_ALIGNED_STRUCT(16) XMVECTORU8 + { + union + { + uint8_t u[16]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const noexcept { return v; } +#ifdef _XM_NO_INTRINSICS_ +#elif defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const noexcept { return _mm_castps_si128(v); } + inline operator __m128d() const noexcept { return _mm_castps_pd(v); } +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES)) + inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); } + inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); } +#endif + }; + + XM_ALIGNED_STRUCT(16) XMVECTORU32 + { + union + { + uint32_t u[4]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const noexcept { return v; } +#ifdef _XM_NO_INTRINSICS_ +#elif defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const noexcept { return _mm_castps_si128(v); } + inline operator __m128d() const noexcept { return _mm_castps_pd(v); } +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES)) + inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); } + inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); } +#endif + }; + + //------------------------------------------------------------------------------ + // Vector operators + +#ifndef _XM_NO_XMVECTOR_OVERLOADS_ + XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV operator- (FXMVECTOR V) noexcept; + + XMVECTOR& XM_CALLCONV operator+= (XMVECTOR& V1, FXMVECTOR V2) noexcept; + XMVECTOR& XM_CALLCONV operator-= (XMVECTOR& V1, FXMVECTOR V2) noexcept; + XMVECTOR& XM_CALLCONV operator*= (XMVECTOR& V1, FXMVECTOR V2) noexcept; + XMVECTOR& XM_CALLCONV operator/= (XMVECTOR& V1, FXMVECTOR V2) noexcept; + + XMVECTOR& operator*= (XMVECTOR& V, float S) noexcept; + XMVECTOR& operator/= (XMVECTOR& V, float S) noexcept; + + XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV operator- (FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV operator* (FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV operator/ (FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV operator* (FXMVECTOR V, float S) noexcept; + XMVECTOR XM_CALLCONV operator* (float S, FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV operator/ (FXMVECTOR V, float S) noexcept; +#endif /* !_XM_NO_XMVECTOR_OVERLOADS_ */ + + //------------------------------------------------------------------------------ + // Matrix type: Sixteen 32 bit floating point components aligned on a + // 16 byte boundary and mapped to four hardware vector registers + + struct XMMATRIX; + + // Fix-up for (1st) XMMATRIX parameter to pass in-register for ARM64 and vector call; by reference otherwise +#if ( defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || _XM_VECTORCALL_ || __aarch64__ ) && !defined(_XM_NO_INTRINSICS_) + typedef const XMMATRIX FXMMATRIX; +#else + typedef const XMMATRIX& FXMMATRIX; +#endif + + // Fix-up for (2nd+) XMMATRIX parameters to pass by reference + typedef const XMMATRIX& CXMMATRIX; + +#ifdef _XM_NO_INTRINSICS_ + struct XMMATRIX +#else + XM_ALIGNED_STRUCT(16) XMMATRIX +#endif + { +#ifdef _XM_NO_INTRINSICS_ + union + { + XMVECTOR r[4]; + struct + { + float _11, _12, _13, _14; + float _21, _22, _23, _24; + float _31, _32, _33, _34; + float _41, _42, _43, _44; + }; + float m[4][4]; + }; +#else + XMVECTOR r[4]; +#endif + + XMMATRIX() = default; + + XMMATRIX(const XMMATRIX&) = default; + +#if defined(_MSC_VER) && (_MSC_FULL_VER < 191426431) + XMMATRIX& operator= (const XMMATRIX& M) noexcept { r[0] = M.r[0]; r[1] = M.r[1]; r[2] = M.r[2]; r[3] = M.r[3]; return *this; } +#else + XMMATRIX& operator=(const XMMATRIX&) = default; + + XMMATRIX(XMMATRIX&&) = default; + XMMATRIX& operator=(XMMATRIX&&) = default; +#endif + + constexpr XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, CXMVECTOR R3) noexcept : r{ R0,R1,R2,R3 } {} + XMMATRIX(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33) noexcept; + explicit XMMATRIX(_In_reads_(16) const float* pArray) noexcept; + +#ifdef _XM_NO_INTRINSICS_ + float operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; } +#endif + + XMMATRIX operator+ () const noexcept { return *this; } + XMMATRIX operator- () const noexcept; + + XMMATRIX& XM_CALLCONV operator+= (FXMMATRIX M) noexcept; + XMMATRIX& XM_CALLCONV operator-= (FXMMATRIX M) noexcept; + XMMATRIX& XM_CALLCONV operator*= (FXMMATRIX M) noexcept; + XMMATRIX& operator*= (float S) noexcept; + XMMATRIX& operator/= (float S) noexcept; + + XMMATRIX XM_CALLCONV operator+ (FXMMATRIX M) const noexcept; + XMMATRIX XM_CALLCONV operator- (FXMMATRIX M) const noexcept; + XMMATRIX XM_CALLCONV operator* (FXMMATRIX M) const noexcept; + XMMATRIX operator* (float S) const noexcept; + XMMATRIX operator/ (float S) const noexcept; + + friend XMMATRIX XM_CALLCONV operator* (float S, FXMMATRIX M) noexcept; + }; + + //------------------------------------------------------------------------------ + // 2D Vector; 32 bit floating point components + struct XMFLOAT2 + { + float x; + float y; + + XMFLOAT2() = default; + + XMFLOAT2(const XMFLOAT2&) = default; + XMFLOAT2& operator=(const XMFLOAT2&) = default; + + XMFLOAT2(XMFLOAT2&&) = default; + XMFLOAT2& operator=(XMFLOAT2&&) = default; + + constexpr XMFLOAT2(float _x, float _y) noexcept : x(_x), y(_y) {} + explicit XMFLOAT2(_In_reads_(2) const float* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + +#if (__cplusplus >= 202002L) + bool operator == (const XMFLOAT2&) const = default; + auto operator <=> (const XMFLOAT2&) const = default; +#endif + }; + + // 2D Vector; 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT2A : public XMFLOAT2 + { + using XMFLOAT2::XMFLOAT2; + }; + + //------------------------------------------------------------------------------ + // 2D Vector; 32 bit signed integer components + struct XMINT2 + { + int32_t x; + int32_t y; + + XMINT2() = default; + + XMINT2(const XMINT2&) = default; + XMINT2& operator=(const XMINT2&) = default; + + XMINT2(XMINT2&&) = default; + XMINT2& operator=(XMINT2&&) = default; + + constexpr XMINT2(int32_t _x, int32_t _y) noexcept : x(_x), y(_y) {} + explicit XMINT2(_In_reads_(2) const int32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + +#if (__cplusplus >= 202002L) + bool operator == (const XMINT2&) const = default; + auto operator <=> (const XMINT2&) const = default; +#endif + }; + + // 2D Vector; 32 bit unsigned integer components + struct XMUINT2 + { + uint32_t x; + uint32_t y; + + XMUINT2() = default; + + XMUINT2(const XMUINT2&) = default; + XMUINT2& operator=(const XMUINT2&) = default; + + XMUINT2(XMUINT2&&) = default; + XMUINT2& operator=(XMUINT2&&) = default; + + constexpr XMUINT2(uint32_t _x, uint32_t _y) noexcept : x(_x), y(_y) {} + explicit XMUINT2(_In_reads_(2) const uint32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + +#if (__cplusplus >= 202002L) + bool operator == (const XMUINT2&) const = default; + auto operator <=> (const XMUINT2&) const = default; +#endif + }; + + //------------------------------------------------------------------------------ + // 3D Vector; 32 bit floating point components + struct XMFLOAT3 + { + float x; + float y; + float z; + + XMFLOAT3() = default; + + XMFLOAT3(const XMFLOAT3&) = default; + XMFLOAT3& operator=(const XMFLOAT3&) = default; + + XMFLOAT3(XMFLOAT3&&) = default; + XMFLOAT3& operator=(XMFLOAT3&&) = default; + + constexpr XMFLOAT3(float _x, float _y, float _z) noexcept : x(_x), y(_y), z(_z) {} + explicit XMFLOAT3(_In_reads_(3) const float* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + }; + + // 3D Vector; 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT3A : public XMFLOAT3 + { + using XMFLOAT3::XMFLOAT3; + }; + + //------------------------------------------------------------------------------ + // 3D Vector; 32 bit signed integer components + struct XMINT3 + { + int32_t x; + int32_t y; + int32_t z; + + XMINT3() = default; + + XMINT3(const XMINT3&) = default; + XMINT3& operator=(const XMINT3&) = default; + + XMINT3(XMINT3&&) = default; + XMINT3& operator=(XMINT3&&) = default; + + constexpr XMINT3(int32_t _x, int32_t _y, int32_t _z) noexcept : x(_x), y(_y), z(_z) {} + explicit XMINT3(_In_reads_(3) const int32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + +#if (__cplusplus >= 202002L) + bool operator == (const XMINT3&) const = default; + auto operator <=> (const XMINT3&) const = default; +#endif + }; + + // 3D Vector; 32 bit unsigned integer components + struct XMUINT3 + { + uint32_t x; + uint32_t y; + uint32_t z; + + XMUINT3() = default; + + XMUINT3(const XMUINT3&) = default; + XMUINT3& operator=(const XMUINT3&) = default; + + XMUINT3(XMUINT3&&) = default; + XMUINT3& operator=(XMUINT3&&) = default; + + constexpr XMUINT3(uint32_t _x, uint32_t _y, uint32_t _z) noexcept : x(_x), y(_y), z(_z) {} + explicit XMUINT3(_In_reads_(3) const uint32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + +#if (__cplusplus >= 202002L) + bool operator == (const XMUINT3&) const = default; + auto operator <=> (const XMUINT3&) const = default; +#endif + }; + + //------------------------------------------------------------------------------ + // 4D Vector; 32 bit floating point components + struct XMFLOAT4 + { + float x; + float y; + float z; + float w; + + XMFLOAT4() = default; + + XMFLOAT4(const XMFLOAT4&) = default; + XMFLOAT4& operator=(const XMFLOAT4&) = default; + + XMFLOAT4(XMFLOAT4&&) = default; + XMFLOAT4& operator=(XMFLOAT4&&) = default; + + constexpr XMFLOAT4(float _x, float _y, float _z, float _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMFLOAT4(_In_reads_(4) const float* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + +#if (__cplusplus >= 202002L) + bool operator == (const XMFLOAT4&) const = default; + auto operator <=> (const XMFLOAT4&) const = default; +#endif + }; + + // 4D Vector; 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT4A : public XMFLOAT4 + { + using XMFLOAT4::XMFLOAT4; + }; + + //------------------------------------------------------------------------------ + // 4D Vector; 32 bit signed integer components + struct XMINT4 + { + int32_t x; + int32_t y; + int32_t z; + int32_t w; + + XMINT4() = default; + + XMINT4(const XMINT4&) = default; + XMINT4& operator=(const XMINT4&) = default; + + XMINT4(XMINT4&&) = default; + XMINT4& operator=(XMINT4&&) = default; + + constexpr XMINT4(int32_t _x, int32_t _y, int32_t _z, int32_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMINT4(_In_reads_(4) const int32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + +#if (__cplusplus >= 202002L) + bool operator == (const XMINT4&) const = default; + auto operator <=> (const XMINT4&) const = default; +#endif + }; + + // 4D Vector; 32 bit unsigned integer components + struct XMUINT4 + { + uint32_t x; + uint32_t y; + uint32_t z; + uint32_t w; + + XMUINT4() = default; + + XMUINT4(const XMUINT4&) = default; + XMUINT4& operator=(const XMUINT4&) = default; + + XMUINT4(XMUINT4&&) = default; + XMUINT4& operator=(XMUINT4&&) = default; + + constexpr XMUINT4(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUINT4(_In_reads_(4) const uint32_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + +#if (__cplusplus >= 202002L) + bool operator == (const XMUINT4&) const = default; + auto operator <=> (const XMUINT4&) const = default; +#endif + }; + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wgnu-anonymous-struct" +#pragma clang diagnostic ignored "-Wnested-anon-types" +#endif + + //------------------------------------------------------------------------------ + // 3x3 Matrix: 32 bit floating point components + struct XMFLOAT3X3 + { + union + { + struct + { + float _11, _12, _13; + float _21, _22, _23; + float _31, _32, _33; + }; + float m[3][3]; + }; + + XMFLOAT3X3() = default; + + XMFLOAT3X3(const XMFLOAT3X3&) = default; + XMFLOAT3X3& operator=(const XMFLOAT3X3&) = default; + + XMFLOAT3X3(XMFLOAT3X3&&) = default; + XMFLOAT3X3& operator=(XMFLOAT3X3&&) = default; + + constexpr XMFLOAT3X3(float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22) noexcept + : _11(m00), _12(m01), _13(m02), + _21(m10), _22(m11), _23(m12), + _31(m20), _32(m21), _33(m22) {} + explicit XMFLOAT3X3(_In_reads_(9) const float* pArray) noexcept; + + float operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; } + +#if (__cplusplus >= 202002L) + bool operator == (const XMFLOAT3X3&) const = default; + auto operator <=> (const XMFLOAT3X3&) const = default; +#endif + }; + + //------------------------------------------------------------------------------ + // 4x3 Row-major Matrix: 32 bit floating point components + struct XMFLOAT4X3 + { + union + { + struct + { + float _11, _12, _13; + float _21, _22, _23; + float _31, _32, _33; + float _41, _42, _43; + }; + float m[4][3]; + float f[12]; + }; + + XMFLOAT4X3() = default; + + XMFLOAT4X3(const XMFLOAT4X3&) = default; + XMFLOAT4X3& operator=(const XMFLOAT4X3&) = default; + + XMFLOAT4X3(XMFLOAT4X3&&) = default; + XMFLOAT4X3& operator=(XMFLOAT4X3&&) = default; + + constexpr XMFLOAT4X3(float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22, + float m30, float m31, float m32) noexcept + : _11(m00), _12(m01), _13(m02), + _21(m10), _22(m11), _23(m12), + _31(m20), _32(m21), _33(m22), + _41(m30), _42(m31), _43(m32) {} + explicit XMFLOAT4X3(_In_reads_(12) const float* pArray) noexcept; + + float operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; } + +#if (__cplusplus >= 202002L) + bool operator == (const XMFLOAT4X3&) const = default; + auto operator <=> (const XMFLOAT4X3&) const = default; +#endif + }; + + // 4x3 Row-major Matrix: 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT4X3A : public XMFLOAT4X3 + { + using XMFLOAT4X3::XMFLOAT4X3; + }; + + //------------------------------------------------------------------------------ + // 3x4 Column-major Matrix: 32 bit floating point components + struct XMFLOAT3X4 + { + union + { + struct + { + float _11, _12, _13, _14; + float _21, _22, _23, _24; + float _31, _32, _33, _34; + }; + float m[3][4]; + float f[12]; + }; + + XMFLOAT3X4() = default; + + XMFLOAT3X4(const XMFLOAT3X4&) = default; + XMFLOAT3X4& operator=(const XMFLOAT3X4&) = default; + + XMFLOAT3X4(XMFLOAT3X4&&) = default; + XMFLOAT3X4& operator=(XMFLOAT3X4&&) = default; + + constexpr XMFLOAT3X4(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23) noexcept + : _11(m00), _12(m01), _13(m02), _14(m03), + _21(m10), _22(m11), _23(m12), _24(m13), + _31(m20), _32(m21), _33(m22), _34(m23) {} + explicit XMFLOAT3X4(_In_reads_(12) const float* pArray) noexcept; + + float operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; } + +#if (__cplusplus >= 202002L) + bool operator == (const XMFLOAT3X4&) const = default; + auto operator <=> (const XMFLOAT3X4&) const = default; +#endif + }; + + // 3x4 Column-major Matrix: 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT3X4A : public XMFLOAT3X4 + { + using XMFLOAT3X4::XMFLOAT3X4; + }; + + //------------------------------------------------------------------------------ + // 4x4 Matrix: 32 bit floating point components + struct XMFLOAT4X4 + { + union + { + struct + { + float _11, _12, _13, _14; + float _21, _22, _23, _24; + float _31, _32, _33, _34; + float _41, _42, _43, _44; + }; + float m[4][4]; + }; + + XMFLOAT4X4() = default; + + XMFLOAT4X4(const XMFLOAT4X4&) = default; + XMFLOAT4X4& operator=(const XMFLOAT4X4&) = default; + + XMFLOAT4X4(XMFLOAT4X4&&) = default; + XMFLOAT4X4& operator=(XMFLOAT4X4&&) = default; + + constexpr XMFLOAT4X4(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33) noexcept + : _11(m00), _12(m01), _13(m02), _14(m03), + _21(m10), _22(m11), _23(m12), _24(m13), + _31(m20), _32(m21), _33(m22), _34(m23), + _41(m30), _42(m31), _43(m32), _44(m33) {} + explicit XMFLOAT4X4(_In_reads_(16) const float* pArray) noexcept; + + float operator() (size_t Row, size_t Column) const noexcept { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) noexcept { return m[Row][Column]; } + +#if (__cplusplus >= 202002L) + bool operator == (const XMFLOAT4X4&) const = default; + auto operator <=> (const XMFLOAT4X4&) const = default; +#endif + }; + + // 4x4 Matrix: 32 bit floating point components aligned on a 16 byte boundary + XM_ALIGNED_STRUCT(16) XMFLOAT4X4A : public XMFLOAT4X4 + { + using XMFLOAT4X4::XMFLOAT4X4; + }; + + //////////////////////////////////////////////////////////////////////////////// + +#ifdef __clang__ +#pragma clang diagnostic pop +#endif +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +/**************************************************************************** + * + * Data conversion operations + * + ****************************************************************************/ + + XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat(FXMVECTOR VInt, uint32_t DivExponent) noexcept; + XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt(FXMVECTOR VFloat, uint32_t MulExponent) noexcept; + XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat(FXMVECTOR VUInt, uint32_t DivExponent) noexcept; + XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt(FXMVECTOR VFloat, uint32_t MulExponent) noexcept; + +#if defined(__XNAMATH_H__) && defined(XMVectorSetBinaryConstant) +#undef XMVectorSetBinaryConstant +#undef XMVectorSplatConstant +#undef XMVectorSplatConstantInt +#endif + + XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant) noexcept; + + /**************************************************************************** + * + * Load operations + * + ****************************************************************************/ + + XMVECTOR XM_CALLCONV XMLoadInt(_In_ const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat(_In_ const float* pSource) noexcept; + + XMVECTOR XM_CALLCONV XMLoadInt2(_In_reads_(2) const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadInt2A(_In_reads_(2) const uint32_t* PSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat2(_In_ const XMFLOAT2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat2A(_In_ const XMFLOAT2A* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadSInt2(_In_ const XMINT2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUInt2(_In_ const XMUINT2* pSource) noexcept; + + XMVECTOR XM_CALLCONV XMLoadInt3(_In_reads_(3) const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadInt3A(_In_reads_(3) const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat3(_In_ const XMFLOAT3* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat3A(_In_ const XMFLOAT3A* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadSInt3(_In_ const XMINT3* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUInt3(_In_ const XMUINT3* pSource) noexcept; + + XMVECTOR XM_CALLCONV XMLoadInt4(_In_reads_(4) const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadInt4A(_In_reads_(4) const uint32_t* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat4(_In_ const XMFLOAT4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat4A(_In_ const XMFLOAT4A* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadSInt4(_In_ const XMINT4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUInt4(_In_ const XMUINT4* pSource) noexcept; + + XMMATRIX XM_CALLCONV XMLoadFloat3x3(_In_ const XMFLOAT3X3* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat4x3(_In_ const XMFLOAT4X3* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat4x3A(_In_ const XMFLOAT4X3A* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat3x4(_In_ const XMFLOAT3X4* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat3x4A(_In_ const XMFLOAT3X4A* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat4x4(_In_ const XMFLOAT4X4* pSource) noexcept; + XMMATRIX XM_CALLCONV XMLoadFloat4x4A(_In_ const XMFLOAT4X4A* pSource) noexcept; + + /**************************************************************************** + * + * Store operations + * + ****************************************************************************/ + + void XM_CALLCONV XMStoreInt(_Out_ uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat(_Out_ float* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreInt2(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreInt2A(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat2(_Out_ XMFLOAT2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat2A(_Out_ XMFLOAT2A* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreSInt2(_Out_ XMINT2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUInt2(_Out_ XMUINT2* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreInt3(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreInt3A(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat3(_Out_ XMFLOAT3* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat3A(_Out_ XMFLOAT3A* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreSInt3(_Out_ XMINT3* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUInt3(_Out_ XMUINT3* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreInt4(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreInt4A(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat4(_Out_ XMFLOAT4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat4A(_Out_ XMFLOAT4A* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreSInt4(_Out_ XMINT4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUInt4(_Out_ XMUINT4* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreFloat3x3(_Out_ XMFLOAT3X3* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat4x3(_Out_ XMFLOAT4X3* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat4x3A(_Out_ XMFLOAT4X3A* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat3x4(_Out_ XMFLOAT3X4* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat3x4A(_Out_ XMFLOAT3X4A* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat4x4(_Out_ XMFLOAT4X4* pDestination, _In_ FXMMATRIX M) noexcept; + void XM_CALLCONV XMStoreFloat4x4A(_Out_ XMFLOAT4X4A* pDestination, _In_ FXMMATRIX M) noexcept; + + /**************************************************************************** + * + * General vector operations + * + ****************************************************************************/ + + XMVECTOR XM_CALLCONV XMVectorZero() noexcept; + XMVECTOR XM_CALLCONV XMVectorSet(float x, float y, float z, float w) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w) noexcept; + XMVECTOR XM_CALLCONV XMVectorReplicate(float Value) noexcept; + XMVECTOR XM_CALLCONV XMVectorReplicatePtr(_In_ const float* pValue) noexcept; + XMVECTOR XM_CALLCONV XMVectorReplicateInt(uint32_t Value) noexcept; + XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr(_In_ const uint32_t* pValue) noexcept; + XMVECTOR XM_CALLCONV XMVectorTrueInt() noexcept; + XMVECTOR XM_CALLCONV XMVectorFalseInt() noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatX(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatY(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatZ(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatW(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatOne() noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatInfinity() noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatQNaN() noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatEpsilon() noexcept; + XMVECTOR XM_CALLCONV XMVectorSplatSignMask() noexcept; + + float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i) noexcept; + float XM_CALLCONV XMVectorGetX(FXMVECTOR V) noexcept; + float XM_CALLCONV XMVectorGetY(FXMVECTOR V) noexcept; + float XM_CALLCONV XMVectorGetZ(FXMVECTOR V) noexcept; + float XM_CALLCONV XMVectorGetW(FXMVECTOR V) noexcept; + + void XM_CALLCONV XMVectorGetByIndexPtr(_Out_ float* f, _In_ FXMVECTOR V, _In_ size_t i) noexcept; + void XM_CALLCONV XMVectorGetXPtr(_Out_ float* x, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetYPtr(_Out_ float* y, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetZPtr(_Out_ float* z, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetWPtr(_Out_ float* w, _In_ FXMVECTOR V) noexcept; + + uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i) noexcept; + uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V) noexcept; + uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) noexcept; + uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) noexcept; + uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) noexcept; + + void XM_CALLCONV XMVectorGetIntByIndexPtr(_Out_ uint32_t* x, _In_ FXMVECTOR V, _In_ size_t i) noexcept; + void XM_CALLCONV XMVectorGetIntXPtr(_Out_ uint32_t* x, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetIntYPtr(_Out_ uint32_t* y, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetIntZPtr(_Out_ uint32_t* z, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorGetIntWPtr(_Out_ uint32_t* w, _In_ FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) noexcept; + + XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(_In_ FXMVECTOR V, _In_ const float* f, _In_ size_t i) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetXPtr(_In_ FXMVECTOR V, _In_ const float* x) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetYPtr(_In_ FXMVECTOR V, _In_ const float* y) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetZPtr(_In_ FXMVECTOR V, _In_ const float* z) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetWPtr(_In_ FXMVECTOR V, _In_ const float* w) noexcept; + + XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) noexcept; + + XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(_In_ FXMVECTOR V, _In_ const uint32_t* x, _In_ size_t i) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(_In_ FXMVECTOR V, _In_ const uint32_t* x) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(_In_ FXMVECTOR V, _In_ const uint32_t* y) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(_In_ FXMVECTOR V, _In_ const uint32_t* z) noexcept; + XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(_In_ FXMVECTOR V, _In_ const uint32_t* w) noexcept; + +#if defined(__XNAMATH_H__) && defined(XMVectorSwizzle) +#undef XMVectorSwizzle +#endif + + XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3) noexcept; + XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW) noexcept; + XMVECTOR XM_CALLCONV XMVectorSelectControl(uint32_t VectorIndex0, uint32_t VectorIndex1, uint32_t VectorIndex2, uint32_t VectorIndex3) noexcept; + XMVECTOR XM_CALLCONV XMVectorSelect(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Control) noexcept; + XMVECTOR XM_CALLCONV XMVectorMergeXY(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorMergeZW(FXMVECTOR V1, FXMVECTOR V2) noexcept; + +#if defined(__XNAMATH_H__) && defined(XMVectorShiftLeft) +#undef XMVectorShiftLeft +#undef XMVectorRotateLeft +#undef XMVectorRotateRight +#undef XMVectorInsert +#endif + + XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) noexcept; + XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) noexcept; + XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) noexcept; + XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements, + uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3) noexcept; + + XMVECTOR XM_CALLCONV XMVectorEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorEqualIntR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorNearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept; + XMVECTOR XM_CALLCONV XMVectorNotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorNotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorGreater(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorGreaterR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorLess(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorLessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorInBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept; + XMVECTOR XM_CALLCONV XMVectorInBoundsR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR Bounds) noexcept; + + XMVECTOR XM_CALLCONV XMVectorIsNaN(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorIsInfinite(FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVectorMin(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorMax(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorRound(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorTruncate(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorFloor(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorCeiling(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorClamp(FXMVECTOR V, FXMVECTOR Min, FXMVECTOR Max) noexcept; + XMVECTOR XM_CALLCONV XMVectorSaturate(FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVectorAndInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorAndCInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorOrInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorNorInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorXorInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + + XMVECTOR XM_CALLCONV XMVectorNegate(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorAdd(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorSum(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorAddAngles(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorSubtract(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorSubtractAngles(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorMultiply(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorMultiplyAdd(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3) noexcept; + XMVECTOR XM_CALLCONV XMVectorDivide(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3) noexcept; + XMVECTOR XM_CALLCONV XMVectorScale(FXMVECTOR V, float ScaleFactor) noexcept; + XMVECTOR XM_CALLCONV XMVectorReciprocalEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorReciprocal(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSqrtEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSqrt(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorExp2(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorExp10(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorExpE(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorExp(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorLog2(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorLog10(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorLogE(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorLog(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorPow(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorAbs(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorMod(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVectorModAngles(FXMVECTOR Angles) noexcept; + XMVECTOR XM_CALLCONV XMVectorSin(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSinEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorCos(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorCosEst(FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorSinCos(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMVectorSinCosEst(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorTan(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorTanEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorSinH(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorCosH(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorTanH(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorASin(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorASinEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorACos(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorACosEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorATan(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorATanEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVectorATan2(FXMVECTOR Y, FXMVECTOR X) noexcept; + XMVECTOR XM_CALLCONV XMVectorATan2Est(FXMVECTOR Y, FXMVECTOR X) noexcept; + XMVECTOR XM_CALLCONV XMVectorLerp(FXMVECTOR V0, FXMVECTOR V1, float t) noexcept; + XMVECTOR XM_CALLCONV XMVectorLerpV(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR T) noexcept; + XMVECTOR XM_CALLCONV XMVectorHermite(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, float t) noexcept; + XMVECTOR XM_CALLCONV XMVectorHermiteV(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, HXMVECTOR T) noexcept; + XMVECTOR XM_CALLCONV XMVectorCatmullRom(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, float t) noexcept; + XMVECTOR XM_CALLCONV XMVectorCatmullRomV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, HXMVECTOR T) noexcept; + XMVECTOR XM_CALLCONV XMVectorBaryCentric(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, float f, float g) noexcept; + XMVECTOR XM_CALLCONV XMVectorBaryCentricV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR F, HXMVECTOR G) noexcept; + + /**************************************************************************** + * + * 2D vector operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMVector2Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector2EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector2EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept; + bool XM_CALLCONV XMVector2NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector2GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector2GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2Less(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector2InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept; + + bool XM_CALLCONV XMVector2IsNaN(FXMVECTOR V) noexcept; + bool XM_CALLCONV XMVector2IsInfinite(FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVector2Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector2Cross(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector2LengthSq(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2ReciprocalLength(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2LengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2Length(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2NormalizeEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2Normalize(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2ClampLength(FXMVECTOR V, float LengthMin, float LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector2ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector2Reflect(FXMVECTOR Incident, FXMVECTOR Normal) noexcept; + XMVECTOR XM_CALLCONV XMVector2Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector2RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector2Orthogonal(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector2LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point) noexcept; + XMVECTOR XM_CALLCONV XMVector2IntersectLine(FXMVECTOR Line1Point1, FXMVECTOR Line1Point2, FXMVECTOR Line2Point1, GXMVECTOR Line2Point2) noexcept; + XMVECTOR XM_CALLCONV XMVector2Transform(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT4* XM_CALLCONV XMVector2TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (VectorCount - 1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2) + InputStride * (VectorCount - 1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + XMVECTOR XM_CALLCONV XMVector2TransformCoord(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT2) + OutputStride * (VectorCount - 1)) XMFLOAT2* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2) + InputStride * (VectorCount - 1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + XMVECTOR XM_CALLCONV XMVector2TransformNormal(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT2) + OutputStride * (VectorCount - 1)) XMFLOAT2* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2) + InputStride * (VectorCount - 1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + + /**************************************************************************** + * + * 3D vector operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMVector3Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector3EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector3EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept; + bool XM_CALLCONV XMVector3NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector3GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector3GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3Less(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector3InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept; + + bool XM_CALLCONV XMVector3IsNaN(FXMVECTOR V) noexcept; + bool XM_CALLCONV XMVector3IsInfinite(FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVector3Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector3Cross(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector3LengthSq(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3ReciprocalLength(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3LengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3Length(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3NormalizeEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3Normalize(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3ClampLength(FXMVECTOR V, float LengthMin, float LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector3ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector3Reflect(FXMVECTOR Incident, FXMVECTOR Normal) noexcept; + XMVECTOR XM_CALLCONV XMVector3Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector3RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector3Orthogonal(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector3LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point) noexcept; + void XM_CALLCONV XMVector3ComponentsFromNormal(_Out_ XMVECTOR* pParallel, _Out_ XMVECTOR* pPerpendicular, _In_ FXMVECTOR V, _In_ FXMVECTOR Normal) noexcept; + XMVECTOR XM_CALLCONV XMVector3Rotate(FXMVECTOR V, FXMVECTOR RotationQuaternion) noexcept; + XMVECTOR XM_CALLCONV XMVector3InverseRotate(FXMVECTOR V, FXMVECTOR RotationQuaternion) noexcept; + XMVECTOR XM_CALLCONV XMVector3Transform(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT4* XM_CALLCONV XMVector3TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (VectorCount - 1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + XMVECTOR XM_CALLCONV XMVector3TransformCoord(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + XMVECTOR XM_CALLCONV XMVector3TransformNormal(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + XMVECTOR XM_CALLCONV XMVector3Project(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, + FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World) noexcept; + XMFLOAT3* XM_CALLCONV XMVector3ProjectStream(_Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, + _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, + _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World) noexcept; + XMVECTOR XM_CALLCONV XMVector3Unproject(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, + FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World) noexcept; + XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream(_Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, + _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, + _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World) noexcept; + + /**************************************************************************** + * + * 4D vector operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMVector4Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector4EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector4EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept; + bool XM_CALLCONV XMVector4NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector4GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + uint32_t XM_CALLCONV XMVector4GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4Less(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept; + bool XM_CALLCONV XMVector4InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept; + + bool XM_CALLCONV XMVector4IsNaN(FXMVECTOR V) noexcept; + bool XM_CALLCONV XMVector4IsInfinite(FXMVECTOR V) noexcept; + + XMVECTOR XM_CALLCONV XMVector4Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector4Cross(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3) noexcept; + XMVECTOR XM_CALLCONV XMVector4LengthSq(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4ReciprocalLength(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4LengthEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4Length(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4NormalizeEst(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4Normalize(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4ClampLength(FXMVECTOR V, float LengthMin, float LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector4ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept; + XMVECTOR XM_CALLCONV XMVector4Reflect(FXMVECTOR Incident, FXMVECTOR Normal) noexcept; + XMVECTOR XM_CALLCONV XMVector4Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector4RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept; + XMVECTOR XM_CALLCONV XMVector4Orthogonal(FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept; + XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMVector4Transform(FXMVECTOR V, FXMMATRIX M) noexcept; + XMFLOAT4* XM_CALLCONV XMVector4TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (VectorCount - 1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT4) + InputStride * (VectorCount - 1)) const XMFLOAT4* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M) noexcept; + + /**************************************************************************** + * + * Matrix operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M) noexcept; + bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M) noexcept; + bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M) noexcept; + + XMMATRIX XM_CALLCONV XMMatrixMultiply(FXMMATRIX M1, CXMMATRIX M2) noexcept; + XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose(FXMMATRIX M1, CXMMATRIX M2) noexcept; + XMMATRIX XM_CALLCONV XMMatrixTranspose(FXMMATRIX M) noexcept; + XMMATRIX XM_CALLCONV XMMatrixInverse(_Out_opt_ XMVECTOR* pDeterminant, _In_ FXMMATRIX M) noexcept; + XMMATRIX XM_CALLCONV XMMatrixVectorTensorProduct(FXMVECTOR V1, FXMVECTOR V2) noexcept; + XMVECTOR XM_CALLCONV XMMatrixDeterminant(FXMMATRIX M) noexcept; + + _Success_(return) + bool XM_CALLCONV XMMatrixDecompose(_Out_ XMVECTOR* outScale, _Out_ XMVECTOR* outRotQuat, _Out_ XMVECTOR* outTrans, _In_ FXMMATRIX M) noexcept; + + XMMATRIX XM_CALLCONV XMMatrixIdentity() noexcept; + XMMATRIX XM_CALLCONV XMMatrixSet(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33) noexcept; + XMMATRIX XM_CALLCONV XMMatrixTranslation(float OffsetX, float OffsetY, float OffsetZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector(FXMVECTOR Offset) noexcept; + XMMATRIX XM_CALLCONV XMMatrixScaling(float ScaleX, float ScaleY, float ScaleZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixScalingFromVector(FXMVECTOR Scale) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle) noexcept; + + // Rotates about y-axis (Yaw), then x-axis (Pitch), then z-axis (Roll) + XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw(float Pitch, float Yaw, float Roll) noexcept; + + // Rotates about y-axis (Angles.y), then x-axis (Angles.x), then z-axis (Angles.z) + XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector(FXMVECTOR Angles) noexcept; + + XMMATRIX XM_CALLCONV XMMatrixRotationNormal(FXMVECTOR NormalAxis, float Angle) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationAxis(FXMVECTOR Axis, float Angle) noexcept; + XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion(FXMVECTOR Quaternion) noexcept; + XMMATRIX XM_CALLCONV XMMatrixTransformation2D(FXMVECTOR ScalingOrigin, float ScalingOrientation, FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, float Rotation, GXMVECTOR Translation) noexcept; + XMMATRIX XM_CALLCONV XMMatrixTransformation(FXMVECTOR ScalingOrigin, FXMVECTOR ScalingOrientationQuaternion, FXMVECTOR Scaling, + GXMVECTOR RotationOrigin, HXMVECTOR RotationQuaternion, HXMVECTOR Translation) noexcept; + XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, float Rotation, FXMVECTOR Translation) noexcept; + XMMATRIX XM_CALLCONV XMMatrixAffineTransformation(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FXMVECTOR RotationQuaternion, GXMVECTOR Translation) noexcept; + XMMATRIX XM_CALLCONV XMMatrixReflect(FXMVECTOR ReflectionPlane) noexcept; + XMMATRIX XM_CALLCONV XMMatrixShadow(FXMVECTOR ShadowPlane, FXMVECTOR LightPosition) noexcept; + + XMMATRIX XM_CALLCONV XMMatrixLookAtLH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection) noexcept; + XMMATRIX XM_CALLCONV XMMatrixLookAtRH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection) noexcept; + XMMATRIX XM_CALLCONV XMMatrixLookToLH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection) noexcept; + XMMATRIX XM_CALLCONV XMMatrixLookToRH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH(float FovAngleY, float AspectRatio, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH(float FovAngleY, float AspectRatio, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixOrthographicLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixOrthographicRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept; + XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept; + + + /**************************************************************************** + * + * Quaternion operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMQuaternionEqual(FXMVECTOR Q1, FXMVECTOR Q2) noexcept; + bool XM_CALLCONV XMQuaternionNotEqual(FXMVECTOR Q1, FXMVECTOR Q2) noexcept; + + bool XM_CALLCONV XMQuaternionIsNaN(FXMVECTOR Q) noexcept; + bool XM_CALLCONV XMQuaternionIsInfinite(FXMVECTOR Q) noexcept; + bool XM_CALLCONV XMQuaternionIsIdentity(FXMVECTOR Q) noexcept; + + XMVECTOR XM_CALLCONV XMQuaternionDot(FXMVECTOR Q1, FXMVECTOR Q2) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionMultiply(FXMVECTOR Q1, FXMVECTOR Q2) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionLengthSq(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionLength(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionNormalize(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionConjugate(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionInverse(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionLn(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionExp(FXMVECTOR Q) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionSlerp(FXMVECTOR Q0, FXMVECTOR Q1, float t) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionSlerpV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR T) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionSquad(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, float t) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionSquadV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, HXMVECTOR T) noexcept; + void XM_CALLCONV XMQuaternionSquadSetup(_Out_ XMVECTOR* pA, _Out_ XMVECTOR* pB, _Out_ XMVECTOR* pC, _In_ FXMVECTOR Q0, _In_ FXMVECTOR Q1, _In_ FXMVECTOR Q2, _In_ GXMVECTOR Q3) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionBaryCentric(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, float f, float g) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR F, HXMVECTOR G) noexcept; + + XMVECTOR XM_CALLCONV XMQuaternionIdentity() noexcept; + + // Rotates about y-axis (Yaw), then x-axis (Pitch), then z-axis (Roll) + XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw(float Pitch, float Yaw, float Roll) noexcept; + + // Rotates about y-axis (Angles.y), then x-axis (Angles.x), then z-axis (Angles.z) + XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector(FXMVECTOR Angles) noexcept; + + XMVECTOR XM_CALLCONV XMQuaternionRotationNormal(FXMVECTOR NormalAxis, float Angle) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionRotationAxis(FXMVECTOR Axis, float Angle) noexcept; + XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M) noexcept; + + void XM_CALLCONV XMQuaternionToAxisAngle(_Out_ XMVECTOR* pAxis, _Out_ float* pAngle, _In_ FXMVECTOR Q) noexcept; + + /**************************************************************************** + * + * Plane operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMPlaneEqual(FXMVECTOR P1, FXMVECTOR P2) noexcept; + bool XM_CALLCONV XMPlaneNearEqual(FXMVECTOR P1, FXMVECTOR P2, FXMVECTOR Epsilon) noexcept; + bool XM_CALLCONV XMPlaneNotEqual(FXMVECTOR P1, FXMVECTOR P2) noexcept; + + bool XM_CALLCONV XMPlaneIsNaN(FXMVECTOR P) noexcept; + bool XM_CALLCONV XMPlaneIsInfinite(FXMVECTOR P) noexcept; + + XMVECTOR XM_CALLCONV XMPlaneDot(FXMVECTOR P, FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMPlaneDotCoord(FXMVECTOR P, FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMPlaneDotNormal(FXMVECTOR P, FXMVECTOR V) noexcept; + XMVECTOR XM_CALLCONV XMPlaneNormalizeEst(FXMVECTOR P) noexcept; + XMVECTOR XM_CALLCONV XMPlaneNormalize(FXMVECTOR P) noexcept; + XMVECTOR XM_CALLCONV XMPlaneIntersectLine(FXMVECTOR P, FXMVECTOR LinePoint1, FXMVECTOR LinePoint2) noexcept; + void XM_CALLCONV XMPlaneIntersectPlane(_Out_ XMVECTOR* pLinePoint1, _Out_ XMVECTOR* pLinePoint2, _In_ FXMVECTOR P1, _In_ FXMVECTOR P2) noexcept; + + // Transforms a plane given an inverse transpose matrix + XMVECTOR XM_CALLCONV XMPlaneTransform(FXMVECTOR P, FXMMATRIX ITM) noexcept; + + // Transforms an array of planes given an inverse transpose matrix + XMFLOAT4* XM_CALLCONV XMPlaneTransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (PlaneCount - 1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT4) + InputStride * (PlaneCount - 1)) const XMFLOAT4* pInputStream, + _In_ size_t InputStride, _In_ size_t PlaneCount, _In_ FXMMATRIX ITM) noexcept; + + XMVECTOR XM_CALLCONV XMPlaneFromPointNormal(FXMVECTOR Point, FXMVECTOR Normal) noexcept; + XMVECTOR XM_CALLCONV XMPlaneFromPoints(FXMVECTOR Point1, FXMVECTOR Point2, FXMVECTOR Point3) noexcept; + + /**************************************************************************** + * + * Color operations + * + ****************************************************************************/ + + bool XM_CALLCONV XMColorEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept; + bool XM_CALLCONV XMColorNotEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept; + bool XM_CALLCONV XMColorGreater(FXMVECTOR C1, FXMVECTOR C2) noexcept; + bool XM_CALLCONV XMColorGreaterOrEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept; + bool XM_CALLCONV XMColorLess(FXMVECTOR C1, FXMVECTOR C2) noexcept; + bool XM_CALLCONV XMColorLessOrEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept; + + bool XM_CALLCONV XMColorIsNaN(FXMVECTOR C) noexcept; + bool XM_CALLCONV XMColorIsInfinite(FXMVECTOR C) noexcept; + + XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR C) noexcept; + XMVECTOR XM_CALLCONV XMColorModulate(FXMVECTOR C1, FXMVECTOR C2) noexcept; + XMVECTOR XM_CALLCONV XMColorAdjustSaturation(FXMVECTOR C, float Saturation) noexcept; + XMVECTOR XM_CALLCONV XMColorAdjustContrast(FXMVECTOR C, float Contrast) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToHSL(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorHSLToRGB(FXMVECTOR hsl) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToHSV(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorHSVToRGB(FXMVECTOR hsv) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToYUV(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorYUVToRGB(FXMVECTOR yuv) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD(FXMVECTOR yuv) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToYUV_UHD(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorYUVToRGB_UHD(FXMVECTOR yuv) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToXYZ(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorXYZToRGB(FXMVECTOR xyz) noexcept; + + XMVECTOR XM_CALLCONV XMColorXYZToSRGB(FXMVECTOR xyz) noexcept; + XMVECTOR XM_CALLCONV XMColorSRGBToXYZ(FXMVECTOR srgb) noexcept; + + XMVECTOR XM_CALLCONV XMColorRGBToSRGB(FXMVECTOR rgb) noexcept; + XMVECTOR XM_CALLCONV XMColorSRGBToRGB(FXMVECTOR srgb) noexcept; + + + /**************************************************************************** + * + * Miscellaneous operations + * + ****************************************************************************/ + + bool XMVerifyCPUSupport() noexcept; + + XMVECTOR XM_CALLCONV XMFresnelTerm(FXMVECTOR CosIncidentAngle, FXMVECTOR RefractionIndex) noexcept; + + bool XMScalarNearEqual(float S1, float S2, float Epsilon) noexcept; + float XMScalarModAngle(float Value) noexcept; + + float XMScalarSin(float Value) noexcept; + float XMScalarSinEst(float Value) noexcept; + + float XMScalarCos(float Value) noexcept; + float XMScalarCosEst(float Value) noexcept; + + void XMScalarSinCos(_Out_ float* pSin, _Out_ float* pCos, float Value) noexcept; + void XMScalarSinCosEst(_Out_ float* pSin, _Out_ float* pCos, float Value) noexcept; + + float XMScalarASin(float Value) noexcept; + float XMScalarASinEst(float Value) noexcept; + + float XMScalarACos(float Value) noexcept; + float XMScalarACosEst(float Value) noexcept; + + /**************************************************************************** + * + * Templates + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XMMin) +#undef XMMin +#undef XMMax +#endif + + template inline T XMMin(T a, T b) noexcept { return (a < b) ? a : b; } + template inline T XMMax(T a, T b) noexcept { return (a > b) ? a : b; } + + //------------------------------------------------------------------------------ + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +// PermuteHelper internal template (SSE only) + namespace Internal + { + // Slow path fallback for permutes that do not map to a single SSE shuffle opcode. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept + { + static const XMVECTORU32 selectMask = + { { { + WhichX ? 0xFFFFFFFF : 0, + WhichY ? 0xFFFFFFFF : 0, + WhichZ ? 0xFFFFFFFF : 0, + WhichW ? 0xFFFFFFFF : 0, + } } }; + + XMVECTOR shuffled1 = XM_PERMUTE_PS(v1, Shuffle); + XMVECTOR shuffled2 = XM_PERMUTE_PS(v2, Shuffle); + + XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1); + XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2); + + return _mm_or_ps(masked1, masked2); + } + }; + + // Fast path for permutes that only read from the first vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR) noexcept { return XM_PERMUTE_PS(v1, Shuffle); } + }; + + // Fast path for permutes that only read from the second vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR, FXMVECTOR v2) noexcept { return XM_PERMUTE_PS(v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the first vector, ZW from the second. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept { return _mm_shuffle_ps(v1, v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the second vector, ZW from the first. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept { return _mm_shuffle_ps(v2, v1, Shuffle); } + }; + } + +#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + + // General permute template + template + inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2) noexcept + { + static_assert(PermuteX <= 7, "PermuteX template parameter out of range"); + static_assert(PermuteY <= 7, "PermuteY template parameter out of range"); + static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range"); + static_assert(PermuteW <= 7, "PermuteW template parameter out of range"); + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + constexpr uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3); + + constexpr bool WhichX = PermuteX > 3; + constexpr bool WhichY = PermuteY > 3; + constexpr bool WhichZ = PermuteZ > 3; + constexpr bool WhichW = PermuteW > 3; + + return Internal::PermuteHelper::Permute(V1, V2); +#else + + return XMVectorPermute(V1, V2, PermuteX, PermuteY, PermuteZ, PermuteW); + +#endif + } + + // Special-case permute templates + template<> constexpr XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 2, 3>(FXMVECTOR V1, FXMVECTOR) noexcept { return V1; } + template<> constexpr XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 6, 7>(FXMVECTOR, FXMVECTOR V2) noexcept { return V2; } + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_movelh_ps(V1, V2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<6, 7, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_movehl_ps(V1, V2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 4, 1, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_unpacklo_ps(V1, V2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 6, 3, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_unpackhi_ps(V1, V2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(V1), _mm_castps_pd(V2))); } +#endif + +#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x1); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x3); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x4); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x5); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x6); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x7); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x8); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x9); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xA); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xB); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xC); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xD); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xE); } +#endif + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + + // If the indices are all in the range 0-3 or 4-7, then use XMVectorSwizzle instead + // The mirror cases are not spelled out here as the programmer can always swap the arguments + // (i.e. prefer permutes where the X element comes from the V1 vector instead of the V2 vector) + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vget_low_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vget_low_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vrev64_f32(vget_low_f32(V2))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vrev64_f32(vget_low_f32(V2))); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_high_f32(V1), vget_high_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vget_high_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_high_f32(V1), vrev64_f32(vget_high_f32(V2))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vrev64_f32(vget_high_f32(V2))); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vget_high_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vget_high_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vrev64_f32(vget_high_f32(V2))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vrev64_f32(vget_high_f32(V2))); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vget_low_f32(V2)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_high_f32(V1), vrev64_f32(vget_low_f32(V2))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vrev64_f32(vget_low_f32(V2))); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 4, 2, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vtrnq_f32(V1, V2).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 5, 3, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vtrnq_f32(V1, V2).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 4, 1, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vzipq_f32(V1, V2).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 6, 3, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vzipq_f32(V1, V2).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 2, 4, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vuzpq_f32(V1, V2).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 3, 5, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vuzpq_f32(V1, V2).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 2, 3, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vextq_f32(V1, V2, 1); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vextq_f32(V1, V2, 2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 4, 5, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vextq_f32(V1, V2, 3); } + +#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ + + //------------------------------------------------------------------------------ + + // General swizzle template + template + inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V) noexcept + { + static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range"); + static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range"); + static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range"); + static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range"); + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + return XM_PERMUTE_PS(V, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX)); +#else + + return XMVectorSwizzle(V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW); + +#endif + } + + // Specialized swizzles + template<> constexpr XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 2, 3>(FXMVECTOR V) noexcept { return V; } + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 0, 1>(FXMVECTOR V) noexcept { return _mm_movelh_ps(V, V); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 2, 3>(FXMVECTOR V) noexcept { return _mm_movehl_ps(V, V); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 1, 1>(FXMVECTOR V) noexcept { return _mm_unpacklo_ps(V, V); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 2, 3, 3>(FXMVECTOR V) noexcept { return _mm_unpackhi_ps(V, V); } +#endif + +#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 2, 2>(FXMVECTOR V) noexcept { return _mm_moveldup_ps(V); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 1, 3, 3>(FXMVECTOR V) noexcept { return _mm_movehdup_ps(V); } +#endif + +#if defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) && defined(_XM_FAVOR_INTEL_) + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 0, 0>(FXMVECTOR V) noexcept { return _mm_broadcastss_ps(V); } +#endif + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 0, 0>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_low_f32(V), 0); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 1, 1, 1>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_low_f32(V), 1); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 2, 2, 2>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_high_f32(V), 0); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 3, 3, 3>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_high_f32(V), 1); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 0, 3, 2>(FXMVECTOR V) noexcept { return vrev64q_f32(V); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 0, 1>(FXMVECTOR V) noexcept { float32x2_t vt = vget_low_f32(V); return vcombine_f32(vt, vt); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 2, 3>(FXMVECTOR V) noexcept { float32x2_t vt = vget_high_f32(V); return vcombine_f32(vt, vt); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 0, 1, 0>(FXMVECTOR V) noexcept { float32x2_t vt = vrev64_f32(vget_low_f32(V)); return vcombine_f32(vt, vt); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 2, 3, 2>(FXMVECTOR V) noexcept { float32x2_t vt = vrev64_f32(vget_high_f32(V)); return vcombine_f32(vt, vt); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 3, 2>(FXMVECTOR V) noexcept { return vcombine_f32(vget_low_f32(V), vrev64_f32(vget_high_f32(V))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 0, 2, 3>(FXMVECTOR V) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V)), vget_high_f32(V)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 1, 0>(FXMVECTOR V) noexcept { return vcombine_f32(vget_high_f32(V), vrev64_f32(vget_low_f32(V))); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 2, 0, 1>(FXMVECTOR V) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V)), vget_low_f32(V)); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 2, 1, 0>(FXMVECTOR V) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V)), vrev64_f32(vget_low_f32(V))); } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 2, 2>(FXMVECTOR V) noexcept { return vtrnq_f32(V, V).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 1, 3, 3>(FXMVECTOR V) noexcept { return vtrnq_f32(V, V).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 1, 1>(FXMVECTOR V) noexcept { return vzipq_f32(V, V).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 2, 3, 3>(FXMVECTOR V) noexcept { return vzipq_f32(V, V).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 2, 0, 2>(FXMVECTOR V) noexcept { return vuzpq_f32(V, V).val[0]; } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 3, 1, 3>(FXMVECTOR V) noexcept { return vuzpq_f32(V, V).val[1]; } + + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 2, 3, 0>(FXMVECTOR V) noexcept { return vextq_f32(V, V, 1); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 0, 1>(FXMVECTOR V) noexcept { return vextq_f32(V, V, 2); } + template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 0, 1, 2>(FXMVECTOR V) noexcept { return vextq_f32(V, V, 3); } + +#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ + + //------------------------------------------------------------------------------ + + template + inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2) noexcept + { + static_assert(Elements < 4, "Elements template parameter out of range"); + return XMVectorPermute(V1, V2); + } + + template + inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V) noexcept + { + static_assert(Elements < 4, "Elements template parameter out of range"); + return XMVectorSwizzle(V); + } + + template + inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V) noexcept + { + static_assert(Elements < 4, "Elements template parameter out of range"); + return XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V); + } + + template + inline XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS) noexcept + { + XMVECTOR Control = XMVectorSelectControl(Select0 & 1, Select1 & 1, Select2 & 1, Select3 & 1); + return XMVectorSelect(VD, XMVectorRotateLeft(VS), Control); + } + + /**************************************************************************** + * + * Globals + * + ****************************************************************************/ + + // The purpose of the following global constants is to prevent redundant + // reloading of the constants when they are referenced by more than one + // separate inline math routine called within the same function. Declaring + // a constant locally within a routine is sufficient to prevent redundant + // reloads of that constant when that single routine is called multiple + // times in a function, but if the constant is used (and declared) in a + // separate math routine it would be reloaded. + +#ifndef XMGLOBALCONST +#if defined(__GNUC__) && !defined(__MINGW32__) +#define XMGLOBALCONST extern const __attribute__((weak)) +#else +#define XMGLOBALCONST extern const __declspec(selectany) +#endif +#endif + + XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients0 = { { { -0.16666667f, +0.0083333310f, -0.00019840874f, +2.7525562e-06f } } }; + XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients1 = { { { -2.3889859e-08f, -0.16665852f /*Est1*/, +0.0083139502f /*Est2*/, -0.00018524670f /*Est3*/ } } }; + XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients0 = { { { -0.5f, +0.041666638f, -0.0013888378f, +2.4760495e-05f } } }; + XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients1 = { { { -2.6051615e-07f, -0.49992746f /*Est1*/, +0.041493919f /*Est2*/, -0.0012712436f /*Est3*/ } } }; + XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients0 = { { { 1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f } } }; + XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients1 = { { { 2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f } } }; + XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients2 = { { { 5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f } } }; + XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients0 = { { { +1.5707963050f, -0.2145988016f, +0.0889789874f, -0.0501743046f } } }; + XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients1 = { { { +0.0308918810f, -0.0170881256f, +0.0066700901f, -0.0012624911f } } }; + XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients0 = { { { -0.3333314528f, +0.1999355085f, -0.1420889944f, +0.1065626393f } } }; + XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients1 = { { { -0.0752896400f, +0.0429096138f, -0.0161657367f, +0.0028662257f } } }; + XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients0 = { { { +0.999866f, +0.999866f, +0.999866f, +0.999866f } } }; + XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients1 = { { { -0.3302995f, +0.180141f, -0.085133f, +0.0208351f } } }; + XMGLOBALCONST XMVECTORF32 g_XMTanEstCoefficients = { { { 2.484f, -1.954923183e-1f, 2.467401101f, XM_1DIVPI } } }; + XMGLOBALCONST XMVECTORF32 g_XMArcEstCoefficients = { { { +1.5707288f, -0.2121144f, +0.0742610f, -0.0187293f } } }; + XMGLOBALCONST XMVECTORF32 g_XMPiConstants0 = { { { XM_PI, XM_2PI, XM_1DIVPI, XM_1DIV2PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMIdentityR0 = { { { 1.0f, 0.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMIdentityR1 = { { { 0.0f, 1.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMIdentityR2 = { { { 0.0f, 0.0f, 1.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMIdentityR3 = { { { 0.0f, 0.0f, 0.0f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0 = { { { -1.0f, 0.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1 = { { { 0.0f, -1.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2 = { { { 0.0f, 0.0f, -1.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3 = { { { 0.0f, 0.0f, 0.0f, -1.0f } } }; + XMGLOBALCONST XMVECTORU32 g_XMNegativeZero = { { { 0x80000000, 0x80000000, 0x80000000, 0x80000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMNegate3 = { { { 0x80000000, 0x80000000, 0x80000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskXY = { { { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMask3 = { { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskX = { { { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskY = { { { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskZ = { { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskW = { { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF } } }; + XMGLOBALCONST XMVECTORF32 g_XMOne = { { { 1.0f, 1.0f, 1.0f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMOne3 = { { { 1.0f, 1.0f, 1.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMZero = { { { 0.0f, 0.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMTwo = { { { 2.f, 2.f, 2.f, 2.f } } }; + XMGLOBALCONST XMVECTORF32 g_XMFour = { { { 4.f, 4.f, 4.f, 4.f } } }; + XMGLOBALCONST XMVECTORF32 g_XMSix = { { { 6.f, 6.f, 6.f, 6.f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegativeOne = { { { -1.0f, -1.0f, -1.0f, -1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMOneHalf = { { { 0.5f, 0.5f, 0.5f, 0.5f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegativeOneHalf = { { { -0.5f, -0.5f, -0.5f, -0.5f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegativeTwoPi = { { { -XM_2PI, -XM_2PI, -XM_2PI, -XM_2PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegativePi = { { { -XM_PI, -XM_PI, -XM_PI, -XM_PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMHalfPi = { { { XM_PIDIV2, XM_PIDIV2, XM_PIDIV2, XM_PIDIV2 } } }; + XMGLOBALCONST XMVECTORF32 g_XMPi = { { { XM_PI, XM_PI, XM_PI, XM_PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMReciprocalPi = { { { XM_1DIVPI, XM_1DIVPI, XM_1DIVPI, XM_1DIVPI } } }; + XMGLOBALCONST XMVECTORF32 g_XMTwoPi = { { { XM_2PI, XM_2PI, XM_2PI, XM_2PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMReciprocalTwoPi = { { { XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI } } }; + XMGLOBALCONST XMVECTORF32 g_XMEpsilon = { { { 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f } } }; + XMGLOBALCONST XMVECTORI32 g_XMInfinity = { { { 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMQNaN = { { { 0x7FC00000, 0x7FC00000, 0x7FC00000, 0x7FC00000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMQNaNTest = { { { 0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF } } }; + XMGLOBALCONST XMVECTORI32 g_XMAbsMask = { { { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF } } }; + XMGLOBALCONST XMVECTORI32 g_XMFltMin = { { { 0x00800000, 0x00800000, 0x00800000, 0x00800000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMFltMax = { { { 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF } } }; + XMGLOBALCONST XMVECTORU32 g_XMNegOneMask = { { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskA8R8G8B8 = { { { 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipA8R8G8B8 = { { { 0x00000000, 0x00000000, 0x00000000, 0x80000000 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixAA8R8G8B8 = { { { 0.0f, 0.0f, 0.0f, float(0x80000000U) } } }; + XMGLOBALCONST XMVECTORF32 g_XMNormalizeA8R8G8B8 = { { { 1.0f / (255.0f * float(0x10000)), 1.0f / (255.0f * float(0x100)), 1.0f / 255.0f, 1.0f / (255.0f * float(0x1000000)) } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskA2B10G10R10 = { { { 0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipA2B10G10R10 = { { { 0x00000200, 0x00080000, 0x20000000, 0x80000000 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixAA2B10G10R10 = { { { -512.0f, -512.0f * float(0x400), -512.0f * float(0x100000), float(0x80000000U) } } }; + XMGLOBALCONST XMVECTORF32 g_XMNormalizeA2B10G10R10 = { { { 1.0f / 511.0f, 1.0f / (511.0f * float(0x400)), 1.0f / (511.0f * float(0x100000)), 1.0f / (3.0f * float(0x40000000)) } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16 = { { { 0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16 = { { { 0x00008000, 0x00000000, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16 = { { { -32768.0f, 0.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16 = { { { 1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16Z16W16 = { { { 0x0000FFFF, 0x0000FFFF, 0xFFFF0000, 0xFFFF0000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16Z16W16 = { { { 0x00008000, 0x00008000, 0x00000000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16Z16W16 = { { { -32768.0f, -32768.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16Z16W16 = { { { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 1.0f / (32767.0f * 65536.0f) } } }; + XMGLOBALCONST XMVECTORF32 g_XMNoFraction = { { { 8388608.0f, 8388608.0f, 8388608.0f, 8388608.0f } } }; + XMGLOBALCONST XMVECTORI32 g_XMMaskByte = { { { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegateX = { { { -1.0f, 1.0f, 1.0f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegateY = { { { 1.0f, -1.0f, 1.0f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegateZ = { { { 1.0f, 1.0f, -1.0f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMNegateW = { { { 1.0f, 1.0f, 1.0f, -1.0f } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect0101 = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1 } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect1010 = { { { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } }; + XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = { { { 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect1000 = { { { XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect1100 = { { { XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect1110 = { { { XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMSelect1011 = { { { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixupY16 = { { { 1.0f, 1.0f / 65536.0f, 0.0f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16 = { { { 1.0f, 1.0f, 1.0f / 65536.0f, 1.0f / 65536.0f } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipY = { { { 0, 0x80000000, 0, 0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipZ = { { { 0, 0, 0x80000000, 0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipW = { { { 0, 0, 0, 0x80000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipYZ = { { { 0, 0x80000000, 0x80000000, 0 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipZW = { { { 0, 0, 0x80000000, 0x80000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMFlipYW = { { { 0, 0x80000000, 0, 0x80000000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMMaskDec4 = { { { 0x3FF, 0x3FF << 10, 0x3FF << 20, static_cast(0xC0000000) } } }; + XMGLOBALCONST XMVECTORI32 g_XMXorDec4 = { { { 0x200, 0x200 << 10, 0x200 << 20, 0 } } }; + XMGLOBALCONST XMVECTORF32 g_XMAddUDec4 = { { { 0, 0, 0, 32768.0f * 65536.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMAddDec4 = { { { -512.0f, -512.0f * 1024.0f, -512.0f * 1024.0f * 1024.0f, 0 } } }; + XMGLOBALCONST XMVECTORF32 g_XMMulDec4 = { { { 1.0f, 1.0f / 1024.0f, 1.0f / (1024.0f * 1024.0f), 1.0f / (1024.0f * 1024.0f * 1024.0f) } } }; + XMGLOBALCONST XMVECTORU32 g_XMMaskByte4 = { { { 0xFF, 0xFF00, 0xFF0000, 0xFF000000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMXorByte4 = { { { 0x80, 0x8000, 0x800000, 0x00000000 } } }; + XMGLOBALCONST XMVECTORF32 g_XMAddByte4 = { { { -128.0f, -128.0f * 256.0f, -128.0f * 65536.0f, 0 } } }; + XMGLOBALCONST XMVECTORF32 g_XMFixUnsigned = { { { 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMMaxInt = { { { 65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMMaxUInt = { { { 65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMUnsignedFix = { { { 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMsrgbScale = { { { 12.92f, 12.92f, 12.92f, 1.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMsrgbA = { { { 0.055f, 0.055f, 0.055f, 0.0f } } }; + XMGLOBALCONST XMVECTORF32 g_XMsrgbA1 = { { { 1.055f, 1.055f, 1.055f, 1.0f } } }; + XMGLOBALCONST XMVECTORI32 g_XMExponentBias = { { { 127, 127, 127, 127 } } }; + XMGLOBALCONST XMVECTORI32 g_XMSubnormalExponent = { { { -126, -126, -126, -126 } } }; + XMGLOBALCONST XMVECTORI32 g_XMNumTrailing = { { { 23, 23, 23, 23 } } }; + XMGLOBALCONST XMVECTORI32 g_XMMinNormal = { { { 0x00800000, 0x00800000, 0x00800000, 0x00800000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMNegInfinity = { { { 0xFF800000, 0xFF800000, 0xFF800000, 0xFF800000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMNegQNaN = { { { 0xFFC00000, 0xFFC00000, 0xFFC00000, 0xFFC00000 } } }; + XMGLOBALCONST XMVECTORI32 g_XMBin128 = { { { 0x43000000, 0x43000000, 0x43000000, 0x43000000 } } }; + XMGLOBALCONST XMVECTORU32 g_XMBinNeg150 = { { { 0xC3160000, 0xC3160000, 0xC3160000, 0xC3160000 } } }; + XMGLOBALCONST XMVECTORI32 g_XM253 = { { { 253, 253, 253, 253 } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst1 = { { { -6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst2 = { { { +2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst3 = { { { -5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst4 = { { { +9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst5 = { { { -1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst6 = { { { +1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f } } }; + XMGLOBALCONST XMVECTORF32 g_XMExpEst7 = { { { -1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst0 = { { { +1.442693f, +1.442693f, +1.442693f, +1.442693f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst1 = { { { -0.721242f, -0.721242f, -0.721242f, -0.721242f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst2 = { { { +0.479384f, +0.479384f, +0.479384f, +0.479384f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst3 = { { { -0.350295f, -0.350295f, -0.350295f, -0.350295f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst4 = { { { +0.248590f, +0.248590f, +0.248590f, +0.248590f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst5 = { { { -0.145700f, -0.145700f, -0.145700f, -0.145700f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst6 = { { { +0.057148f, +0.057148f, +0.057148f, +0.057148f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLogEst7 = { { { -0.010578f, -0.010578f, -0.010578f, -0.010578f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLgE = { { { +1.442695f, +1.442695f, +1.442695f, +1.442695f } } }; + XMGLOBALCONST XMVECTORF32 g_XMInvLgE = { { { +6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f } } }; + XMGLOBALCONST XMVECTORF32 g_XMLg10 = { { { +3.321928f, +3.321928f, +3.321928f, +3.321928f } } }; + XMGLOBALCONST XMVECTORF32 g_XMInvLg10 = { { { +3.010299956e-1f, +3.010299956e-1f, +3.010299956e-1f, +3.010299956e-1f } } }; + XMGLOBALCONST XMVECTORF32 g_UByteMax = { { { 255.0f, 255.0f, 255.0f, 255.0f } } }; + XMGLOBALCONST XMVECTORF32 g_ByteMin = { { { -127.0f, -127.0f, -127.0f, -127.0f } } }; + XMGLOBALCONST XMVECTORF32 g_ByteMax = { { { 127.0f, 127.0f, 127.0f, 127.0f } } }; + XMGLOBALCONST XMVECTORF32 g_ShortMin = { { { -32767.0f, -32767.0f, -32767.0f, -32767.0f } } }; + XMGLOBALCONST XMVECTORF32 g_ShortMax = { { { 32767.0f, 32767.0f, 32767.0f, 32767.0f } } }; + XMGLOBALCONST XMVECTORF32 g_UShortMax = { { { 65535.0f, 65535.0f, 65535.0f, 65535.0f } } }; + + /**************************************************************************** + * + * Implementation + * + ****************************************************************************/ + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4068 4214 4204 4365 4616 4640 6001 6101) + // C4068/4616: ignore unknown pragmas + // C4214/4204: nonstandard extension used + // C4365/4640: Off by default noise + // C6001/6101: False positives +#endif + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") +#pragma prefast(disable : 26495, "Union initialization confuses /analyze") +#endif + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast" +#endif + +//------------------------------------------------------------------------------ + + inline XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3) noexcept + { +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = (0 - (C0 & 1)) & 0x3F800000; + vResult.u[1] = (0 - (C1 & 1)) & 0x3F800000; + vResult.u[2] = (0 - (C2 & 1)) & 0x3F800000; + vResult.u[3] = (0 - (C3 & 1)) & 0x3F800000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = (0 - (C0 & 1)) & 0x3F800000; + vResult.u[1] = (0 - (C1 & 1)) & 0x3F800000; + vResult.u[2] = (0 - (C2 & 1)) & 0x3F800000; + vResult.u[3] = (0 - (C3 & 1)) & 0x3F800000; + return vResult.v; +#else // XM_SSE_INTRINSICS_ + static const XMVECTORU32 g_vMask1 = { { { 1, 1, 1, 1 } } }; + // Move the parms to a vector + __m128i vTemp = _mm_set_epi32(static_cast(C3), static_cast(C2), static_cast(C1), static_cast(C0)); + // Mask off the low bits + vTemp = _mm_and_si128(vTemp, g_vMask1); + // 0xFFFFFFFF on true bits + vTemp = _mm_cmpeq_epi32(vTemp, g_vMask1); + // 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f + vTemp = _mm_and_si128(vTemp, g_XMOne); + return _mm_castsi128_ps(vTemp); +#endif + } + + //------------------------------------------------------------------------------ + + inline XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent) noexcept + { + assert(IntConstant >= -16 && IntConstant <= 15); + assert(DivExponent < 32); +#if defined(_XM_NO_INTRINSICS_) + + using DirectX::XMConvertVectorIntToFloat; + + XMVECTORI32 V = { { { IntConstant, IntConstant, IntConstant, IntConstant } } }; + return XMConvertVectorIntToFloat(V.v, DivExponent); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Splat the int + int32x4_t vScale = vdupq_n_s32(IntConstant); + // Convert to a float + XMVECTOR vResult = vcvtq_f32_s32(vScale); + // Convert DivExponent into 1.0f/(1<(&vScale)[0]); + return vResult; +#else // XM_SSE_INTRINSICS_ + // Splat the int + __m128i vScale = _mm_set1_epi32(IntConstant); + // Convert to a float + XMVECTOR vResult = _mm_cvtepi32_ps(vScale); + // Convert DivExponent into 1.0f/(1<(uScale)); + // Multiply by the reciprocal (Perform a right shift by DivExponent) + vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(vScale)); + return vResult; +#endif + } + + //------------------------------------------------------------------------------ + + inline XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant) noexcept + { + assert(IntConstant >= -16 && IntConstant <= 15); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORI32 V = { { { IntConstant, IntConstant, IntConstant, IntConstant } } }; + return V.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t V = vdupq_n_s32(IntConstant); + return reinterpret_cast(&V)[0]; +#else // XM_SSE_INTRINSICS_ + __m128i V = _mm_set1_epi32(IntConstant); + return _mm_castsi128_ps(V); +#endif + } + +#include "DirectXMathConvert.inl" +#include "DirectXMathVector.inl" +#include "DirectXMathMatrix.inl" +#include "DirectXMathMisc.inl" + +#ifdef __clang__ +#pragma clang diagnostic pop +#endif +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +} // namespace DirectX + diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathConvert.inl b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathConvert.inl new file mode 100644 index 000000000..3ca86d5ff --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathConvert.inl @@ -0,0 +1,2191 @@ +//------------------------------------------------------------------------------------- +// DirectXMathConvert.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Data conversion + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4701) +// C4701: false positives +#endif + +inline XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat +( + FXMVECTOR VInt, + uint32_t DivExponent +) noexcept +{ + assert(DivExponent < 32); +#if defined(_XM_NO_INTRINSICS_) + float fScale = 1.0f / static_cast(1U << DivExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + auto iTemp = static_cast(VInt.vector4_u32[ElementIndex]); + Result.vector4_f32[ElementIndex] = static_cast(iTemp)* fScale; + } while (++ElementIndex < 4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fScale = 1.0f / static_cast(1U << DivExponent); + float32x4_t vResult = vcvtq_f32_s32(vreinterpretq_s32_f32(VInt)); + return vmulq_n_f32(vResult, fScale); +#else // _XM_SSE_INTRINSICS_ + // Convert to floats + XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt)); + // Convert DivExponent into 1.0f/(1<(uScale)); + vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(vScale)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt +( + FXMVECTOR VFloat, + uint32_t MulExponent +) noexcept +{ + assert(MulExponent < 32); +#if defined(_XM_NO_INTRINSICS_) + // Get the scalar factor. + auto fScale = static_cast(1U << MulExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + int32_t iResult; + float fTemp = VFloat.vector4_f32[ElementIndex] * fScale; + if (fTemp <= -(65536.0f * 32768.0f)) + { + iResult = (-0x7FFFFFFF) - 1; + } + else if (fTemp > (65536.0f * 32768.0f) - 128.0f) + { + iResult = 0x7FFFFFFF; + } + else { + iResult = static_cast(fTemp); + } + Result.vector4_u32[ElementIndex] = static_cast(iResult); + } while (++ElementIndex < 4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmulq_n_f32(VFloat, static_cast(1U << MulExponent)); + // In case of positive overflow, detect it + uint32x4_t vOverflow = vcgtq_f32(vResult, g_XMMaxInt); + // Float to int conversion + int32x4_t vResulti = vcvtq_s32_f32(vResult); + // If there was positive overflow, set to 0x7FFFFFFF + vResult = vreinterpretq_f32_u32(vandq_u32(vOverflow, g_XMAbsMask)); + vOverflow = vbicq_u32(vreinterpretq_u32_s32(vResulti), vOverflow); + vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult)); + return vreinterpretq_f32_u32(vOverflow); +#else // _XM_SSE_INTRINSICS_ + XMVECTOR vResult = _mm_set_ps1(static_cast(1U << MulExponent)); + vResult = _mm_mul_ps(vResult, VFloat); + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(vResult); + // If there was positive overflow, set to 0x7FFFFFFF + vResult = _mm_and_ps(vOverflow, g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow, vResult); + return vOverflow; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat +( + FXMVECTOR VUInt, + uint32_t DivExponent +) noexcept +{ + assert(DivExponent < 32); +#if defined(_XM_NO_INTRINSICS_) + float fScale = 1.0f / static_cast(1U << DivExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + Result.vector4_f32[ElementIndex] = static_cast(VUInt.vector4_u32[ElementIndex])* fScale; + } while (++ElementIndex < 4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fScale = 1.0f / static_cast(1U << DivExponent); + float32x4_t vResult = vcvtq_f32_u32(vreinterpretq_u32_f32(VUInt)); + return vmulq_n_f32(vResult, fScale); +#else // _XM_SSE_INTRINSICS_ + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(VUInt, g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(VUInt, vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned); + vResult = _mm_add_ps(vResult, vMask); + // Convert DivExponent into 1.0f/(1<(uScale)); + vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(iMask)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt +( + FXMVECTOR VFloat, + uint32_t MulExponent +) noexcept +{ + assert(MulExponent < 32); +#if defined(_XM_NO_INTRINSICS_) + // Get the scalar factor. + auto fScale = static_cast(1U << MulExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + uint32_t uResult; + float fTemp = VFloat.vector4_f32[ElementIndex] * fScale; + if (fTemp <= 0.0f) + { + uResult = 0; + } + else if (fTemp >= (65536.0f * 65536.0f)) + { + uResult = 0xFFFFFFFFU; + } + else { + uResult = static_cast(fTemp); + } + Result.vector4_u32[ElementIndex] = uResult; + } while (++ElementIndex < 4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmulq_n_f32(VFloat, static_cast(1U << MulExponent)); + // In case of overflow, detect it + uint32x4_t vOverflow = vcgtq_f32(vResult, g_XMMaxUInt); + // Float to int conversion + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + // If there was overflow, set to 0xFFFFFFFFU + vResult = vreinterpretq_f32_u32(vbicq_u32(vResulti, vOverflow)); + vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult)); + return vreinterpretq_f32_u32(vOverflow); +#else // _XM_SSE_INTRINSICS_ + XMVECTOR vResult = _mm_set_ps1(static_cast(1U << MulExponent)); + vResult = _mm_mul_ps(vResult, VFloat); + // Clamp to >=0 + vResult = _mm_max_ps(vResult, g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue, vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult, vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask, g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult, vOverflow); + return vResult; +#endif +} + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +/**************************************************************************** + * + * Vector and matrix load operations + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt(const uint32_t* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = *pSource; + V.vector4_u32[1] = 0; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t zero = vdupq_n_u32(0); + return vreinterpretq_f32_u32(vld1q_lane_u32(pSource, zero, 0)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ss(reinterpret_cast(pSource)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat(const float* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = *pSource; + V.vector4_f32[1] = 0.f; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t zero = vdupq_n_f32(0); + return vld1q_lane_f32(pSource, zero, 0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ss(pSource); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt2(const uint32_t* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32(pSource); + uint32x2_t zero = vdup_n_u32(0); + return vreinterpretq_f32_u32(vcombine_u32(x, zero)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt2A(const uint32_t* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + uint32x2_t x = vld1_u32_ex(pSource, 64); +#else + uint32x2_t x = vld1_u32(pSource); +#endif + uint32x2_t zero = vdup_n_u32(0); + return vreinterpretq_f32_u32(vcombine_u32(x, zero)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat2(const XMFLOAT2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t x = vld1_f32(reinterpret_cast(pSource)); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32(x, zero); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat2A(const XMFLOAT2A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + float32x2_t x = vld1_f32_ex(reinterpret_cast(pSource), 64); +#else + float32x2_t x = vld1_f32(reinterpret_cast(pSource)); +#endif + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32(x, zero); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadSInt2(const XMINT2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x2_t x = vld1_s32(reinterpret_cast(pSource)); + float32x2_t v = vcvt_f32_s32(x); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32(v, zero); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 V = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + return _mm_cvtepi32_ps(_mm_castps_si128(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUInt2(const XMUINT2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32(reinterpret_cast(pSource)); + float32x2_t v = vcvt_f32_u32(x); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32(v, zero); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 V = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(V, g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(V, vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned); + vResult = _mm_add_ps(vResult, vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt3(const uint32_t* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32(pSource); + uint32x2_t zero = vdup_n_u32(0); + uint32x2_t y = vld1_lane_u32(pSource + 2, zero, 0); + return vreinterpretq_f32_u32(vcombine_u32(x, y)); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(pSource + 2)); + return _mm_insert_ps(xy, z, 0x20); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(pSource + 2)); + return _mm_movelh_ps(xy, z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt3A(const uint32_t* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Reads an extra integer which is zero'd +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + uint32x4_t V = vld1q_u32_ex(pSource, 128); +#else + uint32x4_t V = vld1q_u32(pSource); +#endif + return vreinterpretq_f32_u32(vsetq_lane_u32(0, V, 3)); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(pSource + 2)); + return _mm_insert_ps(xy, z, 0x20); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(pSource + 2)); + return _mm_movelh_ps(xy, z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat3(const XMFLOAT3* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t x = vld1_f32(reinterpret_cast(pSource)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t y = vld1_lane_f32(reinterpret_cast(pSource) + 2, zero, 0); + return vcombine_f32(x, y); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(&pSource->z); + return _mm_insert_ps(xy, z, 0x20); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(&pSource->z); + return _mm_movelh_ps(xy, z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat3A(const XMFLOAT3A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Reads an extra float which is zero'd +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + float32x4_t V = vld1q_f32_ex(reinterpret_cast(pSource), 128); +#else + float32x4_t V = vld1q_f32(reinterpret_cast(pSource)); +#endif + return vsetq_lane_f32(0, V, 3); +#elif defined(_XM_SSE_INTRINSICS_) + // Reads an extra float which is zero'd + __m128 V = _mm_load_ps(&pSource->x); + return _mm_and_ps(V, g_XMMask3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadSInt3(const XMINT3* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = static_cast(pSource->z); + V.vector4_f32[3] = 0.f; + return V; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x2_t x = vld1_s32(reinterpret_cast(pSource)); + int32x2_t zero = vdup_n_s32(0); + int32x2_t y = vld1_lane_s32(reinterpret_cast(pSource) + 2, zero, 0); + int32x4_t v = vcombine_s32(x, y); + return vcvtq_f32_s32(v); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(&pSource->z)); + __m128 V = _mm_movelh_ps(xy, z); + return _mm_cvtepi32_ps(_mm_castps_si128(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUInt3(const XMUINT3* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = static_cast(pSource->z); + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32(reinterpret_cast(pSource)); + uint32x2_t zero = vdup_n_u32(0); + uint32x2_t y = vld1_lane_u32(reinterpret_cast(pSource) + 2, zero, 0); + uint32x4_t v = vcombine_u32(x, y); + return vcvtq_f32_u32(v); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pSource))); + __m128 z = _mm_load_ss(reinterpret_cast(&pSource->z)); + __m128 V = _mm_movelh_ps(xy, z); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(V, g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(V, vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned); + vResult = _mm_add_ps(vResult, vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt4(const uint32_t* pSource) noexcept +{ + assert(pSource); + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = pSource[3]; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_u32(pSource)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128(reinterpret_cast(pSource)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt4A(const uint32_t* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = pSource[3]; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + return vld1q_u32_ex(pSource, 128); +#else + return vreinterpretq_f32_u32(vld1q_u32(pSource)); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_load_si128(reinterpret_cast(pSource)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat4(const XMFLOAT4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_f32(reinterpret_cast(pSource)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_loadu_ps(&pSource->x); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat4A(const XMFLOAT4A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + return vld1q_f32_ex(reinterpret_cast(pSource), 128); +#else + return vld1q_f32(reinterpret_cast(pSource)); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps(&pSource->x); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadSInt4(const XMINT4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = static_cast(pSource->z); + V.vector4_f32[3] = static_cast(pSource->w); + return V; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t v = vld1q_s32(reinterpret_cast(pSource)); + return vcvtq_f32_s32(v); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128(reinterpret_cast(pSource)); + return _mm_cvtepi32_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUInt4(const XMUINT4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = static_cast(pSource->x); + V.vector4_f32[1] = static_cast(pSource->y); + V.vector4_f32[2] = static_cast(pSource->z); + V.vector4_f32[3] = static_cast(pSource->w); + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t v = vld1q_u32(reinterpret_cast(pSource)); + return vcvtq_f32_u32(v); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128(reinterpret_cast(pSource)); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V), g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V), vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned); + vResult = _mm_add_ps(vResult, vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat3x3(const XMFLOAT3X3* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + M.r[3].vector4_f32[0] = 0.0f; + M.r[3].vector4_f32[1] = 0.0f; + M.r[3].vector4_f32[2] = 0.0f; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t v0 = vld1q_f32(&pSource->m[0][0]); + float32x4_t v1 = vld1q_f32(&pSource->m[1][1]); + float32x2_t v2 = vcreate_f32(static_cast(*reinterpret_cast(&pSource->m[2][2]))); + float32x4_t T = vextq_f32(v0, v1, 3); + + XMMATRIX M; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T), g_XMMask3)); + M.r[2] = vcombine_f32(vget_high_f32(v1), v2); + M.r[3] = g_XMIdentityR3; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 Z = _mm_setzero_ps(); + + __m128 V1 = _mm_loadu_ps(&pSource->m[0][0]); + __m128 V2 = _mm_loadu_ps(&pSource->m[1][1]); + __m128 V3 = _mm_load_ss(&pSource->m[2][2]); + + __m128 T1 = _mm_unpackhi_ps(V1, Z); + __m128 T2 = _mm_unpacklo_ps(V2, Z); + __m128 T3 = _mm_shuffle_ps(V3, T2, _MM_SHUFFLE(0, 1, 0, 0)); + __m128 T4 = _mm_movehl_ps(T2, T3); + __m128 T5 = _mm_movehl_ps(Z, T1); + + XMMATRIX M; + M.r[0] = _mm_movelh_ps(V1, T1); + M.r[1] = _mm_add_ps(T4, T5); + M.r[2] = _mm_shuffle_ps(V2, V3, _MM_SHUFFLE(1, 0, 3, 2)); + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x3(const XMFLOAT4X3* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t v0 = vld1q_f32(&pSource->m[0][0]); + float32x4_t v1 = vld1q_f32(&pSource->m[1][1]); + float32x4_t v2 = vld1q_f32(&pSource->m[2][2]); + + float32x4_t T1 = vextq_f32(v0, v1, 3); + float32x4_t T2 = vcombine_f32(vget_high_f32(v1), vget_low_f32(v2)); + float32x4_t T3 = vextq_f32(v2, v2, 1); + + XMMATRIX M; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3)); + M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3)); + M.r[3] = vsetq_lane_f32(1.f, T3, 3); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + // Use unaligned load instructions to + // load the 12 floats + // vTemp1 = x1,y1,z1,x2 + XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]); + // vTemp2 = y2,z2,x3,y3 + XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]); + // vTemp4 = z3,x4,y4,z4 + XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]); + // vTemp3 = x3,y3,z3,z3 + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2, vTemp4, _MM_SHUFFLE(0, 0, 3, 2)); + // vTemp2 = y2,z2,x2,x2 + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(3, 3, 1, 0)); + // vTemp2 = x2,y2,z2,z2 + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2)); + // vTemp1 = x1,y1,z1,0 + vTemp1 = _mm_and_ps(vTemp1, g_XMMask3); + // vTemp2 = x2,y2,z2,0 + vTemp2 = _mm_and_ps(vTemp2, g_XMMask3); + // vTemp3 = x3,y3,z3,0 + vTemp3 = _mm_and_ps(vTemp3, g_XMMask3); + // vTemp4i = x4,y4,z4,0 + __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8); + // vTemp4i = x4,y4,z4,1.0f + vTemp4i = _mm_or_si128(vTemp4i, g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + _mm_castsi128_ps(vTemp4i)); + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A(const XMFLOAT4X3A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + float32x4_t v0 = vld1q_f32_ex(&pSource->m[0][0], 128); + float32x4_t v1 = vld1q_f32_ex(&pSource->m[1][1], 128); + float32x4_t v2 = vld1q_f32_ex(&pSource->m[2][2], 128); +#else + float32x4_t v0 = vld1q_f32(&pSource->m[0][0]); + float32x4_t v1 = vld1q_f32(&pSource->m[1][1]); + float32x4_t v2 = vld1q_f32(&pSource->m[2][2]); +#endif + + float32x4_t T1 = vextq_f32(v0, v1, 3); + float32x4_t T2 = vcombine_f32(vget_high_f32(v1), vget_low_f32(v2)); + float32x4_t T3 = vextq_f32(v2, v2, 1); + + XMMATRIX M; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3)); + M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3)); + M.r[3] = vsetq_lane_f32(1.f, T3, 3); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + // Use aligned load instructions to + // load the 12 floats + // vTemp1 = x1,y1,z1,x2 + XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]); + // vTemp2 = y2,z2,x3,y3 + XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]); + // vTemp4 = z3,x4,y4,z4 + XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]); + // vTemp3 = x3,y3,z3,z3 + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2, vTemp4, _MM_SHUFFLE(0, 0, 3, 2)); + // vTemp2 = y2,z2,x2,x2 + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(3, 3, 1, 0)); + // vTemp2 = x2,y2,z2,z2 + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2)); + // vTemp1 = x1,y1,z1,0 + vTemp1 = _mm_and_ps(vTemp1, g_XMMask3); + // vTemp2 = x2,y2,z2,0 + vTemp2 = _mm_and_ps(vTemp2, g_XMMask3); + // vTemp3 = x3,y3,z3,0 + vTemp3 = _mm_and_ps(vTemp3, g_XMMask3); + // vTemp4i = x4,y4,z4,0 + __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8); + // vTemp4i = x4,y4,z4,1.0f + vTemp4i = _mm_or_si128(vTemp4i, g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + _mm_castsi128_ps(vTemp4i)); + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat3x4(const XMFLOAT3X4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[1][0]; + M.r[0].vector4_f32[2] = pSource->m[2][0]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[0][1]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[2][1]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[0][2]; + M.r[2].vector4_f32[1] = pSource->m[1][2]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[0][3]; + M.r[3].vector4_f32[1] = pSource->m[1][3]; + M.r[3].vector4_f32[2] = pSource->m[2][3]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2x4_t vTemp0 = vld4_f32(&pSource->_11); + float32x4_t vTemp1 = vld1q_f32(&pSource->_31); + + float32x2_t l = vget_low_f32(vTemp1); + float32x4_t T0 = vcombine_f32(vTemp0.val[0], l); + float32x2_t rl = vrev64_f32(l); + float32x4_t T1 = vcombine_f32(vTemp0.val[1], rl); + + float32x2_t h = vget_high_f32(vTemp1); + float32x4_t T2 = vcombine_f32(vTemp0.val[2], h); + float32x2_t rh = vrev64_f32(h); + float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh); + + XMMATRIX M = {}; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3)); + M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3)); + M.r[3] = vsetq_lane_f32(1.f, T3, 3); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_loadu_ps(&pSource->_11); + M.r[1] = _mm_loadu_ps(&pSource->_21); + M.r[2] = _mm_loadu_ps(&pSource->_31); + M.r[3] = g_XMIdentityR3; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + XMMATRIX mResult; + + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat3x4A(const XMFLOAT3X4A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[1][0]; + M.r[0].vector4_f32[2] = pSource->m[2][0]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[0][1]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[2][1]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[0][2]; + M.r[2].vector4_f32[1] = pSource->m[1][2]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[0][3]; + M.r[3].vector4_f32[1] = pSource->m[1][3]; + M.r[3].vector4_f32[2] = pSource->m[2][3]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + float32x2x4_t vTemp0 = vld4_f32_ex(&pSource->_11, 128); + float32x4_t vTemp1 = vld1q_f32_ex(&pSource->_31, 128); +#else + float32x2x4_t vTemp0 = vld4_f32(&pSource->_11); + float32x4_t vTemp1 = vld1q_f32(&pSource->_31); +#endif + + float32x2_t l = vget_low_f32(vTemp1); + float32x4_t T0 = vcombine_f32(vTemp0.val[0], l); + float32x2_t rl = vrev64_f32(l); + float32x4_t T1 = vcombine_f32(vTemp0.val[1], rl); + + float32x2_t h = vget_high_f32(vTemp1); + float32x4_t T2 = vcombine_f32(vTemp0.val[2], h); + float32x2_t rh = vrev64_f32(h); + float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh); + + XMMATRIX M = {}; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3)); + M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3)); + M.r[3] = vsetq_lane_f32(1.f, T3, 3); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_load_ps(&pSource->_11); + M.r[1] = _mm_load_ps(&pSource->_21); + M.r[2] = _mm_load_ps(&pSource->_31); + M.r[3] = g_XMIdentityR3; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + XMMATRIX mResult; + + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x4(const XMFLOAT4X4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = pSource->m[0][3]; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = pSource->m[1][3]; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = pSource->m[2][3]; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = pSource->m[3][3]; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vld1q_f32(reinterpret_cast(&pSource->_11)); + M.r[1] = vld1q_f32(reinterpret_cast(&pSource->_21)); + M.r[2] = vld1q_f32(reinterpret_cast(&pSource->_31)); + M.r[3] = vld1q_f32(reinterpret_cast(&pSource->_41)); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_loadu_ps(&pSource->_11); + M.r[1] = _mm_loadu_ps(&pSource->_21); + M.r[2] = _mm_loadu_ps(&pSource->_31); + M.r[3] = _mm_loadu_ps(&pSource->_41); + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x4A(const XMFLOAT4X4A* pSource) noexcept +{ + assert(pSource); + assert((reinterpret_cast(pSource) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = pSource->m[0][3]; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = pSource->m[1][3]; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = pSource->m[2][3]; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = pSource->m[3][3]; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + M.r[0] = vld1q_f32_ex(reinterpret_cast(&pSource->_11), 128); + M.r[1] = vld1q_f32_ex(reinterpret_cast(&pSource->_21), 128); + M.r[2] = vld1q_f32_ex(reinterpret_cast(&pSource->_31), 128); + M.r[3] = vld1q_f32_ex(reinterpret_cast(&pSource->_41), 128); +#else + M.r[0] = vld1q_f32(reinterpret_cast(&pSource->_11)); + M.r[1] = vld1q_f32(reinterpret_cast(&pSource->_21)); + M.r[2] = vld1q_f32(reinterpret_cast(&pSource->_31)); + M.r[3] = vld1q_f32(reinterpret_cast(&pSource->_41)); +#endif + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_load_ps(&pSource->_11); + M.r[1] = _mm_load_ps(&pSource->_21); + M.r[2] = _mm_load_ps(&pSource->_31); + M.r[3] = _mm_load_ps(&pSource->_41); + return M; +#endif +} + +/**************************************************************************** + * + * Vector and matrix store operations + * + ****************************************************************************/ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + *pDestination = XMVectorGetIntX(V); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(pDestination, *reinterpret_cast(&V), 0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(reinterpret_cast(pDestination), V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat +( + float* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + *pDestination = XMVectorGetX(V); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(pDestination, V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(pDestination, V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt2 +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V)); + vst1_u32(pDestination, VL); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt2A +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V)); +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + vst1_u32_ex(pDestination, VL, 64); +#else + vst1_u32(pDestination, VL); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat2 +( + XMFLOAT2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + vst1_f32(reinterpret_cast(pDestination), VL); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat2A +( + XMFLOAT2A* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + vst1_f32_ex(reinterpret_cast(pDestination), VL, 64); +#else + vst1_f32(reinterpret_cast(pDestination), VL); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreSInt2 +( + XMINT2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t v = vget_low_f32(V); + int32x2_t iv = vcvt_s32_f32(v); + vst1_s32(reinterpret_cast(pDestination), iv); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow, vResult); + // Write two ints + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(vOverflow)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUInt2 +( + XMUINT2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t v = vget_low_f32(V); + uint32x2_t iv = vcvt_u32_f32(v); + vst1_u32(reinterpret_cast(pDestination), iv); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue, vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult, vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask, g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult, vOverflow); + // Write two uints + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(vResult)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt3 +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V)); + vst1_u32(pDestination, VL); + vst1q_lane_u32(pDestination + 2, *reinterpret_cast(&V), 2); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); + __m128 z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(reinterpret_cast(&pDestination[2]), z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt3A +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V)); +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + vst1_u32_ex(pDestination, VL, 64); +#else + vst1_u32(pDestination, VL); +#endif + vst1q_lane_u32(pDestination + 2, *reinterpret_cast(&V), 2); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); + __m128 z = _mm_movehl_ps(V, V); + _mm_store_ss(reinterpret_cast(&pDestination[2]), z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3 +( + XMFLOAT3* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + vst1_f32(reinterpret_cast(pDestination), VL); + vst1q_lane_f32(reinterpret_cast(pDestination) + 2, V, 2); +#elif defined(_XM_SSE4_INTRINSICS_) + * reinterpret_cast(&pDestination->x) = _mm_extract_ps(V, 0); + *reinterpret_cast(&pDestination->y) = _mm_extract_ps(V, 1); + *reinterpret_cast(&pDestination->z) = _mm_extract_ps(V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); + __m128 z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(&pDestination->z, z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3A +( + XMFLOAT3A* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + vst1_f32_ex(reinterpret_cast(pDestination), VL, 64); +#else + vst1_f32(reinterpret_cast(pDestination), VL); +#endif + vst1q_lane_f32(reinterpret_cast(pDestination) + 2, V, 2); +#elif defined(_XM_SSE4_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); + *reinterpret_cast(&pDestination->z) = _mm_extract_ps(V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(V)); + __m128 z = _mm_movehl_ps(V, V); + _mm_store_ss(&pDestination->z, z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreSInt3 +( + XMINT3* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); + pDestination->z = static_cast(V.vector4_f32[2]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t v = vcvtq_s32_f32(V); + int32x2_t vL = vget_low_s32(v); + vst1_s32(reinterpret_cast(pDestination), vL); + vst1q_lane_s32(reinterpret_cast(pDestination) + 2, v, 2); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow, vResult); + // Write 3 uints + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(vOverflow)); + __m128 z = XM_PERMUTE_PS(vOverflow, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(reinterpret_cast(&pDestination->z), z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUInt3 +( + XMUINT3* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); + pDestination->z = static_cast(V.vector4_f32[2]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t v = vcvtq_u32_f32(V); + uint32x2_t vL = vget_low_u32(v); + vst1_u32(reinterpret_cast(pDestination), vL); + vst1q_lane_u32(reinterpret_cast(pDestination) + 2, v, 2); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue, vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult, vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask, g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult, vOverflow); + // Write 3 uints + _mm_store_sd(reinterpret_cast(pDestination), _mm_castps_pd(vResult)); + __m128 z = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(reinterpret_cast(&pDestination->z), z); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt4 +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_u32(pDestination, vreinterpretq_u32_f32(V)); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt4A +( + uint32_t* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + vst1q_u32_ex(pDestination, V, 128); +#else + vst1q_u32(pDestination, vreinterpretq_u32_f32(V)); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4 +( + XMFLOAT4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32(reinterpret_cast(pDestination), V); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_ps(&pDestination->x, V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4A +( + XMFLOAT4A* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + vst1q_f32_ex(reinterpret_cast(pDestination), V, 128); +#else + vst1q_f32(reinterpret_cast(pDestination), V); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ps(&pDestination->x, V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreSInt4 +( + XMINT4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); + pDestination->z = static_cast(V.vector4_f32[2]); + pDestination->w = static_cast(V.vector4_f32[3]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t v = vcvtq_s32_f32(V); + vst1q_s32(reinterpret_cast(pDestination), v); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow, vResult); + _mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUInt4 +( + XMUINT4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = static_cast(V.vector4_f32[0]); + pDestination->y = static_cast(V.vector4_f32[1]); + pDestination->z = static_cast(V.vector4_f32[2]); + pDestination->w = static_cast(V.vector4_f32[3]); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t v = vcvtq_u32_f32(V); + vst1q_u32(reinterpret_cast(pDestination), v); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue, vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult, vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask, g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult, vOverflow); + _mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3x3 +( + XMFLOAT3X3* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1); + float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1); + vst1q_f32(&pDestination->m[0][0], T2); + + T1 = vextq_f32(M.r[1], M.r[1], 1); + T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2])); + vst1q_f32(&pDestination->m[1][1], T2); + + vst1q_lane_f32(&pDestination->m[2][2], M.r[2], 2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = M.r[0]; + XMVECTOR vTemp2 = M.r[1]; + XMVECTOR vTemp3 = M.r[2]; + XMVECTOR vWork = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(0, 0, 2, 2)); + vTemp1 = _mm_shuffle_ps(vTemp1, vWork, _MM_SHUFFLE(2, 0, 1, 0)); + _mm_storeu_ps(&pDestination->m[0][0], vTemp1); + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1)); + _mm_storeu_ps(&pDestination->m[1][1], vTemp2); + vTemp3 = XM_PERMUTE_PS(vTemp3, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(&pDestination->m[2][2], vTemp3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x3 +( + XMFLOAT4X3* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1); + float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1); + vst1q_f32(&pDestination->m[0][0], T2); + + T1 = vextq_f32(M.r[1], M.r[1], 1); + T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2])); + vst1q_f32(&pDestination->m[1][1], T2); + + T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0); + T2 = vextq_f32(T1, M.r[3], 3); + vst1q_f32(&pDestination->m[2][2], T2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = M.r[0]; + XMVECTOR vTemp2 = M.r[1]; + XMVECTOR vTemp3 = M.r[2]; + XMVECTOR vTemp4 = M.r[3]; + XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1)); + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(2, 2, 0, 0)); + vTemp1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(0, 2, 1, 0)); + vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(0, 0, 2, 2)); + vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 1, 2, 0)); + _mm_storeu_ps(&pDestination->m[0][0], vTemp1); + _mm_storeu_ps(&pDestination->m[1][1], vTemp2x); + _mm_storeu_ps(&pDestination->m[2][2], vTemp3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x3A +( + XMFLOAT4X3A* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1); + float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1); + vst1q_f32_ex(&pDestination->m[0][0], T2, 128); + + T1 = vextq_f32(M.r[1], M.r[1], 1); + T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2])); + vst1q_f32_ex(&pDestination->m[1][1], T2, 128); + + T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0); + T2 = vextq_f32(T1, M.r[3], 3); + vst1q_f32_ex(&pDestination->m[2][2], T2, 128); +#else + float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1); + float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1); + vst1q_f32(&pDestination->m[0][0], T2); + + T1 = vextq_f32(M.r[1], M.r[1], 1); + T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2])); + vst1q_f32(&pDestination->m[1][1], T2); + + T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0); + T2 = vextq_f32(T1, M.r[3], 3); + vst1q_f32(&pDestination->m[2][2], T2); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + // x1,y1,z1,w1 + XMVECTOR vTemp1 = M.r[0]; + // x2,y2,z2,w2 + XMVECTOR vTemp2 = M.r[1]; + // x3,y3,z3,w3 + XMVECTOR vTemp3 = M.r[2]; + // x4,y4,z4,w4 + XMVECTOR vTemp4 = M.r[3]; + // z1,z1,x2,y2 + XMVECTOR vTemp = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(1, 0, 2, 2)); + // y2,z2,x3,y3 (Final) + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1)); + // x1,y1,z1,x2 (Final) + vTemp1 = _mm_shuffle_ps(vTemp1, vTemp, _MM_SHUFFLE(2, 0, 1, 0)); + // z3,z3,x4,x4 + vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(0, 0, 2, 2)); + // z3,x4,y4,z4 (Final) + vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 1, 2, 0)); + // Store in 3 operations + _mm_store_ps(&pDestination->m[0][0], vTemp1); + _mm_store_ps(&pDestination->m[1][1], vTemp2); + _mm_store_ps(&pDestination->m[2][2], vTemp3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3x4 +( + XMFLOAT3X4* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[1].vector4_f32[0]; + pDestination->m[0][2] = M.r[2].vector4_f32[0]; + pDestination->m[0][3] = M.r[3].vector4_f32[0]; + + pDestination->m[1][0] = M.r[0].vector4_f32[1]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[2].vector4_f32[1]; + pDestination->m[1][3] = M.r[3].vector4_f32[1]; + + pDestination->m[2][0] = M.r[0].vector4_f32[2]; + pDestination->m[2][1] = M.r[1].vector4_f32[2]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); + float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); + + float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); + float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); + + vst1q_f32(&pDestination->m[0][0], T0.val[0]); + vst1q_f32(&pDestination->m[1][0], T0.val[1]); + vst1q_f32(&pDestination->m[2][0], T1.val[0]); +#elif defined(_XM_SSE_INTRINSICS_) + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + + // x.x,y.x,z.x,w.x + XMVECTOR r0 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + XMVECTOR r1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + XMVECTOR r2 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + + _mm_storeu_ps(&pDestination->m[0][0], r0); + _mm_storeu_ps(&pDestination->m[1][0], r1); + _mm_storeu_ps(&pDestination->m[2][0], r2); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3x4A +( + XMFLOAT3X4A* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[1].vector4_f32[0]; + pDestination->m[0][2] = M.r[2].vector4_f32[0]; + pDestination->m[0][3] = M.r[3].vector4_f32[0]; + + pDestination->m[1][0] = M.r[0].vector4_f32[1]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[2].vector4_f32[1]; + pDestination->m[1][3] = M.r[3].vector4_f32[1]; + + pDestination->m[2][0] = M.r[0].vector4_f32[2]; + pDestination->m[2][1] = M.r[1].vector4_f32[2]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); + float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); + + float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); + float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); + +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + vst1q_f32_ex(&pDestination->m[0][0], T0.val[0], 128); + vst1q_f32_ex(&pDestination->m[1][0], T0.val[1], 128); + vst1q_f32_ex(&pDestination->m[2][0], T1.val[0], 128); +#else + vst1q_f32(&pDestination->m[0][0], T0.val[0]); + vst1q_f32(&pDestination->m[1][0], T0.val[1]); + vst1q_f32(&pDestination->m[2][0], T1.val[0]); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + + // x.x,y.x,z.x,w.x + XMVECTOR r0 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + XMVECTOR r1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + XMVECTOR r2 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + + _mm_store_ps(&pDestination->m[0][0], r0); + _mm_store_ps(&pDestination->m[1][0], r1); + _mm_store_ps(&pDestination->m[2][0], r2); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x4 +( + XMFLOAT4X4* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + pDestination->m[0][3] = M.r[0].vector4_f32[3]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + pDestination->m[1][3] = M.r[1].vector4_f32[3]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[2].vector4_f32[3]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + pDestination->m[3][3] = M.r[3].vector4_f32[3]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32(reinterpret_cast(&pDestination->_11), M.r[0]); + vst1q_f32(reinterpret_cast(&pDestination->_21), M.r[1]); + vst1q_f32(reinterpret_cast(&pDestination->_31), M.r[2]); + vst1q_f32(reinterpret_cast(&pDestination->_41), M.r[3]); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_ps(&pDestination->_11, M.r[0]); + _mm_storeu_ps(&pDestination->_21, M.r[1]); + _mm_storeu_ps(&pDestination->_31, M.r[2]); + _mm_storeu_ps(&pDestination->_41, M.r[3]); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x4A +( + XMFLOAT4X4A* pDestination, + FXMMATRIX M +) noexcept +{ + assert(pDestination); + assert((reinterpret_cast(pDestination) & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + pDestination->m[0][3] = M.r[0].vector4_f32[3]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + pDestination->m[1][3] = M.r[1].vector4_f32[3]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[2].vector4_f32[3]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + pDestination->m[3][3] = M.r[3].vector4_f32[3]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + vst1q_f32_ex(reinterpret_cast(&pDestination->_11), M.r[0], 128); + vst1q_f32_ex(reinterpret_cast(&pDestination->_21), M.r[1], 128); + vst1q_f32_ex(reinterpret_cast(&pDestination->_31), M.r[2], 128); + vst1q_f32_ex(reinterpret_cast(&pDestination->_41), M.r[3], 128); +#else + vst1q_f32(reinterpret_cast(&pDestination->_11), M.r[0]); + vst1q_f32(reinterpret_cast(&pDestination->_21), M.r[1]); + vst1q_f32(reinterpret_cast(&pDestination->_31), M.r[2]); + vst1q_f32(reinterpret_cast(&pDestination->_41), M.r[3]); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ps(&pDestination->_11, M.r[0]); + _mm_store_ps(&pDestination->_21, M.r[1]); + _mm_store_ps(&pDestination->_31, M.r[2]); + _mm_store_ps(&pDestination->_41, M.r[3]); +#endif +} + diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathMatrix.inl b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathMatrix.inl new file mode 100644 index 000000000..1c579a1ec --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathMatrix.inl @@ -0,0 +1,3550 @@ +//------------------------------------------------------------------------------------- +// DirectXMathMatrix.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Matrix + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + // Comparison operations + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +// Return true if any entry in the matrix is NaN +inline bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + size_t i = 16; + auto pWork = reinterpret_cast(&M.m[0][0]); + do { + // Fetch value into integer unit + uint32_t uTest = pWork[0]; + // Remove sign + uTest &= 0x7FFFFFFFU; + // NaN is 0x7F800001 through 0x7FFFFFFF inclusive + uTest -= 0x7F800001U; + if (uTest < 0x007FFFFFU) + { + break; // NaN found + } + ++pWork; // Next entry + } while (--i); + return (i != 0); // i == 0 if nothing matched +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Load in registers + float32x4_t vX = M.r[0]; + float32x4_t vY = M.r[1]; + float32x4_t vZ = M.r[2]; + float32x4_t vW = M.r[3]; + // Test themselves to check for NaN + uint32x4_t xmask = vmvnq_u32(vceqq_f32(vX, vX)); + uint32x4_t ymask = vmvnq_u32(vceqq_f32(vY, vY)); + uint32x4_t zmask = vmvnq_u32(vceqq_f32(vZ, vZ)); + uint32x4_t wmask = vmvnq_u32(vceqq_f32(vW, vW)); + // Or all the results + xmask = vorrq_u32(xmask, zmask); + ymask = vorrq_u32(ymask, wmask); + xmask = vorrq_u32(xmask, ymask); + // If any tested true, return true + uint8x8x2_t vTemp = vzip_u8( + vget_low_u8(vreinterpretq_u8_u32(xmask)), + vget_high_u8(vreinterpretq_u8_u32(xmask))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + return (r != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Load in registers + XMVECTOR vX = M.r[0]; + XMVECTOR vY = M.r[1]; + XMVECTOR vZ = M.r[2]; + XMVECTOR vW = M.r[3]; + // Test themselves to check for NaN + vX = _mm_cmpneq_ps(vX, vX); + vY = _mm_cmpneq_ps(vY, vY); + vZ = _mm_cmpneq_ps(vZ, vZ); + vW = _mm_cmpneq_ps(vW, vW); + // Or all the results + vX = _mm_or_ps(vX, vZ); + vY = _mm_or_ps(vY, vW); + vX = _mm_or_ps(vX, vY); + // If any tested true, return true + return (_mm_movemask_ps(vX) != 0); +#else +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +// Return true if any entry in the matrix is +/-INF +inline bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + size_t i = 16; + auto pWork = reinterpret_cast(&M.m[0][0]); + do { + // Fetch value into integer unit + uint32_t uTest = pWork[0]; + // Remove sign + uTest &= 0x7FFFFFFFU; + // INF is 0x7F800000 + if (uTest == 0x7F800000U) + { + break; // INF found + } + ++pWork; // Next entry + } while (--i); + return (i != 0); // i == 0 if nothing matched +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Load in registers + float32x4_t vX = M.r[0]; + float32x4_t vY = M.r[1]; + float32x4_t vZ = M.r[2]; + float32x4_t vW = M.r[3]; + // Mask off the sign bits + vX = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vX), g_XMAbsMask)); + vY = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vY), g_XMAbsMask)); + vZ = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vZ), g_XMAbsMask)); + vW = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vW), g_XMAbsMask)); + // Compare to infinity + uint32x4_t xmask = vceqq_f32(vX, g_XMInfinity); + uint32x4_t ymask = vceqq_f32(vY, g_XMInfinity); + uint32x4_t zmask = vceqq_f32(vZ, g_XMInfinity); + uint32x4_t wmask = vceqq_f32(vW, g_XMInfinity); + // Or the answers together + xmask = vorrq_u32(xmask, zmask); + ymask = vorrq_u32(ymask, wmask); + xmask = vorrq_u32(xmask, ymask); + // If any tested true, return true + uint8x8x2_t vTemp = vzip_u8( + vget_low_u8(vreinterpretq_u8_u32(xmask)), + vget_high_u8(vreinterpretq_u8_u32(xmask))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + return (r != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bits + XMVECTOR vTemp1 = _mm_and_ps(M.r[0], g_XMAbsMask); + XMVECTOR vTemp2 = _mm_and_ps(M.r[1], g_XMAbsMask); + XMVECTOR vTemp3 = _mm_and_ps(M.r[2], g_XMAbsMask); + XMVECTOR vTemp4 = _mm_and_ps(M.r[3], g_XMAbsMask); + // Compare to infinity + vTemp1 = _mm_cmpeq_ps(vTemp1, g_XMInfinity); + vTemp2 = _mm_cmpeq_ps(vTemp2, g_XMInfinity); + vTemp3 = _mm_cmpeq_ps(vTemp3, g_XMInfinity); + vTemp4 = _mm_cmpeq_ps(vTemp4, g_XMInfinity); + // Or the answers together + vTemp1 = _mm_or_ps(vTemp1, vTemp2); + vTemp3 = _mm_or_ps(vTemp3, vTemp4); + vTemp1 = _mm_or_ps(vTemp1, vTemp3); + // If any are infinity, the signs are true. + return (_mm_movemask_ps(vTemp1) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +// Return true if the XMMatrix is equal to identity +inline bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + // Use the integer pipeline to reduce branching to a minimum + auto pWork = reinterpret_cast(&M.m[0][0]); + // Convert 1.0f to zero and or them together + uint32_t uOne = pWork[0] ^ 0x3F800000U; + // Or all the 0.0f entries together + uint32_t uZero = pWork[1]; + uZero |= pWork[2]; + uZero |= pWork[3]; + // 2nd row + uZero |= pWork[4]; + uOne |= pWork[5] ^ 0x3F800000U; + uZero |= pWork[6]; + uZero |= pWork[7]; + // 3rd row + uZero |= pWork[8]; + uZero |= pWork[9]; + uOne |= pWork[10] ^ 0x3F800000U; + uZero |= pWork[11]; + // 4th row + uZero |= pWork[12]; + uZero |= pWork[13]; + uZero |= pWork[14]; + uOne |= pWork[15] ^ 0x3F800000U; + // If all zero entries are zero, the uZero==0 + uZero &= 0x7FFFFFFF; // Allow -0.0f + // If all 1.0f entries are 1.0f, then uOne==0 + uOne |= uZero; + return (uOne == 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t xmask = vceqq_f32(M.r[0], g_XMIdentityR0); + uint32x4_t ymask = vceqq_f32(M.r[1], g_XMIdentityR1); + uint32x4_t zmask = vceqq_f32(M.r[2], g_XMIdentityR2); + uint32x4_t wmask = vceqq_f32(M.r[3], g_XMIdentityR3); + xmask = vandq_u32(xmask, zmask); + ymask = vandq_u32(ymask, wmask); + xmask = vandq_u32(xmask, ymask); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(xmask)), vget_high_u8(vreinterpretq_u8_u32(xmask))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + return (r == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0], g_XMIdentityR0); + XMVECTOR vTemp2 = _mm_cmpeq_ps(M.r[1], g_XMIdentityR1); + XMVECTOR vTemp3 = _mm_cmpeq_ps(M.r[2], g_XMIdentityR2); + XMVECTOR vTemp4 = _mm_cmpeq_ps(M.r[3], g_XMIdentityR3); + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + vTemp3 = _mm_and_ps(vTemp3, vTemp4); + vTemp1 = _mm_and_ps(vTemp1, vTemp3); + return (_mm_movemask_ps(vTemp1) == 0x0f); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Perform a 4x4 matrix multiply by a 4x4 matrix +inline XMMATRIX XM_CALLCONV XMMatrixMultiply +( + FXMMATRIX M1, + CXMMATRIX M2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX mResult; + // Cache the invariants in registers + float x = M1.m[0][0]; + float y = M1.m[0][1]; + float z = M1.m[0][2]; + float w = M1.m[0][3]; + // Perform the operation on the first row + mResult.m[0][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w); + mResult.m[0][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w); + mResult.m[0][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w); + mResult.m[0][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w); + // Repeat for all the other rows + x = M1.m[1][0]; + y = M1.m[1][1]; + z = M1.m[1][2]; + w = M1.m[1][3]; + mResult.m[1][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w); + mResult.m[1][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w); + mResult.m[1][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w); + mResult.m[1][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w); + x = M1.m[2][0]; + y = M1.m[2][1]; + z = M1.m[2][2]; + w = M1.m[2][3]; + mResult.m[2][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w); + mResult.m[2][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w); + mResult.m[2][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w); + mResult.m[2][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w); + x = M1.m[3][0]; + y = M1.m[3][1]; + z = M1.m[3][2]; + w = M1.m[3][3]; + mResult.m[3][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w); + mResult.m[3][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w); + mResult.m[3][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w); + mResult.m[3][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w); + return mResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX mResult; + float32x2_t VL = vget_low_f32(M1.r[0]); + float32x2_t VH = vget_high_f32(M1.r[0]); + // Perform the operation on the first row + float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0); + float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1); + float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[0] = vaddq_f32(vZ, vW); + // Repeat for the other 3 rows + VL = vget_low_f32(M1.r[1]); + VH = vget_high_f32(M1.r[1]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[1] = vaddq_f32(vZ, vW); + VL = vget_low_f32(M1.r[2]); + VH = vget_high_f32(M1.r[2]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[2] = vaddq_f32(vZ, vW); + VL = vget_low_f32(M1.r[3]); + VH = vget_high_f32(M1.r[3]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[3] = vaddq_f32(vZ, vW); + return mResult; +#elif defined(_XM_AVX2_INTRINSICS_) + __m256 t0 = _mm256_castps128_ps256(M1.r[0]); + t0 = _mm256_insertf128_ps(t0, M1.r[1], 1); + __m256 t1 = _mm256_castps128_ps256(M1.r[2]); + t1 = _mm256_insertf128_ps(t1, M1.r[3], 1); + + __m256 u0 = _mm256_castps128_ps256(M2.r[0]); + u0 = _mm256_insertf128_ps(u0, M2.r[1], 1); + __m256 u1 = _mm256_castps128_ps256(M2.r[2]); + u1 = _mm256_insertf128_ps(u1, M2.r[3], 1); + + __m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00); + __m256 c0 = _mm256_mul_ps(a0, b0); + __m256 c1 = _mm256_mul_ps(a1, b0); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1)); + b0 = _mm256_permute2f128_ps(u0, u0, 0x11); + __m256 c2 = _mm256_fmadd_ps(a0, b0, c0); + __m256 c3 = _mm256_fmadd_ps(a1, b0, c1); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00); + __m256 c4 = _mm256_mul_ps(a0, b1); + __m256 c5 = _mm256_mul_ps(a1, b1); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3)); + b1 = _mm256_permute2f128_ps(u1, u1, 0x11); + __m256 c6 = _mm256_fmadd_ps(a0, b1, c4); + __m256 c7 = _mm256_fmadd_ps(a1, b1, c5); + + t0 = _mm256_add_ps(c2, c6); + t1 = _mm256_add_ps(c3, c7); + + XMMATRIX mResult; + mResult.r[0] = _mm256_castps256_ps128(t0); + mResult.r[1] = _mm256_extractf128_ps(t0, 1); + mResult.r[2] = _mm256_castps256_ps128(t1); + mResult.r[3] = _mm256_extractf128_ps(t1, 1); + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX mResult; + // Splat the component X,Y,Z then W +#if defined(_XM_AVX_INTRINSICS_) + XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 0); + XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 1); + XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 2); + XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 3); +#else + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + XMVECTOR vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + // Perform the operation on the first row + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + // Perform a binary add to reduce cumulative errors + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + mResult.r[0] = vX; + // Repeat for the other 3 rows +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 3); +#else + vW = M1.r[1]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + mResult.r[1] = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 3); +#else + vW = M1.r[2]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + mResult.r[2] = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 3); +#else + vW = M1.r[3]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + mResult.r[3] = vX; + return mResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose +( + FXMMATRIX M1, + CXMMATRIX M2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX mResult; + // Cache the invariants in registers + float x = M2.m[0][0]; + float y = M2.m[1][0]; + float z = M2.m[2][0]; + float w = M2.m[3][0]; + // Perform the operation on the first row + mResult.m[0][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w); + mResult.m[0][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w); + mResult.m[0][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w); + mResult.m[0][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w); + // Repeat for all the other rows + x = M2.m[0][1]; + y = M2.m[1][1]; + z = M2.m[2][1]; + w = M2.m[3][1]; + mResult.m[1][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w); + mResult.m[1][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w); + mResult.m[1][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w); + mResult.m[1][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w); + x = M2.m[0][2]; + y = M2.m[1][2]; + z = M2.m[2][2]; + w = M2.m[3][2]; + mResult.m[2][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w); + mResult.m[2][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w); + mResult.m[2][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w); + mResult.m[2][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w); + x = M2.m[0][3]; + y = M2.m[1][3]; + z = M2.m[2][3]; + w = M2.m[3][3]; + mResult.m[3][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w); + mResult.m[3][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w); + mResult.m[3][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w); + mResult.m[3][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w); + return mResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(M1.r[0]); + float32x2_t VH = vget_high_f32(M1.r[0]); + // Perform the operation on the first row + float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0); + float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1); + float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r0 = vaddq_f32(vZ, vW); + // Repeat for the other 3 rows + VL = vget_low_f32(M1.r[1]); + VH = vget_high_f32(M1.r[1]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r1 = vaddq_f32(vZ, vW); + VL = vget_low_f32(M1.r[2]); + VH = vget_high_f32(M1.r[2]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r2 = vaddq_f32(vZ, vW); + VL = vget_low_f32(M1.r[3]); + VH = vget_high_f32(M1.r[3]); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r3 = vaddq_f32(vZ, vW); + + // Transpose result + float32x4x2_t P0 = vzipq_f32(r0, r2); + float32x4x2_t P1 = vzipq_f32(r1, r3); + + float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); + float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); + + XMMATRIX mResult; + mResult.r[0] = T0.val[0]; + mResult.r[1] = T0.val[1]; + mResult.r[2] = T1.val[0]; + mResult.r[3] = T1.val[1]; + return mResult; +#elif defined(_XM_AVX2_INTRINSICS_) + __m256 t0 = _mm256_castps128_ps256(M1.r[0]); + t0 = _mm256_insertf128_ps(t0, M1.r[1], 1); + __m256 t1 = _mm256_castps128_ps256(M1.r[2]); + t1 = _mm256_insertf128_ps(t1, M1.r[3], 1); + + __m256 u0 = _mm256_castps128_ps256(M2.r[0]); + u0 = _mm256_insertf128_ps(u0, M2.r[1], 1); + __m256 u1 = _mm256_castps128_ps256(M2.r[2]); + u1 = _mm256_insertf128_ps(u1, M2.r[3], 1); + + __m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00); + __m256 c0 = _mm256_mul_ps(a0, b0); + __m256 c1 = _mm256_mul_ps(a1, b0); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1)); + b0 = _mm256_permute2f128_ps(u0, u0, 0x11); + __m256 c2 = _mm256_fmadd_ps(a0, b0, c0); + __m256 c3 = _mm256_fmadd_ps(a1, b0, c1); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00); + __m256 c4 = _mm256_mul_ps(a0, b1); + __m256 c5 = _mm256_mul_ps(a1, b1); + + a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3)); + a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3)); + b1 = _mm256_permute2f128_ps(u1, u1, 0x11); + __m256 c6 = _mm256_fmadd_ps(a0, b1, c4); + __m256 c7 = _mm256_fmadd_ps(a1, b1, c5); + + t0 = _mm256_add_ps(c2, c6); + t1 = _mm256_add_ps(c3, c7); + + // Transpose result + __m256 vTemp = _mm256_unpacklo_ps(t0, t1); + __m256 vTemp2 = _mm256_unpackhi_ps(t0, t1); + __m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20); + __m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31); + vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4); + vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4); + t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20); + t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31); + + XMMATRIX mResult; + mResult.r[0] = _mm256_castps256_ps128(t0); + mResult.r[1] = _mm256_extractf128_ps(t0, 1); + mResult.r[2] = _mm256_castps256_ps128(t1); + mResult.r[3] = _mm256_extractf128_ps(t1, 1); + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the component X,Y,Z then W +#if defined(_XM_AVX_INTRINSICS_) + XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 0); + XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 1); + XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 2); + XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[0]) + 3); +#else + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + XMVECTOR vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + // Perform the operation on the first row + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + // Perform a binary add to reduce cumulative errors + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + XMVECTOR r0 = vX; + // Repeat for the other 3 rows +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[1]) + 3); +#else + vW = M1.r[1]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + XMVECTOR r1 = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[2]) + 3); +#else + vW = M1.r[2]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + XMVECTOR r2 = vX; +#if defined(_XM_AVX_INTRINSICS_) + vX = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 0); + vY = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 1); + vZ = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 2); + vW = _mm_broadcast_ss(reinterpret_cast(&M1.r[3]) + 3); +#else + vW = M1.r[3]; + vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0)); + vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1)); + vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2)); + vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3)); +#endif + vX = _mm_mul_ps(vX, M2.r[0]); + vY = _mm_mul_ps(vY, M2.r[1]); + vZ = _mm_mul_ps(vZ, M2.r[2]); + vW = _mm_mul_ps(vW, M2.r[3]); + vX = _mm_add_ps(vX, vZ); + vY = _mm_add_ps(vY, vW); + vX = _mm_add_ps(vX, vY); + XMVECTOR r3 = vX; + + // Transpose result + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2)); + + XMMATRIX mResult; + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTranspose(FXMMATRIX M) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + // Original matrix: + // + // m00m01m02m03 + // m10m11m12m13 + // m20m21m22m23 + // m30m31m32m33 + + XMMATRIX P; + P.r[0] = XMVectorMergeXY(M.r[0], M.r[2]); // m00m20m01m21 + P.r[1] = XMVectorMergeXY(M.r[1], M.r[3]); // m10m30m11m31 + P.r[2] = XMVectorMergeZW(M.r[0], M.r[2]); // m02m22m03m23 + P.r[3] = XMVectorMergeZW(M.r[1], M.r[3]); // m12m32m13m33 + + XMMATRIX MT; + MT.r[0] = XMVectorMergeXY(P.r[0], P.r[1]); // m00m10m20m30 + MT.r[1] = XMVectorMergeZW(P.r[0], P.r[1]); // m01m11m21m31 + MT.r[2] = XMVectorMergeXY(P.r[2], P.r[3]); // m02m12m22m32 + MT.r[3] = XMVectorMergeZW(P.r[2], P.r[3]); // m03m13m23m33 + return MT; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]); + float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]); + + float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]); + float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]); + + XMMATRIX mResult; + mResult.r[0] = T0.val[0]; + mResult.r[1] = T0.val[1]; + mResult.r[2] = T1.val[0]; + mResult.r[3] = T1.val[1]; + return mResult; +#elif defined(_XM_AVX2_INTRINSICS_) + __m256 t0 = _mm256_castps128_ps256(M.r[0]); + t0 = _mm256_insertf128_ps(t0, M.r[1], 1); + __m256 t1 = _mm256_castps128_ps256(M.r[2]); + t1 = _mm256_insertf128_ps(t1, M.r[3], 1); + + __m256 vTemp = _mm256_unpacklo_ps(t0, t1); + __m256 vTemp2 = _mm256_unpackhi_ps(t0, t1); + __m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20); + __m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31); + vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4); + vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4); + t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20); + t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31); + + XMMATRIX mResult; + mResult.r[0] = _mm256_castps256_ps128(t0); + mResult.r[1] = _mm256_extractf128_ps(t0, 1); + mResult.r[2] = _mm256_castps256_ps128(t1); + mResult.r[3] = _mm256_extractf128_ps(t1, 1); + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + + XMMATRIX mResult; + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ +// Return the inverse and the determinant of a 4x4 matrix +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMMatrixInverse +( + XMVECTOR* pDeterminant, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMMATRIX MT = XMMatrixTranspose(M); + + XMVECTOR V0[4], V1[4]; + V0[0] = XMVectorSwizzle(MT.r[2]); + V1[0] = XMVectorSwizzle(MT.r[3]); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorSwizzle(MT.r[1]); + V0[2] = XMVectorPermute(MT.r[2], MT.r[0]); + V1[2] = XMVectorPermute(MT.r[3], MT.r[1]); + + XMVECTOR D0 = XMVectorMultiply(V0[0], V1[0]); + XMVECTOR D1 = XMVectorMultiply(V0[1], V1[1]); + XMVECTOR D2 = XMVectorMultiply(V0[2], V1[2]); + + V0[0] = XMVectorSwizzle(MT.r[2]); + V1[0] = XMVectorSwizzle(MT.r[3]); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorSwizzle(MT.r[1]); + V0[2] = XMVectorPermute(MT.r[2], MT.r[0]); + V1[2] = XMVectorPermute(MT.r[3], MT.r[1]); + + D0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], D0); + D1 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], D1); + D2 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], D2); + + V0[0] = XMVectorSwizzle(MT.r[1]); + V1[0] = XMVectorPermute(D0, D2); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorPermute(D0, D2); + V0[2] = XMVectorSwizzle(MT.r[3]); + V1[2] = XMVectorPermute(D1, D2); + V0[3] = XMVectorSwizzle(MT.r[2]); + V1[3] = XMVectorPermute(D1, D2); + + XMVECTOR C0 = XMVectorMultiply(V0[0], V1[0]); + XMVECTOR C2 = XMVectorMultiply(V0[1], V1[1]); + XMVECTOR C4 = XMVectorMultiply(V0[2], V1[2]); + XMVECTOR C6 = XMVectorMultiply(V0[3], V1[3]); + + V0[0] = XMVectorSwizzle(MT.r[1]); + V1[0] = XMVectorPermute(D0, D2); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorPermute(D0, D2); + V0[2] = XMVectorSwizzle(MT.r[3]); + V1[2] = XMVectorPermute(D1, D2); + V0[3] = XMVectorSwizzle(MT.r[2]); + V1[3] = XMVectorPermute(D1, D2); + + C0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0); + C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2); + C4 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4); + C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6); + + V0[0] = XMVectorSwizzle(MT.r[1]); + V1[0] = XMVectorPermute(D0, D2); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorPermute(D0, D2); + V0[2] = XMVectorSwizzle(MT.r[3]); + V1[2] = XMVectorPermute(D1, D2); + V0[3] = XMVectorSwizzle(MT.r[2]); + V1[3] = XMVectorPermute(D1, D2); + + XMVECTOR C1 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0); + C0 = XMVectorMultiplyAdd(V0[0], V1[0], C0); + XMVECTOR C3 = XMVectorMultiplyAdd(V0[1], V1[1], C2); + C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2); + XMVECTOR C5 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4); + C4 = XMVectorMultiplyAdd(V0[2], V1[2], C4); + XMVECTOR C7 = XMVectorMultiplyAdd(V0[3], V1[3], C6); + C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6); + + XMMATRIX R; + R.r[0] = XMVectorSelect(C0, C1, g_XMSelect0101.v); + R.r[1] = XMVectorSelect(C2, C3, g_XMSelect0101.v); + R.r[2] = XMVectorSelect(C4, C5, g_XMSelect0101.v); + R.r[3] = XMVectorSelect(C6, C7, g_XMSelect0101.v); + + XMVECTOR Determinant = XMVector4Dot(R.r[0], MT.r[0]); + + if (pDeterminant != nullptr) + *pDeterminant = Determinant; + + XMVECTOR Reciprocal = XMVectorReciprocal(Determinant); + + XMMATRIX Result; + Result.r[0] = XMVectorMultiply(R.r[0], Reciprocal); + Result.r[1] = XMVectorMultiply(R.r[1], Reciprocal); + Result.r[2] = XMVectorMultiply(R.r[2], Reciprocal); + Result.r[3] = XMVectorMultiply(R.r[3], Reciprocal); + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // Transpose matrix + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0)); + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0)); + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + + XMMATRIX MT; + MT.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)); + MT.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)); + MT.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)); + MT.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1)); + + XMVECTOR V00 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(1, 1, 0, 0)); + XMVECTOR V10 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(3, 2, 3, 2)); + XMVECTOR V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1, 1, 0, 0)); + XMVECTOR V11 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(3, 2, 3, 2)); + XMVECTOR V02 = _mm_shuffle_ps(MT.r[2], MT.r[0], _MM_SHUFFLE(2, 0, 2, 0)); + XMVECTOR V12 = _mm_shuffle_ps(MT.r[3], MT.r[1], _MM_SHUFFLE(3, 1, 3, 1)); + + XMVECTOR D0 = _mm_mul_ps(V00, V10); + XMVECTOR D1 = _mm_mul_ps(V01, V11); + XMVECTOR D2 = _mm_mul_ps(V02, V12); + + V00 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(3, 2, 3, 2)); + V10 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1, 1, 0, 0)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(3, 2, 3, 2)); + V11 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1, 1, 0, 0)); + V02 = _mm_shuffle_ps(MT.r[2], MT.r[0], _MM_SHUFFLE(3, 1, 3, 1)); + V12 = _mm_shuffle_ps(MT.r[3], MT.r[1], _MM_SHUFFLE(2, 0, 2, 0)); + + D0 = XM_FNMADD_PS(V00, V10, D0); + D1 = XM_FNMADD_PS(V01, V11, D1); + D2 = XM_FNMADD_PS(V02, V12, D2); + // V11 = D0Y,D0W,D2Y,D2Y + V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 1, 3, 1)); + V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1, 0, 2, 1)); + V10 = _mm_shuffle_ps(V11, D0, _MM_SHUFFLE(0, 3, 0, 2)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(0, 1, 0, 2)); + V11 = _mm_shuffle_ps(V11, D0, _MM_SHUFFLE(2, 1, 2, 1)); + // V13 = D1Y,D1W,D2W,D2W + XMVECTOR V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 3, 3, 1)); + V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1, 0, 2, 1)); + V12 = _mm_shuffle_ps(V13, D1, _MM_SHUFFLE(0, 3, 0, 2)); + XMVECTOR V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(0, 1, 0, 2)); + V13 = _mm_shuffle_ps(V13, D1, _MM_SHUFFLE(2, 1, 2, 1)); + + XMVECTOR C0 = _mm_mul_ps(V00, V10); + XMVECTOR C2 = _mm_mul_ps(V01, V11); + XMVECTOR C4 = _mm_mul_ps(V02, V12); + XMVECTOR C6 = _mm_mul_ps(V03, V13); + + // V11 = D0X,D0Y,D2X,D2X + V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(0, 0, 1, 0)); + V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(2, 1, 3, 2)); + V10 = _mm_shuffle_ps(D0, V11, _MM_SHUFFLE(2, 1, 0, 3)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1, 3, 2, 3)); + V11 = _mm_shuffle_ps(D0, V11, _MM_SHUFFLE(0, 2, 1, 2)); + // V13 = D1X,D1Y,D2Z,D2Z + V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(2, 2, 1, 0)); + V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(2, 1, 3, 2)); + V12 = _mm_shuffle_ps(D1, V13, _MM_SHUFFLE(2, 1, 0, 3)); + V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(1, 3, 2, 3)); + V13 = _mm_shuffle_ps(D1, V13, _MM_SHUFFLE(0, 2, 1, 2)); + + C0 = XM_FNMADD_PS(V00, V10, C0); + C2 = XM_FNMADD_PS(V01, V11, C2); + C4 = XM_FNMADD_PS(V02, V12, C4); + C6 = XM_FNMADD_PS(V03, V13, C6); + + V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(0, 3, 0, 3)); + // V10 = D0Z,D0Z,D2X,D2Y + V10 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 0, 2, 2)); + V10 = XM_PERMUTE_PS(V10, _MM_SHUFFLE(0, 2, 3, 0)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(2, 0, 3, 1)); + // V11 = D0X,D0W,D2X,D2Y + V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 0, 3, 0)); + V11 = XM_PERMUTE_PS(V11, _MM_SHUFFLE(2, 1, 0, 3)); + V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(0, 3, 0, 3)); + // V12 = D1Z,D1Z,D2Z,D2W + V12 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 2, 2, 2)); + V12 = XM_PERMUTE_PS(V12, _MM_SHUFFLE(0, 2, 3, 0)); + V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(2, 0, 3, 1)); + // V13 = D1X,D1W,D2Z,D2W + V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 2, 3, 0)); + V13 = XM_PERMUTE_PS(V13, _MM_SHUFFLE(2, 1, 0, 3)); + + V00 = _mm_mul_ps(V00, V10); + V01 = _mm_mul_ps(V01, V11); + V02 = _mm_mul_ps(V02, V12); + V03 = _mm_mul_ps(V03, V13); + XMVECTOR C1 = _mm_sub_ps(C0, V00); + C0 = _mm_add_ps(C0, V00); + XMVECTOR C3 = _mm_add_ps(C2, V01); + C2 = _mm_sub_ps(C2, V01); + XMVECTOR C5 = _mm_sub_ps(C4, V02); + C4 = _mm_add_ps(C4, V02); + XMVECTOR C7 = _mm_add_ps(C6, V03); + C6 = _mm_sub_ps(C6, V03); + + C0 = _mm_shuffle_ps(C0, C1, _MM_SHUFFLE(3, 1, 2, 0)); + C2 = _mm_shuffle_ps(C2, C3, _MM_SHUFFLE(3, 1, 2, 0)); + C4 = _mm_shuffle_ps(C4, C5, _MM_SHUFFLE(3, 1, 2, 0)); + C6 = _mm_shuffle_ps(C6, C7, _MM_SHUFFLE(3, 1, 2, 0)); + C0 = XM_PERMUTE_PS(C0, _MM_SHUFFLE(3, 1, 2, 0)); + C2 = XM_PERMUTE_PS(C2, _MM_SHUFFLE(3, 1, 2, 0)); + C4 = XM_PERMUTE_PS(C4, _MM_SHUFFLE(3, 1, 2, 0)); + C6 = XM_PERMUTE_PS(C6, _MM_SHUFFLE(3, 1, 2, 0)); + // Get the determinant + XMVECTOR vTemp = XMVector4Dot(C0, MT.r[0]); + if (pDeterminant != nullptr) + *pDeterminant = vTemp; + vTemp = _mm_div_ps(g_XMOne, vTemp); + XMMATRIX mResult; + mResult.r[0] = _mm_mul_ps(C0, vTemp); + mResult.r[1] = _mm_mul_ps(C2, vTemp); + mResult.r[2] = _mm_mul_ps(C4, vTemp); + mResult.r[3] = _mm_mul_ps(C6, vTemp); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixVectorTensorProduct +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + XMMATRIX mResult; + mResult.r[0] = XMVectorMultiply(XMVectorSwizzle<0, 0, 0, 0>(V1), V2); + mResult.r[1] = XMVectorMultiply(XMVectorSwizzle<1, 1, 1, 1>(V1), V2); + mResult.r[2] = XMVectorMultiply(XMVectorSwizzle<2, 2, 2, 2>(V1), V2); + mResult.r[3] = XMVectorMultiply(XMVectorSwizzle<3, 3, 3, 3>(V1), V2); + return mResult; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMMatrixDeterminant(FXMMATRIX M) noexcept +{ + static const XMVECTORF32 Sign = { { { 1.0f, -1.0f, 1.0f, -1.0f } } }; + + XMVECTOR V0 = XMVectorSwizzle(M.r[2]); + XMVECTOR V1 = XMVectorSwizzle(M.r[3]); + XMVECTOR V2 = XMVectorSwizzle(M.r[2]); + XMVECTOR V3 = XMVectorSwizzle(M.r[3]); + XMVECTOR V4 = XMVectorSwizzle(M.r[2]); + XMVECTOR V5 = XMVectorSwizzle(M.r[3]); + + XMVECTOR P0 = XMVectorMultiply(V0, V1); + XMVECTOR P1 = XMVectorMultiply(V2, V3); + XMVECTOR P2 = XMVectorMultiply(V4, V5); + + V0 = XMVectorSwizzle(M.r[2]); + V1 = XMVectorSwizzle(M.r[3]); + V2 = XMVectorSwizzle(M.r[2]); + V3 = XMVectorSwizzle(M.r[3]); + V4 = XMVectorSwizzle(M.r[2]); + V5 = XMVectorSwizzle(M.r[3]); + + P0 = XMVectorNegativeMultiplySubtract(V0, V1, P0); + P1 = XMVectorNegativeMultiplySubtract(V2, V3, P1); + P2 = XMVectorNegativeMultiplySubtract(V4, V5, P2); + + V0 = XMVectorSwizzle(M.r[1]); + V1 = XMVectorSwizzle(M.r[1]); + V2 = XMVectorSwizzle(M.r[1]); + + XMVECTOR S = XMVectorMultiply(M.r[0], Sign.v); + XMVECTOR R = XMVectorMultiply(V0, P0); + R = XMVectorNegativeMultiplySubtract(V1, P1, R); + R = XMVectorMultiplyAdd(V2, P2, R); + + return XMVector4Dot(S, R); +} + +#define XM3RANKDECOMPOSE(a, b, c, x, y, z) \ + if((x) < (y)) \ + { \ + if((y) < (z)) \ + { \ + (a) = 2; \ + (b) = 1; \ + (c) = 0; \ + } \ + else \ + { \ + (a) = 1; \ + \ + if((x) < (z)) \ + { \ + (b) = 2; \ + (c) = 0; \ + } \ + else \ + { \ + (b) = 0; \ + (c) = 2; \ + } \ + } \ + } \ + else \ + { \ + if((x) < (z)) \ + { \ + (a) = 2; \ + (b) = 0; \ + (c) = 1; \ + } \ + else \ + { \ + (a) = 0; \ + \ + if((y) < (z)) \ + { \ + (b) = 2; \ + (c) = 1; \ + } \ + else \ + { \ + (b) = 1; \ + (c) = 2; \ + } \ + } \ + } + +#define XM3_DECOMP_EPSILON 0.0001f + +_Use_decl_annotations_ +inline bool XM_CALLCONV XMMatrixDecompose +( + XMVECTOR* outScale, + XMVECTOR* outRotQuat, + XMVECTOR* outTrans, + FXMMATRIX M +) noexcept +{ + static const XMVECTOR* pvCanonicalBasis[3] = { + &g_XMIdentityR0.v, + &g_XMIdentityR1.v, + &g_XMIdentityR2.v + }; + + assert(outScale != nullptr); + assert(outRotQuat != nullptr); + assert(outTrans != nullptr); + + // Get the translation + outTrans[0] = M.r[3]; + + XMVECTOR* ppvBasis[3]; + XMMATRIX matTemp; + ppvBasis[0] = &matTemp.r[0]; + ppvBasis[1] = &matTemp.r[1]; + ppvBasis[2] = &matTemp.r[2]; + + matTemp.r[0] = M.r[0]; + matTemp.r[1] = M.r[1]; + matTemp.r[2] = M.r[2]; + matTemp.r[3] = g_XMIdentityR3.v; + + auto pfScales = reinterpret_cast(outScale); + + size_t a, b, c; + XMVectorGetXPtr(&pfScales[0], XMVector3Length(ppvBasis[0][0])); + XMVectorGetXPtr(&pfScales[1], XMVector3Length(ppvBasis[1][0])); + XMVectorGetXPtr(&pfScales[2], XMVector3Length(ppvBasis[2][0])); + pfScales[3] = 0.f; + + XM3RANKDECOMPOSE(a, b, c, pfScales[0], pfScales[1], pfScales[2]) + + if (pfScales[a] < XM3_DECOMP_EPSILON) + { + ppvBasis[a][0] = pvCanonicalBasis[a][0]; + } + ppvBasis[a][0] = XMVector3Normalize(ppvBasis[a][0]); + + if (pfScales[b] < XM3_DECOMP_EPSILON) + { + size_t aa, bb, cc; + float fAbsX, fAbsY, fAbsZ; + + fAbsX = fabsf(XMVectorGetX(ppvBasis[a][0])); + fAbsY = fabsf(XMVectorGetY(ppvBasis[a][0])); + fAbsZ = fabsf(XMVectorGetZ(ppvBasis[a][0])); + + XM3RANKDECOMPOSE(aa, bb, cc, fAbsX, fAbsY, fAbsZ) + + ppvBasis[b][0] = XMVector3Cross(ppvBasis[a][0], pvCanonicalBasis[cc][0]); + } + + ppvBasis[b][0] = XMVector3Normalize(ppvBasis[b][0]); + + if (pfScales[c] < XM3_DECOMP_EPSILON) + { + ppvBasis[c][0] = XMVector3Cross(ppvBasis[a][0], ppvBasis[b][0]); + } + + ppvBasis[c][0] = XMVector3Normalize(ppvBasis[c][0]); + + float fDet = XMVectorGetX(XMMatrixDeterminant(matTemp)); + + // use Kramer's rule to check for handedness of coordinate system + if (fDet < 0.0f) + { + // switch coordinate system by negating the scale and inverting the basis vector on the x-axis + pfScales[a] = -pfScales[a]; + ppvBasis[a][0] = XMVectorNegate(ppvBasis[a][0]); + + fDet = -fDet; + } + + fDet -= 1.0f; + fDet *= fDet; + + if (XM3_DECOMP_EPSILON < fDet) + { + // Non-SRT matrix encountered + return false; + } + + // generate the quaternion from the matrix + outRotQuat[0] = XMQuaternionRotationMatrix(matTemp); + return true; +} + +#undef XM3_DECOMP_EPSILON +#undef XM3RANKDECOMPOSE + +//------------------------------------------------------------------------------ +// Transformation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixIdentity() noexcept +{ + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = g_XMIdentityR3.v; + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixSet +( + float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33 +) noexcept +{ + XMMATRIX M; +#if defined(_XM_NO_INTRINSICS_) + M.m[0][0] = m00; M.m[0][1] = m01; M.m[0][2] = m02; M.m[0][3] = m03; + M.m[1][0] = m10; M.m[1][1] = m11; M.m[1][2] = m12; M.m[1][3] = m13; + M.m[2][0] = m20; M.m[2][1] = m21; M.m[2][2] = m22; M.m[2][3] = m23; + M.m[3][0] = m30; M.m[3][1] = m31; M.m[3][2] = m32; M.m[3][3] = m33; +#else + M.r[0] = XMVectorSet(m00, m01, m02, m03); + M.r[1] = XMVectorSet(m10, m11, m12, m13); + M.r[2] = XMVectorSet(m20, m21, m22, m23); + M.r[3] = XMVectorSet(m30, m31, m32, m33); +#endif + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTranslation +( + float OffsetX, + float OffsetY, + float OffsetZ +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = OffsetX; + M.m[3][1] = OffsetY; + M.m[3][2] = OffsetZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = XMVectorSet(OffsetX, OffsetY, OffsetZ, 1.f); + return M; +#endif +} + + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector(FXMVECTOR Offset) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = Offset.vector4_f32[0]; + M.m[3][1] = Offset.vector4_f32[1]; + M.m[3][2] = Offset.vector4_f32[2]; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = XMVectorSelect(g_XMIdentityR3.v, Offset, g_XMSelect1110.v); + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixScaling +( + float ScaleX, + float ScaleY, + float ScaleZ +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = ScaleX; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ScaleY; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = ScaleZ; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(ScaleX, Zero, 0); + M.r[1] = vsetq_lane_f32(ScaleY, Zero, 1); + M.r[2] = vsetq_lane_f32(ScaleZ, Zero, 2); + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_set_ps(0, 0, 0, ScaleX); + M.r[1] = _mm_set_ps(0, 0, ScaleY, 0); + M.r[2] = _mm_set_ps(0, ScaleZ, 0, 0); + M.r[3] = g_XMIdentityR3.v; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixScalingFromVector(FXMVECTOR Scale) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = Scale.vector4_f32[0]; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Scale.vector4_f32[1]; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = Scale.vector4_f32[2]; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskX)); + M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskY)); + M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskZ)); + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_and_ps(Scale, g_XMMaskX); + M.r[1] = _mm_and_ps(Scale, g_XMMaskY); + M.r[2] = _mm_and_ps(Scale, g_XMMaskZ); + M.r[3] = g_XMIdentityR3.v; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = fCosAngle; + M.m[1][2] = fSinAngle; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = -fSinAngle; + M.m[2][2] = fCosAngle; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const float32x4_t Zero = vdupq_n_f32(0); + + float32x4_t T1 = vsetq_lane_f32(fCosAngle, Zero, 1); + T1 = vsetq_lane_f32(fSinAngle, T1, 2); + + float32x4_t T2 = vsetq_lane_f32(-fSinAngle, Zero, 1); + T2 = vsetq_lane_f32(fCosAngle, T2, 2); + + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = T1; + M.r[2] = T2; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = 0,y = cos,z = sin, w = 0 + vCos = _mm_shuffle_ps(vCos, vSin, _MM_SHUFFLE(3, 0, 0, 3)); + XMMATRIX M; + M.r[0] = g_XMIdentityR0; + M.r[1] = vCos; + // x = 0,y = sin,z = cos, w = 0 + vCos = XM_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 1, 2, 0)); + // x = 0,y = -sin,z = cos, w = 0 + vCos = _mm_mul_ps(vCos, g_XMNegateY); + M.r[2] = vCos; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = fCosAngle; + M.m[0][1] = 0.0f; + M.m[0][2] = -fSinAngle; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = fSinAngle; + M.m[2][1] = 0.0f; + M.m[2][2] = fCosAngle; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const float32x4_t Zero = vdupq_n_f32(0); + + float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0); + T0 = vsetq_lane_f32(-fSinAngle, T0, 2); + + float32x4_t T2 = vsetq_lane_f32(fSinAngle, Zero, 0); + T2 = vsetq_lane_f32(fCosAngle, T2, 2); + + XMMATRIX M; + M.r[0] = T0; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = T2; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = sin,y = 0,z = cos, w = 0 + vSin = _mm_shuffle_ps(vSin, vCos, _MM_SHUFFLE(3, 0, 3, 0)); + XMMATRIX M; + M.r[2] = vSin; + M.r[1] = g_XMIdentityR1; + // x = cos,y = 0,z = sin, w = 0 + vSin = XM_PERMUTE_PS(vSin, _MM_SHUFFLE(3, 0, 1, 2)); + // x = cos,y = 0,z = -sin, w = 0 + vSin = _mm_mul_ps(vSin, g_XMNegateZ); + M.r[0] = vSin; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = fCosAngle; + M.m[0][1] = fSinAngle; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = -fSinAngle; + M.m[1][1] = fCosAngle; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const float32x4_t Zero = vdupq_n_f32(0); + + float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0); + T0 = vsetq_lane_f32(fSinAngle, T0, 1); + + float32x4_t T1 = vsetq_lane_f32(-fSinAngle, Zero, 0); + T1 = vsetq_lane_f32(fCosAngle, T1, 1); + + XMMATRIX M; + M.r[0] = T0; + M.r[1] = T1; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = cos,y = sin,z = 0, w = 0 + vCos = _mm_unpacklo_ps(vCos, vSin); + XMMATRIX M; + M.r[0] = vCos; + // x = sin,y = cos,z = 0, w = 0 + vCos = XM_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 2, 0, 1)); + // x = cos,y = -sin,z = 0, w = 0 + vCos = _mm_mul_ps(vCos, g_XMNegateX); + M.r[1] = vCos; + M.r[2] = g_XMIdentityR2; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw +( + float Pitch, + float Yaw, + float Roll +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float cp = cosf(Pitch); + float sp = sinf(Pitch); + + float cy = cosf(Yaw); + float sy = sinf(Yaw); + + float cr = cosf(Roll); + float sr = sinf(Roll); + + XMMATRIX M; + M.m[0][0] = cr * cy + sr * sp * sy; + M.m[0][1] = sr * cp; + M.m[0][2] = sr * sp * cy - cr * sy; + M.m[0][3] = 0.0f; + + M.m[1][0] = cr * sp * sy - sr * cy; + M.m[1][1] = cr * cp; + M.m[1][2] = sr * sy + cr * sp * cy; + M.m[1][3] = 0.0f; + + M.m[2][0] = cp * sy; + M.m[2][1] = -sp; + M.m[2][2] = cp * cy; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; +#else + XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f); + return XMMatrixRotationRollPitchYawFromVector(Angles); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector +( + FXMVECTOR Angles // +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float cp = cosf(Angles.vector4_f32[0]); + float sp = sinf(Angles.vector4_f32[0]); + + float cy = cosf(Angles.vector4_f32[1]); + float sy = sinf(Angles.vector4_f32[1]); + + float cr = cosf(Angles.vector4_f32[2]); + float sr = sinf(Angles.vector4_f32[2]); + + XMMATRIX M; + M.m[0][0] = cr * cy + sr * sp * sy; + M.m[0][1] = sr * cp; + M.m[0][2] = sr * sp * cy - cr * sy; + M.m[0][3] = 0.0f; + + M.m[1][0] = cr * sp * sy - sr * cy; + M.m[1][1] = cr * cp; + M.m[1][2] = sr * sy + cr * sp * cy; + M.m[1][3] = 0.0f; + + M.m[2][0] = cp * sy; + M.m[2][1] = -sp; + M.m[2][2] = cp * cy; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; +#else + static const XMVECTORF32 Sign = { { { 1.0f, -1.0f, -1.0f, 1.0f } } }; + + XMVECTOR SinAngles, CosAngles; + XMVectorSinCos(&SinAngles, &CosAngles, Angles); + + XMVECTOR P0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR Y0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR P1 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR Y1 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR P2 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR P3 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR Y2 = XMVectorSplatX(SinAngles); + XMVECTOR NS = XMVectorNegate(SinAngles); + + XMVECTOR Q0 = XMVectorMultiply(P0, Y0); + XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v); + Q1 = XMVectorMultiply(Q1, Y1); + XMVECTOR Q2 = XMVectorMultiply(P2, Y2); + Q2 = XMVectorMultiplyAdd(Q2, P3, Q1); + + XMVECTOR V0 = XMVectorPermute(Q0, Q2); + XMVECTOR V1 = XMVectorPermute(Q0, Q2); + XMVECTOR V2 = XMVectorPermute(Q0, NS); + + XMMATRIX M; + M.r[0] = XMVectorSelect(g_XMZero, V0, g_XMSelect1110.v); + M.r[1] = XMVectorSelect(g_XMZero, V1, g_XMSelect1110.v); + M.r[2] = XMVectorSelect(g_XMZero, V2, g_XMSelect1110.v); + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationNormal +( + FXMVECTOR NormalAxis, + float Angle +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMVECTOR A = XMVectorSet(fSinAngle, fCosAngle, 1.0f - fCosAngle, 0.0f); + + XMVECTOR C2 = XMVectorSplatZ(A); + XMVECTOR C1 = XMVectorSplatY(A); + XMVECTOR C0 = XMVectorSplatX(A); + + XMVECTOR N0 = XMVectorSwizzle(NormalAxis); + XMVECTOR N1 = XMVectorSwizzle(NormalAxis); + + XMVECTOR V0 = XMVectorMultiply(C2, N0); + V0 = XMVectorMultiply(V0, N1); + + XMVECTOR R0 = XMVectorMultiply(C2, NormalAxis); + R0 = XMVectorMultiplyAdd(R0, NormalAxis, C1); + + XMVECTOR R1 = XMVectorMultiplyAdd(C0, NormalAxis, V0); + XMVECTOR R2 = XMVectorNegativeMultiplySubtract(C0, NormalAxis, V0); + + V0 = XMVectorSelect(A, R0, g_XMSelect1110.v); + XMVECTOR V1 = XMVectorPermute(R1, R2); + XMVECTOR V2 = XMVectorPermute(R1, R2); + + XMMATRIX M; + M.r[0] = XMVectorPermute(V0, V1); + M.r[1] = XMVectorPermute(V0, V1); + M.r[2] = XMVectorPermute(V0, V2); + M.r[3] = g_XMIdentityR3.v; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMVECTOR C2 = _mm_set_ps1(1.0f - fCosAngle); + XMVECTOR C1 = _mm_set_ps1(fCosAngle); + XMVECTOR C0 = _mm_set_ps1(fSinAngle); + + XMVECTOR N0 = XM_PERMUTE_PS(NormalAxis, _MM_SHUFFLE(3, 0, 2, 1)); + XMVECTOR N1 = XM_PERMUTE_PS(NormalAxis, _MM_SHUFFLE(3, 1, 0, 2)); + + XMVECTOR V0 = _mm_mul_ps(C2, N0); + V0 = _mm_mul_ps(V0, N1); + + XMVECTOR R0 = _mm_mul_ps(C2, NormalAxis); + R0 = _mm_mul_ps(R0, NormalAxis); + R0 = _mm_add_ps(R0, C1); + + XMVECTOR R1 = _mm_mul_ps(C0, NormalAxis); + R1 = _mm_add_ps(R1, V0); + XMVECTOR R2 = _mm_mul_ps(C0, NormalAxis); + R2 = _mm_sub_ps(V0, R2); + + V0 = _mm_and_ps(R0, g_XMMask3); + XMVECTOR V1 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(2, 1, 2, 0)); + V1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 3, 2, 1)); + XMVECTOR V2 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(0, 0, 1, 1)); + V2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 0, 2, 0)); + + R2 = _mm_shuffle_ps(V0, V1, _MM_SHUFFLE(1, 0, 3, 0)); + R2 = XM_PERMUTE_PS(R2, _MM_SHUFFLE(1, 3, 2, 0)); + + XMMATRIX M; + M.r[0] = R2; + + R2 = _mm_shuffle_ps(V0, V1, _MM_SHUFFLE(3, 2, 3, 1)); + R2 = XM_PERMUTE_PS(R2, _MM_SHUFFLE(1, 3, 0, 2)); + M.r[1] = R2; + + V2 = _mm_shuffle_ps(V2, V0, _MM_SHUFFLE(3, 2, 1, 0)); + M.r[2] = V2; + M.r[3] = g_XMIdentityR3.v; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationAxis +( + FXMVECTOR Axis, + float Angle +) noexcept +{ + assert(!XMVector3Equal(Axis, XMVectorZero())); + assert(!XMVector3IsInfinite(Axis)); + + XMVECTOR Normal = XMVector3Normalize(Axis); + return XMMatrixRotationNormal(Normal, Angle); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion(FXMVECTOR Quaternion) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float qx = Quaternion.vector4_f32[0]; + float qxx = qx * qx; + + float qy = Quaternion.vector4_f32[1]; + float qyy = qy * qy; + + float qz = Quaternion.vector4_f32[2]; + float qzz = qz * qz; + + float qw = Quaternion.vector4_f32[3]; + + XMMATRIX M; + M.m[0][0] = 1.f - 2.f * qyy - 2.f * qzz; + M.m[0][1] = 2.f * qx * qy + 2.f * qz * qw; + M.m[0][2] = 2.f * qx * qz - 2.f * qy * qw; + M.m[0][3] = 0.f; + + M.m[1][0] = 2.f * qx * qy - 2.f * qz * qw; + M.m[1][1] = 1.f - 2.f * qxx - 2.f * qzz; + M.m[1][2] = 2.f * qy * qz + 2.f * qx * qw; + M.m[1][3] = 0.f; + + M.m[2][0] = 2.f * qx * qz + 2.f * qy * qw; + M.m[2][1] = 2.f * qy * qz - 2.f * qx * qw; + M.m[2][2] = 1.f - 2.f * qxx - 2.f * qyy; + M.m[2][3] = 0.f; + + M.m[3][0] = 0.f; + M.m[3][1] = 0.f; + M.m[3][2] = 0.f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Constant1110 = { { { 1.0f, 1.0f, 1.0f, 0.0f } } }; + + XMVECTOR Q0 = XMVectorAdd(Quaternion, Quaternion); + XMVECTOR Q1 = XMVectorMultiply(Quaternion, Q0); + + XMVECTOR V0 = XMVectorPermute(Q1, Constant1110.v); + XMVECTOR V1 = XMVectorPermute(Q1, Constant1110.v); + XMVECTOR R0 = XMVectorSubtract(Constant1110, V0); + R0 = XMVectorSubtract(R0, V1); + + V0 = XMVectorSwizzle(Quaternion); + V1 = XMVectorSwizzle(Q0); + V0 = XMVectorMultiply(V0, V1); + + V1 = XMVectorSplatW(Quaternion); + XMVECTOR V2 = XMVectorSwizzle(Q0); + V1 = XMVectorMultiply(V1, V2); + + XMVECTOR R1 = XMVectorAdd(V0, V1); + XMVECTOR R2 = XMVectorSubtract(V0, V1); + + V0 = XMVectorPermute(R1, R2); + V1 = XMVectorPermute(R1, R2); + + XMMATRIX M; + M.r[0] = XMVectorPermute(R0, V0); + M.r[1] = XMVectorPermute(R0, V0); + M.r[2] = XMVectorPermute(R0, V1); + M.r[3] = g_XMIdentityR3.v; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Constant1110 = { { { 1.0f, 1.0f, 1.0f, 0.0f } } }; + + XMVECTOR Q0 = _mm_add_ps(Quaternion, Quaternion); + XMVECTOR Q1 = _mm_mul_ps(Quaternion, Q0); + + XMVECTOR V0 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(3, 0, 0, 1)); + V0 = _mm_and_ps(V0, g_XMMask3); + XMVECTOR V1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(3, 1, 2, 2)); + V1 = _mm_and_ps(V1, g_XMMask3); + XMVECTOR R0 = _mm_sub_ps(Constant1110, V0); + R0 = _mm_sub_ps(R0, V1); + + V0 = XM_PERMUTE_PS(Quaternion, _MM_SHUFFLE(3, 1, 0, 0)); + V1 = XM_PERMUTE_PS(Q0, _MM_SHUFFLE(3, 2, 1, 2)); + V0 = _mm_mul_ps(V0, V1); + + V1 = XM_PERMUTE_PS(Quaternion, _MM_SHUFFLE(3, 3, 3, 3)); + XMVECTOR V2 = XM_PERMUTE_PS(Q0, _MM_SHUFFLE(3, 0, 2, 1)); + V1 = _mm_mul_ps(V1, V2); + + XMVECTOR R1 = _mm_add_ps(V0, V1); + XMVECTOR R2 = _mm_sub_ps(V0, V1); + + V0 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(1, 0, 2, 1)); + V0 = XM_PERMUTE_PS(V0, _MM_SHUFFLE(1, 3, 2, 0)); + V1 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(2, 2, 0, 0)); + V1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 0, 2, 0)); + + Q1 = _mm_shuffle_ps(R0, V0, _MM_SHUFFLE(1, 0, 3, 0)); + Q1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(1, 3, 2, 0)); + + XMMATRIX M; + M.r[0] = Q1; + + Q1 = _mm_shuffle_ps(R0, V0, _MM_SHUFFLE(3, 2, 3, 1)); + Q1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(1, 3, 0, 2)); + M.r[1] = Q1; + + Q1 = _mm_shuffle_ps(V1, R0, _MM_SHUFFLE(3, 2, 1, 0)); + M.r[2] = Q1; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTransformation2D +( + FXMVECTOR ScalingOrigin, + float ScalingOrientation, + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + float Rotation, + GXMVECTOR Translation +) noexcept +{ + // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * + // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1100.v, ScalingOrigin, g_XMSelect1100.v); + XMVECTOR NegScalingOrigin = XMVectorNegate(VScalingOrigin); + + XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); + XMMATRIX MScalingOrientation = XMMatrixRotationZ(ScalingOrientation); + XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); + XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v); + XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v); + XMMATRIX MRotation = XMMatrixRotationZ(Rotation); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation, g_XMSelect1100.v); + + XMMATRIX M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); + M = XMMatrixMultiply(M, MScaling); + M = XMMatrixMultiply(M, MScalingOrientation); + M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTransformation +( + FXMVECTOR ScalingOrigin, + FXMVECTOR ScalingOrientationQuaternion, + FXMVECTOR Scaling, + GXMVECTOR RotationOrigin, + HXMVECTOR RotationQuaternion, + HXMVECTOR Translation +) noexcept +{ + // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * + // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1110.v, ScalingOrigin, g_XMSelect1110.v); + XMVECTOR NegScalingOrigin = XMVectorNegate(ScalingOrigin); + + XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); + XMMATRIX MScalingOrientation = XMMatrixRotationQuaternion(ScalingOrientationQuaternion); + XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); + XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v); + XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v); + + XMMATRIX M; + M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); + M = XMMatrixMultiply(M, MScaling); + M = XMMatrixMultiply(M, MScalingOrientation); + M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D +( + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + float Rotation, + FXMVECTOR Translation +) noexcept +{ + // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v); + XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v); + XMMATRIX MRotation = XMMatrixRotationZ(Rotation); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation, g_XMSelect1100.v); + + XMMATRIX M; + M = MScaling; + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation +( + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + FXMVECTOR RotationQuaternion, + GXMVECTOR Translation +) noexcept +{ + // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v); + XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v); + + XMMATRIX M; + M = MScaling; + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixReflect(FXMVECTOR ReflectionPlane) noexcept +{ + assert(!XMVector3Equal(ReflectionPlane, XMVectorZero())); + assert(!XMPlaneIsInfinite(ReflectionPlane)); + + static const XMVECTORF32 NegativeTwo = { { { -2.0f, -2.0f, -2.0f, 0.0f } } }; + + XMVECTOR P = XMPlaneNormalize(ReflectionPlane); + XMVECTOR S = XMVectorMultiply(P, NegativeTwo); + + XMVECTOR A = XMVectorSplatX(P); + XMVECTOR B = XMVectorSplatY(P); + XMVECTOR C = XMVectorSplatZ(P); + XMVECTOR D = XMVectorSplatW(P); + + XMMATRIX M; + M.r[0] = XMVectorMultiplyAdd(A, S, g_XMIdentityR0.v); + M.r[1] = XMVectorMultiplyAdd(B, S, g_XMIdentityR1.v); + M.r[2] = XMVectorMultiplyAdd(C, S, g_XMIdentityR2.v); + M.r[3] = XMVectorMultiplyAdd(D, S, g_XMIdentityR3.v); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixShadow +( + FXMVECTOR ShadowPlane, + FXMVECTOR LightPosition +) noexcept +{ + static const XMVECTORU32 Select0001 = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_0, XM_SELECT_1 } } }; + + assert(!XMVector3Equal(ShadowPlane, XMVectorZero())); + assert(!XMPlaneIsInfinite(ShadowPlane)); + + XMVECTOR P = XMPlaneNormalize(ShadowPlane); + XMVECTOR Dot = XMPlaneDot(P, LightPosition); + P = XMVectorNegate(P); + XMVECTOR D = XMVectorSplatW(P); + XMVECTOR C = XMVectorSplatZ(P); + XMVECTOR B = XMVectorSplatY(P); + XMVECTOR A = XMVectorSplatX(P); + Dot = XMVectorSelect(Select0001.v, Dot, Select0001.v); + + XMMATRIX M; + M.r[3] = XMVectorMultiplyAdd(D, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[2] = XMVectorMultiplyAdd(C, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[1] = XMVectorMultiplyAdd(B, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[0] = XMVectorMultiplyAdd(A, LightPosition, Dot); + return M; +} + +//------------------------------------------------------------------------------ +// View and projection initialization operations +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookAtLH +( + FXMVECTOR EyePosition, + FXMVECTOR FocusPosition, + FXMVECTOR UpDirection +) noexcept +{ + XMVECTOR EyeDirection = XMVectorSubtract(FocusPosition, EyePosition); + return XMMatrixLookToLH(EyePosition, EyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookAtRH +( + FXMVECTOR EyePosition, + FXMVECTOR FocusPosition, + FXMVECTOR UpDirection +) noexcept +{ + XMVECTOR NegEyeDirection = XMVectorSubtract(EyePosition, FocusPosition); + return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookToLH +( + FXMVECTOR EyePosition, + FXMVECTOR EyeDirection, + FXMVECTOR UpDirection +) noexcept +{ + assert(!XMVector3Equal(EyeDirection, XMVectorZero())); + assert(!XMVector3IsInfinite(EyeDirection)); + assert(!XMVector3Equal(UpDirection, XMVectorZero())); + assert(!XMVector3IsInfinite(UpDirection)); + + XMVECTOR R2 = XMVector3Normalize(EyeDirection); + + XMVECTOR R0 = XMVector3Cross(UpDirection, R2); + R0 = XMVector3Normalize(R0); + + XMVECTOR R1 = XMVector3Cross(R2, R0); + + XMVECTOR NegEyePosition = XMVectorNegate(EyePosition); + + XMVECTOR D0 = XMVector3Dot(R0, NegEyePosition); + XMVECTOR D1 = XMVector3Dot(R1, NegEyePosition); + XMVECTOR D2 = XMVector3Dot(R2, NegEyePosition); + + XMMATRIX M; + M.r[0] = XMVectorSelect(D0, R0, g_XMSelect1110.v); + M.r[1] = XMVectorSelect(D1, R1, g_XMSelect1110.v); + M.r[2] = XMVectorSelect(D2, R2, g_XMSelect1110.v); + M.r[3] = g_XMIdentityR3.v; + + M = XMMatrixTranspose(M); + + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookToRH +( + FXMVECTOR EyePosition, + FXMVECTOR EyeDirection, + FXMVECTOR UpDirection +) noexcept +{ + XMVECTOR NegEyeDirection = XMVectorNegate(EyeDirection); + return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable:28931, "PREfast noise: Esp:1266") +#endif + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + const float32x4_t Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(TwoNearZ / ViewHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, g_XMIdentityR3.v, 2); + M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ / ViewWidth, + TwoNearZ / ViewHeight, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // TwoNearZ / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,1.0f + vTemp = _mm_setzero_ps(); + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0 + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ - FarZ); + const float32x4_t Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(TwoNearZ / ViewHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, g_XMNegIdentityR3.v, 2); + M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ - FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ / ViewWidth, + TwoNearZ / ViewHeight, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // TwoNearZ / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,-1.0f + vValues = _mm_shuffle_ps(vValues, g_XMNegIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,-1.0f + vTemp = _mm_setzero_ps(); + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0 + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH +( + float FovAngleY, + float AspectRatio, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + float fRange = FarZ / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = Width; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Height; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float fRange = FarZ / (FarZ - NearZ); + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + const float32x4_t Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32(Width, Zero, 0); + M.r[1] = vsetq_lane_f32(Height, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, g_XMIdentityR3.v, 2); + M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float fRange = FarZ / (FarZ - NearZ); + // Note: This is recorded on the stack + float Height = CosFov / SinFov; + XMVECTOR rMem = { + Height / AspectRatio, + Height, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // Height / AspectRatio,0,0,0 + XMMATRIX M; + M.r[0] = vTemp; + // 0,Height,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,1.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH +( + float FovAngleY, + float AspectRatio, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + float fRange = FarZ / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = Width; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Height; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + float fRange = FarZ / (NearZ - FarZ); + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + const float32x4_t Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32(Width, Zero, 0); + M.r[1] = vsetq_lane_f32(Height, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, g_XMNegIdentityR3.v, 2); + M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + float fRange = FarZ / (NearZ - FarZ); + // Note: This is recorded on the stack + float Height = CosFov / SinFov; + XMVECTOR rMem = { + Height / AspectRatio, + Height, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // Height / AspectRatio,0,0,0 + XMMATRIX M; + M.r[0] = vTemp; + // 0,Height,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,-1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues, g_XMNegIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,-1.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,fRange * NearZ,0.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ * ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ * ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = -(ViewLeft + ViewRight) * ReciprocalWidth; + M.m[2][1] = -(ViewTop + ViewBottom) * ReciprocalHeight; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ - NearZ); + const float32x4_t Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(TwoNearZ * ReciprocalHeight, Zero, 1); + M.r[2] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + 1.0f); + M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ - NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ * ReciprocalWidth, + TwoNearZ * ReciprocalHeight, + -fRange * NearZ, + 0 + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // TwoNearZ*ReciprocalWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ*ReciprocalHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // 0,0,fRange,1.0f + M.r[2] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + 1.0f); + // 0,0,-fRange * NearZ,0.0f + vValues = _mm_and_ps(vValues, g_XMMaskZ); + M.r[3] = vValues; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) noexcept +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ * ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ * ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = (ViewLeft + ViewRight) * ReciprocalWidth; + M.m[2][1] = (ViewTop + ViewBottom) * ReciprocalHeight; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ - FarZ); + const float32x4_t Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(TwoNearZ * ReciprocalHeight, Zero, 1); + M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth, + (ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + -1.0f); + M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ - FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ * ReciprocalWidth, + TwoNearZ * ReciprocalHeight, + fRange * NearZ, + 0 + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // TwoNearZ*ReciprocalWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ*ReciprocalHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // 0,0,fRange,1.0f + M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth, + (ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + -1.0f); + // 0,0,-fRange * NearZ,0.0f + vValues = _mm_and_ps(vValues, g_XMMaskZ); + M.r[3] = vValues; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicLH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) noexcept +{ + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float fRange = 1.0f / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = 2.0f / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 2.0f / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fRange = 1.0f / (FarZ - NearZ); + + const float32x4_t Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, Zero, 2); + M.r[3] = vsetq_lane_f32(-fRange * NearZ, g_XMIdentityR3.v, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fRange = 1.0f / (FarZ - NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + 2.0f / ViewWidth, + 2.0f / ViewHeight, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // 2.0f / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,2.0f / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,0.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,1.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicRH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) noexcept +{ + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float fRange = 1.0f / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = 2.0f / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 2.0f / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fRange = 1.0f / (NearZ - FarZ); + + const float32x4_t Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, Zero, 2); + M.r[3] = vsetq_lane_f32(fRange * NearZ, g_XMIdentityR3.v, 2); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fRange = 1.0f / (NearZ - FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + 2.0f / ViewWidth, + 2.0f / ViewHeight, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // 2.0f / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,2.0f / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); + // 0,0,fRange,0.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 0, 0, 0)); + M.r[2] = vTemp; + // 0,0,fRange * NearZ,1.0f + vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 1, 0, 0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) noexcept +{ + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = ReciprocalWidth + ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ReciprocalHeight + ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = -(ViewLeft + ViewRight) * ReciprocalWidth; + M.m[3][1] = -(ViewTop + ViewBottom) * ReciprocalHeight; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ - NearZ); + const float32x4_t Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, Zero, 2); + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + -fRange * NearZ, + 1.0f); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ - NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + fReciprocalWidth, + fReciprocalHeight, + fRange, + 1.0f + }; + XMVECTOR rMem2 = { + -(ViewLeft + ViewRight), + -(ViewTop + ViewBottom), + -NearZ, + 1.0f + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // fReciprocalWidth*2,0,0,0 + vTemp = _mm_add_ss(vTemp, vTemp); + M.r[0] = vTemp; + // 0,fReciprocalHeight*2,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + vTemp = _mm_add_ps(vTemp, vTemp); + M.r[1] = vTemp; + // 0,0,fRange,0.0f + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskZ); + M.r[2] = vTemp; + // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f + vValues = _mm_mul_ps(vValues, rMem2); + M.r[3] = vValues; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) noexcept +{ + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = ReciprocalWidth + ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ReciprocalHeight + ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange * NearZ, + 1.0f); + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ - FarZ); + const float32x4_t Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0); + M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1); + M.r[2] = vsetq_lane_f32(fRange, Zero, 2); + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange * NearZ, + 1.0f); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ - FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + fReciprocalWidth, + fReciprocalHeight, + fRange, + 1.0f + }; + XMVECTOR rMem2 = { + -(ViewLeft + ViewRight), + -(ViewTop + ViewBottom), + NearZ, + 1.0f + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp, vValues); + // fReciprocalWidth*2,0,0,0 + vTemp = _mm_add_ss(vTemp, vTemp); + M.r[0] = vTemp; + // 0,fReciprocalHeight*2,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskY); + vTemp = _mm_add_ps(vTemp, vTemp); + M.r[1] = vTemp; + // 0,0,fRange,0.0f + vTemp = vValues; + vTemp = _mm_and_ps(vTemp, g_XMMaskZ); + M.r[2] = vTemp; + // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f + vValues = _mm_mul_ps(vValues, rMem2); + M.r[3] = vValues; + return M; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +/**************************************************************************** + * + * XMMATRIX operators and methods + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMMATRIX::XMMATRIX +( + float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33 +) noexcept +{ + r[0] = XMVectorSet(m00, m01, m02, m03); + r[1] = XMVectorSet(m10, m11, m12, m13); + r[2] = XMVectorSet(m20, m21, m22, m23); + r[3] = XMVectorSet(m30, m31, m32, m33); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX::XMMATRIX(const float* pArray) noexcept +{ + assert(pArray != nullptr); + r[0] = XMLoadFloat4(reinterpret_cast(pArray)); + r[1] = XMLoadFloat4(reinterpret_cast(pArray + 4)); + r[2] = XMLoadFloat4(reinterpret_cast(pArray + 8)); + r[3] = XMLoadFloat4(reinterpret_cast(pArray + 12)); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator- () const noexcept +{ + XMMATRIX R; + R.r[0] = XMVectorNegate(r[0]); + R.r[1] = XMVectorNegate(r[1]); + R.r[2] = XMVectorNegate(r[2]); + R.r[3] = XMVectorNegate(r[3]); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XM_CALLCONV XMMATRIX::operator+= (FXMMATRIX M) noexcept +{ + r[0] = XMVectorAdd(r[0], M.r[0]); + r[1] = XMVectorAdd(r[1], M.r[1]); + r[2] = XMVectorAdd(r[2], M.r[2]); + r[3] = XMVectorAdd(r[3], M.r[3]); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XM_CALLCONV XMMATRIX::operator-= (FXMMATRIX M) noexcept +{ + r[0] = XMVectorSubtract(r[0], M.r[0]); + r[1] = XMVectorSubtract(r[1], M.r[1]); + r[2] = XMVectorSubtract(r[2], M.r[2]); + r[3] = XMVectorSubtract(r[3], M.r[3]); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XM_CALLCONV XMMATRIX::operator*=(FXMMATRIX M) noexcept +{ + *this = XMMatrixMultiply(*this, M); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XMMATRIX::operator*= (float S) noexcept +{ + r[0] = XMVectorScale(r[0], S); + r[1] = XMVectorScale(r[1], S); + r[2] = XMVectorScale(r[2], S); + r[3] = XMVectorScale(r[3], S); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XMMATRIX::operator/= (float S) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vS = XMVectorReplicate(S); + r[0] = XMVectorDivide(r[0], vS); + r[1] = XMVectorDivide(r[1], vS); + r[2] = XMVectorDivide(r[2], vS); + r[3] = XMVectorDivide(r[3], vS); + return *this; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + float32x4_t vS = vdupq_n_f32(S); + r[0] = vdivq_f32(r[0], vS); + r[1] = vdivq_f32(r[1], vS); + r[2] = vdivq_f32(r[2], vS); + r[3] = vdivq_f32(r[3], vS); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x2_t vS = vdup_n_f32(S); + float32x2_t R0 = vrecpe_f32(vS); + float32x2_t S0 = vrecps_f32(R0, vS); + R0 = vmul_f32(S0, R0); + S0 = vrecps_f32(R0, vS); + R0 = vmul_f32(S0, R0); + float32x4_t Reciprocal = vcombine_f32(R0, R0); + r[0] = vmulq_f32(r[0], Reciprocal); + r[1] = vmulq_f32(r[1], Reciprocal); + r[2] = vmulq_f32(r[2], Reciprocal); + r[3] = vmulq_f32(r[3], Reciprocal); +#endif + return *this; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 vS = _mm_set_ps1(S); + r[0] = _mm_div_ps(r[0], vS); + r[1] = _mm_div_ps(r[1], vS); + r[2] = _mm_div_ps(r[2], vS); + r[3] = _mm_div_ps(r[3], vS); + return *this; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMATRIX::operator+ (FXMMATRIX M) const noexcept +{ + XMMATRIX R; + R.r[0] = XMVectorAdd(r[0], M.r[0]); + R.r[1] = XMVectorAdd(r[1], M.r[1]); + R.r[2] = XMVectorAdd(r[2], M.r[2]); + R.r[3] = XMVectorAdd(r[3], M.r[3]); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMATRIX::operator- (FXMMATRIX M) const noexcept +{ + XMMATRIX R; + R.r[0] = XMVectorSubtract(r[0], M.r[0]); + R.r[1] = XMVectorSubtract(r[1], M.r[1]); + R.r[2] = XMVectorSubtract(r[2], M.r[2]); + R.r[3] = XMVectorSubtract(r[3], M.r[3]); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMATRIX::operator*(FXMMATRIX M) const noexcept +{ + return XMMatrixMultiply(*this, M); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator* (float S) const noexcept +{ + XMMATRIX R; + R.r[0] = XMVectorScale(r[0], S); + R.r[1] = XMVectorScale(r[1], S); + R.r[2] = XMVectorScale(r[2], S); + R.r[3] = XMVectorScale(r[3], S); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator/ (float S) const noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vS = XMVectorReplicate(S); + XMMATRIX R; + R.r[0] = XMVectorDivide(r[0], vS); + R.r[1] = XMVectorDivide(r[1], vS); + R.r[2] = XMVectorDivide(r[2], vS); + R.r[3] = XMVectorDivide(r[3], vS); + return R; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + float32x4_t vS = vdupq_n_f32(S); + XMMATRIX R; + R.r[0] = vdivq_f32(r[0], vS); + R.r[1] = vdivq_f32(r[1], vS); + R.r[2] = vdivq_f32(r[2], vS); + R.r[3] = vdivq_f32(r[3], vS); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x2_t vS = vdup_n_f32(S); + float32x2_t R0 = vrecpe_f32(vS); + float32x2_t S0 = vrecps_f32(R0, vS); + R0 = vmul_f32(S0, R0); + S0 = vrecps_f32(R0, vS); + R0 = vmul_f32(S0, R0); + float32x4_t Reciprocal = vcombine_f32(R0, R0); + XMMATRIX R; + R.r[0] = vmulq_f32(r[0], Reciprocal); + R.r[1] = vmulq_f32(r[1], Reciprocal); + R.r[2] = vmulq_f32(r[2], Reciprocal); + R.r[3] = vmulq_f32(r[3], Reciprocal); +#endif + return R; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 vS = _mm_set_ps1(S); + XMMATRIX R; + R.r[0] = _mm_div_ps(r[0], vS); + R.r[1] = _mm_div_ps(r[1], vS); + R.r[2] = _mm_div_ps(r[2], vS); + R.r[3] = _mm_div_ps(r[3], vS); + return R; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV operator* +( + float S, + FXMMATRIX M +) noexcept +{ + XMMATRIX R; + R.r[0] = XMVectorScale(M.r[0], S); + R.r[1] = XMVectorScale(M.r[1], S); + R.r[2] = XMVectorScale(M.r[2], S); + R.r[3] = XMVectorScale(M.r[3], S); + return R; +} + +/**************************************************************************** + * + * XMFLOAT3X3 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT3X3::XMFLOAT3X3(const float* pArray) noexcept +{ + assert(pArray != nullptr); + for (size_t Row = 0; Row < 3; Row++) + { + for (size_t Column = 0; Column < 3; Column++) + { + m[Row][Column] = pArray[Row * 3 + Column]; + } + } +} + +/**************************************************************************** + * + * XMFLOAT4X3 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4X3::XMFLOAT4X3(const float* pArray) noexcept +{ + assert(pArray != nullptr); + + m[0][0] = pArray[0]; + m[0][1] = pArray[1]; + m[0][2] = pArray[2]; + + m[1][0] = pArray[3]; + m[1][1] = pArray[4]; + m[1][2] = pArray[5]; + + m[2][0] = pArray[6]; + m[2][1] = pArray[7]; + m[2][2] = pArray[8]; + + m[3][0] = pArray[9]; + m[3][1] = pArray[10]; + m[3][2] = pArray[11]; +} + +/**************************************************************************** +* +* XMFLOAT3X4 operators +* +****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT3X4::XMFLOAT3X4(const float* pArray) noexcept +{ + assert(pArray != nullptr); + + m[0][0] = pArray[0]; + m[0][1] = pArray[1]; + m[0][2] = pArray[2]; + m[0][3] = pArray[3]; + + m[1][0] = pArray[4]; + m[1][1] = pArray[5]; + m[1][2] = pArray[6]; + m[1][3] = pArray[7]; + + m[2][0] = pArray[8]; + m[2][1] = pArray[9]; + m[2][2] = pArray[10]; + m[2][3] = pArray[11]; +} + +/**************************************************************************** + * + * XMFLOAT4X4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4X4::XMFLOAT4X4(const float* pArray) noexcept +{ + assert(pArray != nullptr); + + m[0][0] = pArray[0]; + m[0][1] = pArray[1]; + m[0][2] = pArray[2]; + m[0][3] = pArray[3]; + + m[1][0] = pArray[4]; + m[1][1] = pArray[5]; + m[1][2] = pArray[6]; + m[1][3] = pArray[7]; + + m[2][0] = pArray[8]; + m[2][1] = pArray[9]; + m[2][2] = pArray[10]; + m[2][3] = pArray[11]; + + m[3][0] = pArray[12]; + m[3][1] = pArray[13]; + m[3][2] = pArray[14]; + m[3][3] = pArray[15]; +} + diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathMisc.inl b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathMisc.inl new file mode 100644 index 000000000..5f88da642 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathMisc.inl @@ -0,0 +1,2493 @@ +//------------------------------------------------------------------------------------- +// DirectXMathMisc.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Quaternion + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + // Comparison operations + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionEqual +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) noexcept +{ + return XMVector4Equal(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionNotEqual +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) noexcept +{ + return XMVector4NotEqual(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionIsNaN(FXMVECTOR Q) noexcept +{ + return XMVector4IsNaN(Q); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionIsInfinite(FXMVECTOR Q) noexcept +{ + return XMVector4IsInfinite(Q); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionIsIdentity(FXMVECTOR Q) noexcept +{ + return XMVector4Equal(Q, g_XMIdentityR3.v); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionDot +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) noexcept +{ + return XMVector4Dot(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionMultiply +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) noexcept +{ + // Returns the product Q2*Q1 (which is the concatenation of a rotation Q1 followed by the rotation Q2) + + // [ (Q2.w * Q1.x) + (Q2.x * Q1.w) + (Q2.y * Q1.z) - (Q2.z * Q1.y), + // (Q2.w * Q1.y) - (Q2.x * Q1.z) + (Q2.y * Q1.w) + (Q2.z * Q1.x), + // (Q2.w * Q1.z) + (Q2.x * Q1.y) - (Q2.y * Q1.x) + (Q2.z * Q1.w), + // (Q2.w * Q1.w) - (Q2.x * Q1.x) - (Q2.y * Q1.y) - (Q2.z * Q1.z) ] + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + (Q2.vector4_f32[3] * Q1.vector4_f32[0]) + (Q2.vector4_f32[0] * Q1.vector4_f32[3]) + (Q2.vector4_f32[1] * Q1.vector4_f32[2]) - (Q2.vector4_f32[2] * Q1.vector4_f32[1]), + (Q2.vector4_f32[3] * Q1.vector4_f32[1]) - (Q2.vector4_f32[0] * Q1.vector4_f32[2]) + (Q2.vector4_f32[1] * Q1.vector4_f32[3]) + (Q2.vector4_f32[2] * Q1.vector4_f32[0]), + (Q2.vector4_f32[3] * Q1.vector4_f32[2]) + (Q2.vector4_f32[0] * Q1.vector4_f32[1]) - (Q2.vector4_f32[1] * Q1.vector4_f32[0]) + (Q2.vector4_f32[2] * Q1.vector4_f32[3]), + (Q2.vector4_f32[3] * Q1.vector4_f32[3]) - (Q2.vector4_f32[0] * Q1.vector4_f32[0]) - (Q2.vector4_f32[1] * Q1.vector4_f32[1]) - (Q2.vector4_f32[2] * Q1.vector4_f32[2]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ControlWZYX = { { { 1.0f, -1.0f, 1.0f, -1.0f } } }; + static const XMVECTORF32 ControlZWXY = { { { 1.0f, 1.0f, -1.0f, -1.0f } } }; + static const XMVECTORF32 ControlYXWZ = { { { -1.0f, 1.0f, 1.0f, -1.0f } } }; + + float32x2_t Q2L = vget_low_f32(Q2); + float32x2_t Q2H = vget_high_f32(Q2); + + float32x4_t Q2X = vdupq_lane_f32(Q2L, 0); + float32x4_t Q2Y = vdupq_lane_f32(Q2L, 1); + float32x4_t Q2Z = vdupq_lane_f32(Q2H, 0); + XMVECTOR vResult = vmulq_lane_f32(Q1, Q2H, 1); + + // Mul by Q1WZYX + float32x4_t vTemp = vrev64q_f32(Q1); + vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp)); + Q2X = vmulq_f32(Q2X, vTemp); + vResult = vmlaq_f32(vResult, Q2X, ControlWZYX); + + // Mul by Q1ZWXY + vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp))); + Q2Y = vmulq_f32(Q2Y, vTemp); + vResult = vmlaq_f32(vResult, Q2Y, ControlZWXY); + + // Mul by Q1YXWZ + vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp))); + vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp)); + Q2Z = vmulq_f32(Q2Z, vTemp); + vResult = vmlaq_f32(vResult, Q2Z, ControlYXWZ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ControlWZYX = { { { 1.0f, -1.0f, 1.0f, -1.0f } } }; + static const XMVECTORF32 ControlZWXY = { { { 1.0f, 1.0f, -1.0f, -1.0f } } }; + static const XMVECTORF32 ControlYXWZ = { { { -1.0f, 1.0f, 1.0f, -1.0f } } }; + // Copy to SSE registers and use as few as possible for x86 + XMVECTOR Q2X = Q2; + XMVECTOR Q2Y = Q2; + XMVECTOR Q2Z = Q2; + XMVECTOR vResult = Q2; + // Splat with one instruction + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 3, 3, 3)); + Q2X = XM_PERMUTE_PS(Q2X, _MM_SHUFFLE(0, 0, 0, 0)); + Q2Y = XM_PERMUTE_PS(Q2Y, _MM_SHUFFLE(1, 1, 1, 1)); + Q2Z = XM_PERMUTE_PS(Q2Z, _MM_SHUFFLE(2, 2, 2, 2)); + // Retire Q1 and perform Q1*Q2W + vResult = _mm_mul_ps(vResult, Q1); + XMVECTOR Q1Shuffle = Q1; + // Shuffle the copies of Q1 + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(0, 1, 2, 3)); + // Mul by Q1WZYX + Q2X = _mm_mul_ps(Q2X, Q1Shuffle); + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(2, 3, 0, 1)); + // Flip the signs on y and z + vResult = XM_FMADD_PS(Q2X, ControlWZYX, vResult); + // Mul by Q1ZWXY + Q2Y = _mm_mul_ps(Q2Y, Q1Shuffle); + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(0, 1, 2, 3)); + // Flip the signs on z and w + Q2Y = _mm_mul_ps(Q2Y, ControlZWXY); + // Mul by Q1YXWZ + Q2Z = _mm_mul_ps(Q2Z, Q1Shuffle); + // Flip the signs on x and w + Q2Y = XM_FMADD_PS(Q2Z, ControlYXWZ, Q2Y); + vResult = _mm_add_ps(vResult, Q2Y); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionLengthSq(FXMVECTOR Q) noexcept +{ + return XMVector4LengthSq(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength(FXMVECTOR Q) noexcept +{ + return XMVector4ReciprocalLength(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionLength(FXMVECTOR Q) noexcept +{ + return XMVector4Length(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst(FXMVECTOR Q) noexcept +{ + return XMVector4NormalizeEst(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionNormalize(FXMVECTOR Q) noexcept +{ + return XMVector4Normalize(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionConjugate(FXMVECTOR Q) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + -Q.vector4_f32[0], + -Q.vector4_f32[1], + -Q.vector4_f32[2], + Q.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 NegativeOne3 = { { { -1.0f, -1.0f, -1.0f, 1.0f } } }; + return vmulq_f32(Q, NegativeOne3.v); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 NegativeOne3 = { { { -1.0f, -1.0f, -1.0f, 1.0f } } }; + return _mm_mul_ps(Q, NegativeOne3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionInverse(FXMVECTOR Q) noexcept +{ + XMVECTOR L = XMVector4LengthSq(Q); + XMVECTOR Conjugate = XMQuaternionConjugate(Q); + + XMVECTOR Control = XMVectorLessOrEqual(L, g_XMEpsilon.v); + + XMVECTOR Result = XMVectorDivide(Conjugate, L); + + Result = XMVectorSelect(Result, g_XMZero, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionLn(FXMVECTOR Q) noexcept +{ + static const XMVECTORF32 OneMinusEpsilon = { { { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f } } }; + + XMVECTOR QW = XMVectorSplatW(Q); + XMVECTOR Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v); + + XMVECTOR ControlW = XMVectorInBounds(QW, OneMinusEpsilon.v); + + XMVECTOR Theta = XMVectorACos(QW); + XMVECTOR SinTheta = XMVectorSin(Theta); + + XMVECTOR S = XMVectorDivide(Theta, SinTheta); + + XMVECTOR Result = XMVectorMultiply(Q0, S); + Result = XMVectorSelect(Q0, Result, ControlW); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionExp(FXMVECTOR Q) noexcept +{ + XMVECTOR Theta = XMVector3Length(Q); + + XMVECTOR SinTheta, CosTheta; + XMVectorSinCos(&SinTheta, &CosTheta, Theta); + + XMVECTOR S = XMVectorDivide(SinTheta, Theta); + + XMVECTOR Result = XMVectorMultiply(Q, S); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v); + Result = XMVectorSelect(Result, Q, Control); + + Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSlerp +( + FXMVECTOR Q0, + FXMVECTOR Q1, + float t +) noexcept +{ + XMVECTOR T = XMVectorReplicate(t); + return XMQuaternionSlerpV(Q0, Q1, T); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSlerpV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR T +) noexcept +{ + assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T))); + + // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega) + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + const XMVECTORF32 OneMinusEpsilon = { { { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f } } }; + + XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorLess(CosOmega, Zero); + XMVECTOR Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control); + + CosOmega = XMVectorMultiply(CosOmega, Sign); + + Control = XMVectorLess(CosOmega, OneMinusEpsilon); + + XMVECTOR SinOmega = XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v); + SinOmega = XMVectorSqrt(SinOmega); + + XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega); + + XMVECTOR SignMask = XMVectorSplatSignMask(); + XMVECTOR V01 = XMVectorShiftLeft(T, Zero, 2); + SignMask = XMVectorShiftLeft(SignMask, Zero, 3); + V01 = XMVectorXorInt(V01, SignMask); + V01 = XMVectorAdd(g_XMIdentityR0.v, V01); + + XMVECTOR InvSinOmega = XMVectorReciprocal(SinOmega); + + XMVECTOR S0 = XMVectorMultiply(V01, Omega); + S0 = XMVectorSin(S0); + S0 = XMVectorMultiply(S0, InvSinOmega); + + S0 = XMVectorSelect(V01, S0, Control); + + XMVECTOR S1 = XMVectorSplatY(S0); + S0 = XMVectorSplatX(S0); + + S1 = XMVectorMultiply(S1, Sign); + + XMVECTOR Result = XMVectorMultiply(Q0, S0); + Result = XMVectorMultiplyAdd(Q1, S1, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 OneMinusEpsilon = { { { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f } } }; + static const XMVECTORU32 SignMask2 = { { { 0x80000000, 0x00000000, 0x00000000, 0x00000000 } } }; + + XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorLess(CosOmega, Zero); + XMVECTOR Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control); + + CosOmega = _mm_mul_ps(CosOmega, Sign); + + Control = XMVectorLess(CosOmega, OneMinusEpsilon); + + XMVECTOR SinOmega = _mm_mul_ps(CosOmega, CosOmega); + SinOmega = _mm_sub_ps(g_XMOne, SinOmega); + SinOmega = _mm_sqrt_ps(SinOmega); + + XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega); + + XMVECTOR V01 = XM_PERMUTE_PS(T, _MM_SHUFFLE(2, 3, 0, 1)); + V01 = _mm_and_ps(V01, g_XMMaskXY); + V01 = _mm_xor_ps(V01, SignMask2); + V01 = _mm_add_ps(g_XMIdentityR0, V01); + + XMVECTOR S0 = _mm_mul_ps(V01, Omega); + S0 = XMVectorSin(S0); + S0 = _mm_div_ps(S0, SinOmega); + + S0 = XMVectorSelect(V01, S0, Control); + + XMVECTOR S1 = XMVectorSplatY(S0); + S0 = XMVectorSplatX(S0); + + S1 = _mm_mul_ps(S1, Sign); + XMVECTOR Result = _mm_mul_ps(Q0, S0); + S1 = _mm_mul_ps(S1, Q1); + Result = _mm_add_ps(Result, S1); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSquad +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3, + float t +) noexcept +{ + XMVECTOR T = XMVectorReplicate(t); + return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSquadV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3, + HXMVECTOR T +) noexcept +{ + assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T))); + + XMVECTOR TP = T; + const XMVECTOR Two = XMVectorSplatConstant(2, 0); + + XMVECTOR Q03 = XMQuaternionSlerpV(Q0, Q3, T); + XMVECTOR Q12 = XMQuaternionSlerpV(Q1, Q2, T); + + TP = XMVectorNegativeMultiplySubtract(TP, TP, TP); + TP = XMVectorMultiply(TP, Two); + + XMVECTOR Result = XMQuaternionSlerpV(Q03, Q12, TP); + + return Result; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMQuaternionSquadSetup +( + XMVECTOR* pA, + XMVECTOR* pB, + XMVECTOR* pC, + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3 +) noexcept +{ + assert(pA); + assert(pB); + assert(pC); + + XMVECTOR LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2)); + XMVECTOR LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2)); + XMVECTOR SQ2 = XMVectorNegate(Q2); + + XMVECTOR Control1 = XMVectorLess(LS12, LD12); + SQ2 = XMVectorSelect(Q2, SQ2, Control1); + + XMVECTOR LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1)); + XMVECTOR LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1)); + XMVECTOR SQ0 = XMVectorNegate(Q0); + + XMVECTOR LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3)); + XMVECTOR LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3)); + XMVECTOR SQ3 = XMVectorNegate(Q3); + + XMVECTOR Control0 = XMVectorLess(LS01, LD01); + XMVECTOR Control2 = XMVectorLess(LS23, LD23); + + SQ0 = XMVectorSelect(Q0, SQ0, Control0); + SQ3 = XMVectorSelect(Q3, SQ3, Control2); + + XMVECTOR InvQ1 = XMQuaternionInverse(Q1); + XMVECTOR InvQ2 = XMQuaternionInverse(SQ2); + + XMVECTOR LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0)); + XMVECTOR LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2)); + XMVECTOR LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1)); + XMVECTOR LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3)); + + const XMVECTOR NegativeOneQuarter = XMVectorSplatConstant(-1, 2); + + XMVECTOR ExpQ02 = XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter); + XMVECTOR ExpQ13 = XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter); + ExpQ02 = XMQuaternionExp(ExpQ02); + ExpQ13 = XMQuaternionExp(ExpQ13); + + *pA = XMQuaternionMultiply(Q1, ExpQ02); + *pB = XMQuaternionMultiply(SQ2, ExpQ13); + *pC = SQ2; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentric +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + float f, + float g +) noexcept +{ + float s = f + g; + + XMVECTOR Result; + if ((s < 0.00001f) && (s > -0.00001f)) + { + Result = Q0; + } + else + { + XMVECTOR Q01 = XMQuaternionSlerp(Q0, Q1, s); + XMVECTOR Q02 = XMQuaternionSlerp(Q0, Q2, s); + + Result = XMQuaternionSlerp(Q01, Q02, g / s); + } + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR F, + HXMVECTOR G +) noexcept +{ + assert((XMVectorGetY(F) == XMVectorGetX(F)) && (XMVectorGetZ(F) == XMVectorGetX(F)) && (XMVectorGetW(F) == XMVectorGetX(F))); + assert((XMVectorGetY(G) == XMVectorGetX(G)) && (XMVectorGetZ(G) == XMVectorGetX(G)) && (XMVectorGetW(G) == XMVectorGetX(G))); + + const XMVECTOR Epsilon = XMVectorSplatConstant(1, 16); + + XMVECTOR S = XMVectorAdd(F, G); + + XMVECTOR Result; + if (XMVector4InBounds(S, Epsilon)) + { + Result = Q0; + } + else + { + XMVECTOR Q01 = XMQuaternionSlerpV(Q0, Q1, S); + XMVECTOR Q02 = XMQuaternionSlerpV(Q0, Q2, S); + XMVECTOR GS = XMVectorReciprocal(S); + GS = XMVectorMultiply(G, GS); + + Result = XMQuaternionSlerpV(Q01, Q02, GS); + } + + return Result; +} + +//------------------------------------------------------------------------------ +// Transformation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionIdentity() noexcept +{ + return g_XMIdentityR3.v; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw +( + float Pitch, + float Yaw, + float Roll +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + const float halfpitch = Pitch * 0.5f; + float cp = cosf(halfpitch); + float sp = sinf(halfpitch); + + const float halfyaw = Yaw * 0.5f; + float cy = cosf(halfyaw); + float sy = sinf(halfyaw); + + const float halfroll = Roll * 0.5f; + float cr = cosf(halfroll); + float sr = sinf(halfroll); + + XMVECTORF32 vResult = { { { + cr * sp * cy + sr * cp * sy, + cr * cp * sy - sr * sp * cy, + sr * cp * cy - cr * sp * sy, + cr * cp * cy + sr * sp * sy + } } }; + return vResult; +#else + XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f); + return XMQuaternionRotationRollPitchYawFromVector(Angles); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector +( + FXMVECTOR Angles // +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + const float halfpitch = Angles.vector4_f32[0] * 0.5f; + float cp = cosf(halfpitch); + float sp = sinf(halfpitch); + + const float halfyaw = Angles.vector4_f32[1] * 0.5f; + float cy = cosf(halfyaw); + float sy = sinf(halfyaw); + + const float halfroll = Angles.vector4_f32[2] * 0.5f; + float cr = cosf(halfroll); + float sr = sinf(halfroll); + + XMVECTORF32 vResult = { { { + cr * sp * cy + sr * cp * sy, + cr * cp * sy - sr * sp * cy, + sr * cp * cy - cr * sp * sy, + cr * cp * cy + sr * sp * sy + } } }; + return vResult; +#else + static const XMVECTORF32 Sign = { { { 1.0f, -1.0f, -1.0f, 1.0f } } }; + + XMVECTOR HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v); + + XMVECTOR SinAngles, CosAngles; + XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles); + + XMVECTOR P0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR Y0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR R0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR P1 = XMVectorPermute(CosAngles, SinAngles); + XMVECTOR Y1 = XMVectorPermute(CosAngles, SinAngles); + XMVECTOR R1 = XMVectorPermute(CosAngles, SinAngles); + + XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v); + XMVECTOR Q0 = XMVectorMultiply(P0, Y0); + Q1 = XMVectorMultiply(Q1, Y1); + Q0 = XMVectorMultiply(Q0, R0); + XMVECTOR Q = XMVectorMultiplyAdd(Q1, R1, Q0); + + return Q; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationNormal +( + FXMVECTOR NormalAxis, + float Angle +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v); + + float SinV, CosV; + XMScalarSinCos(&SinV, &CosV, 0.5f * Angle); + + XMVECTOR Scale = XMVectorSet(SinV, SinV, SinV, CosV); + return XMVectorMultiply(N, Scale); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR N = _mm_and_ps(NormalAxis, g_XMMask3); + N = _mm_or_ps(N, g_XMIdentityR3); + XMVECTOR Scale = _mm_set_ps1(0.5f * Angle); + XMVECTOR vSine; + XMVECTOR vCosine; + XMVectorSinCos(&vSine, &vCosine, Scale); + Scale = _mm_and_ps(vSine, g_XMMask3); + vCosine = _mm_and_ps(vCosine, g_XMMaskW); + Scale = _mm_or_ps(Scale, vCosine); + N = _mm_mul_ps(N, Scale); + return N; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationAxis +( + FXMVECTOR Axis, + float Angle +) noexcept +{ + assert(!XMVector3Equal(Axis, XMVectorZero())); + assert(!XMVector3IsInfinite(Axis)); + + XMVECTOR Normal = XMVector3Normalize(Axis); + XMVECTOR Q = XMQuaternionRotationNormal(Normal, Angle); + return Q; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 q; + float r22 = M.m[2][2]; + if (r22 <= 0.f) // x^2 + y^2 >= z^2 + w^2 + { + float dif10 = M.m[1][1] - M.m[0][0]; + float omr22 = 1.f - r22; + if (dif10 <= 0.f) // x^2 >= y^2 + { + float fourXSqr = omr22 - dif10; + float inv4x = 0.5f / sqrtf(fourXSqr); + q.f[0] = fourXSqr * inv4x; + q.f[1] = (M.m[0][1] + M.m[1][0]) * inv4x; + q.f[2] = (M.m[0][2] + M.m[2][0]) * inv4x; + q.f[3] = (M.m[1][2] - M.m[2][1]) * inv4x; + } + else // y^2 >= x^2 + { + float fourYSqr = omr22 + dif10; + float inv4y = 0.5f / sqrtf(fourYSqr); + q.f[0] = (M.m[0][1] + M.m[1][0]) * inv4y; + q.f[1] = fourYSqr * inv4y; + q.f[2] = (M.m[1][2] + M.m[2][1]) * inv4y; + q.f[3] = (M.m[2][0] - M.m[0][2]) * inv4y; + } + } + else // z^2 + w^2 >= x^2 + y^2 + { + float sum10 = M.m[1][1] + M.m[0][0]; + float opr22 = 1.f + r22; + if (sum10 <= 0.f) // z^2 >= w^2 + { + float fourZSqr = opr22 - sum10; + float inv4z = 0.5f / sqrtf(fourZSqr); + q.f[0] = (M.m[0][2] + M.m[2][0]) * inv4z; + q.f[1] = (M.m[1][2] + M.m[2][1]) * inv4z; + q.f[2] = fourZSqr * inv4z; + q.f[3] = (M.m[0][1] - M.m[1][0]) * inv4z; + } + else // w^2 >= z^2 + { + float fourWSqr = opr22 + sum10; + float inv4w = 0.5f / sqrtf(fourWSqr); + q.f[0] = (M.m[1][2] - M.m[2][1]) * inv4w; + q.f[1] = (M.m[2][0] - M.m[0][2]) * inv4w; + q.f[2] = (M.m[0][1] - M.m[1][0]) * inv4w; + q.f[3] = fourWSqr * inv4w; + } + } + return q.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 XMPMMP = { { { +1.0f, -1.0f, -1.0f, +1.0f } } }; + static const XMVECTORF32 XMMPMP = { { { -1.0f, +1.0f, -1.0f, +1.0f } } }; + static const XMVECTORF32 XMMMPP = { { { -1.0f, -1.0f, +1.0f, +1.0f } } }; + static const XMVECTORU32 Select0110 = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 } } }; + static const XMVECTORU32 Select0010 = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } }; + + float32x4_t r0 = M.r[0]; + float32x4_t r1 = M.r[1]; + float32x4_t r2 = M.r[2]; + + float32x4_t r00 = vdupq_lane_f32(vget_low_f32(r0), 0); + float32x4_t r11 = vdupq_lane_f32(vget_low_f32(r1), 1); + float32x4_t r22 = vdupq_lane_f32(vget_high_f32(r2), 0); + + // x^2 >= y^2 equivalent to r11 - r00 <= 0 + float32x4_t r11mr00 = vsubq_f32(r11, r00); + uint32x4_t x2gey2 = vcleq_f32(r11mr00, g_XMZero); + + // z^2 >= w^2 equivalent to r11 + r00 <= 0 + float32x4_t r11pr00 = vaddq_f32(r11, r00); + uint32x4_t z2gew2 = vcleq_f32(r11pr00, g_XMZero); + + // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0 + uint32x4_t x2py2gez2pw2 = vcleq_f32(r22, g_XMZero); + + // (4*x^2, 4*y^2, 4*z^2, 4*w^2) + float32x4_t t0 = vmulq_f32(XMPMMP, r00); + float32x4_t x2y2z2w2 = vmlaq_f32(t0, XMMPMP, r11); + x2y2z2w2 = vmlaq_f32(x2y2z2w2, XMMMPP, r22); + x2y2z2w2 = vaddq_f32(x2y2z2w2, g_XMOne); + + // (r01, r02, r12, r11) + t0 = vextq_f32(r0, r0, 1); + float32x4_t t1 = vextq_f32(r1, r1, 1); + t0 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_low_f32(t1))); + + // (r10, r20, r21, r10) + t1 = vextq_f32(r2, r2, 3); + float32x4_t r10 = vdupq_lane_f32(vget_low_f32(r1), 0); + t1 = vbslq_f32(Select0110, t1, r10); + + // (4*x*y, 4*x*z, 4*y*z, unused) + float32x4_t xyxzyz = vaddq_f32(t0, t1); + + // (r21, r20, r10, r10) + t0 = vcombine_f32(vrev64_f32(vget_low_f32(r2)), vget_low_f32(r10)); + + // (r12, r02, r01, r12) + float32x4_t t2 = vcombine_f32(vrev64_f32(vget_high_f32(r0)), vrev64_f32(vget_low_f32(r0))); + float32x4_t t3 = vdupq_lane_f32(vget_high_f32(r1), 0); + t1 = vbslq_f32(Select0110, t2, t3); + + // (4*x*w, 4*y*w, 4*z*w, unused) + float32x4_t xwywzw = vsubq_f32(t0, t1); + xwywzw = vmulq_f32(XMMPMP, xwywzw); + + // (4*x*x, 4*x*y, 4*x*z, 4*x*w) + t0 = vextq_f32(xyxzyz, xyxzyz, 3); + t1 = vbslq_f32(Select0110, t0, x2y2z2w2); + t2 = vdupq_lane_f32(vget_low_f32(xwywzw), 0); + float32x4_t tensor0 = vbslq_f32(g_XMSelect1110, t1, t2); + + // (4*y*x, 4*y*y, 4*y*z, 4*y*w) + t0 = vbslq_f32(g_XMSelect1011, xyxzyz, x2y2z2w2); + t1 = vdupq_lane_f32(vget_low_f32(xwywzw), 1); + float32x4_t tensor1 = vbslq_f32(g_XMSelect1110, t0, t1); + + // (4*z*x, 4*z*y, 4*z*z, 4*z*w) + t0 = vextq_f32(xyxzyz, xyxzyz, 1); + t1 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_high_f32(xwywzw))); + float32x4_t tensor2 = vbslq_f32(Select0010, x2y2z2w2, t1); + + // (4*w*x, 4*w*y, 4*w*z, 4*w*w) + float32x4_t tensor3 = vbslq_f32(g_XMSelect1110, xwywzw, x2y2z2w2); + + // Select the row of the tensor-product matrix that has the largest + // magnitude. + t0 = vbslq_f32(x2gey2, tensor0, tensor1); + t1 = vbslq_f32(z2gew2, tensor2, tensor3); + t2 = vbslq_f32(x2py2gez2pw2, t0, t1); + + // Normalize the row. No division by zero is possible because the + // quaternion is unit-length (and the row is a nonzero multiple of + // the quaternion). + t0 = XMVector4Length(t2); + return XMVectorDivide(t2, t0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 XMPMMP = { { { +1.0f, -1.0f, -1.0f, +1.0f } } }; + static const XMVECTORF32 XMMPMP = { { { -1.0f, +1.0f, -1.0f, +1.0f } } }; + static const XMVECTORF32 XMMMPP = { { { -1.0f, -1.0f, +1.0f, +1.0f } } }; + + XMVECTOR r0 = M.r[0]; // (r00, r01, r02, 0) + XMVECTOR r1 = M.r[1]; // (r10, r11, r12, 0) + XMVECTOR r2 = M.r[2]; // (r20, r21, r22, 0) + + // (r00, r00, r00, r00) + XMVECTOR r00 = XM_PERMUTE_PS(r0, _MM_SHUFFLE(0, 0, 0, 0)); + // (r11, r11, r11, r11) + XMVECTOR r11 = XM_PERMUTE_PS(r1, _MM_SHUFFLE(1, 1, 1, 1)); + // (r22, r22, r22, r22) + XMVECTOR r22 = XM_PERMUTE_PS(r2, _MM_SHUFFLE(2, 2, 2, 2)); + + // x^2 >= y^2 equivalent to r11 - r00 <= 0 + // (r11 - r00, r11 - r00, r11 - r00, r11 - r00) + XMVECTOR r11mr00 = _mm_sub_ps(r11, r00); + XMVECTOR x2gey2 = _mm_cmple_ps(r11mr00, g_XMZero); + + // z^2 >= w^2 equivalent to r11 + r00 <= 0 + // (r11 + r00, r11 + r00, r11 + r00, r11 + r00) + XMVECTOR r11pr00 = _mm_add_ps(r11, r00); + XMVECTOR z2gew2 = _mm_cmple_ps(r11pr00, g_XMZero); + + // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0 + XMVECTOR x2py2gez2pw2 = _mm_cmple_ps(r22, g_XMZero); + + // (4*x^2, 4*y^2, 4*z^2, 4*w^2) + XMVECTOR t0 = XM_FMADD_PS(XMPMMP, r00, g_XMOne); + XMVECTOR t1 = _mm_mul_ps(XMMPMP, r11); + XMVECTOR t2 = XM_FMADD_PS(XMMMPP, r22, t0); + XMVECTOR x2y2z2w2 = _mm_add_ps(t1, t2); + + // (r01, r02, r12, r11) + t0 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 2, 2, 1)); + // (r10, r10, r20, r21) + t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1, 0, 0, 0)); + // (r10, r20, r21, r10) + t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0)); + // (4*x*y, 4*x*z, 4*y*z, unused) + XMVECTOR xyxzyz = _mm_add_ps(t0, t1); + + // (r21, r20, r10, r10) + t0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 1)); + // (r12, r12, r02, r01) + t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1, 2, 2, 2)); + // (r12, r02, r01, r12) + t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0)); + // (4*x*w, 4*y*w, 4*z*w, unused) + XMVECTOR xwywzw = _mm_sub_ps(t0, t1); + xwywzw = _mm_mul_ps(XMMPMP, xwywzw); + + // (4*x^2, 4*y^2, 4*x*y, unused) + t0 = _mm_shuffle_ps(x2y2z2w2, xyxzyz, _MM_SHUFFLE(0, 0, 1, 0)); + // (4*z^2, 4*w^2, 4*z*w, unused) + t1 = _mm_shuffle_ps(x2y2z2w2, xwywzw, _MM_SHUFFLE(0, 2, 3, 2)); + // (4*x*z, 4*y*z, 4*x*w, 4*y*w) + t2 = _mm_shuffle_ps(xyxzyz, xwywzw, _MM_SHUFFLE(1, 0, 2, 1)); + + // (4*x*x, 4*x*y, 4*x*z, 4*x*w) + XMVECTOR tensor0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2, 0, 2, 0)); + // (4*y*x, 4*y*y, 4*y*z, 4*y*w) + XMVECTOR tensor1 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 1, 1, 2)); + // (4*z*x, 4*z*y, 4*z*z, 4*z*w) + XMVECTOR tensor2 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2, 0, 1, 0)); + // (4*w*x, 4*w*y, 4*w*z, 4*w*w) + XMVECTOR tensor3 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(1, 2, 3, 2)); + + // Select the row of the tensor-product matrix that has the largest + // magnitude. + t0 = _mm_and_ps(x2gey2, tensor0); + t1 = _mm_andnot_ps(x2gey2, tensor1); + t0 = _mm_or_ps(t0, t1); + t1 = _mm_and_ps(z2gew2, tensor2); + t2 = _mm_andnot_ps(z2gew2, tensor3); + t1 = _mm_or_ps(t1, t2); + t0 = _mm_and_ps(x2py2gez2pw2, t0); + t1 = _mm_andnot_ps(x2py2gez2pw2, t1); + t2 = _mm_or_ps(t0, t1); + + // Normalize the row. No division by zero is possible because the + // quaternion is unit-length (and the row is a nonzero multiple of + // the quaternion). + t0 = XMVector4Length(t2); + return _mm_div_ps(t2, t0); +#endif +} + +//------------------------------------------------------------------------------ +// Conversion operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMQuaternionToAxisAngle +( + XMVECTOR* pAxis, + float* pAngle, + FXMVECTOR Q +) noexcept +{ + assert(pAxis); + assert(pAngle); + + *pAxis = Q; + + *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q)); +} + +/**************************************************************************** + * + * Plane + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + // Comparison operations + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneEqual +( + FXMVECTOR P1, + FXMVECTOR P2 +) noexcept +{ + return XMVector4Equal(P1, P2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneNearEqual +( + FXMVECTOR P1, + FXMVECTOR P2, + FXMVECTOR Epsilon +) noexcept +{ + XMVECTOR NP1 = XMPlaneNormalize(P1); + XMVECTOR NP2 = XMPlaneNormalize(P2); + return XMVector4NearEqual(NP1, NP2, Epsilon); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneNotEqual +( + FXMVECTOR P1, + FXMVECTOR P2 +) noexcept +{ + return XMVector4NotEqual(P1, P2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneIsNaN(FXMVECTOR P) noexcept +{ + return XMVector4IsNaN(P); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneIsInfinite(FXMVECTOR P) noexcept +{ + return XMVector4IsInfinite(P); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneDot +( + FXMVECTOR P, + FXMVECTOR V +) noexcept +{ + return XMVector4Dot(P, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneDotCoord +( + FXMVECTOR P, + FXMVECTOR V +) noexcept +{ + // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3] + + XMVECTOR V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v); + XMVECTOR Result = XMVector4Dot(P, V3); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneDotNormal +( + FXMVECTOR P, + FXMVECTOR V +) noexcept +{ + return XMVector3Dot(P, V); +} + +//------------------------------------------------------------------------------ +// XMPlaneNormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst(FXMVECTOR P) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Result = XMVector3ReciprocalLengthEst(P); + return XMVectorMultiply(P, Result); + +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(P, P, 0x7f); + XMVECTOR vResult = _mm_rsqrt_ps(vTemp); + return _mm_mul_ps(vResult, P); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(P, P); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot, vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot, vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the reciprocal + vDot = _mm_rsqrt_ps(vDot); + // Get the reciprocal + vDot = _mm_mul_ps(vDot, P); + return vDot; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneNormalize(FXMVECTOR P) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float fLengthSq = sqrtf((P.vector4_f32[0] * P.vector4_f32[0]) + (P.vector4_f32[1] * P.vector4_f32[1]) + (P.vector4_f32[2] * P.vector4_f32[2])); + // Prevent divide by zero + if (fLengthSq > 0) + { + fLengthSq = 1.0f / fLengthSq; + } + XMVECTORF32 vResult = { { { + P.vector4_f32[0] * fLengthSq, + P.vector4_f32[1] * fLengthSq, + P.vector4_f32[2] * fLengthSq, + P.vector4_f32[3] * fLengthSq + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vLength = XMVector3ReciprocalLength(P); + return XMVectorMultiply(P, vLength); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps(P, P, 0x7f); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(P, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vLengthSq); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(P, P); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1)); + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(P, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vLengthSq); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneIntersectLine +( + FXMVECTOR P, + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2 +) noexcept +{ + XMVECTOR V1 = XMVector3Dot(P, LinePoint1); + XMVECTOR V2 = XMVector3Dot(P, LinePoint2); + XMVECTOR D = XMVectorSubtract(V1, V2); + + XMVECTOR VT = XMPlaneDotCoord(P, LinePoint1); + VT = XMVectorDivide(VT, D); + + XMVECTOR Point = XMVectorSubtract(LinePoint2, LinePoint1); + Point = XMVectorMultiplyAdd(Point, VT, LinePoint1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v); + + return XMVectorSelect(Point, g_XMQNaN.v, Control); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMPlaneIntersectPlane +( + XMVECTOR* pLinePoint1, + XMVECTOR* pLinePoint2, + FXMVECTOR P1, + FXMVECTOR P2 +) noexcept +{ + assert(pLinePoint1); + assert(pLinePoint2); + + XMVECTOR V1 = XMVector3Cross(P2, P1); + + XMVECTOR LengthSq = XMVector3LengthSq(V1); + + XMVECTOR V2 = XMVector3Cross(P2, V1); + + XMVECTOR P1W = XMVectorSplatW(P1); + XMVECTOR Point = XMVectorMultiply(V2, P1W); + + XMVECTOR V3 = XMVector3Cross(V1, P1); + + XMVECTOR P2W = XMVectorSplatW(P2); + Point = XMVectorMultiplyAdd(V3, P2W, Point); + + XMVECTOR LinePoint1 = XMVectorDivide(Point, LengthSq); + + XMVECTOR LinePoint2 = XMVectorAdd(LinePoint1, V1); + + XMVECTOR Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v); + *pLinePoint1 = XMVectorSelect(LinePoint1, g_XMQNaN.v, Control); + *pLinePoint2 = XMVectorSelect(LinePoint2, g_XMQNaN.v, Control); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneTransform +( + FXMVECTOR P, + FXMMATRIX ITM +) noexcept +{ + XMVECTOR W = XMVectorSplatW(P); + XMVECTOR Z = XMVectorSplatZ(P); + XMVECTOR Y = XMVectorSplatY(P); + XMVECTOR X = XMVectorSplatX(P); + + XMVECTOR Result = XMVectorMultiply(W, ITM.r[3]); + Result = XMVectorMultiplyAdd(Z, ITM.r[2], Result); + Result = XMVectorMultiplyAdd(Y, ITM.r[1], Result); + Result = XMVectorMultiplyAdd(X, ITM.r[0], Result); + return Result; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMPlaneTransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT4* pInputStream, + size_t InputStride, + size_t PlaneCount, + FXMMATRIX ITM +) noexcept +{ + return XMVector4TransformStream(pOutputStream, + OutputStride, + pInputStream, + InputStride, + PlaneCount, + ITM); +} + +//------------------------------------------------------------------------------ +// Conversion operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneFromPointNormal +( + FXMVECTOR Point, + FXMVECTOR Normal +) noexcept +{ + XMVECTOR W = XMVector3Dot(Point, Normal); + W = XMVectorNegate(W); + return XMVectorSelect(W, Normal, g_XMSelect1110.v); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneFromPoints +( + FXMVECTOR Point1, + FXMVECTOR Point2, + FXMVECTOR Point3 +) noexcept +{ + XMVECTOR V21 = XMVectorSubtract(Point1, Point2); + XMVECTOR V31 = XMVectorSubtract(Point1, Point3); + + XMVECTOR N = XMVector3Cross(V21, V31); + N = XMVector3Normalize(N); + + XMVECTOR D = XMPlaneDotNormal(N, Point1); + D = XMVectorNegate(D); + + XMVECTOR Result = XMVectorSelect(D, N, g_XMSelect1110.v); + + return Result; +} + +/**************************************************************************** + * + * Color + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + // Comparison operations + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4Equal(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorNotEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4NotEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorGreater +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4Greater(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorGreaterOrEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4GreaterOrEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorLess +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4Less(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorLessOrEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVector4LessOrEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorIsNaN(FXMVECTOR C) noexcept +{ + return XMVector4IsNaN(C); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorIsInfinite(FXMVECTOR C) noexcept +{ + return XMVector4IsInfinite(C); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR vColor) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + 1.0f - vColor.vector4_f32[0], + 1.0f - vColor.vector4_f32[1], + 1.0f - vColor.vector4_f32[2], + vColor.vector4_f32[3] + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vTemp = veorq_u32(vreinterpretq_u32_f32(vColor), g_XMNegate3); + return vaddq_f32(vreinterpretq_f32_u32(vTemp), g_XMOne3); +#elif defined(_XM_SSE_INTRINSICS_) + // Negate only x,y and z. + XMVECTOR vTemp = _mm_xor_ps(vColor, g_XMNegate3); + // Add 1,1,1,0 to -x,-y,-z,w + return _mm_add_ps(vTemp, g_XMOne3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorModulate +( + FXMVECTOR C1, + FXMVECTOR C2 +) noexcept +{ + return XMVectorMultiply(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorAdjustSaturation +( + FXMVECTOR vColor, + float fSaturation +) noexcept +{ + // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2]; + // Result = (C - Luminance) * Saturation + Luminance; + + const XMVECTORF32 gvLuminance = { { { 0.2125f, 0.7154f, 0.0721f, 0.0f } } }; +#if defined(_XM_NO_INTRINSICS_) + float fLuminance = (vColor.vector4_f32[0] * gvLuminance.f[0]) + (vColor.vector4_f32[1] * gvLuminance.f[1]) + (vColor.vector4_f32[2] * gvLuminance.f[2]); + XMVECTOR vResult; + vResult.vector4_f32[0] = ((vColor.vector4_f32[0] - fLuminance) * fSaturation) + fLuminance; + vResult.vector4_f32[1] = ((vColor.vector4_f32[1] - fLuminance) * fSaturation) + fLuminance; + vResult.vector4_f32[2] = ((vColor.vector4_f32[2] - fLuminance) * fSaturation) + fLuminance; + vResult.vector4_f32[3] = vColor.vector4_f32[3]; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vLuminance = XMVector3Dot(vColor, gvLuminance); + XMVECTOR vResult = vsubq_f32(vColor, vLuminance); + vResult = vmlaq_n_f32(vLuminance, vResult, fSaturation); + return vbslq_f32(g_XMSelect1110, vResult, vColor); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vLuminance = XMVector3Dot(vColor, gvLuminance); + // Splat fSaturation + XMVECTOR vSaturation = _mm_set_ps1(fSaturation); + // vResult = ((vColor-vLuminance)*vSaturation)+vLuminance; + XMVECTOR vResult = _mm_sub_ps(vColor, vLuminance); + vResult = XM_FMADD_PS(vResult, vSaturation, vLuminance); + // Retain w from the source color + vLuminance = _mm_shuffle_ps(vResult, vColor, _MM_SHUFFLE(3, 2, 2, 2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w + vResult = _mm_shuffle_ps(vResult, vLuminance, _MM_SHUFFLE(3, 0, 1, 0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorAdjustContrast +( + FXMVECTOR vColor, + float fContrast +) noexcept +{ + // Result = (vColor - 0.5f) * fContrast + 0.5f; + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + ((vColor.vector4_f32[0] - 0.5f) * fContrast) + 0.5f, + ((vColor.vector4_f32[1] - 0.5f) * fContrast) + 0.5f, + ((vColor.vector4_f32[2] - 0.5f) * fContrast) + 0.5f, + vColor.vector4_f32[3] // Leave W untouched + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vResult = vsubq_f32(vColor, g_XMOneHalf.v); + vResult = vmlaq_n_f32(g_XMOneHalf.v, vResult, fContrast); + return vbslq_f32(g_XMSelect1110, vResult, vColor); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vScale = _mm_set_ps1(fContrast); // Splat the scale + XMVECTOR vResult = _mm_sub_ps(vColor, g_XMOneHalf); // Subtract 0.5f from the source (Saving source) + vResult = XM_FMADD_PS(vResult, vScale, g_XMOneHalf); +// Retain w from the source color + vScale = _mm_shuffle_ps(vResult, vColor, _MM_SHUFFLE(3, 2, 2, 2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w + vResult = _mm_shuffle_ps(vResult, vScale, _MM_SHUFFLE(3, 0, 1, 0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToHSL(FXMVECTOR rgb) noexcept +{ + XMVECTOR r = XMVectorSplatX(rgb); + XMVECTOR g = XMVectorSplatY(rgb); + XMVECTOR b = XMVectorSplatZ(rgb); + + XMVECTOR min = XMVectorMin(r, XMVectorMin(g, b)); + XMVECTOR max = XMVectorMax(r, XMVectorMax(g, b)); + + XMVECTOR l = XMVectorMultiply(XMVectorAdd(min, max), g_XMOneHalf); + + XMVECTOR d = XMVectorSubtract(max, min); + + XMVECTOR la = XMVectorSelect(rgb, l, g_XMSelect1110); + + if (XMVector3Less(d, g_XMEpsilon)) + { + // Achromatic, assume H and S of 0 + return XMVectorSelect(la, g_XMZero, g_XMSelect1100); + } + else + { + XMVECTOR s, h; + + XMVECTOR d2 = XMVectorAdd(min, max); + + if (XMVector3Greater(l, g_XMOneHalf)) + { + // d / (2-max-min) + s = XMVectorDivide(d, XMVectorSubtract(g_XMTwo, d2)); + } + else + { + // d / (max+min) + s = XMVectorDivide(d, d2); + } + + if (XMVector3Equal(r, max)) + { + // Red is max + h = XMVectorDivide(XMVectorSubtract(g, b), d); + } + else if (XMVector3Equal(g, max)) + { + // Green is max + h = XMVectorDivide(XMVectorSubtract(b, r), d); + h = XMVectorAdd(h, g_XMTwo); + } + else + { + // Blue is max + h = XMVectorDivide(XMVectorSubtract(r, g), d); + h = XMVectorAdd(h, g_XMFour); + } + + h = XMVectorDivide(h, g_XMSix); + + if (XMVector3Less(h, g_XMZero)) + h = XMVectorAdd(h, g_XMOne); + + XMVECTOR lha = XMVectorSelect(la, h, g_XMSelect1100); + return XMVectorSelect(s, lha, g_XMSelect1011); + } +} + +//------------------------------------------------------------------------------ + +namespace Internal +{ + + inline XMVECTOR XM_CALLCONV XMColorHue2Clr(FXMVECTOR p, FXMVECTOR q, FXMVECTOR h) noexcept + { + static const XMVECTORF32 oneSixth = { { { 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f } } }; + static const XMVECTORF32 twoThirds = { { { 2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f } } }; + + XMVECTOR t = h; + + if (XMVector3Less(t, g_XMZero)) + t = XMVectorAdd(t, g_XMOne); + + if (XMVector3Greater(t, g_XMOne)) + t = XMVectorSubtract(t, g_XMOne); + + if (XMVector3Less(t, oneSixth)) + { + // p + (q - p) * 6 * t + XMVECTOR t1 = XMVectorSubtract(q, p); + XMVECTOR t2 = XMVectorMultiply(g_XMSix, t); + return XMVectorMultiplyAdd(t1, t2, p); + } + + if (XMVector3Less(t, g_XMOneHalf)) + return q; + + if (XMVector3Less(t, twoThirds)) + { + // p + (q - p) * 6 * (2/3 - t) + XMVECTOR t1 = XMVectorSubtract(q, p); + XMVECTOR t2 = XMVectorMultiply(g_XMSix, XMVectorSubtract(twoThirds, t)); + return XMVectorMultiplyAdd(t1, t2, p); + } + + return p; + } + +} // namespace Internal + +inline XMVECTOR XM_CALLCONV XMColorHSLToRGB(FXMVECTOR hsl) noexcept +{ + static const XMVECTORF32 oneThird = { { { 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f } } }; + + XMVECTOR s = XMVectorSplatY(hsl); + XMVECTOR l = XMVectorSplatZ(hsl); + + if (XMVector3NearEqual(s, g_XMZero, g_XMEpsilon)) + { + // Achromatic + return XMVectorSelect(hsl, l, g_XMSelect1110); + } + else + { + XMVECTOR h = XMVectorSplatX(hsl); + + XMVECTOR q; + if (XMVector3Less(l, g_XMOneHalf)) + { + q = XMVectorMultiply(l, XMVectorAdd(g_XMOne, s)); + } + else + { + q = XMVectorSubtract(XMVectorAdd(l, s), XMVectorMultiply(l, s)); + } + + XMVECTOR p = XMVectorSubtract(XMVectorMultiply(g_XMTwo, l), q); + + XMVECTOR r = DirectX::Internal::XMColorHue2Clr(p, q, XMVectorAdd(h, oneThird)); + XMVECTOR g = DirectX::Internal::XMColorHue2Clr(p, q, h); + XMVECTOR b = DirectX::Internal::XMColorHue2Clr(p, q, XMVectorSubtract(h, oneThird)); + + XMVECTOR rg = XMVectorSelect(g, r, g_XMSelect1000); + XMVECTOR ba = XMVectorSelect(hsl, b, g_XMSelect1110); + + return XMVectorSelect(ba, rg, g_XMSelect1100); + } +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToHSV(FXMVECTOR rgb) noexcept +{ + XMVECTOR r = XMVectorSplatX(rgb); + XMVECTOR g = XMVectorSplatY(rgb); + XMVECTOR b = XMVectorSplatZ(rgb); + + XMVECTOR min = XMVectorMin(r, XMVectorMin(g, b)); + XMVECTOR v = XMVectorMax(r, XMVectorMax(g, b)); + + XMVECTOR d = XMVectorSubtract(v, min); + + XMVECTOR s = (XMVector3NearEqual(v, g_XMZero, g_XMEpsilon)) ? g_XMZero : XMVectorDivide(d, v); + + if (XMVector3Less(d, g_XMEpsilon)) + { + // Achromatic, assume H of 0 + XMVECTOR hv = XMVectorSelect(v, g_XMZero, g_XMSelect1000); + XMVECTOR hva = XMVectorSelect(rgb, hv, g_XMSelect1110); + return XMVectorSelect(s, hva, g_XMSelect1011); + } + else + { + XMVECTOR h; + + if (XMVector3Equal(r, v)) + { + // Red is max + h = XMVectorDivide(XMVectorSubtract(g, b), d); + + if (XMVector3Less(g, b)) + h = XMVectorAdd(h, g_XMSix); + } + else if (XMVector3Equal(g, v)) + { + // Green is max + h = XMVectorDivide(XMVectorSubtract(b, r), d); + h = XMVectorAdd(h, g_XMTwo); + } + else + { + // Blue is max + h = XMVectorDivide(XMVectorSubtract(r, g), d); + h = XMVectorAdd(h, g_XMFour); + } + + h = XMVectorDivide(h, g_XMSix); + + XMVECTOR hv = XMVectorSelect(v, h, g_XMSelect1000); + XMVECTOR hva = XMVectorSelect(rgb, hv, g_XMSelect1110); + return XMVectorSelect(s, hva, g_XMSelect1011); + } +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorHSVToRGB(FXMVECTOR hsv) noexcept +{ + XMVECTOR h = XMVectorSplatX(hsv); + XMVECTOR s = XMVectorSplatY(hsv); + XMVECTOR v = XMVectorSplatZ(hsv); + + XMVECTOR h6 = XMVectorMultiply(h, g_XMSix); + + XMVECTOR i = XMVectorFloor(h6); + XMVECTOR f = XMVectorSubtract(h6, i); + + // p = v* (1-s) + XMVECTOR p = XMVectorMultiply(v, XMVectorSubtract(g_XMOne, s)); + + // q = v*(1-f*s) + XMVECTOR q = XMVectorMultiply(v, XMVectorSubtract(g_XMOne, XMVectorMultiply(f, s))); + + // t = v*(1 - (1-f)*s) + XMVECTOR t = XMVectorMultiply(v, XMVectorSubtract(g_XMOne, XMVectorMultiply(XMVectorSubtract(g_XMOne, f), s))); + + auto ii = static_cast(XMVectorGetX(XMVectorMod(i, g_XMSix))); + + XMVECTOR _rgb; + + switch (ii) + { + case 0: // rgb = vtp + { + XMVECTOR vt = XMVectorSelect(t, v, g_XMSelect1000); + _rgb = XMVectorSelect(p, vt, g_XMSelect1100); + } + break; + case 1: // rgb = qvp + { + XMVECTOR qv = XMVectorSelect(v, q, g_XMSelect1000); + _rgb = XMVectorSelect(p, qv, g_XMSelect1100); + } + break; + case 2: // rgb = pvt + { + XMVECTOR pv = XMVectorSelect(v, p, g_XMSelect1000); + _rgb = XMVectorSelect(t, pv, g_XMSelect1100); + } + break; + case 3: // rgb = pqv + { + XMVECTOR pq = XMVectorSelect(q, p, g_XMSelect1000); + _rgb = XMVectorSelect(v, pq, g_XMSelect1100); + } + break; + case 4: // rgb = tpv + { + XMVECTOR tp = XMVectorSelect(p, t, g_XMSelect1000); + _rgb = XMVectorSelect(v, tp, g_XMSelect1100); + } + break; + default: // rgb = vpq + { + XMVECTOR vp = XMVectorSelect(p, v, g_XMSelect1000); + _rgb = XMVectorSelect(q, vp, g_XMSelect1100); + } + break; + } + + return XMVectorSelect(hsv, _rgb, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToYUV(FXMVECTOR rgb) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 0.299f, -0.147f, 0.615f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { 0.587f, -0.289f, -0.515f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 0.114f, 0.436f, -0.100f, 0.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(rgb, M); + + return XMVectorSelect(rgb, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorYUVToRGB(FXMVECTOR yuv) noexcept +{ + static const XMVECTORF32 Scale1 = { { { 0.0f, -0.395f, 2.032f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 1.140f, -0.581f, 0.0f, 0.0f } } }; + + XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(yuv, M); + + return XMVectorSelect(yuv, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD(FXMVECTOR rgb) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 0.2126f, -0.0997f, 0.6150f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { 0.7152f, -0.3354f, -0.5586f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 0.0722f, 0.4351f, -0.0564f, 0.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(rgb, M); + + return XMVectorSelect(rgb, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD(FXMVECTOR yuv) noexcept +{ + static const XMVECTORF32 Scale1 = { { { 0.0f, -0.2153f, 2.1324f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 1.2803f, -0.3806f, 0.0f, 0.0f } } }; + + XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(yuv, M); + + return XMVectorSelect(yuv, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_UHD(FXMVECTOR rgb) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 0.2627f, -0.1215f, 0.6150f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { 0.6780f, -0.3136f, -0.5655f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 0.0593f, 0.4351f, -0.0495f, 0.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(rgb, M); + + return XMVectorSelect(rgb, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_UHD(FXMVECTOR yuv) noexcept +{ + static const XMVECTORF32 Scale1 = { { { 0.0f, -0.1891f, 2.1620f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 1.1989f, -0.4645f, 0.0f, 0.0f } } }; + + XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(yuv, M); + + return XMVectorSelect(yuv, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToXYZ(FXMVECTOR rgb) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 0.4887180f, 0.1762044f, 0.0000000f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { 0.3106803f, 0.8129847f, 0.0102048f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 0.2006017f, 0.0108109f, 0.9897952f, 0.0f } } }; + static const XMVECTORF32 Scale = { { { 1.f / 0.17697f, 1.f / 0.17697f, 1.f / 0.17697f, 0.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVectorMultiply(XMVector3Transform(rgb, M), Scale); + + return XMVectorSelect(rgb, clr, g_XMSelect1110); +} + +inline XMVECTOR XM_CALLCONV XMColorXYZToRGB(FXMVECTOR xyz) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 2.3706743f, -0.5138850f, 0.0052982f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { -0.9000405f, 1.4253036f, -0.0146949f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { -0.4706338f, 0.0885814f, 1.0093968f, 0.0f } } }; + static const XMVECTORF32 Scale = { { { 0.17697f, 0.17697f, 0.17697f, 0.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(XMVectorMultiply(xyz, Scale), M); + + return XMVectorSelect(xyz, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorXYZToSRGB(FXMVECTOR xyz) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 3.2406f, -0.9689f, 0.0557f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { -1.5372f, 1.8758f, -0.2040f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { -0.4986f, 0.0415f, 1.0570f, 0.0f } } }; + static const XMVECTORF32 Cutoff = { { { 0.0031308f, 0.0031308f, 0.0031308f, 0.0f } } }; + static const XMVECTORF32 Exp = { { { 1.0f / 2.4f, 1.0f / 2.4f, 1.0f / 2.4f, 1.0f } } }; + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR lclr = XMVector3Transform(xyz, M); + + XMVECTOR sel = XMVectorGreater(lclr, Cutoff); + + // clr = 12.92 * lclr for lclr <= 0.0031308f + XMVECTOR smallC = XMVectorMultiply(lclr, g_XMsrgbScale); + + // clr = (1+a)*pow(lclr, 1/2.4) - a for lclr > 0.0031308 (where a = 0.055) + XMVECTOR largeC = XMVectorSubtract(XMVectorMultiply(g_XMsrgbA1, XMVectorPow(lclr, Exp)), g_XMsrgbA); + + XMVECTOR clr = XMVectorSelect(smallC, largeC, sel); + + return XMVectorSelect(xyz, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorSRGBToXYZ(FXMVECTOR srgb) noexcept +{ + static const XMVECTORF32 Scale0 = { { { 0.4124f, 0.2126f, 0.0193f, 0.0f } } }; + static const XMVECTORF32 Scale1 = { { { 0.3576f, 0.7152f, 0.1192f, 0.0f } } }; + static const XMVECTORF32 Scale2 = { { { 0.1805f, 0.0722f, 0.9505f, 0.0f } } }; + static const XMVECTORF32 Cutoff = { { { 0.04045f, 0.04045f, 0.04045f, 0.0f } } }; + static const XMVECTORF32 Exp = { { { 2.4f, 2.4f, 2.4f, 1.0f } } }; + + XMVECTOR sel = XMVectorGreater(srgb, Cutoff); + + // lclr = clr / 12.92 + XMVECTOR smallC = XMVectorDivide(srgb, g_XMsrgbScale); + + // lclr = pow( (clr + a) / (1+a), 2.4 ) + XMVECTOR largeC = XMVectorPow(XMVectorDivide(XMVectorAdd(srgb, g_XMsrgbA), g_XMsrgbA1), Exp); + + XMVECTOR lclr = XMVectorSelect(smallC, largeC, sel); + + XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero); + XMVECTOR clr = XMVector3Transform(lclr, M); + + return XMVectorSelect(srgb, clr, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToSRGB(FXMVECTOR rgb) noexcept +{ + static const XMVECTORF32 Cutoff = { { { 0.0031308f, 0.0031308f, 0.0031308f, 1.f } } }; + static const XMVECTORF32 Linear = { { { 12.92f, 12.92f, 12.92f, 1.f } } }; + static const XMVECTORF32 Scale = { { { 1.055f, 1.055f, 1.055f, 1.f } } }; + static const XMVECTORF32 Bias = { { { 0.055f, 0.055f, 0.055f, 0.f } } }; + static const XMVECTORF32 InvGamma = { { { 1.0f / 2.4f, 1.0f / 2.4f, 1.0f / 2.4f, 1.f } } }; + + XMVECTOR V = XMVectorSaturate(rgb); + XMVECTOR V0 = XMVectorMultiply(V, Linear); + XMVECTOR V1 = XMVectorSubtract(XMVectorMultiply(Scale, XMVectorPow(V, InvGamma)), Bias); + XMVECTOR select = XMVectorLess(V, Cutoff); + V = XMVectorSelect(V1, V0, select); + return XMVectorSelect(rgb, V, g_XMSelect1110); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorSRGBToRGB(FXMVECTOR srgb) noexcept +{ + static const XMVECTORF32 Cutoff = { { { 0.04045f, 0.04045f, 0.04045f, 1.f } } }; + static const XMVECTORF32 ILinear = { { { 1.f / 12.92f, 1.f / 12.92f, 1.f / 12.92f, 1.f } } }; + static const XMVECTORF32 Scale = { { { 1.f / 1.055f, 1.f / 1.055f, 1.f / 1.055f, 1.f } } }; + static const XMVECTORF32 Bias = { { { 0.055f, 0.055f, 0.055f, 0.f } } }; + static const XMVECTORF32 Gamma = { { { 2.4f, 2.4f, 2.4f, 1.f } } }; + + XMVECTOR V = XMVectorSaturate(srgb); + XMVECTOR V0 = XMVectorMultiply(V, ILinear); + XMVECTOR V1 = XMVectorPow(XMVectorMultiply(XMVectorAdd(V, Bias), Scale), Gamma); + XMVECTOR select = XMVectorGreater(V, Cutoff); + V = XMVectorSelect(V0, V1, select); + return XMVectorSelect(srgb, V, g_XMSelect1110); +} + +/**************************************************************************** + * + * Miscellaneous + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline bool XMVerifyCPUSupport() noexcept +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + int CPUInfo[4] = { -1 }; +#if defined(__clang__) || defined(__GNUC__) + __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 0); +#endif + +#ifdef __AVX2__ + if (CPUInfo[0] < 7) + return false; +#else + if (CPUInfo[0] < 1) + return false; +#endif + +#if defined(__clang__) || defined(__GNUC__) + __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuid(CPUInfo, 1); +#endif + +#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_) + // The compiler can emit FMA3 instructions even without explicit intrinsics use + if ((CPUInfo[2] & 0x38081001) != 0x38081001) + return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support +#elif defined(_XM_FMA3_INTRINSICS_) && defined(_XM_F16C_INTRINSICS_) + if ((CPUInfo[2] & 0x38081001) != 0x38081001) + return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support +#elif defined(_XM_FMA3_INTRINSICS_) + if ((CPUInfo[2] & 0x18081001) != 0x18081001) + return false; // No AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support +#elif defined(_XM_F16C_INTRINSICS_) + if ((CPUInfo[2] & 0x38080001) != 0x38080001) + return false; // No F16C/AVX/OSXSAVE/SSE4.1/SSE3 support +#elif defined(__AVX__) || defined(_XM_AVX_INTRINSICS_) + if ((CPUInfo[2] & 0x18080001) != 0x18080001) + return false; // No AVX/OSXSAVE/SSE4.1/SSE3 support +#elif defined(_XM_SSE4_INTRINSICS_) + if ((CPUInfo[2] & 0x80001) != 0x80001) + return false; // No SSE3/SSE4.1 support +#elif defined(_XM_SSE3_INTRINSICS_) + if (!(CPUInfo[2] & 0x1)) + return false; // No SSE3 support +#endif + + // The x64 processor model requires SSE2 support, but no harm in checking + if ((CPUInfo[3] & 0x6000000) != 0x6000000) + return false; // No SSE2/SSE support + +#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_) +#if defined(__clang__) || defined(__GNUC__) + __cpuid_count(7, 0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); +#else + __cpuidex(CPUInfo, 7, 0); +#endif + if (!(CPUInfo[1] & 0x20)) + return false; // No AVX2 support +#endif + + return true; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + // ARM-NEON support is required for the Windows on ARM platform + return true; +#else + // No intrinsics path always supported + return true; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMFresnelTerm +( + FXMVECTOR CosIncidentAngle, + FXMVECTOR RefractionIndex +) noexcept +{ + assert(!XMVector4IsInfinite(CosIncidentAngle)); + + // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where + // c = CosIncidentAngle + // g = sqrt(c^2 + RefractionIndex^2 - 1) + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex, g_XMNegativeOne.v); + G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G); + G = XMVectorAbs(G); + G = XMVectorSqrt(G); + + XMVECTOR S = XMVectorAdd(G, CosIncidentAngle); + XMVECTOR D = XMVectorSubtract(G, CosIncidentAngle); + + XMVECTOR V0 = XMVectorMultiply(D, D); + XMVECTOR V1 = XMVectorMultiply(S, S); + V1 = XMVectorReciprocal(V1); + V0 = XMVectorMultiply(g_XMOneHalf.v, V0); + V0 = XMVectorMultiply(V0, V1); + + XMVECTOR V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v); + XMVECTOR V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v); + V2 = XMVectorMultiply(V2, V2); + V3 = XMVectorMultiply(V3, V3); + V3 = XMVectorReciprocal(V3); + V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v); + + XMVECTOR Result = XMVectorMultiply(V0, V2); + + Result = XMVectorSaturate(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2)) + XMVECTOR G = _mm_mul_ps(RefractionIndex, RefractionIndex); + XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle, CosIncidentAngle); + G = _mm_sub_ps(G, g_XMOne); + vTemp = _mm_add_ps(vTemp, G); + // max((0-vTemp),vTemp) == abs(vTemp) + // The abs is needed to deal with refraction and cosine being zero + G = _mm_setzero_ps(); + G = _mm_sub_ps(G, vTemp); + G = _mm_max_ps(G, vTemp); + // Last operation, the sqrt() + G = _mm_sqrt_ps(G); + + // Calc G-C and G+C + XMVECTOR GAddC = _mm_add_ps(G, CosIncidentAngle); + XMVECTOR GSubC = _mm_sub_ps(G, CosIncidentAngle); + // Perform the term (0.5f *(g - c)^2) / (g + c)^2 + XMVECTOR vResult = _mm_mul_ps(GSubC, GSubC); + vTemp = _mm_mul_ps(GAddC, GAddC); + vResult = _mm_mul_ps(vResult, g_XMOneHalf); + vResult = _mm_div_ps(vResult, vTemp); + // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) + GAddC = _mm_mul_ps(GAddC, CosIncidentAngle); + GSubC = _mm_mul_ps(GSubC, CosIncidentAngle); + GAddC = _mm_sub_ps(GAddC, g_XMOne); + GSubC = _mm_add_ps(GSubC, g_XMOne); + GAddC = _mm_mul_ps(GAddC, GAddC); + GSubC = _mm_mul_ps(GSubC, GSubC); + GAddC = _mm_div_ps(GAddC, GSubC); + GAddC = _mm_add_ps(GAddC, g_XMOne); + // Multiply the two term parts + vResult = _mm_mul_ps(vResult, GAddC); + // Clamp to 0.0 - 1.0f + vResult = _mm_max_ps(vResult, g_XMZero); + vResult = _mm_min_ps(vResult, g_XMOne); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMScalarNearEqual +( + float S1, + float S2, + float Epsilon +) noexcept +{ + float Delta = S1 - S2; + return (fabsf(Delta) <= Epsilon); +} + +//------------------------------------------------------------------------------ +// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI +inline float XMScalarModAngle(float Angle) noexcept +{ + // Note: The modulo is performed with unsigned math only to work + // around a precision error on numbers that are close to PI + + // Normalize the range from 0.0f to XM_2PI + Angle = Angle + XM_PI; + // Perform the modulo, unsigned + float fTemp = fabsf(Angle); + fTemp = fTemp - (XM_2PI * static_cast(static_cast(fTemp / XM_2PI))); + // Restore the number to the range of -XM_PI to XM_PI-epsilon + fTemp = fTemp - XM_PI; + // If the modulo'd value was negative, restore negation + if (Angle < 0.0f) + { + fTemp = -fTemp; + } + return fTemp; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarSin(float Value) noexcept +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + if (y > XM_PIDIV2) + { + y = XM_PI - y; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + } + + // 11-degree minimax approximation + float y2 = y * y; + return (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarSinEst(float Value) noexcept +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + if (y > XM_PIDIV2) + { + y = XM_PI - y; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + } + + // 7-degree minimax approximation + float y2 = y * y; + return (((-0.00018524670f * y2 + 0.0083139502f) * y2 - 0.16665852f) * y2 + 1.0f) * y; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarCos(float Value) noexcept +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + // 10-degree minimax approximation + float y2 = y * y; + float p = ((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f; + return sign * p; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarCosEst(float Value) noexcept +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + // 6-degree minimax approximation + float y2 = y * y; + float p = ((-0.0012712436f * y2 + 0.041493919f) * y2 - 0.49992746f) * y2 + 1.0f; + return sign * p; +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XMScalarSinCos +( + float* pSin, + float* pCos, + float Value +) noexcept +{ + assert(pSin); + assert(pCos); + + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + float y2 = y * y; + + // 11-degree minimax approximation + *pSin = (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y; + + // 10-degree minimax approximation + float p = ((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f; + *pCos = sign * p; +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XMScalarSinCosEst +( + float* pSin, + float* pCos, + float Value +) noexcept +{ + assert(pSin); + assert(pCos); + + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI * Value; + if (Value >= 0.0f) + { + quotient = static_cast(static_cast(quotient + 0.5f)); + } + else + { + quotient = static_cast(static_cast(quotient - 0.5f)); + } + float y = Value - XM_2PI * quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + float y2 = y * y; + + // 7-degree minimax approximation + *pSin = (((-0.00018524670f * y2 + 0.0083139502f) * y2 - 0.16665852f) * y2 + 1.0f) * y; + + // 6-degree minimax approximation + float p = ((-0.0012712436f * y2 + 0.041493919f) * y2 - 0.49992746f) * y2 + 1.0f; + *pCos = sign * p; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarASin(float Value) noexcept +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 7-degree minimax approximation + float result = ((((((-0.0012624911f * x + 0.0066700901f) * x - 0.0170881256f) * x + 0.0308918810f) * x - 0.0501743046f) * x + 0.0889789874f) * x - 0.2145988016f) * x + 1.5707963050f; + result *= root; // acos(|x|) + + // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x) + return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarASinEst(float Value) noexcept +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 3-degree minimax approximation + float result = ((-0.0187293f * x + 0.0742610f) * x - 0.2121144f) * x + 1.5707288f; + result *= root; // acos(|x|) + + // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x) + return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarACos(float Value) noexcept +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 7-degree minimax approximation + float result = ((((((-0.0012624911f * x + 0.0066700901f) * x - 0.0170881256f) * x + 0.0308918810f) * x - 0.0501743046f) * x + 0.0889789874f) * x - 0.2145988016f) * x + 1.5707963050f; + result *= root; + + // acos(x) = pi - acos(-x) when x < 0 + return (nonnegative ? result : XM_PI - result); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarACosEst(float Value) noexcept +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 3-degree minimax approximation + float result = ((-0.0187293f * x + 0.0742610f) * x - 0.2121144f) * x + 1.5707288f; + result *= root; + + // acos(x) = pi - acos(-x) when x < 0 + return (nonnegative ? result : XM_PI - result); +} + diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathVector.inl b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathVector.inl new file mode 100644 index 000000000..e3db56a6c --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMathVector.inl @@ -0,0 +1,14819 @@ +//------------------------------------------------------------------------------------- +// DirectXMathVector.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#if defined(_XM_NO_INTRINSICS_) +#define XMISNAN(x) isnan(x) +#define XMISINF(x) isinf(x) +#endif + +#if defined(_XM_SSE_INTRINSICS_) + +#define XM3UNPACK3INTO4(l1, l2, l3) \ + XMVECTOR V3 = _mm_shuffle_ps(l2, l3, _MM_SHUFFLE(0, 0, 3, 2));\ + XMVECTOR V2 = _mm_shuffle_ps(l2, l1, _MM_SHUFFLE(3, 3, 1, 0));\ + V2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 0, 2));\ + XMVECTOR V4 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(L3), 32 / 8)) + +#define XM3PACK4INTO3(v2x) \ + v2x = _mm_shuffle_ps(V2, V3, _MM_SHUFFLE(1, 0, 2, 1));\ + V2 = _mm_shuffle_ps(V2, V1, _MM_SHUFFLE(2, 2, 0, 0));\ + V1 = _mm_shuffle_ps(V1, V2, _MM_SHUFFLE(0, 2, 1, 0));\ + V3 = _mm_shuffle_ps(V3, V4, _MM_SHUFFLE(0, 0, 2, 2));\ + V3 = _mm_shuffle_ps(V3, V4, _MM_SHUFFLE(2, 1, 2, 0)) + +#endif + +/**************************************************************************** + * + * General Vector + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + // Assignment operations + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + // Return a vector with all elements equaling zero +inline XMVECTOR XM_CALLCONV XMVectorZero() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { 0.0f, 0.0f, 0.0f, 0.0f } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32(0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_setzero_ps(); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with four floating point values +inline XMVECTOR XM_CALLCONV XMVectorSet +( + float x, + float y, + float z, + float w +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { x, y, z, w } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t V0 = vcreate_f32( + static_cast(*reinterpret_cast(&x)) + | (static_cast(*reinterpret_cast(&y)) << 32)); + float32x2_t V1 = vcreate_f32( + static_cast(*reinterpret_cast(&z)) + | (static_cast(*reinterpret_cast(&w)) << 32)); + return vcombine_f32(V0, V1); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_set_ps(w, z, y, x); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with four integer values +inline XMVECTOR XM_CALLCONV XMVectorSetInt +( + uint32_t x, + uint32_t y, + uint32_t z, + uint32_t w +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult = { { { x, y, z, w } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t V0 = vcreate_u32(static_cast(x) | (static_cast(y) << 32)); + uint32x2_t V1 = vcreate_u32(static_cast(z) | (static_cast(w) << 32)); + return vreinterpretq_f32_u32(vcombine_u32(V0, V1)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set_epi32(static_cast(w), static_cast(z), static_cast(y), static_cast(x)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated floating point value +inline XMVECTOR XM_CALLCONV XMVectorReplicate(float Value) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32(Value); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_set_ps1(Value); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr(const float* pValue) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float Value = pValue[0]; + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_dup_f32(pValue); +#elif defined(_XM_AVX_INTRINSICS_) + return _mm_broadcast_ss(pValue); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps1(pValue); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated integer value +inline XMVECTOR XM_CALLCONV XMVectorReplicateInt(uint32_t Value) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(Value)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_set1_epi32(static_cast(Value)); + return _mm_castsi128_ps(vTemp); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr(const uint32_t* pValue) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t Value = pValue[0]; + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_dup_u32(pValue)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps1(reinterpret_cast(pValue)); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with all bits set (true mask) +inline XMVECTOR XM_CALLCONV XMVectorTrueInt() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult = { { { 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_s32(vdupq_n_s32(-1)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set1_epi32(-1); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with all bits clear (false mask) +inline XMVECTOR XM_CALLCONV XMVectorFalseInt() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { 0.0f, 0.0f, 0.0f, 0.0f } } }; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(0)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_setzero_ps(); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the x component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatX(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = V.vector4_f32[0]; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32(vget_low_f32(V), 0); +#elif defined(_XM_AVX2_INTRINSICS_) && defined(_XM_FAVOR_INTEL_) + return _mm_broadcastss_ps(V); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the y component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatY(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = V.vector4_f32[1]; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32(vget_low_f32(V), 1); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the z component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatZ(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = V.vector4_f32[2]; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32(vget_high_f32(V), 0); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the w component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatW(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = V.vector4_f32[3]; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32(vget_high_f32(V), 1); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of 1.0f,1.0f,1.0f,1.0f +inline XMVECTOR XM_CALLCONV XMVectorSplatOne() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = 1.0f; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32(1.0f); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMOne; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of INF,INF,INF,INF +inline XMVECTOR XM_CALLCONV XMVectorSplatInfinity() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = 0x7F800000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(0x7F800000)); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMInfinity; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN +inline XMVECTOR XM_CALLCONV XMVectorSplatQNaN() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = 0x7FC00000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(0x7FC00000)); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMQNaN; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f +inline XMVECTOR XM_CALLCONV XMVectorSplatEpsilon() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = 0x34000000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(0x34000000)); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMEpsilon; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f +inline XMVECTOR XM_CALLCONV XMVectorSplatSignMask() noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = 0x80000000U; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vdupq_n_u32(0x80000000U)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set1_epi32(static_cast(0x80000000)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Return a floating point value via an index. This is not a recommended +// function to use due to performance loss. +inline float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i) noexcept +{ + assert(i < 4); + _Analysis_assume_(i < 4); +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[i]; +#else + XMVECTORF32 U; + U.v = V; + return U.f[i]; +#endif +} + +//------------------------------------------------------------------------------ +// Return the X component in an FPU register. +inline float XM_CALLCONV XMVectorGetX(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cvtss_f32(V); +#endif +} + +// Return the Y component in an FPU register. +inline float XM_CALLCONV XMVectorGetY(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + return _mm_cvtss_f32(vTemp); +#endif +} + +// Return the Z component in an FPU register. +inline float XM_CALLCONV XMVectorGetZ(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + return _mm_cvtss_f32(vTemp); +#endif +} + +// Return the W component in an FPU register. +inline float XM_CALLCONV XMVectorGetW(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 3); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + return _mm_cvtss_f32(vTemp); +#endif +} + +//------------------------------------------------------------------------------ + +// Store a component indexed by i into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetByIndexPtr(float* f, FXMVECTOR V, size_t i) noexcept +{ + assert(f != nullptr); + assert(i < 4); + _Analysis_assume_(i < 4); +#if defined(_XM_NO_INTRINSICS_) + *f = V.vector4_f32[i]; +#else + XMVECTORF32 U; + U.v = V; + *f = U.f[i]; +#endif +} + +//------------------------------------------------------------------------------ + +// Store the X component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetXPtr(float* x, FXMVECTOR V) noexcept +{ + assert(x != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_f32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(x, V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(x, V); +#endif +} + +// Store the Y component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetYPtr(float* y, FXMVECTOR V) noexcept +{ + assert(y != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(y, V, 1); +#elif defined(_XM_SSE4_INTRINSICS_) + * (reinterpret_cast(y)) = _mm_extract_ps(V, 1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + _mm_store_ss(y, vResult); +#endif +} + +// Store the Z component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetZPtr(float* z, FXMVECTOR V) noexcept +{ + assert(z != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(z, V, 2); +#elif defined(_XM_SSE4_INTRINSICS_) + * (reinterpret_cast(z)) = _mm_extract_ps(V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(z, vResult); +#endif +} + +// Store the W component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetWPtr(float* w, FXMVECTOR V) noexcept +{ + assert(w != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(w, V, 3); +#elif defined(_XM_SSE4_INTRINSICS_) + * (reinterpret_cast(w)) = _mm_extract_ps(V, 3); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + _mm_store_ss(w, vResult); +#endif +} + +//------------------------------------------------------------------------------ + +// Return an integer value via an index. This is not a recommended +// function to use due to performance loss. +inline uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i) noexcept +{ + assert(i < 4); + _Analysis_assume_(i < 4); +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[i]; +#else + XMVECTORU32 U; + U.v = V; + return U.u[i]; +#endif +} + +//------------------------------------------------------------------------------ + +// Return the X component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(vreinterpretq_u32_f32(V), 0); +#elif defined(_XM_SSE_INTRINSICS_) + return static_cast(_mm_cvtsi128_si32(_mm_castps_si128(V))); +#endif +} + +// Return the Y component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(vreinterpretq_u32_f32(V), 1); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + return static_cast(_mm_extract_epi32(V1, 1)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(1, 1, 1, 1)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#endif +} + +// Return the Z component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(vreinterpretq_u32_f32(V), 2); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + return static_cast(_mm_extract_epi32(V1, 2)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(2, 2, 2, 2)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#endif +} + +// Return the W component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(vreinterpretq_u32_f32(V), 3); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + return static_cast(_mm_extract_epi32(V1, 3)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(3, 3, 3, 3)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ + +// Store a component indexed by i into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntByIndexPtr(uint32_t* x, FXMVECTOR V, size_t i) noexcept +{ + assert(x != nullptr); + assert(i < 4); + _Analysis_assume_(i < 4); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_u32[i]; +#else + XMVECTORU32 U; + U.v = V; + *x = U.u[i]; +#endif +} + +//------------------------------------------------------------------------------ + +// Store the X component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntXPtr(uint32_t* x, FXMVECTOR V) noexcept +{ + assert(x != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_u32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(x, *reinterpret_cast(&V), 0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(reinterpret_cast(x), V); +#endif +} + +// Store the Y component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntYPtr(uint32_t* y, FXMVECTOR V) noexcept +{ + assert(y != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *y = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(y, *reinterpret_cast(&V), 1); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + *y = static_cast(_mm_extract_epi32(V1, 1)); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + _mm_store_ss(reinterpret_cast(y), vResult); +#endif +} + +// Store the Z component into a 32 bit integer locaCantion in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntZPtr(uint32_t* z, FXMVECTOR V) noexcept +{ + assert(z != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *z = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(z, *reinterpret_cast(&V), 2); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + *z = static_cast(_mm_extract_epi32(V1, 2)); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(reinterpret_cast(z), vResult); +#endif +} + +// Store the W component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntWPtr(uint32_t* w, FXMVECTOR V) noexcept +{ + assert(w != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *w = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(w, *reinterpret_cast(&V), 3); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128(V); + *w = static_cast(_mm_extract_epi32(V1, 3)); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + _mm_store_ss(reinterpret_cast(w), vResult); +#endif +} + +//------------------------------------------------------------------------------ + +// Set a single indexed floating point component +inline XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i) noexcept +{ + assert(i < 4); + _Analysis_assume_(i < 4); + XMVECTORF32 U; + U.v = V; + U.f[i] = f; + return U.v; +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + x, + V.vector4_f32[1], + V.vector4_f32[2], + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(x, V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(x); + vResult = _mm_move_ss(V, vResult); + return vResult; +#endif +} + +// Sets the Y component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + y, + V.vector4_f32[2], + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(y, V, 1); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(y); + vResult = _mm_insert_ps(V, vResult, 0x10); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(y); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1)); + return vResult; +#endif +} +// Sets the Z component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + V.vector4_f32[1], + z, + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(z, V, 2); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(z); + vResult = _mm_insert_ps(V, vResult, 0x20); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(z); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2)); + return vResult; +#endif +} + +// Sets the W component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + V.vector4_f32[1], + V.vector4_f32[2], + w + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(w, V, 3); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(w); + vResult = _mm_insert_ps(V, vResult, 0x30); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(w); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(FXMVECTOR V, const float* f, size_t i) noexcept +{ + assert(f != nullptr); + assert(i < 4); + _Analysis_assume_(i < 4); + XMVECTORF32 U; + U.v = V; + U.f[i] = *f; + return U.v; +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetXPtr(FXMVECTOR V, const float* x) noexcept +{ + assert(x != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + *x, + V.vector4_f32[1], + V.vector4_f32[2], + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(x, V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_load_ss(x); + vResult = _mm_move_ss(V, vResult); + return vResult; +#endif +} + +// Sets the Y component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetYPtr(FXMVECTOR V, const float* y) noexcept +{ + assert(y != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + *y, + V.vector4_f32[2], + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(y, V, 1); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(y); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1)); + return vResult; +#endif +} + +// Sets the Z component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetZPtr(FXMVECTOR V, const float* z) noexcept +{ + assert(z != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + V.vector4_f32[1], + *z, + V.vector4_f32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(z, V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(z); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2)); + return vResult; +#endif +} + +// Sets the W component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetWPtr(FXMVECTOR V, const float* w) noexcept +{ + assert(w != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 U = { { { + V.vector4_f32[0], + V.vector4_f32[1], + V.vector4_f32[2], + *w + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(w, V, 3); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(w); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i) noexcept +{ + assert(i < 4); + _Analysis_assume_(i < 4); + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = x; + return tmp; +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + x, + V.vector4_u32[1], + V.vector4_u32[2], + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vsetq_lane_u32(x, vreinterpretq_u32_f32(V), 0)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cvtsi32_si128(static_cast(x)); + XMVECTOR vResult = _mm_move_ss(V, _mm_castsi128_ps(vTemp)); + return vResult; +#endif +} + +// Sets the Y component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + y, + V.vector4_u32[2], + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vsetq_lane_u32(y, vreinterpretq_u32_f32(V), 1)); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i vResult = _mm_castps_si128(V); + vResult = _mm_insert_epi32(vResult, static_cast(y), 1); + return _mm_castsi128_ps(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(static_cast(y)); + // Replace the x component + vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp)); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1)); + return vResult; +#endif +} + +// Sets the Z component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + V.vector4_u32[1], + z, + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vsetq_lane_u32(z, vreinterpretq_u32_f32(V), 2)); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i vResult = _mm_castps_si128(V); + vResult = _mm_insert_epi32(vResult, static_cast(z), 2); + return _mm_castsi128_ps(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(static_cast(z)); + // Replace the x component + vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp)); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2)); + return vResult; +#endif +} + +// Sets the W component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + V.vector4_u32[1], + V.vector4_u32[2], + w + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vsetq_lane_u32(w, vreinterpretq_u32_f32(V), 3)); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i vResult = _mm_castps_si128(V); + vResult = _mm_insert_epi32(vResult, static_cast(w), 3); + return _mm_castsi128_ps(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(static_cast(w)); + // Replace the x component + vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp)); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(FXMVECTOR V, const uint32_t* x, size_t i) noexcept +{ + assert(x != nullptr); + assert(i < 4); + _Analysis_assume_(i < 4); + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = *x; + return tmp; +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(FXMVECTOR V, const uint32_t* x) noexcept +{ + assert(x != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + *x, + V.vector4_u32[1], + V.vector4_u32[2], + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_lane_u32(x, *reinterpret_cast(&V), 0)); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(x)); + XMVECTOR vResult = _mm_move_ss(V, vTemp); + return vResult; +#endif +} + +// Sets the Y component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(FXMVECTOR V, const uint32_t* y) noexcept +{ + assert(y != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + *y, + V.vector4_u32[2], + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_lane_u32(y, *reinterpret_cast(&V), 1)); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(y)); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1)); + return vResult; +#endif +} + +// Sets the Z component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(FXMVECTOR V, const uint32_t* z) noexcept +{ + assert(z != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + V.vector4_u32[1], + *z, + V.vector4_u32[3] + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_lane_u32(z, *reinterpret_cast(&V), 2)); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(z)); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2)); + return vResult; +#endif +} + +// Sets the W component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(FXMVECTOR V, const uint32_t* w) noexcept +{ + assert(w != nullptr); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 U = { { { + V.vector4_u32[0], + V.vector4_u32[1], + V.vector4_u32[2], + *w + } } }; + return U.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vld1q_lane_u32(w, *reinterpret_cast(&V), 3)); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(w)); + // Replace the x component + vResult = _mm_move_ss(vResult, vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSwizzle +( + FXMVECTOR V, + uint32_t E0, + uint32_t E1, + uint32_t E2, + uint32_t E3 +) noexcept +{ + assert((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4)); + _Analysis_assume_((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4)); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + V.vector4_f32[E0], + V.vector4_f32[E1], + V.vector4_f32[E2], + V.vector4_f32[E3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const uint32_t ControlElement[4] = + { + 0x03020100, // XM_SWIZZLE_X + 0x07060504, // XM_SWIZZLE_Y + 0x0B0A0908, // XM_SWIZZLE_Z + 0x0F0E0D0C, // XM_SWIZZLE_W + }; + + uint8x8x2_t tbl; + tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V)); + tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V)); + + uint32x2_t idx = vcreate_u32(static_cast(ControlElement[E0]) | (static_cast(ControlElement[E1]) << 32)); + const uint8x8_t rL = vtbl2_u8(tbl, vreinterpret_u8_u32(idx)); + + idx = vcreate_u32(static_cast(ControlElement[E2]) | (static_cast(ControlElement[E3]) << 32)); + const uint8x8_t rH = vtbl2_u8(tbl, vreinterpret_u8_u32(idx)); + + return vcombine_f32(vreinterpret_f32_u8(rL), vreinterpret_f32_u8(rH)); +#elif defined(_XM_AVX_INTRINSICS_) + unsigned int elem[4] = { E0, E1, E2, E3 }; + __m128i vControl = _mm_loadu_si128(reinterpret_cast(&elem[0])); + return _mm_permutevar_ps(V, vControl); +#else + auto aPtr = reinterpret_cast(&V); + + XMVECTOR Result; + auto pWork = reinterpret_cast(&Result); + + pWork[0] = aPtr[E0]; + pWork[1] = aPtr[E1]; + pWork[2] = aPtr[E2]; + pWork[3] = aPtr[E3]; + + return Result; +#endif +} + +//------------------------------------------------------------------------------ +inline XMVECTOR XM_CALLCONV XMVectorPermute +( + FXMVECTOR V1, + FXMVECTOR V2, + uint32_t PermuteX, + uint32_t PermuteY, + uint32_t PermuteZ, + uint32_t PermuteW +) noexcept +{ + assert(PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7); + _Analysis_assume_(PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7); + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const uint32_t ControlElement[8] = + { + 0x03020100, // XM_PERMUTE_0X + 0x07060504, // XM_PERMUTE_0Y + 0x0B0A0908, // XM_PERMUTE_0Z + 0x0F0E0D0C, // XM_PERMUTE_0W + 0x13121110, // XM_PERMUTE_1X + 0x17161514, // XM_PERMUTE_1Y + 0x1B1A1918, // XM_PERMUTE_1Z + 0x1F1E1D1C, // XM_PERMUTE_1W + }; + + uint8x8x4_t tbl; + tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V1)); + tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V1)); + tbl.val[2] = vreinterpret_u8_f32(vget_low_f32(V2)); + tbl.val[3] = vreinterpret_u8_f32(vget_high_f32(V2)); + + uint32x2_t idx = vcreate_u32(static_cast(ControlElement[PermuteX]) | (static_cast(ControlElement[PermuteY]) << 32)); + const uint8x8_t rL = vtbl4_u8(tbl, vreinterpret_u8_u32(idx)); + + idx = vcreate_u32(static_cast(ControlElement[PermuteZ]) | (static_cast(ControlElement[PermuteW]) << 32)); + const uint8x8_t rH = vtbl4_u8(tbl, vreinterpret_u8_u32(idx)); + + return vcombine_f32(vreinterpret_f32_u8(rL), vreinterpret_f32_u8(rH)); +#elif defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const XMVECTORU32 three = { { { 3, 3, 3, 3 } } }; + + XM_ALIGNED_DATA(16) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW }; + __m128i vControl = _mm_load_si128(reinterpret_cast(&elem[0])); + + __m128i vSelect = _mm_cmpgt_epi32(vControl, three); + vControl = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(vControl), three)); + + __m128 shuffled1 = _mm_permutevar_ps(V1, vControl); + __m128 shuffled2 = _mm_permutevar_ps(V2, vControl); + + __m128 masked1 = _mm_andnot_ps(_mm_castsi128_ps(vSelect), shuffled1); + __m128 masked2 = _mm_and_ps(_mm_castsi128_ps(vSelect), shuffled2); + + return _mm_or_ps(masked1, masked2); +#else + + const uint32_t* aPtr[2]; + aPtr[0] = reinterpret_cast(&V1); + aPtr[1] = reinterpret_cast(&V2); + + XMVECTOR Result; + auto pWork = reinterpret_cast(&Result); + + const uint32_t i0 = PermuteX & 3; + const uint32_t vi0 = PermuteX >> 2; + pWork[0] = aPtr[vi0][i0]; + + const uint32_t i1 = PermuteY & 3; + const uint32_t vi1 = PermuteY >> 2; + pWork[1] = aPtr[vi1][i1]; + + const uint32_t i2 = PermuteZ & 3; + const uint32_t vi2 = PermuteZ >> 2; + pWork[2] = aPtr[vi2][i2]; + + const uint32_t i3 = PermuteW & 3; + const uint32_t vi3 = PermuteW >> 2; + pWork[3] = aPtr[vi3][i3]; + + return Result; +#endif +} + +//------------------------------------------------------------------------------ +// Define a control vector to be used in XMVectorSelect +// operations. The four integers specified in XMVectorSelectControl +// serve as indices to select between components in two vectors. +// The first index controls selection for the first component of +// the vectors involved in a select operation, the second index +// controls selection for the second component etc. A value of +// zero for an index causes the corresponding component from the first +// vector to be selected whereas a one causes the component from the +// second vector to be selected instead. + +inline XMVECTOR XM_CALLCONV XMVectorSelectControl +( + uint32_t VectorIndex0, + uint32_t VectorIndex1, + uint32_t VectorIndex2, + uint32_t VectorIndex3 +) noexcept +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + // x=Index0,y=Index1,z=Index2,w=Index3 + __m128i vTemp = _mm_set_epi32(static_cast(VectorIndex3), static_cast(VectorIndex2), static_cast(VectorIndex1), static_cast(VectorIndex0)); + // Any non-zero entries become 0xFFFFFFFF else 0 + vTemp = _mm_cmpgt_epi32(vTemp, g_XMZero); + return _mm_castsi128_ps(vTemp); +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + int32x2_t V0 = vcreate_s32(static_cast(VectorIndex0) | (static_cast(VectorIndex1) << 32)); + int32x2_t V1 = vcreate_s32(static_cast(VectorIndex2) | (static_cast(VectorIndex3) << 32)); + int32x4_t vTemp = vcombine_s32(V0, V1); + // Any non-zero entries become 0xFFFFFFFF else 0 + return vreinterpretq_f32_u32(vcgtq_s32(vTemp, g_XMZero)); +#else + XMVECTOR ControlVector; + const uint32_t ControlElement[] = + { + XM_SELECT_0, + XM_SELECT_1 + }; + + assert(VectorIndex0 < 2); + assert(VectorIndex1 < 2); + assert(VectorIndex2 < 2); + assert(VectorIndex3 < 2); + _Analysis_assume_(VectorIndex0 < 2); + _Analysis_assume_(VectorIndex1 < 2); + _Analysis_assume_(VectorIndex2 < 2); + _Analysis_assume_(VectorIndex3 < 2); + + ControlVector.vector4_u32[0] = ControlElement[VectorIndex0]; + ControlVector.vector4_u32[1] = ControlElement[VectorIndex1]; + ControlVector.vector4_u32[2] = ControlElement[VectorIndex2]; + ControlVector.vector4_u32[3] = ControlElement[VectorIndex3]; + + return ControlVector; + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSelect +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Control +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]), + (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]), + (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]), + (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]), + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vbslq_f32(vreinterpretq_u32_f32(Control), V2, V1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = _mm_andnot_ps(Control, V1); + XMVECTOR vTemp2 = _mm_and_ps(V2, Control); + return _mm_or_ps(vTemp1, vTemp2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMergeXY +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[0], + V2.vector4_u32[0], + V1.vector4_u32[1], + V2.vector4_u32[1], + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vzipq_f32(V1, V2).val[0]; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_unpacklo_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMergeZW +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[2], + V2.vector4_u32[2], + V1.vector4_u32[3], + V2.vector4_u32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vzipq_f32(V1, V2).val[1]; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_unpackhi_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) noexcept +{ + assert(Elements < 4); + _Analysis_assume_(Elements < 4); + return XMVectorPermute(V1, V2, Elements, ((Elements)+1), ((Elements)+2), ((Elements)+3)); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) noexcept +{ + assert(Elements < 4); + _Analysis_assume_(Elements < 4); + return XMVectorSwizzle(V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) noexcept +{ + assert(Elements < 4); + _Analysis_assume_(Elements < 4); + return XMVectorSwizzle(V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorInsert( + FXMVECTOR VD, FXMVECTOR VS, + uint32_t VSLeftRotateElements, + uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3) noexcept +{ + XMVECTOR Control = XMVectorSelectControl(Select0 & 1, Select1 & 1, Select2 & 1, Select3 & 1); + return XMVectorSelect(VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control); +} + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0, + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vceqq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpeq_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorEqualR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + assert(pCR != nullptr); +#if defined(_XM_NO_INTRINSICS_) + uint32_t ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux & uy & uz & uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux | uy | uz | uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTORU32 Control = { { { ux, uy, uz, uw } } }; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vreinterpret_u8_u32(vget_low_u32(vResult)), vreinterpret_u8_u32(vget_high_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + // All elements are equal + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + // All elements are not equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vreinterpretq_f32_u32(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest == 0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +// Treat the components of the vectors as unsigned integers and +// compare individual bits between the two. This is useful for +// comparing control vectors and result vectors returned from +// other comparison operations. + +inline XMVECTOR XM_CALLCONV XMVectorEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0, + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vceqq_s32(vreinterpretq_s32_f32(V1), vreinterpretq_s32_f32(V2))); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorEqualIntR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + assert(pCR != nullptr); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control = XMVectorEqualInt(V1, V2); + + *pCR = 0; + if (XMVector4EqualInt(Control, XMVectorTrueInt())) + { + // All elements are equal + *pCR |= XM_CRMASK_CR6TRUE; + } + else if (XMVector4EqualInt(Control, XMVectorFalseInt())) + { + // All elements are not equal + *pCR |= XM_CRMASK_CR6FALSE; + } + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + // All elements are equal + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + // All elements are not equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vreinterpretq_f32_u32(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + int iTemp = _mm_movemask_ps(_mm_castsi128_ps(V)); + uint32_t CR = 0; + if (iTemp == 0x0F) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTemp) + { + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float fDeltax = V1.vector4_f32[0] - V2.vector4_f32[0]; + float fDeltay = V1.vector4_f32[1] - V2.vector4_f32[1]; + float fDeltaz = V1.vector4_f32[2] - V2.vector4_f32[2]; + float fDeltaw = V1.vector4_f32[3] - V2.vector4_f32[3]; + + fDeltax = fabsf(fDeltax); + fDeltay = fabsf(fDeltay); + fDeltaz = fabsf(fDeltaz); + fDeltaw = fabsf(fDeltaw); + + XMVECTORU32 Control = { { { + (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0, + (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0, + (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0, + (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0, + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vDelta = vsubq_f32(V1, V2); +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + return vacleq_f32(vDelta, Epsilon); +#else + return vreinterpretq_f32_u32(vcleq_f32(vabsq_f32(vDelta), Epsilon)); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1, V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp, vDelta); + vTemp = _mm_max_ps(vTemp, vDelta); + vTemp = _mm_cmple_ps(vTemp, Epsilon); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0, + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(V1, V2))); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpneq_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0, + (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0, + (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0, + (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vmvnq_u32( + vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)))); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return _mm_xor_ps(_mm_castsi128_ps(V), g_XMNegOneMask); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorGreater +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vcgtq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpgt_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorGreaterR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + assert(pCR != nullptr); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux & uy & uz & uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux | uy | uz | uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTORU32 Control = { { { ux, uy, uz, uw } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vreinterpretq_f32_u32(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest == 0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vcgeq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpge_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + assert(pCR != nullptr); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux & uy & uz & uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux | uy | uz | uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTORU32 Control = { { { ux, uy, uz, uw } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + // All elements are greater or equal + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + // All elements are not greater or equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vreinterpretq_f32_u32(vResult); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest == 0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLess +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vcltq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmplt_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vcleq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmple_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorInBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0, + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0, + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0, + (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + uint32x4_t vTemp1 = vcleq_f32(V, Bounds); + // Negate the bounds + uint32x4_t vTemp2 = vreinterpretq_u32_f32(vnegq_f32(Bounds)); + // Test if greater or equal (Reversed) + vTemp2 = vcleq_f32(vreinterpretq_f32_u32(vTemp2), V); + // Blend answers + vTemp1 = vandq_u32(vTemp1, vTemp2); + return vreinterpretq_f32_u32(vTemp1); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2, V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + return vTemp1; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorInBoundsR +( + uint32_t* pCR, + FXMVECTOR V, + FXMVECTOR Bounds +) noexcept +{ + assert(pCR != nullptr); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + + uint32_t CR = 0; + if (ux & uy & uz & uw) + { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + + XMVECTORU32 Control = { { { ux, uy, uz, uw } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + uint32x4_t vTemp1 = vcleq_f32(V, Bounds); + // Negate the bounds + uint32x4_t vTemp2 = vreinterpretq_u32_f32(vnegq_f32(Bounds)); + // Test if greater or equal (Reversed) + vTemp2 = vcleq_f32(vreinterpretq_f32_u32(vTemp2), V); + // Blend answers + vTemp1 = vandq_u32(vTemp1, vTemp2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTemp1)), vget_high_u8(vreinterpretq_u8_u32(vTemp1))); + uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1); + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + return vreinterpretq_f32_u32(vTemp1); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2, V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + + uint32_t CR = 0; + if (_mm_movemask_ps(vTemp1) == 0xf) + { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + return vTemp1; +#endif +} + +//------------------------------------------------------------------------------ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline XMVECTOR XM_CALLCONV XMVectorIsNaN(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0, + XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0, + XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0, + XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test against itself. NaN is always not equal + uint32x4_t vTempNan = vceqq_f32(V, V); + // Flip results + return vreinterpretq_f32_u32(vmvnq_u32(vTempNan)); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + return _mm_cmpneq_ps(V, V); +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorIsInfinite(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Control = { { { + XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0, + XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0, + XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0, + XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0 + } } }; + return Control.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x4_t vTemp = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + // Compare to infinity + vTemp = vceqq_f32(vreinterpretq_f32_u32(vTemp), g_XMInfinity); + // If any are infinity, the signs are true. + return vreinterpretq_f32_u32(vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V, g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity); + // If any are infinity, the signs are true. + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +// Rounding and clamping operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMin +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0], + (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1], + (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2], + (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vminq_f32(V1, V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_min_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMax +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0], + (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1], + (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2], + (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmaxq_f32(V1, V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_max_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +namespace Internal +{ + // Round to nearest (even) a.k.a. banker's rounding + inline float round_to_nearest(float x) noexcept + { + float i = floorf(x); + x -= i; + if (x < 0.5f) + return i; + if (x > 0.5f) + return i + 1.f; + + float int_part; + (void)modff(i / 2.f, &int_part); + if ((2.f * int_part) == i) + { + return i; + } + + return i + 1.f; + } +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline XMVECTOR XM_CALLCONV XMVectorRound(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + Internal::round_to_nearest(V.vector4_f32[0]), + Internal::round_to_nearest(V.vector4_f32[1]), + Internal::round_to_nearest(V.vector4_f32[2]), + Internal::round_to_nearest(V.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + return vrndnq_f32(V); +#else + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(V), g_XMNegativeZero); + float32x4_t sMagic = vreinterpretq_f32_u32(vorrq_u32(g_XMNoFraction, sign)); + float32x4_t R1 = vaddq_f32(V, sMagic); + R1 = vsubq_f32(R1, sMagic); + float32x4_t R2 = vabsq_f32(V); + uint32x4_t mask = vcleq_f32(R2, g_XMNoFraction); + return vbslq_f32(mask, R1, V); +#endif +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_round_ps(V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 sign = _mm_and_ps(V, g_XMNegativeZero); + __m128 sMagic = _mm_or_ps(g_XMNoFraction, sign); + __m128 R1 = _mm_add_ps(V, sMagic); + R1 = _mm_sub_ps(R1, sMagic); + __m128 R2 = _mm_and_ps(V, g_XMAbsMask); + __m128 mask = _mm_cmple_ps(R2, g_XMNoFraction); + R2 = _mm_andnot_ps(mask, V); + R1 = _mm_and_ps(R1, mask); + XMVECTOR vResult = _mm_xor_ps(R1, R2); + return vResult; +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTruncate(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + uint32_t i; + + // Avoid C4701 + Result.vector4_f32[0] = 0.0f; + + for (i = 0; i < 4; i++) + { + if (XMISNAN(V.vector4_f32[i])) + { + Result.vector4_u32[i] = 0x7FC00000; + } + else if (fabsf(V.vector4_f32[i]) < 8388608.0f) + { + Result.vector4_f32[i] = static_cast(static_cast(V.vector4_f32[i])); + } + else + { + Result.vector4_f32[i] = V.vector4_f32[i]; + } + } + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + return vrndq_f32(V); +#else + float32x4_t vTest = vabsq_f32(V); + vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction)); + + int32x4_t vInt = vcvtq_s32_f32(V); + float32x4_t vResult = vcvtq_f32_s32(vInt); + + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V); +#endif +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_round_ps(V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + // Get the abs value + __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF + vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction); + // Convert to int and back to float for rounding with truncation + __m128i vInt = _mm_cvttps_epi32(V); + // Convert back to floats + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest)); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V)); + vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorFloor(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + floorf(V.vector4_f32[0]), + floorf(V.vector4_f32[1]), + floorf(V.vector4_f32[2]), + floorf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + return vrndmq_f32(V); +#else + float32x4_t vTest = vabsq_f32(V); + vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction)); + // Truncate + int32x4_t vInt = vcvtq_s32_f32(V); + float32x4_t vResult = vcvtq_f32_s32(vInt); + uint32x4_t vLargerMask = vcgtq_f32(vResult, V); + // 0 -> 0, 0xffffffff -> -1.0f + float32x4_t vLarger = vcvtq_f32_s32(vreinterpretq_s32_u32(vLargerMask)); + vResult = vaddq_f32(vResult, vLarger); + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V); +#endif +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_floor_ps(V); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction); + // Truncate + __m128i vInt = _mm_cvttps_epi32(V); + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + __m128 vLarger = _mm_cmpgt_ps(vResult, V); + // 0 -> 0, 0xffffffff -> -1.0f + vLarger = _mm_cvtepi32_ps(_mm_castps_si128(vLarger)); + vResult = _mm_add_ps(vResult, vLarger); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest)); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V)); + vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCeiling(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + ceilf(V.vector4_f32[0]), + ceilf(V.vector4_f32[1]), + ceilf(V.vector4_f32[2]), + ceilf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + return vrndpq_f32(V); +#else + float32x4_t vTest = vabsq_f32(V); + vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction)); + // Truncate + int32x4_t vInt = vcvtq_s32_f32(V); + float32x4_t vResult = vcvtq_f32_s32(vInt); + uint32x4_t vSmallerMask = vcltq_f32(vResult, V); + // 0 -> 0, 0xffffffff -> -1.0f + float32x4_t vSmaller = vcvtq_f32_s32(vreinterpretq_s32_u32(vSmallerMask)); + vResult = vsubq_f32(vResult, vSmaller); + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V); +#endif +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_ceil_ps(V); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction); + // Truncate + __m128i vInt = _mm_cvttps_epi32(V); + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + __m128 vSmaller = _mm_cmplt_ps(vResult, V); + // 0 -> 0, 0xffffffff -> -1.0f + vSmaller = _mm_cvtepi32_ps(_mm_castps_si128(vSmaller)); + vResult = _mm_sub_ps(vResult, vSmaller); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest)); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V)); + vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorClamp +( + FXMVECTOR V, + FXMVECTOR Min, + FXMVECTOR Max +) noexcept +{ + assert(XMVector4LessOrEqual(Min, Max)); + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVectorMax(Min, V); + Result = XMVectorMin(Max, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32(Min, V); + vResult = vminq_f32(Max, vResult); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult; + vResult = _mm_max_ps(Min, V); + vResult = _mm_min_ps(Max, vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSaturate(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + return XMVectorClamp(V, Zero, g_XMOne.v); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Set <0 to 0 + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0)); + // Set>1 to 1 + return vminq_f32(vResult, vdupq_n_f32(1.0f)); +#elif defined(_XM_SSE_INTRINSICS_) + // Set <0 to 0 + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + // Set>1 to 1 + return _mm_min_ps(vResult, g_XMOne); +#endif +} + +//------------------------------------------------------------------------------ +// Bitwise logical operations +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAndInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[0] & V2.vector4_u32[0], + V1.vector4_u32[1] & V2.vector4_u32[1], + V1.vector4_u32[2] & V2.vector4_u32[2], + V1.vector4_u32[3] & V2.vector4_u32[3] + } } }; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2))); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_and_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAndCInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[0] & ~V2.vector4_u32[0], + V1.vector4_u32[1] & ~V2.vector4_u32[1], + V1.vector4_u32[2] & ~V2.vector4_u32[2], + V1.vector4_u32[3] & ~V2.vector4_u32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2))); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_andnot_si128(_mm_castps_si128(V2), _mm_castps_si128(V1)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorOrInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[0] | V2.vector4_u32[0], + V1.vector4_u32[1] | V2.vector4_u32[1], + V1.vector4_u32[2] | V2.vector4_u32[2], + V1.vector4_u32[3] | V2.vector4_u32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2))); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_or_si128(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNorInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + ~(V1.vector4_u32[0] | V2.vector4_u32[0]), + ~(V1.vector4_u32[1] | V2.vector4_u32[1]), + ~(V1.vector4_u32[2] | V2.vector4_u32[2]), + ~(V1.vector4_u32[3] | V2.vector4_u32[3]) + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t Result = vorrq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + return vreinterpretq_f32_u32(vbicq_u32(g_XMNegOneMask, Result)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i Result; + Result = _mm_or_si128(_mm_castps_si128(V1), _mm_castps_si128(V2)); + Result = _mm_andnot_si128(Result, g_XMNegOneMask); + return _mm_castsi128_ps(Result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorXorInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORU32 Result = { { { + V1.vector4_u32[0] ^ V2.vector4_u32[0], + V1.vector4_u32[1] ^ V2.vector4_u32[1], + V1.vector4_u32[2] ^ V2.vector4_u32[2], + V1.vector4_u32[3] ^ V2.vector4_u32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2))); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_xor_si128(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNegate(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + -V.vector4_f32[0], + -V.vector4_f32[1], + -V.vector4_f32[2], + -V.vector4_f32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vnegq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Z; + + Z = _mm_setzero_ps(); + + return _mm_sub_ps(Z, V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAdd +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + V1.vector4_f32[0] + V2.vector4_f32[0], + V1.vector4_f32[1] + V2.vector4_f32[1], + V1.vector4_f32[2] + V2.vector4_f32[2], + V1.vector4_f32[3] + V2.vector4_f32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vaddq_f32(V1, V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_add_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSum(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result; + Result.f[0] = + Result.f[1] = + Result.f[2] = + Result.f[3] = V.vector4_f32[0] + V.vector4_f32[1] + V.vector4_f32[2] + V.vector4_f32[3]; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + float32x4_t vTemp = vpaddq_f32(V, V); + return vpaddq_f32(vTemp, vTemp); +#else + float32x2_t v1 = vget_low_f32(V); + float32x2_t v2 = vget_high_f32(V); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + return vcombine_f32(v1, v1); +#endif +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vTemp = _mm_hadd_ps(V, V); + return _mm_hadd_ps(vTemp, vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 3, 0, 1)); + XMVECTOR vTemp2 = _mm_add_ps(V, vTemp); + vTemp = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 0, 3, 2)); + return _mm_add_ps(vTemp, vTemp2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAddAngles +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + // Add the given angles together. If the range of V1 is such + // that -Pi <= V1 < Pi and the range of V2 is such that + // -2Pi <= V2 <= 2Pi, then the range of the resulting angle + // will be -Pi <= Result < Pi. + XMVECTOR Result = XMVectorAdd(V1, V2); + + XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v); + XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); + + Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); + Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); + + Result = XMVectorAdd(Result, Offset); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Adjust the angles + float32x4_t vResult = vaddq_f32(V1, V2); + // Less than Pi? + uint32x4_t vOffset = vcltq_f32(vResult, g_XMNegativePi); + vOffset = vandq_u32(vOffset, g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = vaddq_f32(vResult, vreinterpretq_f32_u32(vOffset)); + // Greater than or equal to Pi? + vOffset = vcgeq_f32(vResult, g_XMPi); + vOffset = vandq_u32(vOffset, g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = vsubq_f32(vResult, vreinterpretq_f32_u32(vOffset)); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = _mm_add_ps(V1, V2); + // Less than Pi? + XMVECTOR vOffset = _mm_cmplt_ps(vResult, g_XMNegativePi); + vOffset = _mm_and_ps(vOffset, g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = _mm_add_ps(vResult, vOffset); + // Greater than or equal to Pi? + vOffset = _mm_cmpge_ps(vResult, g_XMPi); + vOffset = _mm_and_ps(vOffset, g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = _mm_sub_ps(vResult, vOffset); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSubtract +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + V1.vector4_f32[0] - V2.vector4_f32[0], + V1.vector4_f32[1] - V2.vector4_f32[1], + V1.vector4_f32[2] - V2.vector4_f32[2], + V1.vector4_f32[3] - V2.vector4_f32[3] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsubq_f32(V1, V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sub_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSubtractAngles +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + // Subtract the given angles. If the range of V1 is such + // that -Pi <= V1 < Pi and the range of V2 is such that + // -2Pi <= V2 <= 2Pi, then the range of the resulting angle + // will be -Pi <= Result < Pi. + XMVECTOR Result = XMVectorSubtract(V1, V2); + + XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v); + XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); + + Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); + Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); + + Result = XMVectorAdd(Result, Offset); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = vsubq_f32(V1, V2); + // Less than Pi? + uint32x4_t vOffset = vcltq_f32(vResult, g_XMNegativePi); + vOffset = vandq_u32(vOffset, g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = vaddq_f32(vResult, vreinterpretq_f32_u32(vOffset)); + // Greater than or equal to Pi? + vOffset = vcgeq_f32(vResult, g_XMPi); + vOffset = vandq_u32(vOffset, g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = vsubq_f32(vResult, vreinterpretq_f32_u32(vOffset)); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = _mm_sub_ps(V1, V2); + // Less than Pi? + XMVECTOR vOffset = _mm_cmplt_ps(vResult, g_XMNegativePi); + vOffset = _mm_and_ps(vOffset, g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = _mm_add_ps(vResult, vOffset); + // Greater than or equal to Pi? + vOffset = _mm_cmpge_ps(vResult, g_XMPi); + vOffset = _mm_and_ps(vOffset, g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = _mm_sub_ps(vResult, vOffset); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMultiply +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + V1.vector4_f32[0] * V2.vector4_f32[0], + V1.vector4_f32[1] * V2.vector4_f32[1], + V1.vector4_f32[2] * V2.vector4_f32[2], + V1.vector4_f32[3] * V2.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmulq_f32(V1, V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_mul_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + V1.vector4_f32[0] * V2.vector4_f32[0] + V3.vector4_f32[0], + V1.vector4_f32[1] * V2.vector4_f32[1] + V3.vector4_f32[1], + V1.vector4_f32[2] * V2.vector4_f32[2] + V3.vector4_f32[2], + V1.vector4_f32[3] * V2.vector4_f32[3] + V3.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + return vfmaq_f32(V3, V1, V2); +#else + return vmlaq_f32(V3, V1, V2); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return XM_FMADD_PS(V1, V2, V3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorDivide +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + V1.vector4_f32[0] / V2.vector4_f32[0], + V1.vector4_f32[1] / V2.vector4_f32[1], + V1.vector4_f32[2] / V2.vector4_f32[2], + V1.vector4_f32[3] / V2.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + return vdivq_f32(V1, V2); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(V2); + float32x4_t S = vrecpsq_f32(Reciprocal, V2); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, V2); + Reciprocal = vmulq_f32(S, Reciprocal); + return vmulq_f32(V1, Reciprocal); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_div_ps(V1, V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]), + V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]), + V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]), + V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3]) + } } }; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + return vfmsq_f32(V3, V1, V2); +#else + return vmlsq_f32(V3, V1, V2); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return XM_FNMADD_PS(V1, V2, V3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorScale +( + FXMVECTOR V, + float ScaleFactor +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + V.vector4_f32[0] * ScaleFactor, + V.vector4_f32[1] * ScaleFactor, + V.vector4_f32[2] * ScaleFactor, + V.vector4_f32[3] * ScaleFactor + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmulq_n_f32(V, ScaleFactor); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_set_ps1(ScaleFactor); + return _mm_mul_ps(vResult, V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocalEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + 1.f / V.vector4_f32[0], + 1.f / V.vector4_f32[1], + 1.f / V.vector4_f32[2], + 1.f / V.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vrecpeq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_rcp_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocal(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + 1.f / V.vector4_f32[0], + 1.f / V.vector4_f32[1], + 1.f / V.vector4_f32[2], + 1.f / V.vector4_f32[3] + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + float32x4_t one = vdupq_n_f32(1.0f); + return vdivq_f32(one, V); +#else + // 2 iterations of Newton-Raphson refinement + float32x4_t Reciprocal = vrecpeq_f32(V); + float32x4_t S = vrecpsq_f32(Reciprocal, V); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, V); + return vmulq_f32(S, Reciprocal); +#endif +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_div_ps(g_XMOne, V); +#endif +} + +//------------------------------------------------------------------------------ +// Return an estimated square root +inline XMVECTOR XM_CALLCONV XMVectorSqrtEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + sqrtf(V.vector4_f32[0]), + sqrtf(V.vector4_f32[1]), + sqrtf(V.vector4_f32[2]), + sqrtf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 1 iteration of Newton-Raphson refinment of sqrt + float32x4_t S0 = vrsqrteq_f32(V); + float32x4_t P0 = vmulq_f32(V, S0); + float32x4_t R0 = vrsqrtsq_f32(P0, S0); + float32x4_t S1 = vmulq_f32(S0, R0); + + XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); + XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0)); + XMVECTOR Result = vmulq_f32(V, S1); + XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); + return XMVectorSelect(V, Result, Select); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sqrt_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSqrt(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + sqrtf(V.vector4_f32[0]), + sqrtf(V.vector4_f32[1]), + sqrtf(V.vector4_f32[2]), + sqrtf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 3 iterations of Newton-Raphson refinment of sqrt + float32x4_t S0 = vrsqrteq_f32(V); + float32x4_t P0 = vmulq_f32(V, S0); + float32x4_t R0 = vrsqrtsq_f32(P0, S0); + float32x4_t S1 = vmulq_f32(S0, R0); + float32x4_t P1 = vmulq_f32(V, S1); + float32x4_t R1 = vrsqrtsq_f32(P1, S1); + float32x4_t S2 = vmulq_f32(S1, R1); + float32x4_t P2 = vmulq_f32(V, S2); + float32x4_t R2 = vrsqrtsq_f32(P2, S2); + float32x4_t S3 = vmulq_f32(S2, R2); + + XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); + XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0)); + XMVECTOR Result = vmulq_f32(V, S3); + XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); + return XMVectorSelect(V, Result, Select); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sqrt_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + 1.f / sqrtf(V.vector4_f32[0]), + 1.f / sqrtf(V.vector4_f32[1]), + 1.f / sqrtf(V.vector4_f32[2]), + 1.f / sqrtf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vrsqrteq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_rsqrt_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + 1.f / sqrtf(V.vector4_f32[0]), + 1.f / sqrtf(V.vector4_f32[1]), + 1.f / sqrtf(V.vector4_f32[2]), + 1.f / sqrtf(V.vector4_f32[3]) + } } }; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t S0 = vrsqrteq_f32(V); + + float32x4_t P0 = vmulq_f32(V, S0); + float32x4_t R0 = vrsqrtsq_f32(P0, S0); + + float32x4_t S1 = vmulq_f32(S0, R0); + float32x4_t P1 = vmulq_f32(V, S1); + float32x4_t R1 = vrsqrtsq_f32(P1, S1); + + return vmulq_f32(S1, R1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_sqrt_ps(V); + vResult = _mm_div_ps(g_XMOne, vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExp2(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + exp2f(V.vector4_f32[0]), + exp2f(V.vector4_f32[1]), + exp2f(V.vector4_f32[2]), + exp2f(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t itrunc = vcvtq_s32_f32(V); + float32x4_t ftrunc = vcvtq_f32_s32(itrunc); + float32x4_t y = vsubq_f32(V, ftrunc); + + float32x4_t poly = vmlaq_f32(g_XMExpEst6, g_XMExpEst7, y); + poly = vmlaq_f32(g_XMExpEst5, poly, y); + poly = vmlaq_f32(g_XMExpEst4, poly, y); + poly = vmlaq_f32(g_XMExpEst3, poly, y); + poly = vmlaq_f32(g_XMExpEst2, poly, y); + poly = vmlaq_f32(g_XMExpEst1, poly, y); + poly = vmlaq_f32(g_XMOne, poly, y); + + int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias); + biased = vshlq_n_s32(biased, 23); + float32x4_t result0 = XMVectorDivide(vreinterpretq_f32_s32(biased), poly); + + biased = vaddq_s32(itrunc, g_XM253); + biased = vshlq_n_s32(biased, 23); + float32x4_t result1 = XMVectorDivide(vreinterpretq_f32_s32(biased), poly); + result1 = vmulq_f32(g_XMMinNormal.v, result1); + + // Use selection to handle the cases + // if (V is NaN) -> QNaN; + // else if (V sign bit set) + // if (V > -150) + // if (V.exponent < -126) -> result1 + // else -> result0 + // else -> +0 + // else + // if (V < 128) -> result0 + // else -> +inf + + uint32x4_t comp = vcltq_s32(vreinterpretq_s32_f32(V), g_XMBin128); + float32x4_t result2 = vbslq_f32(comp, result0, g_XMInfinity); + + comp = vcltq_s32(itrunc, g_XMSubnormalExponent); + float32x4_t result3 = vbslq_f32(comp, result1, result0); + + comp = vcltq_s32(vreinterpretq_s32_f32(V), g_XMBinNeg150); + float32x4_t result4 = vbslq_f32(comp, result3, g_XMZero); + + int32x4_t sign = vandq_s32(vreinterpretq_s32_f32(V), g_XMNegativeZero); + comp = vceqq_s32(sign, g_XMNegativeZero); + float32x4_t result5 = vbslq_f32(comp, result4, result2); + + int32x4_t t0 = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest); + int32x4_t t1 = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + t0 = vreinterpretq_s32_u32(vceqq_s32(t0, g_XMZero)); + t1 = vreinterpretq_s32_u32(vceqq_s32(t1, g_XMInfinity)); + int32x4_t isNaN = vbicq_s32(t1, t0); + + float32x4_t vResult = vbslq_f32(vreinterpretq_u32_s32(isNaN), g_XMQNaN, result5); + return vResult; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_exp2_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i itrunc = _mm_cvttps_epi32(V); + __m128 ftrunc = _mm_cvtepi32_ps(itrunc); + __m128 y = _mm_sub_ps(V, ftrunc); + + __m128 poly = XM_FMADD_PS(g_XMExpEst7, y, g_XMExpEst6); + poly = XM_FMADD_PS(poly, y, g_XMExpEst5); + poly = XM_FMADD_PS(poly, y, g_XMExpEst4); + poly = XM_FMADD_PS(poly, y, g_XMExpEst3); + poly = XM_FMADD_PS(poly, y, g_XMExpEst2); + poly = XM_FMADD_PS(poly, y, g_XMExpEst1); + poly = XM_FMADD_PS(poly, y, g_XMOne); + + __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias); + biased = _mm_slli_epi32(biased, 23); + __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly); + + biased = _mm_add_epi32(itrunc, g_XM253); + biased = _mm_slli_epi32(biased, 23); + __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly); + result1 = _mm_mul_ps(g_XMMinNormal.v, result1); + + // Use selection to handle the cases + // if (V is NaN) -> QNaN; + // else if (V sign bit set) + // if (V > -150) + // if (V.exponent < -126) -> result1 + // else -> result0 + // else -> +0 + // else + // if (V < 128) -> result0 + // else -> +inf + + __m128i comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBin128); + __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0)); + __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity); + __m128i result2 = _mm_or_si128(select0, select1); + + comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent); + select1 = _mm_and_si128(comp, _mm_castps_si128(result1)); + select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0)); + __m128i result3 = _mm_or_si128(select0, select1); + + comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBinNeg150); + select0 = _mm_and_si128(comp, result3); + select1 = _mm_andnot_si128(comp, g_XMZero); + __m128i result4 = _mm_or_si128(select0, select1); + + __m128i sign = _mm_and_si128(_mm_castps_si128(V), g_XMNegativeZero); + comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero); + select0 = _mm_and_si128(comp, result4); + select1 = _mm_andnot_si128(comp, result2); + __m128i result5 = _mm_or_si128(select0, select1); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result5); + __m128i vResult = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(vResult); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExp10(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + powf(10.0f, V.vector4_f32[0]), + powf(10.0f, V.vector4_f32[1]), + powf(10.0f, V.vector4_f32[2]), + powf(10.0f, V.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_exp10_ps(V); + return Result; +#else + // exp10(V) = exp2(vin*log2(10)) + XMVECTOR Vten = XMVectorMultiply(g_XMLg10, V); + return XMVectorExp2(Vten); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExpE(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + expf(V.vector4_f32[0]), + expf(V.vector4_f32[1]), + expf(V.vector4_f32[2]), + expf(V.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_exp_ps(V); + return Result; +#else + // expE(V) = exp2(vin*log2(e)) + XMVECTOR Ve = XMVectorMultiply(g_XMLgE, V); + return XMVectorExp2(Ve); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExp(FXMVECTOR V) noexcept +{ + return XMVectorExp2(V); +} + +//------------------------------------------------------------------------------ + +#if defined(_XM_SSE_INTRINSICS_) + +namespace Internal +{ + inline __m128i multi_sll_epi32(__m128i value, __m128i count) noexcept + { + __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0, 0, 0, 0)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r0 = _mm_sll_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 1, 1, 1)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r1 = _mm_sll_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 2, 2, 2)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 2, 2, 2)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r2 = _mm_sll_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3, 3, 3, 3)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3, 3, 3, 3)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r3 = _mm_sll_epi32(v, c); + + // (r0,r0,r1,r1) + __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0, 0, 0, 0)); + // (r2,r2,r3,r3) + __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0, 0, 0, 0)); + // (r0,r1,r2,r3) + __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2, 0, 2, 0)); + return _mm_castps_si128(result); + } + + inline __m128i multi_srl_epi32(__m128i value, __m128i count) noexcept + { + __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0, 0, 0, 0)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r0 = _mm_srl_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 1, 1, 1)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r1 = _mm_srl_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 2, 2, 2)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 2, 2, 2)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r2 = _mm_srl_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3, 3, 3, 3)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3, 3, 3, 3)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r3 = _mm_srl_epi32(v, c); + + // (r0,r0,r1,r1) + __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0, 0, 0, 0)); + // (r2,r2,r3,r3) + __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0, 0, 0, 0)); + // (r0,r1,r2,r3) + __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2, 0, 2, 0)); + return _mm_castps_si128(result); + } + + inline __m128i GetLeadingBit(const __m128i value) noexcept + { + static const XMVECTORI32 g_XM0000FFFF = { { { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF } } }; + static const XMVECTORI32 g_XM000000FF = { { { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF } } }; + static const XMVECTORI32 g_XM0000000F = { { { 0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F } } }; + static const XMVECTORI32 g_XM00000003 = { { { 0x00000003, 0x00000003, 0x00000003, 0x00000003 } } }; + + __m128i v = value, r, c, b, s; + + c = _mm_cmpgt_epi32(v, g_XM0000FFFF); // c = (v > 0xFFFF) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + r = _mm_slli_epi32(b, 4); // r = (b << 4) + v = multi_srl_epi32(v, r); // v = (v >> r) + + c = _mm_cmpgt_epi32(v, g_XM000000FF); // c = (v > 0xFF) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + s = _mm_slli_epi32(b, 3); // s = (b << 3) + v = multi_srl_epi32(v, s); // v = (v >> s) + r = _mm_or_si128(r, s); // r = (r | s) + + c = _mm_cmpgt_epi32(v, g_XM0000000F); // c = (v > 0xF) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + s = _mm_slli_epi32(b, 2); // s = (b << 2) + v = multi_srl_epi32(v, s); // v = (v >> s) + r = _mm_or_si128(r, s); // r = (r | s) + + c = _mm_cmpgt_epi32(v, g_XM00000003); // c = (v > 0x3) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + s = _mm_slli_epi32(b, 1); // s = (b << 1) + v = multi_srl_epi32(v, s); // v = (v >> s) + r = _mm_or_si128(r, s); // r = (r | s) + + s = _mm_srli_epi32(v, 1); + r = _mm_or_si128(r, s); + return r; + } +} // namespace Internal + +#endif // _XM_SSE_INTRINSICS_ + +#if defined(_XM_ARM_NEON_INTRINSICS_) + +namespace Internal +{ + inline int32x4_t GetLeadingBit(const int32x4_t value) noexcept + { + static const XMVECTORI32 g_XM0000FFFF = { { { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF } } }; + static const XMVECTORI32 g_XM000000FF = { { { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF } } }; + static const XMVECTORI32 g_XM0000000F = { { { 0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F } } }; + static const XMVECTORI32 g_XM00000003 = { { { 0x00000003, 0x00000003, 0x00000003, 0x00000003 } } }; + + uint32x4_t c = vcgtq_s32(value, g_XM0000FFFF); // c = (v > 0xFFFF) + int32x4_t b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31); // b = (c ? 1 : 0) + int32x4_t r = vshlq_n_s32(b, 4); // r = (b << 4) + r = vnegq_s32(r); + int32x4_t v = vshlq_s32(value, r); // v = (v >> r) + + c = vcgtq_s32(v, g_XM000000FF); // c = (v > 0xFF) + b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31); // b = (c ? 1 : 0) + int32x4_t s = vshlq_n_s32(b, 3); // s = (b << 3) + s = vnegq_s32(s); + v = vshlq_s32(v, s); // v = (v >> s) + r = vorrq_s32(r, s); // r = (r | s) + + c = vcgtq_s32(v, g_XM0000000F); // c = (v > 0xF) + b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31); // b = (c ? 1 : 0) + s = vshlq_n_s32(b, 2); // s = (b << 2) + s = vnegq_s32(s); + v = vshlq_s32(v, s); // v = (v >> s) + r = vorrq_s32(r, s); // r = (r | s) + + c = vcgtq_s32(v, g_XM00000003); // c = (v > 0x3) + b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31); // b = (c ? 1 : 0) + s = vshlq_n_s32(b, 1); // s = (b << 1) + s = vnegq_s32(s); + v = vshlq_s32(v, s); // v = (v >> s) + r = vorrq_s32(r, s); // r = (r | s) + + s = vshrq_n_s32(v, 1); + r = vorrq_s32(r, s); + return r; + } + +} // namespace Internal + +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLog2(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + log2f(V.vector4_f32[0]), + log2f(V.vector4_f32[1]), + log2f(V.vector4_f32[2]), + log2f(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest); + uint32x4_t isExponentZero = vceqq_s32(vreinterpretq_s32_f32(g_XMZero), rawBiased); + + // Compute exponent and significand for normals. + int32x4_t biased = vshrq_n_s32(rawBiased, 23); + int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias); + int32x4_t trailingNor = trailing; + + // Compute exponent and significand for subnormals. + int32x4_t leading = Internal::GetLeadingBit(trailing); + int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading); + int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift); + int32x4_t trailingSub = vshlq_s32(trailing, shift); + trailingSub = vandq_s32(trailingSub, g_XMQNaNTest); + int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor); + int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor); + + // Compute the approximation. + int32x4_t tmp = vorrq_s32(vreinterpretq_s32_f32(g_XMOne), t); + float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne); + + float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y); + log2 = vmlaq_f32(g_XMLogEst5, log2, y); + log2 = vmlaq_f32(g_XMLogEst4, log2, y); + log2 = vmlaq_f32(g_XMLogEst3, log2, y); + log2 = vmlaq_f32(g_XMLogEst2, log2, y); + log2 = vmlaq_f32(g_XMLogEst1, log2, y); + log2 = vmlaq_f32(g_XMLogEst0, log2, y); + log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isInfinite = vceqq_u32(isInfinite, g_XMInfinity); + + uint32x4_t isGreaterZero = vcgtq_f32(V, g_XMZero); + uint32x4_t isNotFinite = vcgtq_f32(V, g_XMInfinity); + uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite); + + uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isZero = vceqq_u32(isZero, g_XMZero); + + uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest); + uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity); + t0 = vceqq_u32(t0, g_XMZero); + t1 = vceqq_u32(t1, g_XMInfinity); + uint32x4_t isNaN = vbicq_u32(t1, t0); + + float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2); + float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN); + result = vbslq_f32(isPositive, result, tmp2); + result = vbslq_f32(isNaN, g_XMQNaN, result); + return result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_log2_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + __m128i biased = _mm_srli_epi32(rawBiased, 23); + __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias); + __m128i trailingNor = trailing; + + // Compute exponent and significand for subnormals. + __m128i leading = Internal::GetLeadingBit(trailing); + __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading); + __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift); + __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift); + trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest); + + __m128i select0 = _mm_and_si128(isExponentZero, exponentSub); + __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor); + __m128i e = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isExponentZero, trailingSub); + select1 = _mm_andnot_si128(isExponentZero, trailingNor); + __m128i t = _mm_or_si128(select0, select1); + + // Compute the approximation. + __m128i tmp = _mm_or_si128(g_XMOne, t); + __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne); + + __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst5); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst4); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst3); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst2); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst1); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst0); + log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e)); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity); + + __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero); + __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity); + __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero); + + __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isZero = _mm_cmpeq_epi32(isZero, g_XMZero); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isInfinite, g_XMInfinity); + select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2)); + __m128i result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isZero, g_XMNegInfinity); + select1 = _mm_andnot_si128(isZero, g_XMNegQNaN); + tmp = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isPositive, result); + select1 = _mm_andnot_si128(isPositive, tmp); + result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result); + result = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLog10(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + log10f(V.vector4_f32[0]), + log10f(V.vector4_f32[1]), + log10f(V.vector4_f32[2]), + log10f(V.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest); + uint32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + int32x4_t biased = vshrq_n_s32(rawBiased, 23); + int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias); + int32x4_t trailingNor = trailing; + + // Compute exponent and significand for subnormals. + int32x4_t leading = Internal::GetLeadingBit(trailing); + int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading); + int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift); + int32x4_t trailingSub = vshlq_s32(trailing, shift); + trailingSub = vandq_s32(trailingSub, g_XMQNaNTest); + int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor); + int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor); + + // Compute the approximation. + int32x4_t tmp = vorrq_s32(g_XMOne, t); + float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne); + + float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y); + log2 = vmlaq_f32(g_XMLogEst5, log2, y); + log2 = vmlaq_f32(g_XMLogEst4, log2, y); + log2 = vmlaq_f32(g_XMLogEst3, log2, y); + log2 = vmlaq_f32(g_XMLogEst2, log2, y); + log2 = vmlaq_f32(g_XMLogEst1, log2, y); + log2 = vmlaq_f32(g_XMLogEst0, log2, y); + log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y); + + log2 = vmulq_f32(g_XMInvLg10, log2); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isInfinite = vceqq_u32(isInfinite, g_XMInfinity); + + uint32x4_t isGreaterZero = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMZero); + uint32x4_t isNotFinite = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite); + + uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isZero = vceqq_u32(isZero, g_XMZero); + + uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest); + uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity); + t0 = vceqq_u32(t0, g_XMZero); + t1 = vceqq_u32(t1, g_XMInfinity); + uint32x4_t isNaN = vbicq_u32(t1, t0); + + float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2); + float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN); + result = vbslq_f32(isPositive, result, tmp2); + result = vbslq_f32(isNaN, g_XMQNaN, result); + return result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_log10_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + __m128i biased = _mm_srli_epi32(rawBiased, 23); + __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias); + __m128i trailingNor = trailing; + + // Compute exponent and significand for subnormals. + __m128i leading = Internal::GetLeadingBit(trailing); + __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading); + __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift); + __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift); + trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest); + + __m128i select0 = _mm_and_si128(isExponentZero, exponentSub); + __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor); + __m128i e = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isExponentZero, trailingSub); + select1 = _mm_andnot_si128(isExponentZero, trailingNor); + __m128i t = _mm_or_si128(select0, select1); + + // Compute the approximation. + __m128i tmp = _mm_or_si128(g_XMOne, t); + __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne); + + __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst5); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst4); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst3); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst2); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst1); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst0); + log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e)); + + log2 = _mm_mul_ps(g_XMInvLg10, log2); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity); + + __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero); + __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity); + __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero); + + __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isZero = _mm_cmpeq_epi32(isZero, g_XMZero); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isInfinite, g_XMInfinity); + select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2)); + __m128i result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isZero, g_XMNegInfinity); + select1 = _mm_andnot_si128(isZero, g_XMNegQNaN); + tmp = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isPositive, result); + select1 = _mm_andnot_si128(isPositive, tmp); + result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result); + result = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLogE(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + logf(V.vector4_f32[0]), + logf(V.vector4_f32[1]), + logf(V.vector4_f32[2]), + logf(V.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest); + uint32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + int32x4_t biased = vshrq_n_s32(rawBiased, 23); + int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias); + int32x4_t trailingNor = trailing; + + // Compute exponent and significand for subnormals. + int32x4_t leading = Internal::GetLeadingBit(trailing); + int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading); + int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift); + int32x4_t trailingSub = vshlq_s32(trailing, shift); + trailingSub = vandq_s32(trailingSub, g_XMQNaNTest); + int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor); + int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor); + + // Compute the approximation. + int32x4_t tmp = vorrq_s32(g_XMOne, t); + float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne); + + float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y); + log2 = vmlaq_f32(g_XMLogEst5, log2, y); + log2 = vmlaq_f32(g_XMLogEst4, log2, y); + log2 = vmlaq_f32(g_XMLogEst3, log2, y); + log2 = vmlaq_f32(g_XMLogEst2, log2, y); + log2 = vmlaq_f32(g_XMLogEst1, log2, y); + log2 = vmlaq_f32(g_XMLogEst0, log2, y); + log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y); + + log2 = vmulq_f32(g_XMInvLgE, log2); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isInfinite = vceqq_u32(isInfinite, g_XMInfinity); + + uint32x4_t isGreaterZero = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMZero); + uint32x4_t isNotFinite = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMInfinity); + uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite); + + uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + isZero = vceqq_u32(isZero, g_XMZero); + + uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest); + uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity); + t0 = vceqq_u32(t0, g_XMZero); + t1 = vceqq_u32(t1, g_XMInfinity); + uint32x4_t isNaN = vbicq_u32(t1, t0); + + float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2); + float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN); + result = vbslq_f32(isPositive, result, tmp2); + result = vbslq_f32(isNaN, g_XMQNaN, result); + return result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_log_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + __m128i biased = _mm_srli_epi32(rawBiased, 23); + __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias); + __m128i trailingNor = trailing; + + // Compute exponent and significand for subnormals. + __m128i leading = Internal::GetLeadingBit(trailing); + __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading); + __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift); + __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift); + trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest); + + __m128i select0 = _mm_and_si128(isExponentZero, exponentSub); + __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor); + __m128i e = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isExponentZero, trailingSub); + select1 = _mm_andnot_si128(isExponentZero, trailingNor); + __m128i t = _mm_or_si128(select0, select1); + + // Compute the approximation. + __m128i tmp = _mm_or_si128(g_XMOne, t); + __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne); + + __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst5); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst4); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst3); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst2); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst1); + log2 = XM_FMADD_PS(log2, y, g_XMLogEst0); + log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e)); + + log2 = _mm_mul_ps(g_XMInvLgE, log2); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity); + + __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero); + __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity); + __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero); + + __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isZero = _mm_cmpeq_epi32(isZero, g_XMZero); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isInfinite, g_XMInfinity); + select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2)); + __m128i result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isZero, g_XMNegInfinity); + select1 = _mm_andnot_si128(isZero, g_XMNegQNaN); + tmp = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isPositive, result); + select1 = _mm_andnot_si128(isPositive, tmp); + result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result); + result = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLog(FXMVECTOR V) noexcept +{ + return XMVectorLog2(V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorPow +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + powf(V1.vector4_f32[0], V2.vector4_f32[0]), + powf(V1.vector4_f32[1], V2.vector4_f32[1]), + powf(V1.vector4_f32[2], V2.vector4_f32[2]), + powf(V1.vector4_f32[3], V2.vector4_f32[3]) + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { { { + powf(vgetq_lane_f32(V1, 0), vgetq_lane_f32(V2, 0)), + powf(vgetq_lane_f32(V1, 1), vgetq_lane_f32(V2, 1)), + powf(vgetq_lane_f32(V1, 2), vgetq_lane_f32(V2, 2)), + powf(vgetq_lane_f32(V1, 3), vgetq_lane_f32(V2, 3)) + } } }; + return vResult.v; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_pow_ps(V1, V2); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + XM_ALIGNED_DATA(16) float a[4]; + XM_ALIGNED_DATA(16) float b[4]; + _mm_store_ps(a, V1); + _mm_store_ps(b, V2); + XMVECTOR vResult = _mm_setr_ps( + powf(a[0], b[0]), + powf(a[1], b[1]), + powf(a[2], b[2]), + powf(a[3], b[3])); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAbs(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + fabsf(V.vector4_f32[0]), + fabsf(V.vector4_f32[1]), + fabsf(V.vector4_f32[2]), + fabsf(V.vector4_f32[3]) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vabsq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_setzero_ps(); + vResult = _mm_sub_ps(vResult, V); + vResult = _mm_max_ps(vResult, V); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMod +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + // V1 % V2 = V1 - V2 * truncate(V1 / V2) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Quotient = XMVectorDivide(V1, V2); + Quotient = XMVectorTruncate(Quotient); + XMVECTOR Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vResult = XMVectorDivide(V1, V2); + vResult = XMVectorTruncate(vResult); + return vmlsq_f32(V1, vResult, V2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_div_ps(V1, V2); + vResult = XMVectorTruncate(vResult); + return XM_FNMADD_PS(vResult, V2, V1); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorModAngles(FXMVECTOR Angles) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + XMVECTOR Result; + + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v); + V = XMVectorRound(V); + Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + XMVECTOR vResult = vmulq_f32(Angles, g_XMReciprocalTwoPi); + // Use the inline function due to complexity for rounding + vResult = XMVectorRound(vResult); + return vmlsq_f32(Angles, vResult, g_XMTwoPi); +#elif defined(_XM_SSE_INTRINSICS_) + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + XMVECTOR vResult = _mm_mul_ps(Angles, g_XMReciprocalTwoPi); + // Use the inline function due to complexity for rounding + vResult = XMVectorRound(vResult); + return XM_FNMADD_PS(vResult, g_XMTwoPi, Angles); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSin(FXMVECTOR V) noexcept +{ + // 11-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + sinf(V.vector4_f32[0]), + sinf(V.vector4_f32[1]), + sinf(V.vector4_f32[2]), + sinf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR SC1 = g_XMSinCoefficients1; + const XMVECTOR SC0 = g_XMSinCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, x); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_sin_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + __m128 sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR SC1 = g_XMSinCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(SC1, _MM_SHUFFLE(0, 0, 0, 0)); + const XMVECTOR SC0 = g_XMSinCoefficients0; + __m128 vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, x); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCos(FXMVECTOR V) noexcept +{ + // 10-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + cosf(V.vector4_f32[0]), + cosf(V.vector4_f32[1]), + cosf(V.vector4_f32[2]), + cosf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR CC1 = g_XMCosCoefficients1; + const XMVECTOR CC0 = g_XMCosCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, fsign); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_cos_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR CC1 = g_XMCosCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(CC1, _MM_SHUFFLE(0, 0, 0, 0)); + const XMVECTOR CC0 = g_XMCosCoefficients0; + __m128 vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, sign); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorSinCos +( + XMVECTOR* pSin, + XMVECTOR* pCos, + FXMVECTOR V +) noexcept +{ + assert(pSin != nullptr); + assert(pCos != nullptr); + + // 11/10-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Sin = { { { + sinf(V.vector4_f32[0]), + sinf(V.vector4_f32[1]), + sinf(V.vector4_f32[2]), + sinf(V.vector4_f32[3]) + } } }; + + XMVECTORF32 Cos = { { { + cosf(V.vector4_f32[0]), + cosf(V.vector4_f32[1]), + cosf(V.vector4_f32[2]), + cosf(V.vector4_f32[3]) + } } }; + + *pSin = Sin.v; + *pCos = Cos.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SC1 = g_XMSinCoefficients1; + const XMVECTOR SC0 = g_XMSinCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pSin = vmulq_f32(Result, x); + + // Compute polynomial approximation for cosine + const XMVECTOR CC1 = g_XMCosCoefficients1; + const XMVECTOR CC0 = g_XMCosCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1); + Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pCos = vmulq_f32(Result, fsign); +#elif defined(_XM_SVML_INTRINSICS_) + *pSin = _mm_sincos_ps(pCos, V); +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation of sine + const XMVECTOR SC1 = g_XMSinCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(SC1, _MM_SHUFFLE(0, 0, 0, 0)); + const XMVECTOR SC0 = g_XMSinCoefficients0; + __m128 vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, x); + *pSin = Result; + + // Compute polynomial approximation of cosine + const XMVECTOR CC1 = g_XMCosCoefficients1; + vConstantsB = XM_PERMUTE_PS(CC1, _MM_SHUFFLE(0, 0, 0, 0)); + const XMVECTOR CC0 = g_XMCosCoefficients0; + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(3, 3, 3, 3)); + Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, sign); + *pCos = Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTan(FXMVECTOR V) noexcept +{ + // Cody and Waite algorithm to compute tangent. + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + tanf(V.vector4_f32[0]), + tanf(V.vector4_f32[1]), + tanf(V.vector4_f32[2]), + tanf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_tan_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 TanCoefficients0 = { { { 1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f } } }; + static const XMVECTORF32 TanCoefficients1 = { { { 4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f } } }; + static const XMVECTORF32 TanConstants = { { { 1.570796371f, 6.077100628e-11f, 0.000244140625f, 0.63661977228f /*2 / Pi*/ } } }; + static const XMVECTORU32 Mask = { { { 0x1, 0x1, 0x1, 0x1 } } }; + + XMVECTOR TwoDivPi = XMVectorSplatW(TanConstants.v); + + XMVECTOR Zero = XMVectorZero(); + + XMVECTOR C0 = XMVectorSplatX(TanConstants.v); + XMVECTOR C1 = XMVectorSplatY(TanConstants.v); + XMVECTOR Epsilon = XMVectorSplatZ(TanConstants.v); + + XMVECTOR VA = XMVectorMultiply(V, TwoDivPi); + + VA = XMVectorRound(VA); + + XMVECTOR VC = XMVectorNegativeMultiplySubtract(VA, C0, V); + + XMVECTOR VB = XMVectorAbs(VA); + + VC = XMVectorNegativeMultiplySubtract(VA, C1, VC); + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + VB = vreinterpretq_f32_u32(vcvtq_u32_f32(VB)); +#elif defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + reinterpret_cast<__m128i*>(&VB)[0] = _mm_cvttps_epi32(VB); +#else + for (size_t i = 0; i < 4; i++) + { + VB.vector4_u32[i] = static_cast(VB.vector4_f32[i]); + } +#endif + + XMVECTOR VC2 = XMVectorMultiply(VC, VC); + + XMVECTOR T7 = XMVectorSplatW(TanCoefficients1.v); + XMVECTOR T6 = XMVectorSplatZ(TanCoefficients1.v); + XMVECTOR T4 = XMVectorSplatX(TanCoefficients1.v); + XMVECTOR T3 = XMVectorSplatW(TanCoefficients0.v); + XMVECTOR T5 = XMVectorSplatY(TanCoefficients1.v); + XMVECTOR T2 = XMVectorSplatZ(TanCoefficients0.v); + XMVECTOR T1 = XMVectorSplatY(TanCoefficients0.v); + XMVECTOR T0 = XMVectorSplatX(TanCoefficients0.v); + + XMVECTOR VBIsEven = XMVectorAndInt(VB, Mask.v); + VBIsEven = XMVectorEqualInt(VBIsEven, Zero); + + XMVECTOR N = XMVectorMultiplyAdd(VC2, T7, T6); + XMVECTOR D = XMVectorMultiplyAdd(VC2, T4, T3); + N = XMVectorMultiplyAdd(VC2, N, T5); + D = XMVectorMultiplyAdd(VC2, D, T2); + N = XMVectorMultiply(VC2, N); + D = XMVectorMultiplyAdd(VC2, D, T1); + N = XMVectorMultiplyAdd(VC, N, VC); + XMVECTOR VCNearZero = XMVectorInBounds(VC, Epsilon); + D = XMVectorMultiplyAdd(VC2, D, T0); + + N = XMVectorSelect(N, VC, VCNearZero); + D = XMVectorSelect(D, g_XMOne.v, VCNearZero); + + XMVECTOR R0 = XMVectorNegate(N); + XMVECTOR R1 = XMVectorDivide(N, D); + R0 = XMVectorDivide(D, R0); + + XMVECTOR VIsZero = XMVectorEqual(V, Zero); + + XMVECTOR Result = XMVectorSelect(R0, R1, VBIsEven); + + Result = XMVectorSelect(Result, Zero, VIsZero); + + return Result; + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSinH(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + sinhf(V.vector4_f32[0]), + sinhf(V.vector4_f32[1]), + sinhf(V.vector4_f32[2]), + sinhf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f) + + XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + + return vsubq_f32(E1, E2); +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_sinh_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f) + + XMVECTOR V1 = XM_FMADD_PS(V, Scale, g_XMNegativeOne); + XMVECTOR V2 = XM_FNMADD_PS(V, Scale, g_XMNegativeOne); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + + return _mm_sub_ps(E1, E2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCosH(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + coshf(V.vector4_f32[0]), + coshf(V.vector4_f32[1]), + coshf(V.vector4_f32[2]), + coshf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f) + + XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + return vaddq_f32(E1, E2); +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_cosh_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f) + + XMVECTOR V1 = XM_FMADD_PS(V, Scale.v, g_XMNegativeOne.v); + XMVECTOR V2 = XM_FNMADD_PS(V, Scale.v, g_XMNegativeOne.v); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + return _mm_add_ps(E1, E2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTanH(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + tanhf(V.vector4_f32[0]), + tanhf(V.vector4_f32[1]), + tanhf(V.vector4_f32[2]), + tanhf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f } } }; // 2.0f / ln(2.0f) + + XMVECTOR E = vmulq_f32(V, Scale.v); + E = XMVectorExp(E); + E = vmlaq_f32(g_XMOneHalf.v, E, g_XMOneHalf.v); + E = XMVectorReciprocal(E); + return vsubq_f32(g_XMOne.v, E); +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_tanh_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f } } }; // 2.0f / ln(2.0f) + + XMVECTOR E = _mm_mul_ps(V, Scale.v); + E = XMVectorExp(E); + E = XM_FMADD_PS(E, g_XMOneHalf.v, g_XMOneHalf.v); + E = _mm_div_ps(g_XMOne.v, E); + return _mm_sub_ps(g_XMOne.v, E); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorASin(FXMVECTOR V) noexcept +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + asinf(V.vector4_f32[0]), + asinf(V.vector4_f32[1]), + asinf(V.vector4_f32[2]), + asinf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0); + XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AC1), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0); + t0 = vmlaq_f32(vConstants, t0, x); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0); + t0 = vmlaq_f32(vConstants, t0, x); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32(nonnegative, t0, t1); + t0 = vsubq_f32(g_XMHalfPi, t0); + return t0; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_asin_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(3, 3, 3, 3)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(2, 2, 2, 2)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + t0 = _mm_sub_ps(g_XMHalfPi, t0); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorACos(FXMVECTOR V) noexcept +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + acosf(V.vector4_f32[0]), + acosf(V.vector4_f32[1]), + acosf(V.vector4_f32[2]), + acosf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0); + XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AC1), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0); + t0 = vmlaq_f32(vConstants, t0, x); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0); + t0 = vmlaq_f32(vConstants, t0, x); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32(nonnegative, t0, t1); + return t0; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_acos_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(3, 3, 3, 3)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(2, 2, 2, 2)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATan(FXMVECTOR V) noexcept +{ + // 17-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + atanf(V.vector4_f32[0]), + atanf(V.vector4_f32[1]), + atanf(V.vector4_f32[2]), + atanf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t absV = vabsq_f32(V); + float32x4_t invV = XMVectorReciprocal(V); + uint32x4_t comp = vcgtq_f32(V, g_XMOne); + float32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + comp = vcleq_f32(absV, g_XMOne); + sign = vbslq_f32(comp, g_XMZero, sign); + float32x4_t x = vbslq_f32(comp, V, invV); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR TC1 = g_XMATanCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(TC1), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + const XMVECTOR TC0 = g_XMATanCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, x); + + float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi); + result1 = vsubq_f32(result1, Result); + + comp = vceqq_f32(sign, g_XMZero); + Result = vbslq_f32(comp, Result, result1); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_atan_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 absV = XMVectorAbs(V); + __m128 invV = _mm_div_ps(g_XMOne, V); + __m128 comp = _mm_cmpgt_ps(V, g_XMOne); + __m128 select0 = _mm_and_ps(comp, g_XMOne); + __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + __m128 sign = _mm_or_ps(select0, select1); + comp = _mm_cmple_ps(absV, g_XMOne); + select0 = _mm_and_ps(comp, g_XMZero); + select1 = _mm_andnot_ps(comp, sign); + sign = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, V); + select1 = _mm_andnot_ps(comp, invV); + __m128 x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR TC1 = g_XMATanCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + const XMVECTOR TC0 = g_XMATanCoefficients0; + vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(3, 3, 3, 3)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + Result = XM_FMADD_PS(Result, x2, g_XMOne); + + Result = _mm_mul_ps(Result, x); + __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi); + result1 = _mm_sub_ps(result1, Result); + + comp = _mm_cmpeq_ps(sign, g_XMZero); + select0 = _mm_and_ps(comp, Result); + select1 = _mm_andnot_ps(comp, result1); + Result = _mm_or_ps(select0, select1); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATan2 +( + FXMVECTOR Y, + FXMVECTOR X +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + atan2f(Y.vector4_f32[0], X.vector4_f32[0]), + atan2f(Y.vector4_f32[1], X.vector4_f32[1]), + atan2f(Y.vector4_f32[2], X.vector4_f32[2]), + atan2f(Y.vector4_f32[3], X.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_atan2_ps(Y, X); + return Result; +#else + + // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions: + + // Y == 0 and X is Negative -> Pi with the sign of Y + // y == 0 and x is positive -> 0 with the sign of y + // Y != 0 and X == 0 -> Pi / 2 with the sign of Y + // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y) + // X == -Infinity and Finite Y -> Pi with the sign of Y + // X == +Infinity and Finite Y -> 0 with the sign of Y + // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y + // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y + // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y + + static const XMVECTORF32 ATan2Constants = { { { XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f } } }; + + XMVECTOR Zero = XMVectorZero(); + XMVECTOR ATanResultValid = XMVectorTrueInt(); + + XMVECTOR Pi = XMVectorSplatX(ATan2Constants); + XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants); + XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants); + XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants); + + XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero); + XMVECTOR XEqualsZero = XMVectorEqual(X, Zero); + XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); + XIsPositive = XMVectorEqualInt(XIsPositive, Zero); + XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); + XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); + + XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); + Pi = XMVectorOrInt(Pi, YSign); + PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); + PiOverFour = XMVectorOrInt(PiOverFour, YSign); + ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); + + XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive); + XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); + XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero); + XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); + XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); + XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity); + ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); + + XMVECTOR V = XMVectorDivide(Y, X); + + XMVECTOR R0 = XMVectorATan(V); + + R1 = XMVectorSelect(Pi, g_XMNegativeZero, XIsPositive); + R2 = XMVectorAdd(R0, R1); + + return XMVectorSelect(Result, R2, ATanResultValid); + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSinEst(FXMVECTOR V) noexcept +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + sinf(V.vector4_f32[0]), + sinf(V.vector4_f32[1]), + sinf(V.vector4_f32[2]), + sinf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, x); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_sin_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + __m128 sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR SEC = g_XMSinCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, x); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCosEst(FXMVECTOR V) noexcept +{ + // 6-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + cosf(V.vector4_f32[0]), + cosf(V.vector4_f32[1]), + cosf(V.vector4_f32[2]), + cosf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, fsign); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_cos_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, sign); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorSinCosEst +( + XMVECTOR* pSin, + XMVECTOR* pCos, + FXMVECTOR V +) noexcept +{ + assert(pSin != nullptr); + assert(pCos != nullptr); + + // 7/6-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Sin = { { { + sinf(V.vector4_f32[0]), + sinf(V.vector4_f32[1]), + sinf(V.vector4_f32[2]), + sinf(V.vector4_f32[3]) + } } }; + + XMVECTORF32 Cos = { { { + cosf(V.vector4_f32[0]), + cosf(V.vector4_f32[1]), + cosf(V.vector4_f32[2]), + cosf(V.vector4_f32[3]) + } } }; + + *pSin = Sin.v; + *pCos = Cos.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32(x); + float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32(comp, x, rflx); + float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pSin = vmulq_f32(Result, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0); + Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pCos = vmulq_f32(Result, fsign); +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SEC = g_XMSinCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, x); + *pSin = Result; + + // Compute polynomial approximation for cosine + const XMVECTOR CEC = g_XMCosCoefficients1; + vConstantsB = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(3, 3, 3, 3)); + vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(2, 2, 2, 2)); + Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + Result = XM_FMADD_PS(Result, x2, g_XMOne); + Result = _mm_mul_ps(Result, sign); + *pCos = Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTanEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + tanf(V.vector4_f32[0]), + tanf(V.vector4_f32[1]), + tanf(V.vector4_f32[2]), + tanf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_tan_ps(V); + return Result; +#else + + XMVECTOR OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v); + + XMVECTOR V1 = XMVectorMultiply(V, OneOverPi); + V1 = XMVectorRound(V1); + + V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V); + + XMVECTOR T0 = XMVectorSplatX(g_XMTanEstCoefficients.v); + XMVECTOR T1 = XMVectorSplatY(g_XMTanEstCoefficients.v); + XMVECTOR T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v); + + XMVECTOR V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2); + XMVECTOR V2 = XMVectorMultiply(V1, V1); + XMVECTOR V1T0 = XMVectorMultiply(V1, T0); + XMVECTOR V1T1 = XMVectorMultiply(V1, T1); + + XMVECTOR D = XMVectorReciprocalEst(V2T2); + XMVECTOR N = XMVectorMultiplyAdd(V2, V1T1, V1T0); + + return XMVectorMultiply(N, D); + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorASinEst(FXMVECTOR V) noexcept +{ + // 3-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result; + Result.f[0] = asinf(V.vector4_f32[0]); + Result.f[1] = asinf(V.vector4_f32[1]); + Result.f[2] = asinf(V.vector4_f32[2]); + Result.f[3] = asinf(V.vector4_f32[3]); + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); + t0 = vmlaq_f32(vConstants, t0, x); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32(nonnegative, t0, t1); + t0 = vsubq_f32(g_XMHalfPi, t0); + return t0; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_asin_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + t0 = _mm_sub_ps(g_XMHalfPi, t0); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorACosEst(FXMVECTOR V) noexcept +{ + // 3-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + acosf(V.vector4_f32[0]), + acosf(V.vector4_f32[1]), + acosf(V.vector4_f32[2]), + acosf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + t0 = vmlaq_f32(vConstants, t0, x); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); + t0 = vmlaq_f32(vConstants, t0, x); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32(nonnegative, t0, t1); + return t0; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_acos_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1)); + t0 = XM_FMADD_PS(t0, x, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0)); + t0 = XM_FMADD_PS(t0, x, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATanEst(FXMVECTOR V) noexcept +{ + // 9-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + atanf(V.vector4_f32[0]), + atanf(V.vector4_f32[1]), + atanf(V.vector4_f32[2]), + atanf(V.vector4_f32[3]) + } } }; + return Result.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t absV = vabsq_f32(V); + float32x4_t invV = XMVectorReciprocalEst(V); + uint32x4_t comp = vcgtq_f32(V, g_XMOne); + float32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + comp = vcleq_f32(absV, g_XMOne); + sign = vbslq_f32(comp, g_XMZero, sign); + float32x4_t x = vbslq_f32(comp, V, invV); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMATanEstCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(AEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + // ATanEstCoefficients0 is already splatted + Result = vmlaq_f32(g_XMATanEstCoefficients0, Result, x2); + Result = vmulq_f32(Result, x); + + float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi); + result1 = vsubq_f32(result1, Result); + + comp = vceqq_f32(sign, g_XMZero); + Result = vbslq_f32(comp, Result, result1); + return Result; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_atan_ps(V); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 absV = XMVectorAbs(V); + __m128 invV = _mm_div_ps(g_XMOne, V); + __m128 comp = _mm_cmpgt_ps(V, g_XMOne); + __m128 select0 = _mm_and_ps(comp, g_XMOne); + __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + __m128 sign = _mm_or_ps(select0, select1); + comp = _mm_cmple_ps(absV, g_XMOne); + select0 = _mm_and_ps(comp, g_XMZero); + select1 = _mm_andnot_ps(comp, sign); + sign = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, V); + select1 = _mm_andnot_ps(comp, invV); + __m128 x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMATanEstCoefficients1; + __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1)); + Result = XM_FMADD_PS(Result, x2, vConstants); + + vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0)); + Result = XM_FMADD_PS(Result, x2, vConstants); + // ATanEstCoefficients0 is already splatted + Result = XM_FMADD_PS(Result, x2, g_XMATanEstCoefficients0); + Result = _mm_mul_ps(Result, x); + __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi); + result1 = _mm_sub_ps(result1, Result); + + comp = _mm_cmpeq_ps(sign, g_XMZero); + select0 = _mm_and_ps(comp, Result); + select1 = _mm_andnot_ps(comp, result1); + Result = _mm_or_ps(select0, select1); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATan2Est +( + FXMVECTOR Y, + FXMVECTOR X +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 Result = { { { + atan2f(Y.vector4_f32[0], X.vector4_f32[0]), + atan2f(Y.vector4_f32[1], X.vector4_f32[1]), + atan2f(Y.vector4_f32[2], X.vector4_f32[2]), + atan2f(Y.vector4_f32[3], X.vector4_f32[3]), + } } }; + return Result.v; +#elif defined(_XM_SVML_INTRINSICS_) + XMVECTOR Result = _mm_atan2_ps(Y, X); + return Result; +#else + + static const XMVECTORF32 ATan2Constants = { { { XM_PI, XM_PIDIV2, XM_PIDIV4, 2.3561944905f /* Pi*3/4 */ } } }; + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR ATanResultValid = XMVectorTrueInt(); + + XMVECTOR Pi = XMVectorSplatX(ATan2Constants); + XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants); + XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants); + XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants); + + XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero); + XMVECTOR XEqualsZero = XMVectorEqual(X, Zero); + XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); + XIsPositive = XMVectorEqualInt(XIsPositive, Zero); + XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); + XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); + + XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); + Pi = XMVectorOrInt(Pi, YSign); + PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); + PiOverFour = XMVectorOrInt(PiOverFour, YSign); + ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); + + XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive); + XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); + XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero); + XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); + XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); + XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity); + ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); + + XMVECTOR Reciprocal = XMVectorReciprocalEst(X); + XMVECTOR V = XMVectorMultiply(Y, Reciprocal); + XMVECTOR R0 = XMVectorATanEst(V); + + R1 = XMVectorSelect(Pi, g_XMNegativeZero, XIsPositive); + R2 = XMVectorAdd(R0, R1); + + Result = XMVectorSelect(Result, R2, ATanResultValid); + + return Result; + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLerp +( + FXMVECTOR V0, + FXMVECTOR V1, + float t +) noexcept +{ + // V0 + t * (V1 - V0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Scale = XMVectorReplicate(t); + XMVECTOR Length = XMVectorSubtract(V1, V0); + return XMVectorMultiplyAdd(Length, Scale, V0); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR L = vsubq_f32(V1, V0); + return vmlaq_n_f32(V0, L, t); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR L = _mm_sub_ps(V1, V0); + XMVECTOR S = _mm_set_ps1(t); + return XM_FMADD_PS(L, S, V0); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLerpV +( + FXMVECTOR V0, + FXMVECTOR V1, + FXMVECTOR T +) noexcept +{ + // V0 + T * (V1 - V0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Length = XMVectorSubtract(V1, V0); + return XMVectorMultiplyAdd(Length, T, V0); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR L = vsubq_f32(V1, V0); + return vmlaq_f32(V0, L, T); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Length = _mm_sub_ps(V1, V0); + return XM_FMADD_PS(Length, T, V0); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorHermite +( + FXMVECTOR Position0, + FXMVECTOR Tangent0, + FXMVECTOR Position1, + GXMVECTOR Tangent1, + float t +) noexcept +{ + // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + + // (t^3 - 2 * t^2 + t) * Tangent0 + + // (-2 * t^3 + 3 * t^2) * Position1 + + // (t^3 - t^2) * Tangent1 + +#if defined(_XM_NO_INTRINSICS_) + + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f); + XMVECTOR T0 = XMVectorReplicate(t3 - 2.0f * t2 + t); + XMVECTOR P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2); + XMVECTOR T1 = XMVectorReplicate(t3 - t2); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(T0, Tangent0, Result); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(T1, Tangent1, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + float p0 = 2.0f * t3 - 3.0f * t2 + 1.0f; + float t0 = t3 - 2.0f * t2 + t; + float p1 = -2.0f * t3 + 3.0f * t2; + float t1 = t3 - t2; + + XMVECTOR vResult = vmulq_n_f32(Position0, p0); + vResult = vmlaq_n_f32(vResult, Tangent0, t0); + vResult = vmlaq_n_f32(vResult, Position1, p1); + vResult = vmlaq_n_f32(vResult, Tangent1, t1); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f); + XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t); + XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2); + XMVECTOR T1 = _mm_set_ps1(t3 - t2); + + XMVECTOR vResult = _mm_mul_ps(P0, Position0); + vResult = XM_FMADD_PS(Tangent0, T0, vResult); + vResult = XM_FMADD_PS(Position1, P1, vResult); + vResult = XM_FMADD_PS(Tangent1, T1, vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorHermiteV +( + FXMVECTOR Position0, + FXMVECTOR Tangent0, + FXMVECTOR Position1, + GXMVECTOR Tangent1, + HXMVECTOR T +) noexcept +{ + // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + + // (t^3 - 2 * t^2 + t) * Tangent0 + + // (-2 * t^3 + 3 * t^2) * Position1 + + // (t^3 - t^2) * Tangent1 + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR T2 = XMVectorMultiply(T, T); + XMVECTOR T3 = XMVectorMultiply(T, T2); + + XMVECTOR P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f); + XMVECTOR T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]); + XMVECTOR P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]); + XMVECTOR T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(T0, Tangent0, Result); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(T1, Tangent1, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 CatMulT2 = { { { -3.0f, -2.0f, 3.0f, -1.0f } } }; + static const XMVECTORF32 CatMulT3 = { { { 2.0f, 1.0f, -2.0f, 1.0f } } }; + + XMVECTOR T2 = vmulq_f32(T, T); + XMVECTOR T3 = vmulq_f32(T, T2); + // Mul by the constants against t^2 + T2 = vmulq_f32(T2, CatMulT2); + // Mul by the constants against t^3 + T3 = vmlaq_f32(T2, T3, CatMulT3); + // T3 now has the pre-result. + // I need to add t.y only + T2 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T), g_XMMaskY)); + T3 = vaddq_f32(T3, T2); + // Add 1.0f to x + T3 = vaddq_f32(T3, g_XMIdentityR0); + // Now, I have the constants created + // Mul the x constant to Position0 + XMVECTOR vResult = vmulq_lane_f32(Position0, vget_low_f32(T3), 0); // T3[0] + // Mul the y constant to Tangent0 + vResult = vmlaq_lane_f32(vResult, Tangent0, vget_low_f32(T3), 1); // T3[1] + // Mul the z constant to Position1 + vResult = vmlaq_lane_f32(vResult, Position1, vget_high_f32(T3), 0); // T3[2] + // Mul the w constant to Tangent1 + vResult = vmlaq_lane_f32(vResult, Tangent1, vget_high_f32(T3), 1); // T3[3] + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 CatMulT2 = { { { -3.0f, -2.0f, 3.0f, -1.0f } } }; + static const XMVECTORF32 CatMulT3 = { { { 2.0f, 1.0f, -2.0f, 1.0f } } }; + + XMVECTOR T2 = _mm_mul_ps(T, T); + XMVECTOR T3 = _mm_mul_ps(T, T2); + // Mul by the constants against t^2 + T2 = _mm_mul_ps(T2, CatMulT2); + // Mul by the constants against t^3 + T3 = XM_FMADD_PS(T3, CatMulT3, T2); + // T3 now has the pre-result. + // I need to add t.y only + T2 = _mm_and_ps(T, g_XMMaskY); + T3 = _mm_add_ps(T3, T2); + // Add 1.0f to x + T3 = _mm_add_ps(T3, g_XMIdentityR0); + // Now, I have the constants created + // Mul the x constant to Position0 + XMVECTOR vResult = XM_PERMUTE_PS(T3, _MM_SHUFFLE(0, 0, 0, 0)); + vResult = _mm_mul_ps(vResult, Position0); + // Mul the y constant to Tangent0 + T2 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(1, 1, 1, 1)); + vResult = XM_FMADD_PS(T2, Tangent0, vResult); + // Mul the z constant to Position1 + T2 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(2, 2, 2, 2)); + vResult = XM_FMADD_PS(T2, Position1, vResult); + // Mul the w constant to Tangent1 + T3 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(3, 3, 3, 3)); + vResult = XM_FMADD_PS(T3, Tangent1, vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCatmullRom +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR Position3, + float t +) noexcept +{ + // Result = ((-t^3 + 2 * t^2 - t) * Position0 + + // (3 * t^3 - 5 * t^2 + 2) * Position1 + + // (-3 * t^3 + 4 * t^2 + t) * Position2 + + // (t^3 - t^2) * Position3) * 0.5 + +#if defined(_XM_NO_INTRINSICS_) + + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f); + XMVECTOR P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); + XMVECTOR P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); + XMVECTOR P3 = XMVectorReplicate((t3 - t2) * 0.5f); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(P2, Position2, Result); + Result = XMVectorMultiplyAdd(P3, Position3, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + float p0 = (-t3 + 2.0f * t2 - t) * 0.5f; + float p1 = (3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f; + float p2 = (-3.0f * t3 + 4.0f * t2 + t) * 0.5f; + float p3 = (t3 - t2) * 0.5f; + + XMVECTOR P1 = vmulq_n_f32(Position1, p1); + XMVECTOR P0 = vmlaq_n_f32(P1, Position0, p0); + XMVECTOR P3 = vmulq_n_f32(Position3, p3); + XMVECTOR P2 = vmlaq_n_f32(P3, Position2, p2); + P0 = vaddq_f32(P0, P2); + return P0; +#elif defined(_XM_SSE_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f); + XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); + XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); + XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f); + + P1 = _mm_mul_ps(Position1, P1); + P0 = XM_FMADD_PS(Position0, P0, P1); + P3 = _mm_mul_ps(Position3, P3); + P2 = XM_FMADD_PS(Position2, P2, P3); + P0 = _mm_add_ps(P0, P2); + return P0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCatmullRomV +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR Position3, + HXMVECTOR T +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float fx = T.vector4_f32[0]; + float fy = T.vector4_f32[1]; + float fz = T.vector4_f32[2]; + float fw = T.vector4_f32[3]; + XMVECTORF32 vResult = { { { + 0.5f * ((-fx * fx * fx + 2 * fx * fx - fx) * Position0.vector4_f32[0] + + (3 * fx * fx * fx - 5 * fx * fx + 2) * Position1.vector4_f32[0] + + (-3 * fx * fx * fx + 4 * fx * fx + fx) * Position2.vector4_f32[0] + + (fx * fx * fx - fx * fx) * Position3.vector4_f32[0]), + + 0.5f * ((-fy * fy * fy + 2 * fy * fy - fy) * Position0.vector4_f32[1] + + (3 * fy * fy * fy - 5 * fy * fy + 2) * Position1.vector4_f32[1] + + (-3 * fy * fy * fy + 4 * fy * fy + fy) * Position2.vector4_f32[1] + + (fy * fy * fy - fy * fy) * Position3.vector4_f32[1]), + + 0.5f * ((-fz * fz * fz + 2 * fz * fz - fz) * Position0.vector4_f32[2] + + (3 * fz * fz * fz - 5 * fz * fz + 2) * Position1.vector4_f32[2] + + (-3 * fz * fz * fz + 4 * fz * fz + fz) * Position2.vector4_f32[2] + + (fz * fz * fz - fz * fz) * Position3.vector4_f32[2]), + + 0.5f * ((-fw * fw * fw + 2 * fw * fw - fw) * Position0.vector4_f32[3] + + (3 * fw * fw * fw - 5 * fw * fw + 2) * Position1.vector4_f32[3] + + (-3 * fw * fw * fw + 4 * fw * fw + fw) * Position2.vector4_f32[3] + + (fw * fw * fw - fw * fw) * Position3.vector4_f32[3]) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Catmul2 = { { { 2.0f, 2.0f, 2.0f, 2.0f } } }; + static const XMVECTORF32 Catmul3 = { { { 3.0f, 3.0f, 3.0f, 3.0f } } }; + static const XMVECTORF32 Catmul4 = { { { 4.0f, 4.0f, 4.0f, 4.0f } } }; + static const XMVECTORF32 Catmul5 = { { { 5.0f, 5.0f, 5.0f, 5.0f } } }; + // Cache T^2 and T^3 + XMVECTOR T2 = vmulq_f32(T, T); + XMVECTOR T3 = vmulq_f32(T, T2); + // Perform the Position0 term + XMVECTOR vResult = vaddq_f32(T2, T2); + vResult = vsubq_f32(vResult, T); + vResult = vsubq_f32(vResult, T3); + vResult = vmulq_f32(vResult, Position0); + // Perform the Position1 term and add + XMVECTOR vTemp = vmulq_f32(T3, Catmul3); + vTemp = vmlsq_f32(vTemp, T2, Catmul5); + vTemp = vaddq_f32(vTemp, Catmul2); + vResult = vmlaq_f32(vResult, vTemp, Position1); + // Perform the Position2 term and add + vTemp = vmulq_f32(T2, Catmul4); + vTemp = vmlsq_f32(vTemp, T3, Catmul3); + vTemp = vaddq_f32(vTemp, T); + vResult = vmlaq_f32(vResult, vTemp, Position2); + // Position3 is the last term + T3 = vsubq_f32(T3, T2); + vResult = vmlaq_f32(vResult, T3, Position3); + // Multiply by 0.5f and exit + vResult = vmulq_f32(vResult, g_XMOneHalf); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Catmul2 = { { { 2.0f, 2.0f, 2.0f, 2.0f } } }; + static const XMVECTORF32 Catmul3 = { { { 3.0f, 3.0f, 3.0f, 3.0f } } }; + static const XMVECTORF32 Catmul4 = { { { 4.0f, 4.0f, 4.0f, 4.0f } } }; + static const XMVECTORF32 Catmul5 = { { { 5.0f, 5.0f, 5.0f, 5.0f } } }; + // Cache T^2 and T^3 + XMVECTOR T2 = _mm_mul_ps(T, T); + XMVECTOR T3 = _mm_mul_ps(T, T2); + // Perform the Position0 term + XMVECTOR vResult = _mm_add_ps(T2, T2); + vResult = _mm_sub_ps(vResult, T); + vResult = _mm_sub_ps(vResult, T3); + vResult = _mm_mul_ps(vResult, Position0); + // Perform the Position1 term and add + XMVECTOR vTemp = _mm_mul_ps(T3, Catmul3); + vTemp = XM_FNMADD_PS(T2, Catmul5, vTemp); + vTemp = _mm_add_ps(vTemp, Catmul2); + vResult = XM_FMADD_PS(vTemp, Position1, vResult); + // Perform the Position2 term and add + vTemp = _mm_mul_ps(T2, Catmul4); + vTemp = XM_FNMADD_PS(T3, Catmul3, vTemp); + vTemp = _mm_add_ps(vTemp, T); + vResult = XM_FMADD_PS(vTemp, Position2, vResult); + // Position3 is the last term + T3 = _mm_sub_ps(T3, T2); + vResult = XM_FMADD_PS(T3, Position3, vResult); + // Multiply by 0.5f and exit + vResult = _mm_mul_ps(vResult, g_XMOneHalf); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorBaryCentric +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + float f, + float g +) noexcept +{ + // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P10 = XMVectorSubtract(Position1, Position0); + XMVECTOR ScaleF = XMVectorReplicate(f); + + XMVECTOR P20 = XMVectorSubtract(Position2, Position0); + XMVECTOR ScaleG = XMVectorReplicate(g); + + XMVECTOR Result = XMVectorMultiplyAdd(P10, ScaleF, Position0); + Result = XMVectorMultiplyAdd(P20, ScaleG, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR R1 = vsubq_f32(Position1, Position0); + XMVECTOR R2 = vsubq_f32(Position2, Position0); + R1 = vmlaq_n_f32(Position0, R1, f); + return vmlaq_n_f32(R1, R2, g); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R1 = _mm_sub_ps(Position1, Position0); + XMVECTOR R2 = _mm_sub_ps(Position2, Position0); + XMVECTOR SF = _mm_set_ps1(f); + R1 = XM_FMADD_PS(R1, SF, Position0); + XMVECTOR SG = _mm_set_ps1(g); + return XM_FMADD_PS(R2, SG, R1); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorBaryCentricV +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR F, + HXMVECTOR G +) noexcept +{ + // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P10 = XMVectorSubtract(Position1, Position0); + XMVECTOR P20 = XMVectorSubtract(Position2, Position0); + + XMVECTOR Result = XMVectorMultiplyAdd(P10, F, Position0); + Result = XMVectorMultiplyAdd(P20, G, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR R1 = vsubq_f32(Position1, Position0); + XMVECTOR R2 = vsubq_f32(Position2, Position0); + R1 = vmlaq_f32(Position0, R1, F); + return vmlaq_f32(R1, R2, G); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R1 = _mm_sub_ps(Position1, Position0); + XMVECTOR R2 = _mm_sub_ps(Position2, Position0); + R1 = XM_FMADD_PS(R1, F, Position0); + return XM_FMADD_PS(R2, G, R1); +#endif +} + +/**************************************************************************** + * + * 2D Vector + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + // Comparison operations + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + // z and w are don't care + return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0); +#endif +} + + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2)); + uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0); + uint32_t CR = 0; + if (r == 0xFFFFFFFFFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + // z and w are don't care + int iTest = _mm_movemask_ps(vTemp) & 3; + uint32_t CR = 0; + if (iTest == 3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)), vget_low_u32(vreinterpretq_u32_f32(V2))); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3) == 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && + (V1.vector4_u32[1] == V2.vector4_u32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && + (V1.vector4_u32[1] != V2.vector4_u32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)), vget_low_u32(vreinterpretq_u32_f32(V2))); + uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0); + uint32_t CR = 0; + if (r == 0xFFFFFFFFFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3; + uint32_t CR = 0; + if (iTest == 3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]); + float dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]); + return ((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t vDelta = vsub_f32(vget_low_f32(V1), vget_low_f32(V2)); +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + uint32x2_t vTemp = vacle_f32(vDelta, vget_low_u32(Epsilon)); +#else + uint32x2_t vTemp = vcle_f32(vabs_f32(vDelta), vget_low_f32(Epsilon)); +#endif + uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0); + return (r == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1, V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp, vDelta); + vTemp = _mm_max_ps(vTemp, vDelta); + vTemp = _mm_cmple_ps(vTemp, Epsilon); + // z and w are don't care + return (((_mm_movemask_ps(vTemp) & 3) == 0x3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) != 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + // z and w are don't care + return (((_mm_movemask_ps(vTemp) & 3) != 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)), vget_low_u32(vreinterpretq_u32_f32(V2))); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) != 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3) != 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcgt_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + // z and w are don't care + return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && + (V1.vector4_f32[1] > V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && + (V1.vector4_f32[1] <= V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcgt_f32(vget_low_f32(V1), vget_low_f32(V2)); + uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0); + uint32_t CR = 0; + if (r == 0xFFFFFFFFFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp) & 3; + uint32_t CR = 0; + if (iTest == 3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcge_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcge_f32(vget_low_f32(V1), vget_low_f32(V2)); + uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0); + uint32_t CR = 0; + if (r == 0xFFFFFFFFFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp) & 3; + uint32_t CR = 0; + if (iTest == 3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vclt_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcle_f32(vget_low_f32(V1), vget_low_f32(V2)); + return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + float32x2_t B = vget_low_f32(Bounds); + // Test if less than or equal + uint32x2_t ivTemp1 = vcle_f32(VL, B); + // Negate the bounds + float32x2_t vTemp2 = vneg_f32(B); + // Test if greater or equal (Reversed) + uint32x2_t ivTemp2 = vcle_f32(vTemp2, VL); + // Blend answers + ivTemp1 = vand_u32(ivTemp1, ivTemp2); + // x and y in bounds? + return (vget_lane_u64(vreinterpret_u64_u32(ivTemp1), 0) == 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2, V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + // x and y in bounds? (z and w are don't care) + return (((_mm_movemask_ps(vTemp1) & 0x3) == 0x3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline bool XM_CALLCONV XMVector2IsNaN(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Test against itself. NaN is always not equal + uint32x2_t vTempNan = vceq_f32(VL, VL); + // If x or y are NaN, the mask is zero + return (vget_lane_u64(vreinterpret_u64_u32(vTempNan), 0) != 0xFFFFFFFFFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V, V); + // If x or y are NaN, the mask is non-zero + return ((_mm_movemask_ps(vTempNan) & 3) != 0); +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2IsInfinite(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x2_t vTemp = vand_u32(vget_low_u32(vreinterpretq_u32_f32(V)), vget_low_u32(g_XMAbsMask)); + // Compare to infinity + vTemp = vceq_f32(vreinterpret_f32_u32(vTemp), vget_low_f32(g_XMInfinity)); + // If any are infinity, the signs are true. + return vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) != 0; +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V, g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity); + // If x or z are infinity, the signs are true. + return ((_mm_movemask_ps(vTemp) & 3) != 0); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result; + Result.f[0] = + Result.f[1] = + Result.f[2] = + Result.f[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1]; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Perform the dot product on x and y + float32x2_t vTemp = vmul_f32(vget_low_f32(V1), vget_low_f32(V2)); + vTemp = vpadd_f32(vTemp, vTemp); + return vcombine_f32(vTemp, vTemp); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_dp_ps(V1, V2, 0x3f); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V1, V2); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_moveldup_ps(vDot); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V1, V2); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Cross +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ] + +#if defined(_XM_NO_INTRINSICS_) + float fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]); + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = fCross; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { { { 1.f, -1.f, 0, 0 } } }; + + float32x2_t vTemp = vmul_f32(vget_low_f32(V1), vrev64_f32(vget_low_f32(V2))); + vTemp = vmul_f32(vTemp, vget_low_f32(Negate)); + vTemp = vpadd_f32(vTemp, vTemp); + return vcombine_f32(vTemp, vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap x and y + XMVECTOR vResult = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 1, 0, 1)); + // Perform the muls + vResult = _mm_mul_ps(vResult, V1); + // Splat y + XMVECTOR vTemp = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(1, 1, 1, 1)); + // Sub the values + vResult = _mm_sub_ss(vResult, vTemp); + // Splat the cross product + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 0, 0, 0)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2LengthSq(FXMVECTOR V) noexcept +{ + return XMVector2Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + // Reciprocal sqrt (estimate) + vTemp = vrsqrte_f32(vTemp); + return vcombine_f32(vTemp, vTemp); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f); + return _mm_rsqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ss(vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + // Reciprocal sqrt + float32x2_t S0 = vrsqrte_f32(vTemp); + float32x2_t P0 = vmul_f32(vTemp, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(vTemp, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f); + XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp); + return _mm_div_ps(g_XMOne, vLengthSq); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ss(vTemp); + vLengthSq = _mm_div_ss(g_XMOne, vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = _mm_sqrt_ss(vLengthSq); + vLengthSq = _mm_div_ss(g_XMOne, vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2LengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorSqrtEst(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(vTemp, zero); + // Sqrt (estimate) + float32x2_t Result = vrsqrte_f32(vTemp); + Result = vmul_f32(vTemp, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ss(vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = _mm_sqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Length(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorSqrt(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(vTemp, zero); + // Sqrt + float32x2_t S0 = vrsqrte_f32(vTemp); + float32x2_t P0 = vmul_f32(vTemp, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(vTemp, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + Result = vmul_f32(vTemp, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ss(vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ +// XMVector2NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + // Reciprocal sqrt (estimate) + vTemp = vrsqrte_f32(vTemp); + // Normalize + float32x2_t Result = vmul_f32(VL, vTemp); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f); + XMVECTOR vResult = _mm_rsqrt_ps(vTemp); + return _mm_mul_ps(vResult, V); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + vLengthSq = _mm_mul_ps(vLengthSq, V); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + vLengthSq = _mm_mul_ps(vLengthSq, V); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Normalize(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR vResult = XMVector2Length(V); + float fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) + { + fLength = 1.0f / fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0] * fLength; + vResult.vector4_f32[1] = V.vector4_f32[1] * fLength; + vResult.vector4_f32[2] = V.vector4_f32[2] * fLength; + vResult.vector4_f32[3] = V.vector4_f32[3] * fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32(VL, VL); + vTemp = vpadd_f32(vTemp, vTemp); + uint32x2_t VEqualsZero = vceq_f32(vTemp, vdup_n_f32(0)); + uint32x2_t VEqualsInf = vceq_f32(vTemp, vget_low_f32(g_XMInfinity)); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + float32x2_t S0 = vrsqrte_f32(vTemp); + float32x2_t P0 = vmul_f32(vTemp, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(vTemp, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + vTemp = vmul_f32(S1, R1); + // Normalize + float32x2_t Result = vmul_f32(VL, vTemp); + Result = vbsl_f32(VEqualsZero, vdup_n_f32(0), Result); + Result = vbsl_f32(VEqualsInf, vget_low_f32(g_XMQNaN), Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0x3f); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE3_INTRINSICS_) + // Perform the dot product on x and y only + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_moveldup_ps(vLengthSq); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y only + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1)); + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) noexcept +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + return XMVector2ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) noexcept +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector2GreaterOrEqual(LengthMin, g_XMZero)); + assert(XMVector2GreaterOrEqual(LengthMax, g_XMZero)); + assert(XMVector2GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector2LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) noexcept +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result; + Result = XMVector2Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) noexcept +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector2RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +// Return the refraction of a 2D vector +inline XMVECTOR XM_CALLCONV XMVector2RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) noexcept +{ + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + +#if defined(_XM_NO_INTRINSICS_) + + float IDotN = (Incident.vector4_f32[0] * Normal.vector4_f32[0]) + (Incident.vector4_f32[1] * Normal.vector4_f32[1]); + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + float RY = 1.0f - (IDotN * IDotN); + float RX = 1.0f - (RY * RefractionIndex.vector4_f32[0] * RefractionIndex.vector4_f32[0]); + RY = 1.0f - (RY * RefractionIndex.vector4_f32[1] * RefractionIndex.vector4_f32[1]); + if (RX >= 0.0f) + { + RX = (RefractionIndex.vector4_f32[0] * Incident.vector4_f32[0]) - (Normal.vector4_f32[0] * ((RefractionIndex.vector4_f32[0] * IDotN) + sqrtf(RX))); + } + else + { + RX = 0.0f; + } + if (RY >= 0.0f) + { + RY = (RefractionIndex.vector4_f32[1] * Incident.vector4_f32[1]) - (Normal.vector4_f32[1] * ((RefractionIndex.vector4_f32[1] * IDotN) + sqrtf(RY))); + } + else + { + RY = 0.0f; + } + + XMVECTOR vResult; + vResult.vector4_f32[0] = RX; + vResult.vector4_f32[1] = RY; + vResult.vector4_f32[2] = 0.0f; + vResult.vector4_f32[3] = 0.0f; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t IL = vget_low_f32(Incident); + float32x2_t NL = vget_low_f32(Normal); + float32x2_t RIL = vget_low_f32(RefractionIndex); + // Get the 2D Dot product of Incident-Normal + float32x2_t vTemp = vmul_f32(IL, NL); + float32x2_t IDotN = vpadd_f32(vTemp, vTemp); + // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + vTemp = vmls_f32(vget_low_f32(g_XMOne), IDotN, IDotN); + vTemp = vmul_f32(vTemp, RIL); + vTemp = vmls_f32(vget_low_f32(g_XMOne), vTemp, RIL); + // If any terms are <=0, sqrt() will fail, punt to zero + uint32x2_t vMask = vcgt_f32(vTemp, vget_low_f32(g_XMZero)); + // Sqrt(vTemp) + float32x2_t S0 = vrsqrte_f32(vTemp); + float32x2_t P0 = vmul_f32(vTemp, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(vTemp, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t S2 = vmul_f32(S1, R1); + vTemp = vmul_f32(vTemp, S2); + // R = RefractionIndex * IDotN + sqrt(R) + vTemp = vmla_f32(vTemp, RIL, IDotN); + // Result = RefractionIndex * Incident - Normal * R + float32x2_t vResult = vmul_f32(RIL, IL); + vResult = vmls_f32(vResult, vTemp, NL); + vResult = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(vResult), vMask)); + return vcombine_f32(vResult, vResult); +#elif defined(_XM_SSE_INTRINSICS_) + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + // Get the 2D Dot product of Incident-Normal + XMVECTOR IDotN = XMVector2Dot(Incident, Normal); + // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR vTemp = XM_FNMADD_PS(IDotN, IDotN, g_XMOne); + vTemp = _mm_mul_ps(vTemp, RefractionIndex); + vTemp = XM_FNMADD_PS(vTemp, RefractionIndex, g_XMOne); + // If any terms are <=0, sqrt() will fail, punt to zero + XMVECTOR vMask = _mm_cmpgt_ps(vTemp, g_XMZero); + // R = RefractionIndex * IDotN + sqrt(R) + vTemp = _mm_sqrt_ps(vTemp); + vTemp = XM_FMADD_PS(RefractionIndex, IDotN, vTemp); + // Result = RefractionIndex * Incident - Normal * R + XMVECTOR vResult = _mm_mul_ps(RefractionIndex, Incident); + vResult = XM_FNMADD_PS(vTemp, Normal, vResult); + vResult = _mm_and_ps(vResult, vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Orthogonal(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + -V.vector4_f32[1], + V.vector4_f32[0], + 0.f, + 0.f + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { { { -1.f, 1.f, 0, 0 } } }; + const float32x2_t zero = vdup_n_f32(0); + + float32x2_t VL = vget_low_f32(V); + float32x2_t Result = vmul_f32(vrev64_f32(VL), vget_low_f32(Negate)); + return vcombine_f32(Result, zero); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1)); + vResult = _mm_mul_ps(vResult, g_XMNegateX); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector2Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector2Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne, g_XMOne); + Result = XMVectorACos(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + XMVECTOR L1 = XMVector2ReciprocalLength(V1); + XMVECTOR L2 = XMVector2ReciprocalLength(V2); + + XMVECTOR Dot = XMVector2Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2LinePointDistance +( + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2, + FXMVECTOR Point +) noexcept +{ + // Given a vector PointVector from LinePoint1 to Point and a vector + // LineVector from LinePoint1 to LinePoint2, the scaled distance + // PointProjectionScale from LinePoint1 to the perpendicular projection + // of PointVector onto the line is defined as: + // + // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) + + XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1); + XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1); + + XMVECTOR LengthSq = XMVector2LengthSq(LineVector); + + XMVECTOR PointProjectionScale = XMVector2Dot(PointVector, LineVector); + PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq); + + XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); + DistanceVector = XMVectorSubtract(PointVector, DistanceVector); + + return XMVector2Length(DistanceVector); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2IntersectLine +( + FXMVECTOR Line1Point1, + FXMVECTOR Line1Point2, + FXMVECTOR Line2Point1, + GXMVECTOR Line2Point2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR V1 = XMVectorSubtract(Line1Point2, Line1Point1); + XMVECTOR V2 = XMVectorSubtract(Line2Point2, Line2Point1); + XMVECTOR V3 = XMVectorSubtract(Line1Point1, Line2Point1); + + XMVECTOR C1 = XMVector2Cross(V1, V2); + XMVECTOR C2 = XMVector2Cross(V2, V3); + + XMVECTOR Result; + const XMVECTOR Zero = XMVectorZero(); + if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v)) + { + if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v)) + { + // Coincident + Result = g_XMInfinity.v; + } + else + { + // Parallel + Result = g_XMQNaN.v; + } + } + else + { + // Intersection point = Line1Point1 + V1 * (C2 / C1) + XMVECTOR Scale = XMVectorReciprocal(C1); + Scale = XMVectorMultiply(C2, Scale); + Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1); + } + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1); + XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1); + XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1); + // Generate the cross products + XMVECTOR C1 = XMVector2Cross(V1, V2); + XMVECTOR C2 = XMVector2Cross(V2, V3); + // If C1 is not close to epsilon, use the calculated value + XMVECTOR vResultMask = _mm_setzero_ps(); + vResultMask = _mm_sub_ps(vResultMask, C1); + vResultMask = _mm_max_ps(vResultMask, C1); + // 0xFFFFFFFF if the calculated value is to be used + vResultMask = _mm_cmpgt_ps(vResultMask, g_XMEpsilon); + // If C1 is close to epsilon, which fail type is it? INFINITY or NAN? + XMVECTOR vFailMask = _mm_setzero_ps(); + vFailMask = _mm_sub_ps(vFailMask, C2); + vFailMask = _mm_max_ps(vFailMask, C2); + vFailMask = _mm_cmple_ps(vFailMask, g_XMEpsilon); + XMVECTOR vFail = _mm_and_ps(vFailMask, g_XMInfinity); + vFailMask = _mm_andnot_ps(vFailMask, g_XMQNaN); + // vFail is NAN or INF + vFail = _mm_or_ps(vFail, vFailMask); + // Intersection point = Line1Point1 + V1 * (C2 / C1) + XMVECTOR vResult = _mm_div_ps(C2, C1); + vResult = XM_FMADD_PS(vResult, V1, Line1Point1); + // Use result, or failure value + vResult = _mm_and_ps(vResult, vResultMask); + vResultMask = _mm_andnot_ps(vResultMask, vFail); + vResult = _mm_or_ps(vResult, vResultMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Transform +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + float32x4_t Result = vmlaq_lane_f32(M.r[3], M.r[1], VL, 1); // Y + return vmlaq_lane_f32(Result, M.r[0], VL, 0); // X +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y + vResult = XM_FMADD_PS(vResult, M.r[1], M.r[3]); + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X + vResult = XM_FMADD_PS(vTemp, M.r[0], vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); + + assert(OutputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2(reinterpret_cast(pInputVector)); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3); + Result = XMVectorMultiplyAdd(X, row0, Result); + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" ) +#endif + + XMStoreFloat4(reinterpret_cast(pOutputVector), Result); + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT4))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x2_t V = vld2q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + float32x2_t r3 = vget_low_f32(row3); + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(row3); + r = vget_high_f32(row0); + XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O + XMVECTOR vResult3 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O + vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + float32x4x4_t R; + R.val[0] = vResult0; + R.val[1] = vResult1; + R.val[2] = vResult2; + R.val[3] = vResult3; + + vst4q_f32(reinterpret_cast(pOutputVector), R); + pOutputVector += sizeof(XMFLOAT4) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t V = vld1_f32(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32(row3, row0, V, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, V, 1); // Y + + vst1q_f32(reinterpret_cast(pOutputVector), vResult); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_AVX2_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + __m256 row0 = _mm256_broadcast_ps(&M.r[0]); + __m256 row1 = _mm256_broadcast_ps(&M.r[1]); + __m256 row3 = _mm256_broadcast_ps(&M.r[3]); + + if (InputStride == sizeof(XMFLOAT2)) + { + if (OutputStride == sizeof(XMFLOAT4)) + { + if (!(reinterpret_cast(pOutputStream) & 0x1F)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + X1 = _mm256_insertf128_ps(vTempA, _mm256_castps256_ps128(vTempA2), 1); + XM256_STREAM_PS(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT4) * 2; + + X2 = _mm256_insertf128_ps(vTempA2, _mm256_extractf128_ps(vTempA, 1), 0); + XM256_STREAM_PS(reinterpret_cast(pOutputVector), X2); + pOutputVector += sizeof(XMFLOAT4) * 2; + + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + X1 = _mm256_insertf128_ps(vTempA, _mm256_castps256_ps128(vTempA2), 1); + _mm256_storeu_ps(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT4) * 2; + + X2 = _mm256_insertf128_ps(vTempA2, _mm256_extractf128_ps(vTempA, 1), 0); + _mm256_storeu_ps(reinterpret_cast(pOutputVector), X2); + pOutputVector += sizeof(XMFLOAT4) * 2; + + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_castps256_ps128(vTempA)); + pOutputVector += OutputStride; + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_castps256_ps128(vTempA2)); + pOutputVector += OutputStride; + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_extractf128_ps(vTempA, 1)); + pOutputVector += OutputStride; + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_extractf128_ps(vTempA2, 1)); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + if (i < VectorCount) + { + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t two = VectorCount >> 1; + if (two > 0) + { + if (InputStride == sizeof(XMFLOAT2)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) + { + // Packed input, aligned output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = XM_FMADD_PS(Y, row1, row3); + vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 2; + } + } + else + { + // Packed input, unaligned output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = XM_FMADD_PS(Y, row1, row3); + vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 2; + } + } + } + } + + if (!(reinterpret_cast(pInputVector) & 0xF) && !(InputStride & 0xF)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) + { + // Aligned input, aligned output + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + else + { + // Aligned input, unaligned output + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + } + else + { + // Unaligned input + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2TransformCoord +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMVECTOR W = XMVectorSplatW(Result); + return XMVectorDivide(Result, W); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream +( + XMFLOAT2* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); + + assert(OutputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2(reinterpret_cast(pInputVector)); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMVECTOR W = XMVectorSplatW(Result); + + Result = XMVectorDivide(Result, W); + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" ) +#endif + + XMStoreFloat2(reinterpret_cast(pOutputVector), Result); + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x2_t V = vld2q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + float32x2_t r3 = vget_low_f32(row3); + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(row3); + r = vget_high_f32(row0); + XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + W = vmlaq_lane_f32(W, V.val[1], r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + V.val[0] = vdivq_f32(vResult0, W); + V.val[1] = vdivq_f32(vResult1, W); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + V.val[0] = vmulq_f32(vResult0, Reciprocal); + V.val[1] = vmulq_f32(vResult1, Reciprocal); +#endif + + vst2q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t V = vld1_f32(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32(row3, row0, V, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, V, 1); // Y + + V = vget_high_f32(vResult); + float32x2_t W = vdup_lane_f32(V, 1); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + V = vget_low_f32(vResult); + V = vdiv_f32(V, W); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x2_t Reciprocal = vrecpe_f32(W); + float32x2_t S = vrecps_f32(Reciprocal, W); + Reciprocal = vmul_f32(S, Reciprocal); + S = vrecps_f32(Reciprocal, W); + Reciprocal = vmul_f32(S, Reciprocal); + + V = vget_low_f32(vResult); + V = vmul_f32(V, Reciprocal); +#endif + + vst1_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_AVX2_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + __m256 row0 = _mm256_broadcast_ps(&M.r[0]); + __m256 row1 = _mm256_broadcast_ps(&M.r[1]); + __m256 row3 = _mm256_broadcast_ps(&M.r[3]); + + if (InputStride == sizeof(XMFLOAT2)) + { + if (OutputStride == sizeof(XMFLOAT2)) + { + if (!(reinterpret_cast(pOutputStream) & 0x1F)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + __m256 W = _mm256_shuffle_ps(vTempA, vTempA, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA = _mm256_div_ps(vTempA, W); + + W = _mm256_shuffle_ps(vTempA2, vTempA2, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA2 = _mm256_div_ps(vTempA2, W); + + X1 = _mm256_shuffle_ps(vTempA, vTempA2, 0x44); + XM256_STREAM_PS(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + __m256 W = _mm256_shuffle_ps(vTempA, vTempA, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA = _mm256_div_ps(vTempA, W); + + W = _mm256_shuffle_ps(vTempA2, vTempA2, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA2 = _mm256_div_ps(vTempA2, W); + + X1 = _mm256_shuffle_ps(vTempA, vTempA2, 0x44); + _mm256_storeu_ps(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3); + __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3); + __m256 vTempA = _mm256_mul_ps(X1, row0); + __m256 vTempA2 = _mm256_mul_ps(X2, row0); + vTempA = _mm256_add_ps(vTempA, vTempB); + vTempA2 = _mm256_add_ps(vTempA2, vTempB2); + + __m256 W = _mm256_shuffle_ps(vTempA, vTempA, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA = _mm256_div_ps(vTempA, W); + + W = _mm256_shuffle_ps(vTempA2, vTempA2, _MM_SHUFFLE(3, 3, 3, 3)); + vTempA2 = _mm256_div_ps(vTempA2, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_castps256_ps128(vTempA))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_castps256_ps128(vTempA2))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_extractf128_ps(vTempA, 1))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_extractf128_ps(vTempA2, 1))); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + if (i < VectorCount) + { + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t two = VectorCount >> 1; + if (two > 0) + { + if (InputStride == sizeof(XMFLOAT2)) + { + if (OutputStride == sizeof(XMFLOAT2)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + XMVECTOR V1 = _mm_div_ps(vTemp, W); + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = XM_FMADD_PS(Y, row1, row3); + vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + XMVECTOR V2 = _mm_div_ps(vTemp, W); + + vTemp = _mm_movelh_ps(V1, V2); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += sizeof(XMFLOAT2) * 2; + + i += 2; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + XMVECTOR V1 = _mm_div_ps(vTemp, W); + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = XM_FMADD_PS(Y, row1, row3); + vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + XMVECTOR V2 = _mm_div_ps(vTemp, W); + + vTemp = _mm_movelh_ps(V1, V2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += sizeof(XMFLOAT2) * 2; + + i += 2; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = XM_FMADD_PS(Y, row1, row3); + vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + + i += 2; + } + } + } + } + + if (!(reinterpret_cast(pInputVector) & 0xF) && !(InputStride & 0xF)) + { + // Aligned input + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3); + XMVECTOR vTemp2 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2TransformNormal +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Y, M.r[1]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + float32x4_t Result = vmulq_lane_f32(M.r[1], VL, 1); // Y + return vmlaq_lane_f32(Result, M.r[0], VL, 0); // X +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y + vResult = _mm_mul_ps(vResult, M.r[1]); + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X + vResult = XM_FMADD_PS(vTemp, M.r[0], vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream +( + XMFLOAT2* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); + + assert(OutputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2(reinterpret_cast(pInputVector)); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Y, row1); + Result = XMVectorMultiplyAdd(X, row0, Result); + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" ) +#endif + + XMStoreFloat2(reinterpret_cast(pOutputVector), Result); + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x2_t V = vld2q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0); // Ax + XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1); // Bx + + XM_PREFETCH(pInputVector); + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + V.val[0] = vResult0; + V.val[1] = vResult1; + + vst2q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t V = vld1_f32(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vResult = vmulq_lane_f32(row0, V, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, V, 1); // Y + + V = vget_low_f32(vResult); + vst1_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_AVX2_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + __m256 row0 = _mm256_broadcast_ps(&M.r[0]); + __m256 row1 = _mm256_broadcast_ps(&M.r[1]); + + if (InputStride == sizeof(XMFLOAT2)) + { + if (OutputStride == sizeof(XMFLOAT2)) + { + if (!(reinterpret_cast(pOutputStream) & 0x1F)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempA = _mm256_mul_ps(Y1, row1); + __m256 vTempB = _mm256_mul_ps(Y2, row1); + vTempA = _mm256_fmadd_ps(X1, row0, vTempA); + vTempB = _mm256_fmadd_ps(X2, row0, vTempB); + + X1 = _mm256_shuffle_ps(vTempA, vTempB, 0x44); + XM256_STREAM_PS(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempA = _mm256_mul_ps(Y1, row1); + __m256 vTempB = _mm256_mul_ps(Y2, row1); + vTempA = _mm256_fmadd_ps(X1, row0, vTempA); + vTempB = _mm256_fmadd_ps(X2, row0, vTempB); + + X1 = _mm256_shuffle_ps(vTempA, vTempB, 0x44); + _mm256_storeu_ps(reinterpret_cast(pOutputVector), X1); + pOutputVector += sizeof(XMFLOAT2) * 4; + + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 4; + + __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + + __m256 vTempA = _mm256_mul_ps(Y1, row1); + __m256 vTempB = _mm256_mul_ps(Y2, row1); + vTempA = _mm256_fmadd_ps(X1, row0, vTempA); + vTempB = _mm256_fmadd_ps(X2, row0, vTempB); + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_castps256_ps128(vTempA))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_castps256_ps128(vTempB))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_extractf128_ps(vTempA, 1))); + pOutputVector += OutputStride; + + _mm_store_sd(reinterpret_cast(pOutputVector), + _mm_castps_pd(_mm256_extractf128_ps(vTempB, 1))); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + if (i < VectorCount) + { + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + vTemp = XM_FMADD_PS(X, row0, vTemp); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + size_t i = 0; + size_t two = VectorCount >> 1; + if (two > 0) + { + if (InputStride == sizeof(XMFLOAT2)) + { + if (OutputStride == sizeof(XMFLOAT2)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + XMVECTOR V1 = XM_FMADD_PS(X, row0, vTemp); + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = _mm_mul_ps(Y, row1); + XMVECTOR V2 = XM_FMADD_PS(X, row0, vTemp); + + vTemp = _mm_movelh_ps(V1, V2); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += sizeof(XMFLOAT2) * 2; + + i += 2; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + XMVECTOR V1 = XM_FMADD_PS(X, row0, vTemp); + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = _mm_mul_ps(Y, row1); + XMVECTOR V2 = XM_FMADD_PS(X, row0, vTemp); + + vTemp = _mm_movelh_ps(V1, V2); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += sizeof(XMFLOAT2) * 2; + + i += 2; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT2) * 2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + vTemp = XM_FMADD_PS(X, row0, vTemp); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + + // Result 2 + Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + + vTemp = _mm_mul_ps(Y, row1); + vTemp = XM_FMADD_PS(X, row0, vTemp); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + + i += 2; + } + } + } + } + + if (!(reinterpret_cast(pInputVector) & 0xF) && !(InputStride & 0xF)) + { + // Aligned input + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + vTemp = XM_FMADD_PS(X, row0, vTemp); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input + for (; i < VectorCount; i++) + { + __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(pInputVector))); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Y, row1); + vTemp = XM_FMADD_PS(X, row0, vTemp); + + _mm_store_sd(reinterpret_cast(pOutputVector), _mm_castps_pd(vTemp)); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +/**************************************************************************** + * + * 3D Vector + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + // Comparison operations + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1]) && + (V1.vector4_f32[2] == V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1]) && + (V1.vector4_f32[2] != V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if (r == 0xFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp) & 7; + uint32_t CR = 0; + if (iTest == 7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && + (V1.vector4_u32[1] == V2.vector4_u32[1]) && + (V1.vector4_u32[2] == V2.vector4_u32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && + (V1.vector4_u32[1] != V2.vector4_u32[1]) && + (V1.vector4_u32[2] != V2.vector4_u32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if (r == 0xFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + int iTemp = _mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7; + uint32_t CR = 0; + if (iTemp == 7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTemp) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float dx, dy, dz; + + dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]); + dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]); + dz = fabsf(V1.vector4_f32[2] - V2.vector4_f32[2]); + return (((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1]) && + (dz <= Epsilon.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vDelta = vsubq_f32(V1, V2); +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + uint32x4_t vResult = vacleq_f32(vDelta, Epsilon); +#else + uint32x4_t vResult = vcleq_f32(vabsq_f32(vDelta), Epsilon); +#endif + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1, V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp, vDelta); + vTemp = _mm_max_ps(vTemp, vDelta); + vTemp = _mm_cmple_ps(vTemp, Epsilon); + // w is don't care + return (((_mm_movemask_ps(vTemp) & 7) == 0x7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) != 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7) != 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && + (V1.vector4_f32[1] > V2.vector4_f32[1]) && + (V1.vector4_f32[2] > V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && + (V1.vector4_f32[1] <= V2.vector4_f32[1]) && + (V1.vector4_f32[2] <= V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if (r == 0xFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp) & 7; + if (iTest == 7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1]) && + (V1.vector4_f32[2] >= V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1]) && + (V1.vector4_f32[2] < V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if (r == 0xFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp) & 7; + if (iTest == 7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcltq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcleq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1, V2); + return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + uint32x4_t ivTemp1 = vcleq_f32(V, Bounds); + // Negate the bounds + float32x4_t vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + uint32x4_t ivTemp2 = vcleq_f32(vTemp2, V); + // Blend answers + ivTemp1 = vandq_u32(ivTemp1, ivTemp2); + // in bounds? + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(ivTemp1)), vget_high_u8(vreinterpretq_u8_u32(ivTemp1))); + uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2, V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + // x,y and z in bounds? (w is don't care) + return (((_mm_movemask_ps(vTemp1) & 0x7) == 0x7) != 0); +#else + return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds)); +#endif +} + +//------------------------------------------------------------------------------ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline bool XM_CALLCONV XMVector3IsNaN(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1]) || + XMISNAN(V.vector4_f32[2])); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test against itself. NaN is always not equal + uint32x4_t vTempNan = vceqq_f32(V, V); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempNan)), vget_high_u8(vreinterpretq_u8_u32(vTempNan))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + // If x or y or z are NaN, the mask is zero + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0xFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V, V); + // If x or y or z are NaN, the mask is non-zero + return ((_mm_movemask_ps(vTempNan) & 7) != 0); +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3IsInfinite(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1]) || + XMISINF(V.vector4_f32[2])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x4_t vTempInf = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + // Compare to infinity + vTempInf = vceqq_f32(vreinterpretq_f32_u32(vTempInf), g_XMInfinity); + // If any are infinity, the signs are true. + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempInf)), vget_high_u8(vreinterpretq_u8_u32(vTempInf))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V, g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity); + // If x,y or z are infinity, the signs are true. + return ((_mm_movemask_ps(vTemp) & 7) != 0); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2]; + XMVECTORF32 vResult; + vResult.f[0] = + vResult.f[1] = + vResult.f[2] = + vResult.f[3] = fValue; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vTemp = vmulq_f32(V1, V2); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + return vcombine_f32(v1, v1); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_dp_ps(V1, V2, 0x7f); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vTemp = _mm_mul_ps(V1, V2); + vTemp = _mm_and_ps(vTemp, g_XMMask3); + vTemp = _mm_hadd_ps(vTemp, vTemp); + return _mm_hadd_ps(vTemp, vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V1, V2); + // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2] + XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1)); + // Result.vector4_f32[0] = x+y + vDot = _mm_add_ss(vDot, vTemp); + // x=Dot.vector4_f32[2] + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // Result.vector4_f32[0] = (x+y)+z + vDot = _mm_add_ss(vDot, vTemp); + // Splat x + return XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0)); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Cross +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + // [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ] + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]), + (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]), + (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]), + 0.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t v1xy = vget_low_f32(V1); + float32x2_t v2xy = vget_low_f32(V2); + + float32x2_t v1yx = vrev64_f32(v1xy); + float32x2_t v2yx = vrev64_f32(v2xy); + + float32x2_t v1zz = vdup_lane_f32(vget_high_f32(V1), 0); + float32x2_t v2zz = vdup_lane_f32(vget_high_f32(V2), 0); + + XMVECTOR vResult = vmulq_f32(vcombine_f32(v1yx, v1xy), vcombine_f32(v2zz, v2yx)); + vResult = vmlsq_f32(vResult, vcombine_f32(v1zz, v1yx), vcombine_f32(v2yx, v2xy)); + vResult = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(vResult), g_XMFlipY)); + return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vResult), g_XMMask3)); +#elif defined(_XM_SSE_INTRINSICS_) + // y1,z1,x1,w1 + XMVECTOR vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(3, 0, 2, 1)); + // z2,x2,y2,w2 + XMVECTOR vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(3, 1, 0, 2)); + // Perform the left operation + XMVECTOR vResult = _mm_mul_ps(vTemp1, vTemp2); + // z1,x1,y1,w1 + vTemp1 = XM_PERMUTE_PS(vTemp1, _MM_SHUFFLE(3, 0, 2, 1)); + // y2,z2,x2,w2 + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(3, 1, 0, 2)); + // Perform the right operation + vResult = XM_FNMADD_PS(vTemp1, vTemp2, vResult); + // Set w to zero + return _mm_and_ps(vResult, g_XMMask3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3LengthSq(FXMVECTOR V) noexcept +{ + return XMVector3Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32(v1); + return vcombine_f32(v2, v2); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f); + return _mm_rsqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the reciprocal + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + // Reciprocal sqrt + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f); + XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp); + return _mm_div_ps(g_XMOne, vLengthSq); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V, V); + vDot = _mm_and_ps(vDot, g_XMMask3); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_sqrt_ps(vDot); + vDot = _mm_div_ps(g_XMOne, vDot); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V, V); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot, vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot, vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the reciprocal + vDot = _mm_sqrt_ps(vDot); + // Get the reciprocal + vDot = _mm_div_ps(g_XMOne, vDot); + return vDot; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3LengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(v1, zero); + // Sqrt (estimate) + float32x2_t Result = vrsqrte_f32(v1); + Result = vmul_f32(v1, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Length(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(v1, zero); + // Sqrt + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + Result = vmul_f32(v1, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ +// XMVector3NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector3ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32(v1); + // Normalize + return vmulq_f32(V, vcombine_f32(v2, v2)); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f); + XMVECTOR vResult = _mm_rsqrt_ps(vTemp); + return _mm_mul_ps(vResult, V); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V, V); + vDot = _mm_and_ps(vDot, g_XMMask3); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_rsqrt_ps(vDot); + vDot = _mm_mul_ps(vDot, V); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V, V); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot, vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot, vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0)); + // Get the reciprocal + vDot = _mm_rsqrt_ps(vDot); + // Perform the normalization + vDot = _mm_mul_ps(vDot, V); + return vDot; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Normalize(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float fLength; + XMVECTOR vResult; + + vResult = XMVector3Length(V); + fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) + { + fLength = 1.0f / fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0] * fLength; + vResult.vector4_f32[1] = V.vector4_f32[1] * fLength; + vResult.vector4_f32[2] = V.vector4_f32[2] * fLength; + vResult.vector4_f32[3] = V.vector4_f32[3] * fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vpadd_f32(v1, v1); + v2 = vdup_lane_f32(v2, 0); + v1 = vadd_f32(v1, v2); + uint32x2_t VEqualsZero = vceq_f32(v1, vdup_n_f32(0)); + uint32x2_t VEqualsInf = vceq_f32(v1, vget_low_f32(g_XMInfinity)); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + v2 = vmul_f32(S1, R1); + // Normalize + XMVECTOR vResult = vmulq_f32(V, vcombine_f32(v2, v2)); + vResult = vbslq_f32(vcombine_u32(VEqualsZero, VEqualsZero), vdupq_n_f32(0), vResult); + return vbslq_f32(vcombine_u32(VEqualsInf, VEqualsInf), g_XMQNaN, vResult); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0x7f); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE3_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1)); + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1)); + vLengthSq = _mm_add_ss(vLengthSq, vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) noexcept +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + + return XMVector3ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) noexcept +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector3GreaterOrEqual(LengthMin, XMVectorZero())); + assert(XMVector3GreaterOrEqual(LengthMax, XMVectorZero())); + assert(XMVector3GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector3LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) noexcept +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result = XMVector3Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) noexcept +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector3RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) noexcept +{ + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR IDotN = XMVector3Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); + R = XMVectorMultiply(R, RefractionIndex); + R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); + + if (XMVector4LessOrEqual(R, Zero)) + { + // Total internal reflection + return Zero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = XMVectorSqrt(R); + R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); + + // Result = RefractionIndex * Incident - Normal * R + XMVECTOR Result = XMVectorMultiply(RefractionIndex, Incident); + Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); + + return Result; + } + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR IDotN = XMVector3Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + float32x4_t R = vmlsq_f32(g_XMOne, IDotN, IDotN); + R = vmulq_f32(R, RefractionIndex); + R = vmlsq_f32(g_XMOne, R, RefractionIndex); + + uint32x4_t isrzero = vcleq_f32(R, g_XMZero); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(isrzero)), vget_high_u8(vreinterpretq_u8_u32(isrzero))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + + float32x4_t vResult; + if (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // Sqrt(R) + float32x4_t S0 = vrsqrteq_f32(R); + float32x4_t P0 = vmulq_f32(R, S0); + float32x4_t R0 = vrsqrtsq_f32(P0, S0); + float32x4_t S1 = vmulq_f32(S0, R0); + float32x4_t P1 = vmulq_f32(R, S1); + float32x4_t R1 = vrsqrtsq_f32(P1, S1); + float32x4_t S2 = vmulq_f32(S1, R1); + R = vmulq_f32(R, S2); + // R = RefractionIndex * IDotN + sqrt(R) + R = vmlaq_f32(R, RefractionIndex, IDotN); + // Result = RefractionIndex * Incident - Normal * R + vResult = vmulq_f32(RefractionIndex, Incident); + vResult = vmlsq_f32(vResult, R, Normal); + } + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + XMVECTOR IDotN = XMVector3Dot(Incident, Normal); + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = XM_FNMADD_PS(IDotN, IDotN, g_XMOne); + XMVECTOR R2 = _mm_mul_ps(RefractionIndex, RefractionIndex); + R = XM_FNMADD_PS(R, R2, g_XMOne); + + XMVECTOR vResult = _mm_cmple_ps(R, g_XMZero); + if (_mm_movemask_ps(vResult) == 0x0f) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = _mm_sqrt_ps(R); + R = XM_FMADD_PS(RefractionIndex, IDotN, R); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex, Incident); + vResult = XM_FNMADD_PS(R, Normal, vResult); + } + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Orthogonal(FXMVECTOR V) noexcept +{ + XMVECTOR Zero = XMVectorZero(); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR YZYY = XMVectorSwizzle(V); + + XMVECTOR NegativeV = XMVectorSubtract(Zero, V); + + XMVECTOR ZIsNegative = XMVectorLess(Z, Zero); + XMVECTOR YZYYIsNegative = XMVectorLess(YZYY, Zero); + + XMVECTOR S = XMVectorAdd(YZYY, Z); + XMVECTOR D = XMVectorSubtract(YZYY, Z); + + XMVECTOR Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative); + + XMVECTOR R0 = XMVectorPermute(NegativeV, S); + XMVECTOR R1 = XMVectorPermute(V, D); + + return XMVectorSelect(R1, R0, Select); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector3Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector3Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACos(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + XMVECTOR L1 = XMVector3ReciprocalLength(V1); + XMVECTOR L2 = XMVector3ReciprocalLength(V2); + + XMVECTOR Dot = XMVector3Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3LinePointDistance +( + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2, + FXMVECTOR Point +) noexcept +{ + // Given a vector PointVector from LinePoint1 to Point and a vector + // LineVector from LinePoint1 to LinePoint2, the scaled distance + // PointProjectionScale from LinePoint1 to the perpendicular projection + // of PointVector onto the line is defined as: + // + // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) + + XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1); + XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1); + + XMVECTOR LengthSq = XMVector3LengthSq(LineVector); + + XMVECTOR PointProjectionScale = XMVector3Dot(PointVector, LineVector); + PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq); + + XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); + DistanceVector = XMVectorSubtract(PointVector, DistanceVector); + + return XMVector3Length(DistanceVector); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XM_CALLCONV XMVector3ComponentsFromNormal +( + XMVECTOR* pParallel, + XMVECTOR* pPerpendicular, + FXMVECTOR V, + FXMVECTOR Normal +) noexcept +{ + assert(pParallel != nullptr); + assert(pPerpendicular != nullptr); + + XMVECTOR Scale = XMVector3Dot(V, Normal); + + XMVECTOR Parallel = XMVectorMultiply(Normal, Scale); + + *pParallel = Parallel; + *pPerpendicular = XMVectorSubtract(V, Parallel); +} + +//------------------------------------------------------------------------------ +// Transform a vector using a rotation expressed as a unit quaternion + +inline XMVECTOR XM_CALLCONV XMVector3Rotate +( + FXMVECTOR V, + FXMVECTOR RotationQuaternion +) noexcept +{ + XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); + XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion); + XMVECTOR Result = XMQuaternionMultiply(Q, A); + return XMQuaternionMultiply(Result, RotationQuaternion); +} + +//------------------------------------------------------------------------------ +// Transform a vector using the inverse of a rotation expressed as a unit quaternion + +inline XMVECTOR XM_CALLCONV XMVector3InverseRotate +( + FXMVECTOR V, + FXMVECTOR RotationQuaternion +) noexcept +{ + XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); + XMVECTOR Result = XMQuaternionMultiply(RotationQuaternion, A); + XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion); + return XMQuaternionMultiply(Result, Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Transform +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + XMVECTOR vResult = vmlaq_lane_f32(M.r[3], M.r[0], VL, 0); // X + vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1); // Y + return vmlaq_lane_f32(vResult, M.r[2], vget_high_f32(V), 0); // Z +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z + vResult = XM_FMADD_PS(vResult, M.r[2], M.r[3]); + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y + vResult = XM_FMADD_PS(vTemp, M.r[1], vResult); + vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X + vResult = XM_FMADD_PS(vTemp, M.r[0], vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMVector3TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMStoreFloat4(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT4))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT3) * 4; + + float32x2_t r3 = vget_low_f32(row3); + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(row3); + r = vget_high_f32(row0); + XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O + XMVECTOR vResult3 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O + vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + r = vget_low_f32(row2); + vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(row2); + vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz+O + vResult3 = vmlaq_lane_f32(vResult3, V.val[2], r, 1); // Dx+Hy+Lz+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + + float32x4x4_t R; + R.val[0] = vResult0; + R.val[1] = vResult1; + R.val[2] = vResult2; + R.val[3] = vResult3; + + vst4q_f32(reinterpret_cast(pOutputVector), R); + pOutputVector += sizeof(XMFLOAT4) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32(reinterpret_cast(pInputVector)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32(reinterpret_cast(pInputVector) + 2, zero, 0); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32(row3, row0, VL, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y + vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z + + vst1q_f32(reinterpret_cast(pOutputVector), vResult); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) + { + // Packed input, aligned output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + else + { + // Packed input, unaligned output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) + { + // Aligned output + for (; i < VectorCount; ++i) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned output + for (; i < VectorCount; ++i) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3TransformCoord +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMVECTOR W = XMVectorSplatW(Result); + return XMVectorDivide(Result, W); +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMVECTOR W = XMVectorSplatW(Result); + + Result = XMVectorDivide(Result, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT3) * 4; + + float32x2_t r3 = vget_low_f32(row3); + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(row3); + r = vget_high_f32(row0); + XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O + XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O + W = vmlaq_lane_f32(W, V.val[1], r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + r = vget_low_f32(row2); + vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(row2); + vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz+O + W = vmlaq_lane_f32(W, V.val[2], r, 1); // Dx+Hy+Lz+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + V.val[0] = vdivq_f32(vResult0, W); + V.val[1] = vdivq_f32(vResult1, W); + V.val[2] = vdivq_f32(vResult2, W); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + V.val[0] = vmulq_f32(vResult0, Reciprocal); + V.val[1] = vmulq_f32(vResult1, Reciprocal); + V.val[2] = vmulq_f32(vResult2, Reciprocal); +#endif + + vst3q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT3) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32(reinterpret_cast(pInputVector)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32(reinterpret_cast(pInputVector) + 2, zero, 0); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32(row3, row0, VL, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y + vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z + + VH = vget_high_f32(vResult); + XMVECTOR W = vdupq_lane_f32(VH, 1); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + vResult = vdivq_f32(vResult, W); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + vResult = vmulq_f32(vResult, Reciprocal); +#endif + + VL = vget_low_f32(vResult); + vst1_f32(reinterpret_cast(pOutputVector), VL); + vst1q_lane_f32(reinterpret_cast(pOutputVector) + 2, vResult, 2); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V1 = _mm_div_ps(vTemp, W); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V2 = _mm_div_ps(vTemp, W); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V3 = _mm_div_ps(vTemp, W); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V4 = _mm_div_ps(vTemp, W); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector), V1); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 16), vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V1 = _mm_div_ps(vTemp, W); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V2 = _mm_div_ps(vTemp, W); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V3 = _mm_div_ps(vTemp, W); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + V4 = _mm_div_ps(vTemp, W); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector), V1); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 16), vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, row2, row3); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3TransformNormal +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Z, M.r[2]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + XMVECTOR vResult = vmulq_lane_f32(M.r[0], VL, 0); // X + vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1); // Y + return vmlaq_lane_f32(vResult, M.r[2], vget_high_f32(V), 0); // Z +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z + vResult = _mm_mul_ps(vResult, M.r[2]); + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y + vResult = XM_FMADD_PS(vTemp, M.r[1], vResult); + vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X + vResult = XM_FMADD_PS(vTemp, M.r[0], vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Z, row2); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMStoreFloat3(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT3) * 4; + + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0); // Ax + XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1); // Bx + + XM_PREFETCH(pInputVector); + + r = vget_high_f32(row0); + XMVECTOR vResult2 = vmulq_lane_f32(V.val[0], r, 0); // Cx + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + r = vget_low_f32(row2); + vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz + vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(row2); + vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + + V.val[0] = vResult0; + V.val[1] = vResult1; + V.val[2] = vResult2; + + vst3q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT3) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32(reinterpret_cast(pInputVector)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32(reinterpret_cast(pInputVector) + 2, zero, 0); + pInputVector += InputStride; + + XMVECTOR vResult = vmulq_lane_f32(row0, VL, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y + vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z + + VL = vget_low_f32(vResult); + vst1_f32(reinterpret_cast(pOutputVector), VL); + vst1q_lane_f32(reinterpret_cast(pOutputVector) + 2, vResult, 2); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Z, row2); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V1 = _mm_add_ps(vTemp, vTemp3); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V2 = _mm_add_ps(vTemp, vTemp3); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V3 = _mm_add_ps(vTemp, vTemp3); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V4 = _mm_add_ps(vTemp, vTemp3); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector), V1); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 16), vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Z, row2); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V1 = _mm_add_ps(vTemp, vTemp3); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V2 = _mm_add_ps(vTemp, vTemp3); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V3 = _mm_add_ps(vTemp, vTemp3); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + V4 = _mm_add_ps(vTemp, vTemp3); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector), V1); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 16), vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Z, row2); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = _mm_mul_ps(Z, row2); + vTemp2 = _mm_mul_ps(Y, row1); + vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = _mm_mul_ps(Z, row2); + XMVECTOR vTemp2 = _mm_mul_ps(Y, row1); + XMVECTOR vTemp3 = _mm_mul_ps(X, row0); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Project +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) noexcept +{ + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + XMVECTOR Result = XMVector3TransformCoord(V, Transform); + + Result = XMVectorMultiplyAdd(Result, Scale, Offset); + + return Result; +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + + XMVECTOR Result = XMVector3TransformCoord(V, Transform); + Result = XMVectorMultiplyAdd(Result, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + XMVECTOR ScaleX = vdupq_n_f32(HalfViewportWidth); + XMVECTOR ScaleY = vdupq_n_f32(-HalfViewportHeight); + XMVECTOR ScaleZ = vdupq_n_f32(ViewportMaxZ - ViewportMinZ); + + XMVECTOR OffsetX = vdupq_n_f32(ViewportX + HalfViewportWidth); + XMVECTOR OffsetY = vdupq_n_f32(ViewportY + HalfViewportHeight); + XMVECTOR OffsetZ = vdupq_n_f32(ViewportMinZ); + + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT3) * 4; + + float32x2_t r3 = vget_low_f32(Transform.r[3]); + float32x2_t r = vget_low_f32(Transform.r[0]); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(Transform.r[3]); + r = vget_high_f32(Transform.r[0]); + XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O + XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(Transform.r[1]); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(Transform.r[1]); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O + W = vmlaq_lane_f32(W, V.val[1], r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + r = vget_low_f32(Transform.r[2]); + vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(Transform.r[2]); + vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz+O + W = vmlaq_lane_f32(W, V.val[2], r, 1); // Dx+Hy+Lz+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + vResult0 = vdivq_f32(vResult0, W); + vResult1 = vdivq_f32(vResult1, W); + vResult2 = vdivq_f32(vResult2, W); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + vResult0 = vmulq_f32(vResult0, Reciprocal); + vResult1 = vmulq_f32(vResult1, Reciprocal); + vResult2 = vmulq_f32(vResult2, Reciprocal); +#endif + + V.val[0] = vmlaq_f32(OffsetX, vResult0, ScaleX); + V.val[1] = vmlaq_f32(OffsetY, vResult1, ScaleY); + V.val[2] = vmlaq_f32(OffsetZ, vResult2, ScaleZ); + + vst3q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT3) * 4; + + i += 4; + } + } + } + + if (i < VectorCount) + { + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32(reinterpret_cast(pInputVector)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32(reinterpret_cast(pInputVector) + 2, zero, 0); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32(Transform.r[3], Transform.r[0], VL, 0); // X + vResult = vmlaq_lane_f32(vResult, Transform.r[1], VL, 1); // Y + vResult = vmlaq_lane_f32(vResult, Transform.r[2], VH, 0); // Z + + VH = vget_high_f32(vResult); + XMVECTOR W = vdupq_lane_f32(VH, 1); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + vResult = vdivq_f32(vResult, W); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + vResult = vmulq_f32(vResult, Reciprocal); +#endif + + vResult = vmlaq_f32(Offset, vResult, Scale); + + VL = vget_low_f32(vResult); + vst1_f32(reinterpret_cast(pOutputVector), VL); + vst1q_lane_f32(reinterpret_cast(pOutputVector) + 2, vResult, 2); + pOutputVector += OutputStride; + } + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V1 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V2 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V3 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V4 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector), V1); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 16), vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V1 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V2 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V3 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + V4 = XM_FMADD_PS(vTemp, Scale, Offset); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector), V1); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 16), vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + vTemp = XM_FMADD_PS(vTemp, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + vTemp = XM_FMADD_PS(vTemp, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + vTemp = XM_FMADD_PS(vTemp, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + vTemp = XM_FMADD_PS(vTemp, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + vTemp = XM_FMADD_PS(vTemp, Scale, Offset); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Unproject +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) noexcept +{ + static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset); + + return XMVector3TransformCoord(Result, Transform); +} + +//------------------------------------------------------------------------------ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + + XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset); + + Result = XMVector3TransformCoord(Result, Transform); + + XMStoreFloat3(reinterpret_cast(pOutputVector), Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + float sx = 1.f / (ViewportWidth * 0.5f); + float sy = 1.f / (-ViewportHeight * 0.5f); + float sz = 1.f / (ViewportMaxZ - ViewportMinZ); + + float ox = (-ViewportX * sx) - 1.f; + float oy = (-ViewportY * sy) + 1.f; + float oz = (-ViewportMinZ * sz); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT3) * 4; + + XMVECTOR ScaleX = vdupq_n_f32(sx); + XMVECTOR OffsetX = vdupq_n_f32(ox); + XMVECTOR VX = vmlaq_f32(OffsetX, ScaleX, V.val[0]); + + float32x2_t r3 = vget_low_f32(Transform.r[3]); + float32x2_t r = vget_low_f32(Transform.r[0]); + XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), VX, r, 0); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), VX, r, 1); // Bx+N + + XM_PREFETCH(pInputVector); + + r3 = vget_high_f32(Transform.r[3]); + r = vget_high_f32(Transform.r[0]); + XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), VX, r, 0); // Cx+O + XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), VX, r, 1); // Dx+P + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + XMVECTOR ScaleY = vdupq_n_f32(sy); + XMVECTOR OffsetY = vdupq_n_f32(oy); + XMVECTOR VY = vmlaq_f32(OffsetY, ScaleY, V.val[1]); + + r = vget_low_f32(Transform.r[1]); + vResult0 = vmlaq_lane_f32(vResult0, VY, r, 0); // Ax+Ey+M + vResult1 = vmlaq_lane_f32(vResult1, VY, r, 1); // Bx+Fy+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(Transform.r[1]); + vResult2 = vmlaq_lane_f32(vResult2, VY, r, 0); // Cx+Gy+O + W = vmlaq_lane_f32(W, VY, r, 1); // Dx+Hy+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + XMVECTOR ScaleZ = vdupq_n_f32(sz); + XMVECTOR OffsetZ = vdupq_n_f32(oz); + XMVECTOR VZ = vmlaq_f32(OffsetZ, ScaleZ, V.val[2]); + + r = vget_low_f32(Transform.r[2]); + vResult0 = vmlaq_lane_f32(vResult0, VZ, r, 0); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32(vResult1, VZ, r, 1); // Bx+Fy+Jz+N + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(Transform.r[2]); + vResult2 = vmlaq_lane_f32(vResult2, VZ, r, 0); // Cx+Gy+Kz+O + W = vmlaq_lane_f32(W, VZ, r, 1); // Dx+Hy+Lz+P + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + V.val[0] = vdivq_f32(vResult0, W); + V.val[1] = vdivq_f32(vResult1, W); + V.val[2] = vdivq_f32(vResult2, W); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + V.val[0] = vmulq_f32(vResult0, Reciprocal); + V.val[1] = vmulq_f32(vResult1, Reciprocal); + V.val[2] = vmulq_f32(vResult2, Reciprocal); +#endif + + vst3q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT3) * 4; + + i += 4; + } + } + } + + if (i < VectorCount) + { + float32x2_t ScaleL = vcreate_f32( + static_cast(*reinterpret_cast(&sx)) + | (static_cast(*reinterpret_cast(&sy)) << 32)); + float32x2_t ScaleH = vcreate_f32(static_cast(*reinterpret_cast(&sz))); + + float32x2_t OffsetL = vcreate_f32( + static_cast(*reinterpret_cast(&ox)) + | (static_cast(*reinterpret_cast(&oy)) << 32)); + float32x2_t OffsetH = vcreate_f32(static_cast(*reinterpret_cast(&oz))); + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32(reinterpret_cast(pInputVector)); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32(reinterpret_cast(pInputVector) + 2, zero, 0); + pInputVector += InputStride; + + VL = vmla_f32(OffsetL, VL, ScaleL); + VH = vmla_f32(OffsetH, VH, ScaleH); + + XMVECTOR vResult = vmlaq_lane_f32(Transform.r[3], Transform.r[0], VL, 0); // X + vResult = vmlaq_lane_f32(vResult, Transform.r[1], VL, 1); // Y + vResult = vmlaq_lane_f32(vResult, Transform.r[2], VH, 0); // Z + + VH = vget_high_f32(vResult); + XMVECTOR W = vdupq_lane_f32(VH, 1); + +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + vResult = vdivq_f32(vResult, W); +#else + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + S = vrecpsq_f32(Reciprocal, W); + Reciprocal = vmulq_f32(S, Reciprocal); + + vResult = vmulq_f32(vResult, Reciprocal); +#endif + + VL = vget_low_f32(vResult); + vst1_f32(reinterpret_cast(pOutputVector), VL); + vst1q_lane_f32(reinterpret_cast(pOutputVector) + 2, vResult, 2); + pOutputVector += OutputStride; + } + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = _mm_mul_ps(Scale, Offset); + Offset = _mm_add_ps(Offset, D); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if (!(reinterpret_cast(pOutputStream) & 0xF)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + V1 = XM_FMADD_PS(V1, Scale, Offset); + + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V1 = _mm_div_ps(vTemp, W); + + // Result 2 + V2 = XM_FMADD_PS(V2, Scale, Offset); + + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V2 = _mm_div_ps(vTemp, W); + + // Result 3 + V3 = XM_FMADD_PS(V3, Scale, Offset); + + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V3 = _mm_div_ps(vTemp, W); + + // Result 4 + V4 = XM_FMADD_PS(V4, Scale, Offset); + + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V4 = _mm_div_ps(vTemp, W); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector), V1); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 16), vTemp); + XM_STREAM_PS(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + V1 = XM_FMADD_PS(V1, Scale, Offset); + + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V1 = _mm_div_ps(vTemp, W); + + // Result 2 + V2 = XM_FMADD_PS(V2, Scale, Offset); + + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V2 = _mm_div_ps(vTemp, W); + + // Result 3 + V3 = XM_FMADD_PS(V3, Scale, Offset); + + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V3 = _mm_div_ps(vTemp, W); + + // Result 4 + V4 = XM_FMADD_PS(V4, Scale, Offset); + + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + V4 = _mm_div_ps(vTemp, W); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector), V1); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 16), vTemp); + _mm_storeu_ps(reinterpret_cast(pOutputVector + 32), V3); + pOutputVector += sizeof(XMFLOAT3) * 4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps(reinterpret_cast(pInputVector)); + __m128 L2 = _mm_loadu_ps(reinterpret_cast(pInputVector + 16)); + __m128 L3 = _mm_loadu_ps(reinterpret_cast(pInputVector + 32)); + pInputVector += sizeof(XMFLOAT3) * 4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1, L2, L3); + + // Result 1 + V1 = XM_FMADD_PS(V1, Scale, Offset); + + XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + V2 = XM_FMADD_PS(V2, Scale, Offset); + + Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + V3 = XM_FMADD_PS(V3, Scale, Offset); + + Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + V4 = XM_FMADD_PS(V4, Scale, Offset); + + Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2)); + Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1)); + X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0)); + + vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + V = _mm_mul_ps(V, Scale); + V = _mm_add_ps(V, Offset); + + XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + + XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]); + XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]); + XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]); + vTemp = _mm_add_ps(vTemp, vTemp2); + vTemp = _mm_add_ps(vTemp, vTemp3); + + XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3)); + vTemp = _mm_div_ps(vTemp, W); + + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +/**************************************************************************** + * + * 4D Vector + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + // Comparison operations + //------------------------------------------------------------------------------ + + //------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + return ((_mm_movemask_ps(vTemp) == 0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1]) && + (V1.vector4_f32[2] == V2.vector4_f32[2]) && + (V1.vector4_f32[3] == V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1]) && + (V1.vector4_f32[2] != V2.vector4_f32[2]) && + (V1.vector4_f32[3] != V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp); + uint32_t CR = 0; + if (iTest == 0xf) // All equal? + { + CR = XM_CRMASK_CR6TRUE; + } + else if (iTest == 0) // All not equal? + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) == 0xf) != 0); +#else + return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if (V1.vector4_u32[0] == V2.vector4_u32[0] && + V1.vector4_u32[1] == V2.vector4_u32[1] && + V1.vector4_u32[2] == V2.vector4_u32[2] && + V1.vector4_u32[3] == V2.vector4_u32[3]) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (V1.vector4_u32[0] != V2.vector4_u32[0] && + V1.vector4_u32[1] != V2.vector4_u32[1] && + V1.vector4_u32[2] != V2.vector4_u32[2] && + V1.vector4_u32[3] != V2.vector4_u32[3]) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp)); + uint32_t CR = 0; + if (iTest == 0xf) // All equal? + { + CR = XM_CRMASK_CR6TRUE; + } + else if (iTest == 0) // All not equal? + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +inline bool XM_CALLCONV XMVector4NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float dx, dy, dz, dw; + + dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]); + dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]); + dz = fabsf(V1.vector4_f32[2] - V2.vector4_f32[2]); + dw = fabsf(V1.vector4_f32[3] - V2.vector4_f32[3]); + return (((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1]) && + (dz <= Epsilon.vector4_f32[2]) && + (dw <= Epsilon.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vDelta = vsubq_f32(V1, V2); +#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES) + uint32x4_t vResult = vacleq_f32(vDelta, Epsilon); +#else + uint32x4_t vResult = vcleq_f32(vabsq_f32(vDelta), Epsilon); +#endif + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1, V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp, vDelta); + vTemp = _mm_max_ps(vTemp, vDelta); + vTemp = _mm_cmple_ps(vTemp, Epsilon); + return ((_mm_movemask_ps(vTemp) == 0xf) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpneq_ps(V1, V2); + return ((_mm_movemask_ps(vTemp)) != 0); +#else + return XMComparisonAnyFalse(XMVector4EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2)); + return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) != 0xF) != 0); +#else + return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + return ((_mm_movemask_ps(vTemp) == 0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if (V1.vector4_f32[0] > V2.vector4_f32[0] && + V1.vector4_f32[1] > V2.vector4_f32[1] && + V1.vector4_f32[2] > V2.vector4_f32[2] && + V1.vector4_f32[3] > V2.vector4_f32[3]) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (V1.vector4_f32[0] <= V2.vector4_f32[0] && + V1.vector4_f32[1] <= V2.vector4_f32[1] && + V1.vector4_f32[2] <= V2.vector4_f32[2] && + V1.vector4_f32[3] <= V2.vector4_f32[3]) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + uint32_t CR = 0; + XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp); + if (iTest == 0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + return ((_mm_movemask_ps(vTemp) == 0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1]) && + (V1.vector4_f32[2] >= V2.vector4_f32[2]) && + (V1.vector4_f32[3] >= V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1]) && + (V1.vector4_f32[2] < V2.vector4_f32[2]) && + (V1.vector4_f32[3] < V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1); + + uint32_t CR = 0; + if (r == 0xFFFFFFFFU) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!r) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + uint32_t CR = 0; + XMVECTOR vTemp = _mm_cmpge_ps(V1, V2); + int iTest = _mm_movemask_ps(vTemp); + if (iTest == 0x0f) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcltq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1, V2); + return ((_mm_movemask_ps(vTemp) == 0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcleq_f32(V1, V2); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1, V2); + return ((_mm_movemask_ps(vTemp) == 0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) && + (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + uint32x4_t ivTemp1 = vcleq_f32(V, Bounds); + // Negate the bounds + float32x4_t vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + uint32x4_t ivTemp2 = vcleq_f32(vTemp2, V); + // Blend answers + ivTemp1 = vandq_u32(ivTemp1, ivTemp2); + // in bounds? + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(ivTemp1)), vget_high_u8(vreinterpretq_u8_u32(ivTemp1))); + uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1) == 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2, V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1, vTemp2); + // All in bounds? + return ((_mm_movemask_ps(vTemp1) == 0x0f) != 0); +#else + return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds)); +#endif +} + +//------------------------------------------------------------------------------ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline bool XM_CALLCONV XMVector4IsNaN(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1]) || + XMISNAN(V.vector4_f32[2]) || + XMISNAN(V.vector4_f32[3])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test against itself. NaN is always not equal + uint32x4_t vTempNan = vceqq_f32(V, V); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempNan)), vget_high_u8(vreinterpretq_u8_u32(vTempNan))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + // If any are NaN, the mask is zero + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0xFFFFFFFFU); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V, V); + // If any are NaN, the mask is non-zero + return (_mm_movemask_ps(vTempNan) != 0); +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4IsInfinite(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1]) || + XMISINF(V.vector4_f32[2]) || + XMISINF(V.vector4_f32[3])); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x4_t vTempInf = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask); + // Compare to infinity + vTempInf = vceqq_f32(vreinterpretq_f32_u32(vTempInf), g_XMInfinity); + // If any are infinity, the signs are true. + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempInf)), vget_high_u8(vreinterpretq_u8_u32(vTempInf))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + XMVECTOR vTemp = _mm_and_ps(V, g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity); + // If any are infinity, the signs are true. + return (_mm_movemask_ps(vTemp) != 0); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result; + Result.f[0] = + Result.f[1] = + Result.f[2] = + Result.f[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3]; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vTemp = vmulq_f32(V1, V2); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + return vcombine_f32(v1, v1); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_dp_ps(V1, V2, 0xff); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vTemp = _mm_mul_ps(V1, V2); + vTemp = _mm_hadd_ps(vTemp, vTemp); + return _mm_hadd_ps(vTemp, vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp2 = V2; + XMVECTOR vTemp = _mm_mul_ps(V1, vTemp2); + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp, _MM_SHUFFLE(1, 0, 0, 0)); // Copy X to the Z position and Y to the W position + vTemp2 = _mm_add_ps(vTemp2, vTemp); // Add Z = X+Z; W = Y+W; + vTemp = _mm_shuffle_ps(vTemp, vTemp2, _MM_SHUFFLE(0, 3, 0, 0)); // Copy W to the Z position + vTemp = _mm_add_ps(vTemp, vTemp2); // Add Z and W together + return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(2, 2, 2, 2)); // Splat Z and return +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Cross +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) noexcept +{ + // [ ((v2.z*v3.w-v2.w*v3.z)*v1.y)-((v2.y*v3.w-v2.w*v3.y)*v1.z)+((v2.y*v3.z-v2.z*v3.y)*v1.w), + // ((v2.w*v3.z-v2.z*v3.w)*v1.x)-((v2.w*v3.x-v2.x*v3.w)*v1.z)+((v2.z*v3.x-v2.x*v3.z)*v1.w), + // ((v2.y*v3.w-v2.w*v3.y)*v1.x)-((v2.x*v3.w-v2.w*v3.x)*v1.y)+((v2.x*v3.y-v2.y*v3.x)*v1.w), + // ((v2.z*v3.y-v2.y*v3.z)*v1.x)-((v2.z*v3.x-v2.x*v3.z)*v1.y)+((v2.y*v3.x-v2.x*v3.y)*v1.z) ] + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + (((V2.vector4_f32[2] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[2])) * V1.vector4_f32[1]) - (((V2.vector4_f32[1] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[1])) * V1.vector4_f32[2]) + (((V2.vector4_f32[1] * V3.vector4_f32[2]) - (V2.vector4_f32[2] * V3.vector4_f32[1])) * V1.vector4_f32[3]), + (((V2.vector4_f32[3] * V3.vector4_f32[2]) - (V2.vector4_f32[2] * V3.vector4_f32[3])) * V1.vector4_f32[0]) - (((V2.vector4_f32[3] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[3])) * V1.vector4_f32[2]) + (((V2.vector4_f32[2] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[2])) * V1.vector4_f32[3]), + (((V2.vector4_f32[1] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[1])) * V1.vector4_f32[0]) - (((V2.vector4_f32[0] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[0])) * V1.vector4_f32[1]) + (((V2.vector4_f32[0] * V3.vector4_f32[1]) - (V2.vector4_f32[1] * V3.vector4_f32[0])) * V1.vector4_f32[3]), + (((V2.vector4_f32[2] * V3.vector4_f32[1]) - (V2.vector4_f32[1] * V3.vector4_f32[2])) * V1.vector4_f32[0]) - (((V2.vector4_f32[2] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[2])) * V1.vector4_f32[1]) + (((V2.vector4_f32[1] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[1])) * V1.vector4_f32[2]), + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const uint32x2_t select = vget_low_u32(g_XMMaskX); + + // Term1: V2zwyz * V3wzwy + const float32x2_t v2xy = vget_low_f32(V2); + const float32x2_t v2zw = vget_high_f32(V2); + const float32x2_t v2yx = vrev64_f32(v2xy); + const float32x2_t v2wz = vrev64_f32(v2zw); + const float32x2_t v2yz = vbsl_f32(select, v2yx, v2wz); + + const float32x2_t v3zw = vget_high_f32(V3); + const float32x2_t v3wz = vrev64_f32(v3zw); + const float32x2_t v3xy = vget_low_f32(V3); + const float32x2_t v3wy = vbsl_f32(select, v3wz, v3xy); + + float32x4_t vTemp1 = vcombine_f32(v2zw, v2yz); + float32x4_t vTemp2 = vcombine_f32(v3wz, v3wy); + XMVECTOR vResult = vmulq_f32(vTemp1, vTemp2); + + // - V2wzwy * V3zwyz + const float32x2_t v2wy = vbsl_f32(select, v2wz, v2xy); + + const float32x2_t v3yx = vrev64_f32(v3xy); + const float32x2_t v3yz = vbsl_f32(select, v3yx, v3wz); + + vTemp1 = vcombine_f32(v2wz, v2wy); + vTemp2 = vcombine_f32(v3zw, v3yz); + vResult = vmlsq_f32(vResult, vTemp1, vTemp2); + + // term1 * V1yxxx + const float32x2_t v1xy = vget_low_f32(V1); + const float32x2_t v1yx = vrev64_f32(v1xy); + + vTemp1 = vcombine_f32(v1yx, vdup_lane_f32(v1yx, 1)); + vResult = vmulq_f32(vResult, vTemp1); + + // Term2: V2ywxz * V3wxwx + const float32x2_t v2yw = vrev64_f32(v2wy); + const float32x2_t v2xz = vbsl_f32(select, v2xy, v2wz); + + const float32x2_t v3wx = vbsl_f32(select, v3wz, v3yx); + + vTemp1 = vcombine_f32(v2yw, v2xz); + vTemp2 = vcombine_f32(v3wx, v3wx); + float32x4_t vTerm = vmulq_f32(vTemp1, vTemp2); + + // - V2wxwx * V3ywxz + const float32x2_t v2wx = vbsl_f32(select, v2wz, v2yx); + + const float32x2_t v3yw = vrev64_f32(v3wy); + const float32x2_t v3xz = vbsl_f32(select, v3xy, v3wz); + + vTemp1 = vcombine_f32(v2wx, v2wx); + vTemp2 = vcombine_f32(v3yw, v3xz); + vTerm = vmlsq_f32(vTerm, vTemp1, vTemp2); + + // vResult - term2 * V1zzyy + const float32x2_t v1zw = vget_high_f32(V1); + + vTemp1 = vcombine_f32(vdup_lane_f32(v1zw, 0), vdup_lane_f32(v1yx, 0)); + vResult = vmlsq_f32(vResult, vTerm, vTemp1); + + // Term3: V2yzxy * V3zxyx + const float32x2_t v3zx = vrev64_f32(v3xz); + + vTemp1 = vcombine_f32(v2yz, v2xy); + vTemp2 = vcombine_f32(v3zx, v3yx); + vTerm = vmulq_f32(vTemp1, vTemp2); + + // - V2zxyx * V3yzxy + const float32x2_t v2zx = vrev64_f32(v2xz); + + vTemp1 = vcombine_f32(v2zx, v2yx); + vTemp2 = vcombine_f32(v3yz, v3xy); + vTerm = vmlsq_f32(vTerm, vTemp1, vTemp2); + + // vResult + term3 * V1wwwz + const float32x2_t v1wz = vrev64_f32(v1zw); + + vTemp1 = vcombine_f32(vdup_lane_f32(v1wz, 0), v1wz); + return vmlaq_f32(vResult, vTerm, vTemp1); +#elif defined(_XM_SSE_INTRINSICS_) + // V2zwyz * V3wzwy + XMVECTOR vResult = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 1, 3, 2)); + XMVECTOR vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 3, 2, 3)); + vResult = _mm_mul_ps(vResult, vTemp3); + // - V2wzwy * V3zwyz + XMVECTOR vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 3, 2, 3)); + vTemp3 = XM_PERMUTE_PS(vTemp3, _MM_SHUFFLE(1, 3, 0, 1)); + vResult = XM_FNMADD_PS(vTemp2, vTemp3, vResult); + // term1 * V1yxxx + XMVECTOR vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 1)); + vResult = _mm_mul_ps(vResult, vTemp1); + + // V2ywxz * V3wxwx + vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 0, 3, 1)); + vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 3, 0, 3)); + vTemp3 = _mm_mul_ps(vTemp3, vTemp2); + // - V2wxwx * V3ywxz + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(2, 1, 2, 1)); + vTemp1 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 0, 3, 1)); + vTemp3 = XM_FNMADD_PS(vTemp2, vTemp1, vTemp3); + // vResult - temp * V1zzyy + vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 2, 2)); + vResult = XM_FNMADD_PS(vTemp1, vTemp3, vResult); + + // V2yzxy * V3zxyx + vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 0, 2, 1)); + vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 1, 0, 2)); + vTemp3 = _mm_mul_ps(vTemp3, vTemp2); + // - V2zxyx * V3yzxy + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(2, 0, 2, 1)); + vTemp1 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 0, 2, 1)); + vTemp3 = XM_FNMADD_PS(vTemp1, vTemp2, vTemp3); + // vResult + term * V1wwwz + vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 3, 3, 3)); + vResult = XM_FMADD_PS(vTemp3, vTemp1, vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4LengthSq(FXMVECTOR V) noexcept +{ + return XMVector4Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32(v1); + return vcombine_f32(v2, v2); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff); + return _mm_rsqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Get the reciprocal + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + // Reciprocal sqrt + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff); + XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp); + return _mm_div_ps(g_XMOne, vLengthSq); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + vLengthSq = _mm_div_ps(g_XMOne, vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Get the reciprocal + vLengthSq = _mm_sqrt_ps(vLengthSq); + // Accurate! + vLengthSq = _mm_div_ps(g_XMOne, vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4LengthEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(v1, zero); + // Sqrt (estimate) + float32x2_t Result = vrsqrte_f32(v1); + Result = vmul_f32(v1, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Length(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32(v1, zero); + // Sqrt + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + float32x2_t Result = vmul_f32(S1, R1); + Result = vmul_f32(v1, Result); + Result = vbsl_f32(VEqualsZero, zero, Result); + return vcombine_f32(Result, Result); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff); + return _mm_sqrt_ps(vTemp); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ +// XMVector4NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector4ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32(v1); + // Normalize + return vmulq_f32(V, vcombine_f32(v2, v2)); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff); + XMVECTOR vResult = _mm_rsqrt_ps(vTemp); + return _mm_mul_ps(vResult, V); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V, V); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_rsqrt_ps(vDot); + vDot = _mm_mul_ps(vDot, V); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Get the reciprocal + XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq); + // Reciprocal mul to perform the normalization + vResult = _mm_mul_ps(vResult, V); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Normalize(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + float fLength; + XMVECTOR vResult; + + vResult = XMVector4Length(V); + fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) + { + fLength = 1.0f / fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0] * fLength; + vResult.vector4_f32[1] = V.vector4_f32[1] * fLength; + vResult.vector4_f32[2] = V.vector4_f32[2] * fLength; + vResult.vector4_f32[3] = V.vector4_f32[3] * fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32(V, V); + float32x2_t v1 = vget_low_f32(vTemp); + float32x2_t v2 = vget_high_f32(vTemp); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + uint32x2_t VEqualsZero = vceq_f32(v1, vdup_n_f32(0)); + uint32x2_t VEqualsInf = vceq_f32(v1, vget_low_f32(g_XMInfinity)); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32(v1, S0); + float32x2_t R0 = vrsqrts_f32(P0, S0); + float32x2_t S1 = vmul_f32(S0, R0); + float32x2_t P1 = vmul_f32(v1, S1); + float32x2_t R1 = vrsqrts_f32(P1, S1); + v2 = vmul_f32(S1, R1); + // Normalize + XMVECTOR vResult = vmulq_f32(V, vcombine_f32(v2, v2)); + vResult = vbslq_f32(vcombine_u32(VEqualsZero, VEqualsZero), vdupq_n_f32(0), vResult); + return vbslq_f32(vcombine_u32(VEqualsInf, VEqualsInf), g_XMQNaN, vResult); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0xff); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE3_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq, vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) noexcept +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + + return XMVector4ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) noexcept +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector4GreaterOrEqual(LengthMin, XMVectorZero())); + assert(XMVector4GreaterOrEqual(LengthMax, XMVectorZero())); + assert(XMVector4GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector4LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) noexcept +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result = XMVector4Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) noexcept +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector4RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR IDotN; + XMVECTOR R; + const XMVECTOR Zero = XMVectorZero(); + + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + + IDotN = XMVector4Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); + R = XMVectorMultiply(R, RefractionIndex); + R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); + + if (XMVector4LessOrEqual(R, Zero)) + { + // Total internal reflection + return Zero; + } + else + { + XMVECTOR Result; + + // R = RefractionIndex * IDotN + sqrt(R) + R = XMVectorSqrt(R); + R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); + + // Result = RefractionIndex * Incident - Normal * R + Result = XMVectorMultiply(RefractionIndex, Incident); + Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); + + return Result; + } + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR IDotN = XMVector4Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + float32x4_t R = vmlsq_f32(g_XMOne, IDotN, IDotN); + R = vmulq_f32(R, RefractionIndex); + R = vmlsq_f32(g_XMOne, R, RefractionIndex); + + uint32x4_t isrzero = vcleq_f32(R, g_XMZero); + uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(isrzero)), vget_high_u8(vreinterpretq_u8_u32(isrzero))); + uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1])); + + float32x4_t vResult; + if (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // Sqrt(R) + float32x4_t S0 = vrsqrteq_f32(R); + float32x4_t P0 = vmulq_f32(R, S0); + float32x4_t R0 = vrsqrtsq_f32(P0, S0); + float32x4_t S1 = vmulq_f32(S0, R0); + float32x4_t P1 = vmulq_f32(R, S1); + float32x4_t R1 = vrsqrtsq_f32(P1, S1); + float32x4_t S2 = vmulq_f32(S1, R1); + R = vmulq_f32(R, S2); + // R = RefractionIndex * IDotN + sqrt(R) + R = vmlaq_f32(R, RefractionIndex, IDotN); + // Result = RefractionIndex * Incident - Normal * R + vResult = vmulq_f32(RefractionIndex, Incident); + vResult = vmlsq_f32(vResult, R, Normal); + } + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR IDotN = XMVector4Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = XM_FNMADD_PS(IDotN, IDotN, g_XMOne); + XMVECTOR R2 = _mm_mul_ps(RefractionIndex, RefractionIndex); + R = XM_FNMADD_PS(R, R2, g_XMOne); + + XMVECTOR vResult = _mm_cmple_ps(R, g_XMZero); + if (_mm_movemask_ps(vResult) == 0x0f) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = _mm_sqrt_ps(R); + R = XM_FMADD_PS(RefractionIndex, IDotN, R); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex, Incident); + vResult = XM_FNMADD_PS(R, Normal, vResult); + } + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Orthogonal(FXMVECTOR V) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 Result = { { { + V.vector4_f32[2], + V.vector4_f32[3], + -V.vector4_f32[0], + -V.vector4_f32[1] + } } }; + return Result.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { { { 1.f, 1.f, -1.f, -1.f } } }; + + float32x4_t Result = vcombine_f32(vget_high_f32(V), vget_low_f32(V)); + return vmulq_f32(Result, Negate); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FlipZW = { { { 1.0f, 1.0f, -1.0f, -1.0f } } }; + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 0, 3, 2)); + vResult = _mm_mul_ps(vResult, FlipZW); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector4Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) noexcept +{ + XMVECTOR Result = XMVector4Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACos(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + XMVECTOR L1 = XMVector4ReciprocalLength(V1); + XMVECTOR L2 = XMVector4ReciprocalLength(V2); + + XMVECTOR Dot = XMVector4Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Transform +( + FXMVECTOR V, + FXMMATRIX M +) noexcept +{ +#if defined(_XM_NO_INTRINSICS_) + + float fX = (M.m[0][0] * V.vector4_f32[0]) + (M.m[1][0] * V.vector4_f32[1]) + (M.m[2][0] * V.vector4_f32[2]) + (M.m[3][0] * V.vector4_f32[3]); + float fY = (M.m[0][1] * V.vector4_f32[0]) + (M.m[1][1] * V.vector4_f32[1]) + (M.m[2][1] * V.vector4_f32[2]) + (M.m[3][1] * V.vector4_f32[3]); + float fZ = (M.m[0][2] * V.vector4_f32[0]) + (M.m[1][2] * V.vector4_f32[1]) + (M.m[2][2] * V.vector4_f32[2]) + (M.m[3][2] * V.vector4_f32[3]); + float fW = (M.m[0][3] * V.vector4_f32[0]) + (M.m[1][3] * V.vector4_f32[1]) + (M.m[2][3] * V.vector4_f32[2]) + (M.m[3][3] * V.vector4_f32[3]); + XMVECTORF32 vResult = { { { fX, fY, fZ, fW } } }; + return vResult.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + XMVECTOR vResult = vmulq_lane_f32(M.r[0], VL, 0); // X + vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1); // Y + float32x2_t VH = vget_high_f32(V); + vResult = vmlaq_lane_f32(vResult, M.r[2], VH, 0); // Z + return vmlaq_lane_f32(vResult, M.r[3], VH, 1); // W +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); // W + vResult = _mm_mul_ps(vResult, M.r[3]); + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z + vResult = XM_FMADD_PS(vTemp, M.r[2], vResult); + vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y + vResult = XM_FMADD_PS(vTemp, M.r[1], vResult); + vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X + vResult = XM_FMADD_PS(vTemp, M.r[0], vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMVector4TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT4* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) noexcept +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT4)); + + assert(OutputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); + +#if defined(_XM_NO_INTRINSICS_) + + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat4(reinterpret_cast(pInputVector)); + XMVECTOR W = XMVectorSplatW(V); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(W, row3); + Result = XMVectorMultiplyAdd(Z, row2, Result); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" ) +#endif + + XMStoreFloat4(reinterpret_cast(pOutputVector), Result); + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if (four > 0) + { + if ((InputStride == sizeof(XMFLOAT4)) && (OutputStride == sizeof(XMFLOAT4))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x4_t V = vld4q_f32(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT4) * 4; + + float32x2_t r = vget_low_f32(row0); + XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0); // Ax + XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1); // Bx + + XM_PREFETCH(pInputVector); + + r = vget_high_f32(row0); + XMVECTOR vResult2 = vmulq_lane_f32(V.val[0], r, 0); // Cx + XMVECTOR vResult3 = vmulq_lane_f32(V.val[0], r, 1); // Dx + + XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE); + + r = vget_low_f32(row1); + vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey + vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2)); + + r = vget_high_f32(row1); + vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy + vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1); // Dx+Hy + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3)); + + r = vget_low_f32(row2); + vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz + vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4)); + + r = vget_high_f32(row2); + vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz + vResult3 = vmlaq_lane_f32(vResult3, V.val[2], r, 1); // Dx+Hy+Lz + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5)); + + r = vget_low_f32(row3); + vResult0 = vmlaq_lane_f32(vResult0, V.val[3], r, 0); // Ax+Ey+Iz+Mw + vResult1 = vmlaq_lane_f32(vResult1, V.val[3], r, 1); // Bx+Fy+Jz+Nw + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 6)); + + r = vget_high_f32(row3); + vResult2 = vmlaq_lane_f32(vResult2, V.val[3], r, 0); // Cx+Gy+Kz+Ow + vResult3 = vmlaq_lane_f32(vResult3, V.val[3], r, 1); // Dx+Hy+Lz+Pw + + XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 7)); + + V.val[0] = vResult0; + V.val[1] = vResult1; + V.val[2] = vResult2; + V.val[3] = vResult3; + + vst4q_f32(reinterpret_cast(pOutputVector), V); + pOutputVector += sizeof(XMFLOAT4) * 4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = vld1q_f32(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + float32x2_t VL = vget_low_f32(V); + XMVECTOR vResult = vmulq_lane_f32(row0, VL, 0); // X + vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y + float32x2_t VH = vget_high_f32(V); + vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z + vResult = vmlaq_lane_f32(vResult, row3, VH, 1); // W + + vst1q_f32(reinterpret_cast(pOutputVector), vResult); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_AVX2_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t two = VectorCount >> 1; + if (two > 0) + { + __m256 row0 = _mm256_broadcast_ps(&M.r[0]); + __m256 row1 = _mm256_broadcast_ps(&M.r[1]); + __m256 row2 = _mm256_broadcast_ps(&M.r[2]); + __m256 row3 = _mm256_broadcast_ps(&M.r[3]); + + if (InputStride == sizeof(XMFLOAT4)) + { + if (OutputStride == sizeof(XMFLOAT4)) + { + if (!(reinterpret_cast(pOutputStream) & 0x1F)) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < two; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT4) * 2; + + __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm256_mul_ps(vTempX, row0); + vTempY = _mm256_mul_ps(vTempY, row1); + vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX); + vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY); + vTempX = _mm256_add_ps(vTempZ, vTempW); + + XM256_STREAM_PS(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += sizeof(XMFLOAT4) * 2; + + i += 2; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < two; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT4) * 2; + + __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm256_mul_ps(vTempX, row0); + vTempY = _mm256_mul_ps(vTempY, row1); + vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX); + vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY); + vTempX = _mm256_add_ps(vTempZ, vTempW); + + _mm256_storeu_ps(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += sizeof(XMFLOAT4) * 2; + + i += 2; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < two; ++j) + { + __m256 VV = _mm256_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += sizeof(XMFLOAT4) * 2; + + __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0)); + __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1)); + __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2)); + __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm256_mul_ps(vTempX, row0); + vTempY = _mm256_mul_ps(vTempY, row1); + vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX); + vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY); + vTempX = _mm256_add_ps(vTempZ, vTempW); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_castps256_ps128(vTempX)); + pOutputVector += OutputStride; + + _mm_storeu_ps(reinterpret_cast(pOutputVector), _mm256_extractf128_ps(vTempX, 1)); + pOutputVector += OutputStride; + i += 2; + } + } + } + } + + if (i < VectorCount) + { + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (; i < VectorCount; i++) + { + __m128 V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm_mul_ps(vTempX, row0); + vTempY = _mm_mul_ps(vTempY, row1); + vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX); + vTempW = XM_FMADD_PS(vTempW, row3, vTempY); + vTempX = _mm_add_ps(vTempZ, vTempW); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + auto pInputVector = reinterpret_cast(pInputStream); + auto pOutputVector = reinterpret_cast(pOutputStream); + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + if (!(reinterpret_cast(pOutputStream) & 0xF) && !(OutputStride & 0xF)) + { + if (!(reinterpret_cast(pInputStream) & 0xF) && !(InputStride & 0xF)) + { + // Aligned input, aligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_load_ps(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm_mul_ps(vTempX, row0); + vTempY = _mm_mul_ps(vTempY, row1); + vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX); + vTempW = XM_FMADD_PS(vTempW, row3, vTempY); + vTempX = _mm_add_ps(vTempZ, vTempW); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input, aligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm_mul_ps(vTempX, row0); + vTempY = _mm_mul_ps(vTempY, row1); + vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX); + vTempW = XM_FMADD_PS(vTempW, row3, vTempY); + vTempX = _mm_add_ps(vTempZ, vTempW); + + XM_STREAM_PS(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += OutputStride; + } + } + } + else + { + if (!(reinterpret_cast(pInputStream) & 0xF) && !(InputStride & 0xF)) + { + // Aligned input, unaligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_load_ps(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm_mul_ps(vTempX, row0); + vTempY = _mm_mul_ps(vTempY, row1); + vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX); + vTempW = XM_FMADD_PS(vTempW, row3, vTempY); + vTempX = _mm_add_ps(vTempZ, vTempW); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input, unaligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_loadu_ps(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); + + vTempX = _mm_mul_ps(vTempX, row0); + vTempY = _mm_mul_ps(vTempY, row1); + vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX); + vTempW = XM_FMADD_PS(vTempW, row3, vTempY); + vTempX = _mm_add_ps(vTempZ, vTempW); + + _mm_storeu_ps(reinterpret_cast(pOutputVector), vTempX); + pOutputVector += OutputStride; + } + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +/**************************************************************************** + * + * XMVECTOR operators + * + ****************************************************************************/ + +#ifndef _XM_NO_XMVECTOR_OVERLOADS_ + + //------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V) noexcept +{ + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator- (FXMVECTOR V) noexcept +{ + return XMVectorNegate(V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator+= +( + XMVECTOR& V1, + FXMVECTOR V2 +) noexcept +{ + V1 = XMVectorAdd(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator-= +( + XMVECTOR& V1, + FXMVECTOR V2 +) noexcept +{ + V1 = XMVectorSubtract(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator*= +( + XMVECTOR& V1, + FXMVECTOR V2 +) noexcept +{ + V1 = XMVectorMultiply(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator/= +( + XMVECTOR& V1, + FXMVECTOR V2 +) noexcept +{ + V1 = XMVectorDivide(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& operator*= +( + XMVECTOR& V, + const float S +) noexcept +{ + V = XMVectorScale(V, S); + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& operator/= +( + XMVECTOR& V, + const float S +) noexcept +{ + XMVECTOR vS = XMVectorReplicate(S); + V = XMVectorDivide(V, vS); + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator+ +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + return XMVectorAdd(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator- +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + return XMVectorSubtract(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator* +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + return XMVectorMultiply(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator/ +( + FXMVECTOR V1, + FXMVECTOR V2 +) noexcept +{ + return XMVectorDivide(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator* +( + FXMVECTOR V, + const float S +) noexcept +{ + return XMVectorScale(V, S); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator/ +( + FXMVECTOR V, + const float S +) noexcept +{ + XMVECTOR vS = XMVectorReplicate(S); + return XMVectorDivide(V, vS); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator* +( + float S, + FXMVECTOR V +) noexcept +{ + return XMVectorScale(V, S); +} + +#endif /* !_XM_NO_XMVECTOR_OVERLOADS_ */ + +#if defined(_XM_NO_INTRINSICS_) +#undef XMISNAN +#undef XMISINF +#endif + +#if defined(_XM_SSE_INTRINSICS_) +#undef XM3UNPACK3INTO4 +#undef XM3PACK4INTO3 +#endif + diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXPackedVector.h b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXPackedVector.h new file mode 100644 index 000000000..1484b476a --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXPackedVector.h @@ -0,0 +1,1224 @@ +//------------------------------------------------------------------------------------- +// DirectXPackedVector.h -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#include "DirectXMath.h" + +namespace DirectX +{ + + namespace PackedVector + { +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4201 4365 4324 4996) + // C4201: nonstandard extension used + // C4365: Off by default noise + // C4324: alignment padding warnings + // C4996: deprecation warnings +#endif + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wgnu-anonymous-struct" +#pragma clang diagnostic ignored "-Wnested-anon-types" +#endif + + //------------------------------------------------------------------------------ + // ARGB Color; 8-8-8-8 bit unsigned normalized integer components packed into + // a 32 bit integer. The normalized color is packed into 32 bits using 8 bit + // unsigned, normalized integers for the alpha, red, green, and blue components. + // The alpha component is stored in the most significant bits and the blue + // component in the least significant bits (A8R8G8B8): + // [32] aaaaaaaa rrrrrrrr gggggggg bbbbbbbb [0] + struct XMCOLOR + { + union + { + struct + { + uint8_t b; // Blue: 0/255 to 255/255 + uint8_t g; // Green: 0/255 to 255/255 + uint8_t r; // Red: 0/255 to 255/255 + uint8_t a; // Alpha: 0/255 to 255/255 + }; + uint32_t c; + }; + + XMCOLOR() = default; + + XMCOLOR(const XMCOLOR&) = default; + XMCOLOR& operator=(const XMCOLOR&) = default; + + XMCOLOR(XMCOLOR&&) = default; + XMCOLOR& operator=(XMCOLOR&&) = default; + + constexpr XMCOLOR(uint32_t Color) noexcept : c(Color) {} + XMCOLOR(float _r, float _g, float _b, float _a) noexcept; + explicit XMCOLOR(_In_reads_(4) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return c; } + + XMCOLOR& operator= (const uint32_t Color) noexcept { c = Color; return *this; } + }; + + //------------------------------------------------------------------------------ + // 16 bit floating point number consisting of a sign bit, a 5 bit biased + // exponent, and a 10 bit mantissa + using HALF = uint16_t; + + //------------------------------------------------------------------------------ + // 2D Vector; 16 bit floating point components + struct XMHALF2 + { + union + { + struct + { + HALF x; + HALF y; + }; + uint32_t v; + }; + + XMHALF2() = default; + + XMHALF2(const XMHALF2&) = default; + XMHALF2& operator=(const XMHALF2&) = default; + + XMHALF2(XMHALF2&&) = default; + XMHALF2& operator=(XMHALF2&&) = default; + + explicit constexpr XMHALF2(uint32_t Packed) noexcept : v(Packed) {} + constexpr XMHALF2(HALF _x, HALF _y) noexcept : x(_x), y(_y) {} + explicit XMHALF2(_In_reads_(2) const HALF* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMHALF2(float _x, float _y) noexcept; + explicit XMHALF2(_In_reads_(2) const float* pArray) noexcept; + + XMHALF2& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 2D Vector; 16 bit signed normalized integer components + struct XMSHORTN2 + { + union + { + struct + { + int16_t x; + int16_t y; + }; + uint32_t v; + }; + + XMSHORTN2() = default; + + XMSHORTN2(const XMSHORTN2&) = default; + XMSHORTN2& operator=(const XMSHORTN2&) = default; + + XMSHORTN2(XMSHORTN2&&) = default; + XMSHORTN2& operator=(XMSHORTN2&&) = default; + + explicit constexpr XMSHORTN2(uint32_t Packed) noexcept : v(Packed) {} + constexpr XMSHORTN2(int16_t _x, int16_t _y) noexcept : x(_x), y(_y) {} + explicit XMSHORTN2(_In_reads_(2) const int16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMSHORTN2(float _x, float _y) noexcept; + explicit XMSHORTN2(_In_reads_(2) const float* pArray) noexcept; + + XMSHORTN2& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 2D Vector; 16 bit signed integer components + struct XMSHORT2 + { + union + { + struct + { + int16_t x; + int16_t y; + }; + uint32_t v; + }; + + XMSHORT2() = default; + + XMSHORT2(const XMSHORT2&) = default; + XMSHORT2& operator=(const XMSHORT2&) = default; + + XMSHORT2(XMSHORT2&&) = default; + XMSHORT2& operator=(XMSHORT2&&) = default; + + explicit constexpr XMSHORT2(uint32_t Packed) noexcept : v(Packed) {} + constexpr XMSHORT2(int16_t _x, int16_t _y) noexcept : x(_x), y(_y) {} + explicit XMSHORT2(_In_reads_(2) const int16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMSHORT2(float _x, float _y) noexcept; + explicit XMSHORT2(_In_reads_(2) const float* pArray) noexcept; + + XMSHORT2& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 2D Vector; 16 bit unsigned normalized integer components + struct XMUSHORTN2 + { + union + { + struct + { + uint16_t x; + uint16_t y; + }; + uint32_t v; + }; + + XMUSHORTN2() = default; + + XMUSHORTN2(const XMUSHORTN2&) = default; + XMUSHORTN2& operator=(const XMUSHORTN2&) = default; + + XMUSHORTN2(XMUSHORTN2&&) = default; + XMUSHORTN2& operator=(XMUSHORTN2&&) = default; + + explicit constexpr XMUSHORTN2(uint32_t Packed) noexcept : v(Packed) {} + constexpr XMUSHORTN2(uint16_t _x, uint16_t _y) noexcept : x(_x), y(_y) {} + explicit XMUSHORTN2(_In_reads_(2) const uint16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMUSHORTN2(float _x, float _y) noexcept; + explicit XMUSHORTN2(_In_reads_(2) const float* pArray) noexcept; + + XMUSHORTN2& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 2D Vector; 16 bit unsigned integer components + struct XMUSHORT2 + { + union + { + struct + { + uint16_t x; + uint16_t y; + }; + uint32_t v; + }; + + XMUSHORT2() = default; + + XMUSHORT2(const XMUSHORT2&) = default; + XMUSHORT2& operator=(const XMUSHORT2&) = default; + + XMUSHORT2(XMUSHORT2&&) = default; + XMUSHORT2& operator=(XMUSHORT2&&) = default; + + explicit constexpr XMUSHORT2(uint32_t Packed) noexcept : v(Packed) {} + constexpr XMUSHORT2(uint16_t _x, uint16_t _y) noexcept : x(_x), y(_y) {} + explicit XMUSHORT2(_In_reads_(2) const uint16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMUSHORT2(float _x, float _y) noexcept; + explicit XMUSHORT2(_In_reads_(2) const float* pArray) noexcept; + + XMUSHORT2& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 2D Vector; 8 bit signed normalized integer components + struct XMBYTEN2 + { + union + { + struct + { + int8_t x; + int8_t y; + }; + uint16_t v; + }; + + XMBYTEN2() = default; + + XMBYTEN2(const XMBYTEN2&) = default; + XMBYTEN2& operator=(const XMBYTEN2&) = default; + + XMBYTEN2(XMBYTEN2&&) = default; + XMBYTEN2& operator=(XMBYTEN2&&) = default; + + explicit constexpr XMBYTEN2(uint16_t Packed) noexcept : v(Packed) {} + constexpr XMBYTEN2(int8_t _x, int8_t _y) noexcept : x(_x), y(_y) {} + explicit XMBYTEN2(_In_reads_(2) const int8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMBYTEN2(float _x, float _y) noexcept; + explicit XMBYTEN2(_In_reads_(2) const float* pArray) noexcept; + + XMBYTEN2& operator= (uint16_t Packed) noexcept { v = Packed; return *this; } + }; + + // 2D Vector; 8 bit signed integer components + struct XMBYTE2 + { + union + { + struct + { + int8_t x; + int8_t y; + }; + uint16_t v; + }; + + XMBYTE2() = default; + + XMBYTE2(const XMBYTE2&) = default; + XMBYTE2& operator=(const XMBYTE2&) = default; + + XMBYTE2(XMBYTE2&&) = default; + XMBYTE2& operator=(XMBYTE2&&) = default; + + explicit constexpr XMBYTE2(uint16_t Packed) noexcept : v(Packed) {} + constexpr XMBYTE2(int8_t _x, int8_t _y) noexcept : x(_x), y(_y) {} + explicit XMBYTE2(_In_reads_(2) const int8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMBYTE2(float _x, float _y) noexcept; + explicit XMBYTE2(_In_reads_(2) const float* pArray) noexcept; + + XMBYTE2& operator= (uint16_t Packed) noexcept { v = Packed; return *this; } + }; + + // 2D Vector; 8 bit unsigned normalized integer components + struct XMUBYTEN2 + { + union + { + struct + { + uint8_t x; + uint8_t y; + }; + uint16_t v; + }; + + XMUBYTEN2() = default; + + XMUBYTEN2(const XMUBYTEN2&) = default; + XMUBYTEN2& operator=(const XMUBYTEN2&) = default; + + XMUBYTEN2(XMUBYTEN2&&) = default; + XMUBYTEN2& operator=(XMUBYTEN2&&) = default; + + explicit constexpr XMUBYTEN2(uint16_t Packed) noexcept : v(Packed) {} + constexpr XMUBYTEN2(uint8_t _x, uint8_t _y) noexcept : x(_x), y(_y) {} + explicit XMUBYTEN2(_In_reads_(2) const uint8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMUBYTEN2(float _x, float _y) noexcept; + explicit XMUBYTEN2(_In_reads_(2) const float* pArray) noexcept; + + XMUBYTEN2& operator= (uint16_t Packed) noexcept { v = Packed; return *this; } + }; + + // 2D Vector; 8 bit unsigned integer components + struct XMUBYTE2 + { + union + { + struct + { + uint8_t x; + uint8_t y; + }; + uint16_t v; + }; + + XMUBYTE2() = default; + + XMUBYTE2(const XMUBYTE2&) = default; + XMUBYTE2& operator=(const XMUBYTE2&) = default; + + XMUBYTE2(XMUBYTE2&&) = default; + XMUBYTE2& operator=(XMUBYTE2&&) = default; + + explicit constexpr XMUBYTE2(uint16_t Packed) noexcept : v(Packed) {} + constexpr XMUBYTE2(uint8_t _x, uint8_t _y) noexcept : x(_x), y(_y) {} + explicit XMUBYTE2(_In_reads_(2) const uint8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]) {} + XMUBYTE2(float _x, float _y) noexcept; + explicit XMUBYTE2(_In_reads_(2) const float* pArray) noexcept; + + XMUBYTE2& operator= (uint16_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 3D vector: 5/6/5 unsigned integer components + struct XMU565 + { + union + { + struct + { + uint16_t x : 5; // 0 to 31 + uint16_t y : 6; // 0 to 63 + uint16_t z : 5; // 0 to 31 + }; + uint16_t v; + }; + + XMU565() = default; + + XMU565(const XMU565&) = default; + XMU565& operator=(const XMU565&) = default; + + XMU565(XMU565&&) = default; + XMU565& operator=(XMU565&&) = default; + + explicit constexpr XMU565(uint16_t Packed) noexcept : v(Packed) {} + constexpr XMU565(uint8_t _x, uint8_t _y, uint8_t _z) noexcept : x(_x), y(_y), z(_z) {} + explicit XMU565(_In_reads_(3) const uint8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + XMU565(float _x, float _y, float _z) noexcept; + explicit XMU565(_In_reads_(3) const float* pArray) noexcept; + + operator uint16_t () const noexcept { return v; } + + XMU565& operator= (uint16_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 3D vector: 11/11/10 floating-point components + // The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent + // and 6-bit mantissa for x component, a 5-bit biased exponent and + // 6-bit mantissa for y component, a 5-bit biased exponent and a 5-bit + // mantissa for z. The z component is stored in the most significant bits + // and the x component in the least significant bits. No sign bits so + // all partial-precision numbers are positive. + // (Z10Y11X11): [32] ZZZZZzzz zzzYYYYY yyyyyyXX XXXxxxxx [0] + struct XMFLOAT3PK + { + union + { + struct + { + uint32_t xm : 6; // x-mantissa + uint32_t xe : 5; // x-exponent + uint32_t ym : 6; // y-mantissa + uint32_t ye : 5; // y-exponent + uint32_t zm : 5; // z-mantissa + uint32_t ze : 5; // z-exponent + }; + uint32_t v; + }; + + XMFLOAT3PK() = default; + + XMFLOAT3PK(const XMFLOAT3PK&) = default; + XMFLOAT3PK& operator=(const XMFLOAT3PK&) = default; + + XMFLOAT3PK(XMFLOAT3PK&&) = default; + XMFLOAT3PK& operator=(XMFLOAT3PK&&) = default; + + explicit constexpr XMFLOAT3PK(uint32_t Packed) noexcept : v(Packed) {} + XMFLOAT3PK(float _x, float _y, float _z) noexcept; + explicit XMFLOAT3PK(_In_reads_(3) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return v; } + + XMFLOAT3PK& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 3D vector: 9/9/9 floating-point components with shared 5-bit exponent + // The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent + // with 9-bit mantissa for the x, y, and z component. The shared exponent + // is stored in the most significant bits and the x component mantissa is in + // the least significant bits. No sign bits so all partial-precision numbers + // are positive. + // (E5Z9Y9X9): [32] EEEEEzzz zzzzzzyy yyyyyyyx xxxxxxxx [0] + struct XMFLOAT3SE + { + union + { + struct + { + uint32_t xm : 9; // x-mantissa + uint32_t ym : 9; // y-mantissa + uint32_t zm : 9; // z-mantissa + uint32_t e : 5; // shared exponent + }; + uint32_t v; + }; + + XMFLOAT3SE() = default; + + XMFLOAT3SE(const XMFLOAT3SE&) = default; + XMFLOAT3SE& operator=(const XMFLOAT3SE&) = default; + + XMFLOAT3SE(XMFLOAT3SE&&) = default; + XMFLOAT3SE& operator=(XMFLOAT3SE&&) = default; + + explicit constexpr XMFLOAT3SE(uint32_t Packed) noexcept : v(Packed) {} + XMFLOAT3SE(float _x, float _y, float _z) noexcept; + explicit XMFLOAT3SE(_In_reads_(3) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return v; } + + XMFLOAT3SE& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 4D Vector; 16 bit floating point components + struct XMHALF4 + { + union + { + struct + { + HALF x; + HALF y; + HALF z; + HALF w; + }; + uint64_t v; + }; + + XMHALF4() = default; + + XMHALF4(const XMHALF4&) = default; + XMHALF4& operator=(const XMHALF4&) = default; + + XMHALF4(XMHALF4&&) = default; + XMHALF4& operator=(XMHALF4&&) = default; + + explicit constexpr XMHALF4(uint64_t Packed) noexcept : v(Packed) {} + constexpr XMHALF4(HALF _x, HALF _y, HALF _z, HALF _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMHALF4(_In_reads_(4) const HALF* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMHALF4(float _x, float _y, float _z, float _w) noexcept; + explicit XMHALF4(_In_reads_(4) const float* pArray) noexcept; + + XMHALF4& operator= (uint64_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 4D Vector; 16 bit signed normalized integer components + struct XMSHORTN4 + { + union + { + struct + { + int16_t x; + int16_t y; + int16_t z; + int16_t w; + }; + uint64_t v; + }; + + XMSHORTN4() = default; + + XMSHORTN4(const XMSHORTN4&) = default; + XMSHORTN4& operator=(const XMSHORTN4&) = default; + + XMSHORTN4(XMSHORTN4&&) = default; + XMSHORTN4& operator=(XMSHORTN4&&) = default; + + explicit constexpr XMSHORTN4(uint64_t Packed) noexcept : v(Packed) {} + constexpr XMSHORTN4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMSHORTN4(_In_reads_(4) const int16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMSHORTN4(float _x, float _y, float _z, float _w) noexcept; + explicit XMSHORTN4(_In_reads_(4) const float* pArray) noexcept; + + XMSHORTN4& operator= (uint64_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 16 bit signed integer components + struct XMSHORT4 + { + union + { + struct + { + int16_t x; + int16_t y; + int16_t z; + int16_t w; + }; + uint64_t v; + }; + + XMSHORT4() = default; + + XMSHORT4(const XMSHORT4&) = default; + XMSHORT4& operator=(const XMSHORT4&) = default; + + XMSHORT4(XMSHORT4&&) = default; + XMSHORT4& operator=(XMSHORT4&&) = default; + + explicit constexpr XMSHORT4(uint64_t Packed) noexcept : v(Packed) {} + constexpr XMSHORT4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMSHORT4(_In_reads_(4) const int16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMSHORT4(float _x, float _y, float _z, float _w) noexcept; + explicit XMSHORT4(_In_reads_(4) const float* pArray) noexcept; + + XMSHORT4& operator= (uint64_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 16 bit unsigned normalized integer components + struct XMUSHORTN4 + { + union + { + struct + { + uint16_t x; + uint16_t y; + uint16_t z; + uint16_t w; + }; + uint64_t v; + }; + + XMUSHORTN4() = default; + + XMUSHORTN4(const XMUSHORTN4&) = default; + XMUSHORTN4& operator=(const XMUSHORTN4&) = default; + + XMUSHORTN4(XMUSHORTN4&&) = default; + XMUSHORTN4& operator=(XMUSHORTN4&&) = default; + + explicit constexpr XMUSHORTN4(uint64_t Packed) noexcept : v(Packed) {} + constexpr XMUSHORTN4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUSHORTN4(_In_reads_(4) const uint16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUSHORTN4(float _x, float _y, float _z, float _w) noexcept; + explicit XMUSHORTN4(_In_reads_(4) const float* pArray) noexcept; + + XMUSHORTN4& operator= (uint64_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 16 bit unsigned integer components + struct XMUSHORT4 + { + union + { + struct + { + uint16_t x; + uint16_t y; + uint16_t z; + uint16_t w; + }; + uint64_t v; + }; + + XMUSHORT4() = default; + + XMUSHORT4(const XMUSHORT4&) = default; + XMUSHORT4& operator=(const XMUSHORT4&) = default; + + XMUSHORT4(XMUSHORT4&&) = default; + XMUSHORT4& operator=(XMUSHORT4&&) = default; + + explicit constexpr XMUSHORT4(uint64_t Packed) noexcept : v(Packed) {} + constexpr XMUSHORT4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUSHORT4(_In_reads_(4) const uint16_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUSHORT4(float _x, float _y, float _z, float _w) noexcept; + explicit XMUSHORT4(_In_reads_(4) const float* pArray) noexcept; + + XMUSHORT4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer + // The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, + // normalized integer for the w component and 10 bit signed, normalized + // integers for the z, y, and x components. The w component is stored in the + // most significant bits and the x component in the least significant bits + // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] + struct XMXDECN4 + { + union + { + struct + { + int32_t x : 10; // -511/511 to 511/511 + int32_t y : 10; // -511/511 to 511/511 + int32_t z : 10; // -511/511 to 511/511 + uint32_t w : 2; // 0/3 to 3/3 + }; + uint32_t v; + }; + + XMXDECN4() = default; + + XMXDECN4(const XMXDECN4&) = default; + XMXDECN4& operator=(const XMXDECN4&) = default; + + XMXDECN4(XMXDECN4&&) = default; + XMXDECN4& operator=(XMXDECN4&&) = default; + + explicit constexpr XMXDECN4(uint32_t Packed) : v(Packed) {} + XMXDECN4(float _x, float _y, float _z, float _w) noexcept; + explicit XMXDECN4(_In_reads_(4) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return v; } + + XMXDECN4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer + // The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned + // integer for the w component and 10 bit signed integers for the + // z, y, and x components. The w component is stored in the + // most significant bits and the x component in the least significant bits + // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] + struct XM_DEPRECATED XMXDEC4 + { + union + { + struct + { + int32_t x : 10; // -511 to 511 + int32_t y : 10; // -511 to 511 + int32_t z : 10; // -511 to 511 + uint32_t w : 2; // 0 to 3 + }; + uint32_t v; + }; + + XMXDEC4() = default; + + XMXDEC4(const XMXDEC4&) = default; + XMXDEC4& operator=(const XMXDEC4&) = default; + + XMXDEC4(XMXDEC4&&) = default; + XMXDEC4& operator=(XMXDEC4&&) = default; + + explicit constexpr XMXDEC4(uint32_t Packed) noexcept : v(Packed) {} + XMXDEC4(float _x, float _y, float _z, float _w) noexcept; + explicit XMXDEC4(_In_reads_(4) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return v; } + + XMXDEC4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer + // The normalized 4D Vector is packed into 32 bits as follows: a 2 bit signed, + // normalized integer for the w component and 10 bit signed, normalized + // integers for the z, y, and x components. The w component is stored in the + // most significant bits and the x component in the least significant bits + // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] + struct XM_DEPRECATED XMDECN4 + { + union + { + struct + { + int32_t x : 10; // -511/511 to 511/511 + int32_t y : 10; // -511/511 to 511/511 + int32_t z : 10; // -511/511 to 511/511 + int32_t w : 2; // -1/1 to 1/1 + }; + uint32_t v; + }; + + XMDECN4() = default; + + XMDECN4(const XMDECN4&) = default; + XMDECN4& operator=(const XMDECN4&) = default; + + XMDECN4(XMDECN4&&) = default; + XMDECN4& operator=(XMDECN4&&) = default; + + explicit constexpr XMDECN4(uint32_t Packed) noexcept : v(Packed) {} + XMDECN4(float _x, float _y, float _z, float _w) noexcept; + explicit XMDECN4(_In_reads_(4) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return v; } + + XMDECN4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer + // The 4D Vector is packed into 32 bits as follows: a 2 bit signed, + // integer for the w component and 10 bit signed integers for the + // z, y, and x components. The w component is stored in the + // most significant bits and the x component in the least significant bits + // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] + struct XM_DEPRECATED XMDEC4 + { + union + { + struct + { + int32_t x : 10; // -511 to 511 + int32_t y : 10; // -511 to 511 + int32_t z : 10; // -511 to 511 + int32_t w : 2; // -1 to 1 + }; + uint32_t v; + }; + + XMDEC4() = default; + + XMDEC4(const XMDEC4&) = default; + XMDEC4& operator=(const XMDEC4&) = default; + + XMDEC4(XMDEC4&&) = default; + XMDEC4& operator=(XMDEC4&&) = default; + + explicit constexpr XMDEC4(uint32_t Packed) noexcept : v(Packed) {} + XMDEC4(float _x, float _y, float _z, float _w) noexcept; + explicit XMDEC4(_In_reads_(4) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return v; } + + XMDEC4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer + // The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, + // normalized integer for the w component and 10 bit unsigned, normalized + // integers for the z, y, and x components. The w component is stored in the + // most significant bits and the x component in the least significant bits + // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] + struct XMUDECN4 + { + union + { + struct + { + uint32_t x : 10; // 0/1023 to 1023/1023 + uint32_t y : 10; // 0/1023 to 1023/1023 + uint32_t z : 10; // 0/1023 to 1023/1023 + uint32_t w : 2; // 0/3 to 3/3 + }; + uint32_t v; + }; + + XMUDECN4() = default; + + XMUDECN4(const XMUDECN4&) = default; + XMUDECN4& operator=(const XMUDECN4&) = default; + + XMUDECN4(XMUDECN4&&) = default; + XMUDECN4& operator=(XMUDECN4&&) = default; + + explicit constexpr XMUDECN4(uint32_t Packed) noexcept : v(Packed) {} + XMUDECN4(float _x, float _y, float _z, float _w) noexcept; + explicit XMUDECN4(_In_reads_(4) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return v; } + + XMUDECN4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer + // The 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, + // integer for the w component and 10 bit unsigned integers + // for the z, y, and x components. The w component is stored in the + // most significant bits and the x component in the least significant bits + // (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] + struct XMUDEC4 + { + union + { + struct + { + uint32_t x : 10; // 0 to 1023 + uint32_t y : 10; // 0 to 1023 + uint32_t z : 10; // 0 to 1023 + uint32_t w : 2; // 0 to 3 + }; + uint32_t v; + }; + + XMUDEC4() = default; + + XMUDEC4(const XMUDEC4&) = default; + XMUDEC4& operator=(const XMUDEC4&) = default; + + XMUDEC4(XMUDEC4&&) = default; + XMUDEC4& operator=(XMUDEC4&&) = default; + + explicit constexpr XMUDEC4(uint32_t Packed) noexcept : v(Packed) {} + XMUDEC4(float _x, float _y, float _z, float _w) noexcept; + explicit XMUDEC4(_In_reads_(4) const float* pArray) noexcept; + + operator uint32_t () const noexcept { return v; } + + XMUDEC4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 4D Vector; 8 bit signed normalized integer components + struct XMBYTEN4 + { + union + { + struct + { + int8_t x; + int8_t y; + int8_t z; + int8_t w; + }; + uint32_t v; + }; + + XMBYTEN4() = default; + + XMBYTEN4(const XMBYTEN4&) = default; + XMBYTEN4& operator=(const XMBYTEN4&) = default; + + XMBYTEN4(XMBYTEN4&&) = default; + XMBYTEN4& operator=(XMBYTEN4&&) = default; + + constexpr XMBYTEN4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit constexpr XMBYTEN4(uint32_t Packed) noexcept : v(Packed) {} + explicit XMBYTEN4(_In_reads_(4) const int8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMBYTEN4(float _x, float _y, float _z, float _w) noexcept; + explicit XMBYTEN4(_In_reads_(4) const float* pArray) noexcept; + + XMBYTEN4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 8 bit signed integer components + struct XMBYTE4 + { + union + { + struct + { + int8_t x; + int8_t y; + int8_t z; + int8_t w; + }; + uint32_t v; + }; + + XMBYTE4() = default; + + XMBYTE4(const XMBYTE4&) = default; + XMBYTE4& operator=(const XMBYTE4&) = default; + + XMBYTE4(XMBYTE4&&) = default; + XMBYTE4& operator=(XMBYTE4&&) = default; + + constexpr XMBYTE4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit constexpr XMBYTE4(uint32_t Packed) noexcept : v(Packed) {} + explicit XMBYTE4(_In_reads_(4) const int8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMBYTE4(float _x, float _y, float _z, float _w) noexcept; + explicit XMBYTE4(_In_reads_(4) const float* pArray) noexcept; + + XMBYTE4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 8 bit unsigned normalized integer components + struct XMUBYTEN4 + { + union + { + struct + { + uint8_t x; + uint8_t y; + uint8_t z; + uint8_t w; + }; + uint32_t v; + }; + + XMUBYTEN4() = default; + + XMUBYTEN4(const XMUBYTEN4&) = default; + XMUBYTEN4& operator=(const XMUBYTEN4&) = default; + + XMUBYTEN4(XMUBYTEN4&&) = default; + XMUBYTEN4& operator=(XMUBYTEN4&&) = default; + + constexpr XMUBYTEN4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit constexpr XMUBYTEN4(uint32_t Packed) noexcept : v(Packed) {} + explicit XMUBYTEN4(_In_reads_(4) const uint8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUBYTEN4(float _x, float _y, float _z, float _w) noexcept; + explicit XMUBYTEN4(_In_reads_(4) const float* pArray) noexcept; + + XMUBYTEN4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + // 4D Vector; 8 bit unsigned integer components + struct XMUBYTE4 + { + union + { + struct + { + uint8_t x; + uint8_t y; + uint8_t z; + uint8_t w; + }; + uint32_t v; + }; + + XMUBYTE4() = default; + + XMUBYTE4(const XMUBYTE4&) = default; + XMUBYTE4& operator=(const XMUBYTE4&) = default; + + XMUBYTE4(XMUBYTE4&&) = default; + XMUBYTE4& operator=(XMUBYTE4&&) = default; + + constexpr XMUBYTE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit constexpr XMUBYTE4(uint32_t Packed) noexcept : v(Packed) {} + explicit XMUBYTE4(_In_reads_(4) const uint8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUBYTE4(float _x, float _y, float _z, float _w) noexcept; + explicit XMUBYTE4(_In_reads_(4) const float* pArray) noexcept; + + XMUBYTE4& operator= (uint32_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 4D vector; 4 bit unsigned integer components + struct XMUNIBBLE4 + { + union + { + struct + { + uint16_t x : 4; // 0 to 15 + uint16_t y : 4; // 0 to 15 + uint16_t z : 4; // 0 to 15 + uint16_t w : 4; // 0 to 15 + }; + uint16_t v; + }; + + XMUNIBBLE4() = default; + + XMUNIBBLE4(const XMUNIBBLE4&) = default; + XMUNIBBLE4& operator=(const XMUNIBBLE4&) = default; + + XMUNIBBLE4(XMUNIBBLE4&&) = default; + XMUNIBBLE4& operator=(XMUNIBBLE4&&) = default; + + explicit constexpr XMUNIBBLE4(uint16_t Packed) noexcept : v(Packed) {} + constexpr XMUNIBBLE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUNIBBLE4(_In_reads_(4) const uint8_t* pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUNIBBLE4(float _x, float _y, float _z, float _w) noexcept; + explicit XMUNIBBLE4(_In_reads_(4) const float* pArray) noexcept; + + operator uint16_t () const noexcept { return v; } + + XMUNIBBLE4& operator= (uint16_t Packed) noexcept { v = Packed; return *this; } + }; + + //------------------------------------------------------------------------------ + // 4D vector: 5/5/5/1 unsigned integer components + struct XMU555 + { + union + { + struct + { + uint16_t x : 5; // 0 to 31 + uint16_t y : 5; // 0 to 31 + uint16_t z : 5; // 0 to 31 + uint16_t w : 1; // 0 or 1 + }; + uint16_t v; + }; + + XMU555() = default; + + XMU555(const XMU555&) = default; + XMU555& operator=(const XMU555&) = default; + + XMU555(XMU555&&) = default; + XMU555& operator=(XMU555&&) = default; + + explicit constexpr XMU555(uint16_t Packed) noexcept : v(Packed) {} + constexpr XMU555(uint8_t _x, uint8_t _y, uint8_t _z, bool _w) noexcept : x(_x), y(_y), z(_z), w(_w ? 0x1 : 0) {} + XMU555(_In_reads_(3) const uint8_t* pArray, _In_ bool _w) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(_w ? 0x1 : 0) {} + XMU555(float _x, float _y, float _z, bool _w) noexcept; + XMU555(_In_reads_(3) const float* pArray, _In_ bool _w) noexcept; + + operator uint16_t () const noexcept { return v; } + + XMU555& operator= (uint16_t Packed) noexcept { v = Packed; return *this; } + }; + +#ifdef __clang__ +#pragma clang diagnostic pop +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + /**************************************************************************** + * + * Data conversion operations + * + ****************************************************************************/ + + float XMConvertHalfToFloat(HALF Value) noexcept; + float* XMConvertHalfToFloatStream(_Out_writes_bytes_(sizeof(float) + OutputStride * (HalfCount - 1)) float* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(HALF) + InputStride * (HalfCount - 1)) const HALF* pInputStream, + _In_ size_t InputStride, _In_ size_t HalfCount) noexcept; + HALF XMConvertFloatToHalf(float Value) noexcept; + HALF* XMConvertFloatToHalfStream(_Out_writes_bytes_(sizeof(HALF) + OutputStride * (FloatCount - 1)) HALF* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(float) + InputStride * (FloatCount - 1)) const float* pInputStream, + _In_ size_t InputStride, _In_ size_t FloatCount) noexcept; + + /**************************************************************************** + * + * Load operations + * + ****************************************************************************/ + + XMVECTOR XM_CALLCONV XMLoadColor(_In_ const XMCOLOR* pSource) noexcept; + + XMVECTOR XM_CALLCONV XMLoadHalf2(_In_ const XMHALF2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadShortN2(_In_ const XMSHORTN2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadShort2(_In_ const XMSHORT2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUShortN2(_In_ const XMUSHORTN2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUShort2(_In_ const XMUSHORT2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadByteN2(_In_ const XMBYTEN2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadByte2(_In_ const XMBYTE2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUByteN2(_In_ const XMUBYTEN2* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUByte2(_In_ const XMUBYTE2* pSource) noexcept; + + XMVECTOR XM_CALLCONV XMLoadU565(_In_ const XMU565* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat3PK(_In_ const XMFLOAT3PK* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadFloat3SE(_In_ const XMFLOAT3SE* pSource) noexcept; + + XMVECTOR XM_CALLCONV XMLoadHalf4(_In_ const XMHALF4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadShortN4(_In_ const XMSHORTN4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadShort4(_In_ const XMSHORT4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUShortN4(_In_ const XMUSHORTN4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUShort4(_In_ const XMUSHORT4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadXDecN4(_In_ const XMXDECN4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUDecN4(_In_ const XMUDECN4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUDecN4_XR(_In_ const XMUDECN4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUDec4(_In_ const XMUDEC4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadByteN4(_In_ const XMBYTEN4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadByte4(_In_ const XMBYTE4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUByteN4(_In_ const XMUBYTEN4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUByte4(_In_ const XMUBYTE4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadUNibble4(_In_ const XMUNIBBLE4* pSource) noexcept; + XMVECTOR XM_CALLCONV XMLoadU555(_In_ const XMU555* pSource) noexcept; + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4996) + // C4996: ignore deprecation warning +#endif + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + + XMVECTOR XM_DEPRECATED XM_CALLCONV XMLoadDecN4(_In_ const XMDECN4* pSource) noexcept; + XMVECTOR XM_DEPRECATED XM_CALLCONV XMLoadDec4(_In_ const XMDEC4* pSource) noexcept; + XMVECTOR XM_DEPRECATED XM_CALLCONV XMLoadXDec4(_In_ const XMXDEC4* pSource) noexcept; + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + /**************************************************************************** + * + * Store operations + * + ****************************************************************************/ + + void XM_CALLCONV XMStoreColor(_Out_ XMCOLOR* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreHalf2(_Out_ XMHALF2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreShortN2(_Out_ XMSHORTN2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreShort2(_Out_ XMSHORT2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUShortN2(_Out_ XMUSHORTN2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUShort2(_Out_ XMUSHORT2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreByteN2(_Out_ XMBYTEN2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreByte2(_Out_ XMBYTE2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUByteN2(_Out_ XMUBYTEN2* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUByte2(_Out_ XMUBYTE2* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreU565(_Out_ XMU565* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat3PK(_Out_ XMFLOAT3PK* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreFloat3SE(_Out_ XMFLOAT3SE* pDestination, _In_ FXMVECTOR V) noexcept; + + void XM_CALLCONV XMStoreHalf4(_Out_ XMHALF4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreShortN4(_Out_ XMSHORTN4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreShort4(_Out_ XMSHORT4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUShortN4(_Out_ XMUSHORTN4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUShort4(_Out_ XMUSHORT4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreXDecN4(_Out_ XMXDECN4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUDecN4(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUDecN4_XR(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUDec4(_Out_ XMUDEC4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreByteN4(_Out_ XMBYTEN4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreByte4(_Out_ XMBYTE4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUByteN4(_Out_ XMUBYTEN4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUByte4(_Out_ XMUBYTE4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreUNibble4(_Out_ XMUNIBBLE4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_CALLCONV XMStoreU555(_Out_ XMU555* pDestination, _In_ FXMVECTOR V) noexcept; + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4996) + // C4996: ignore deprecation warning +#endif + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + + void XM_DEPRECATED XM_CALLCONV XMStoreDecN4(_Out_ XMDECN4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_DEPRECATED XM_CALLCONV XMStoreDec4(_Out_ XMDEC4* pDestination, _In_ FXMVECTOR V) noexcept; + void XM_DEPRECATED XM_CALLCONV XMStoreXDec4(_Out_ XMXDEC4* pDestination, _In_ FXMVECTOR V) noexcept; + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + /**************************************************************************** + * + * Implementation + * + ****************************************************************************/ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4068 4214 4204 4365 4616 6001 6101) + // C4068/4616: ignore unknown pragmas + // C4214/4204: nonstandard extension used + // C4365: Off by default noise + // C6001/6101: False positives +#endif + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") +#pragma prefast(disable : 26495, "Union initialization confuses /analyze") +#endif + +#include "DirectXPackedVector.inl" + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif + } // namespace PackedVector + +} // namespace DirectX + diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXPackedVector.inl b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXPackedVector.inl new file mode 100644 index 000000000..5f7e5d775 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXPackedVector.inl @@ -0,0 +1,4459 @@ +//------------------------------------------------------------------------------------- +// DirectXPackedVector.inl -- SIMD C++ Math library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Data conversion + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline float XMConvertHalfToFloat(HALF Value) noexcept +{ +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128i V1 = _mm_cvtsi32_si128(static_cast(Value)); + __m128 V2 = _mm_cvtph_ps(V1); + return _mm_cvtss_f32(V2); +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2)) + uint16x4_t vHalf = vdup_n_u16(Value); + float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf)); + return vgetq_lane_f32(vFloat, 0); +#else + auto Mantissa = static_cast(Value & 0x03FF); + + uint32_t Exponent = (Value & 0x7C00); + if (Exponent == 0x7C00) // INF/NAN + { + Exponent = 0x8f; + } + else if (Exponent != 0) // The value is normalized + { + Exponent = static_cast((static_cast(Value) >> 10) & 0x1F); + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x0400) == 0); + + Mantissa &= 0x03FF; + } + else // The value is zero + { + Exponent = static_cast(-112); + } + + uint32_t Result = + ((static_cast(Value) & 0x8000) << 16) // Sign + | ((Exponent + 112) << 23) // Exponent + | (Mantissa << 13); // Mantissa + + return reinterpret_cast(&Result)[0]; +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) +#endif + +_Use_decl_annotations_ +inline float* XMConvertHalfToFloatStream +( + float* pOutputStream, + size_t OutputStride, + const HALF* pInputStream, + size_t InputStride, + size_t HalfCount +) noexcept +{ + assert(pOutputStream); + assert(pInputStream); + + assert(InputStride >= sizeof(HALF)); + _Analysis_assume_(InputStride >= sizeof(HALF)); + + assert(OutputStride >= sizeof(float)); + _Analysis_assume_(OutputStride >= sizeof(float)); + +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + auto pHalf = reinterpret_cast(pInputStream); + auto pFloat = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = HalfCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(HALF)) + { + if (OutputStride == sizeof(float)) + { + if ((reinterpret_cast(pFloat) & 0xF) == 0) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + __m128 FV = _mm_cvtph_ps(HV); + + XM_STREAM_PS(reinterpret_cast(pFloat), FV); + pFloat += OutputStride * 4; + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + __m128 FV = _mm_cvtph_ps(HV); + + _mm_storeu_ps(reinterpret_cast(pFloat), FV); + pFloat += OutputStride * 4; + i += 4; + } + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + __m128 FV = _mm_cvtph_ps(HV); + + _mm_store_ss(reinterpret_cast(pFloat), FV); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 1); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 2); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 3); + pFloat += OutputStride; + i += 4; + } + } + } + else if (OutputStride == sizeof(float)) + { + if ((reinterpret_cast(pFloat) & 0xF) == 0) + { + // Scattered input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16(HV, H1, 0); + HV = _mm_insert_epi16(HV, H2, 1); + HV = _mm_insert_epi16(HV, H3, 2); + HV = _mm_insert_epi16(HV, H4, 3); + __m128 FV = _mm_cvtph_ps(HV); + + XM_STREAM_PS(reinterpret_cast(pFloat), FV); + pFloat += OutputStride * 4; + i += 4; + } + } + else + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16(HV, H1, 0); + HV = _mm_insert_epi16(HV, H2, 1); + HV = _mm_insert_epi16(HV, H3, 2); + HV = _mm_insert_epi16(HV, H4, 3); + __m128 FV = _mm_cvtph_ps(HV); + + _mm_storeu_ps(reinterpret_cast(pFloat), FV); + pFloat += OutputStride * 4; + i += 4; + } + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16(HV, H1, 0); + HV = _mm_insert_epi16(HV, H2, 1); + HV = _mm_insert_epi16(HV, H3, 2); + HV = _mm_insert_epi16(HV, H4, 3); + __m128 FV = _mm_cvtph_ps(HV); + + _mm_store_ss(reinterpret_cast(pFloat), FV); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 1); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 2); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 3); + pFloat += OutputStride; + i += 4; + } + } + } + + for (; i < HalfCount; ++i) + { + *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); + pHalf += InputStride; + pFloat += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) ||__aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2)) + auto pHalf = reinterpret_cast(pInputStream); + auto pFloat = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = HalfCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(HALF)) + { + if (OutputStride == sizeof(float)) + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + uint16x4_t vHalf = vld1_u16(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf)); + + vst1q_f32(reinterpret_cast(pFloat), vFloat); + pFloat += OutputStride * 4; + i += 4; + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + uint16x4_t vHalf = vld1_u16(reinterpret_cast(pHalf)); + pHalf += InputStride * 4; + + float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf)); + + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 0); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 1); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 2); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 3); + pFloat += OutputStride; + i += 4; + } + } + } + else if (OutputStride == sizeof(float)) + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) | (uint64_t(H3) << 32) | (uint64_t(H4) << 48); + uint16x4_t vHalf = vcreate_u16(iHalf); + + float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf)); + + vst1q_f32(reinterpret_cast(pFloat), vFloat); + pFloat += OutputStride * 4; + i += 4; + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) | (uint64_t(H3) << 32) | (uint64_t(H4) << 48); + uint16x4_t vHalf = vcreate_u16(iHalf); + + float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf)); + + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 0); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 1); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 2); + pFloat += OutputStride; + vst1q_lane_f32(reinterpret_cast(pFloat), vFloat, 3); + pFloat += OutputStride; + i += 4; + } + } + } + + for (; i < HalfCount; ++i) + { + *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); + pHalf += InputStride; + pFloat += OutputStride; + } + + return pOutputStream; +#else + auto pHalf = reinterpret_cast(pInputStream); + auto pFloat = reinterpret_cast(pOutputStream); + + for (size_t i = 0; i < HalfCount; i++) + { + *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); + pHalf += InputStride; + pFloat += OutputStride; + } + + return pOutputStream; +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline HALF XMConvertFloatToHalf(float Value) noexcept +{ +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128 V1 = _mm_set_ss(Value); + __m128i V2 = _mm_cvtps_ph(V1, _MM_FROUND_TO_NEAREST_INT); + return static_cast(_mm_extract_epi16(V2, 0)); +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2)) + float32x4_t vFloat = vdupq_n_f32(Value); + float16x4_t vHalf = vcvt_f16_f32(vFloat); + return vget_lane_u16(vreinterpret_u16_f16(vHalf), 0); +#else + uint32_t Result; + + auto IValue = reinterpret_cast(&Value)[0]; + uint32_t Sign = (IValue & 0x80000000U) >> 16U; + IValue = IValue & 0x7FFFFFFFU; // Hack off the sign + if (IValue >= 0x47800000 /*e+16*/) + { + // The number is too large to be represented as a half. Return infinity or NaN + Result = 0x7C00U | ((IValue > 0x7F800000) ? (0x200 | ((IValue >> 13U) & 0x3FFU)) : 0U); + } + else if (IValue <= 0x33000000U /*e-25*/) + { + Result = 0; + } + else if (IValue < 0x38800000U /*e-14*/) + { + // The number is too small to be represented as a normalized half. + // Convert it to a denormalized value. + uint32_t Shift = 125U - (IValue >> 23U); + IValue = 0x800000U | (IValue & 0x7FFFFFU); + Result = IValue >> (Shift + 1); + uint32_t s = (IValue & ((1U << Shift) - 1)) != 0; + Result += (Result | s) & ((IValue >> Shift) & 1U); + } + else + { + // Rebias the exponent to represent the value as a normalized half. + IValue += 0xC8000000U; + Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U) & 0x7FFFU; + } + return static_cast(Result | Sign); +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline HALF* XMConvertFloatToHalfStream +( + HALF* pOutputStream, + size_t OutputStride, + const float* pInputStream, + size_t InputStride, + size_t FloatCount +) noexcept +{ + assert(pOutputStream); + assert(pInputStream); + + assert(InputStride >= sizeof(float)); + _Analysis_assume_(InputStride >= sizeof(float)); + + assert(OutputStride >= sizeof(HALF)); + _Analysis_assume_(OutputStride >= sizeof(HALF)); + +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + auto pFloat = reinterpret_cast(pInputStream); + auto pHalf = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = FloatCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(float)) + { + if (OutputStride == sizeof(HALF)) + { + if ((reinterpret_cast(pFloat) & 0xF) == 0) + { + // Aligned and packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_load_ps(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV); + pHalf += OutputStride * 4; + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_loadu_ps(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV); + pHalf += OutputStride * 4; + i += 4; + } + } + } + else + { + if ((reinterpret_cast(pFloat) & 0xF) == 0) + { + // Aligned & packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_load_ps(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT); + + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 0)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 1)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 2)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 3)); + pHalf += OutputStride; + i += 4; + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_loadu_ps(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT); + + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 0)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 1)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 2)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 3)); + pHalf += OutputStride; + i += 4; + } + } + } + } + else if (OutputStride == sizeof(HALF)) + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV1 = _mm_load_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV2 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV3 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV4 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV = _mm_blend_ps(FV1, FV2, 0x2); + __m128 FT = _mm_blend_ps(FV3, FV4, 0x8); + FV = _mm_blend_ps(FV, FT, 0xC); + + __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT); + + _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV); + pHalf += OutputStride * 4; + i += 4; + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV1 = _mm_load_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV2 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV3 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV4 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV = _mm_blend_ps(FV1, FV2, 0x2); + __m128 FT = _mm_blend_ps(FV3, FV4, 0x8); + FV = _mm_blend_ps(FV, FT, 0xC); + + __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT); + + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 0)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 1)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 2)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 3)); + pHalf += OutputStride; + i += 4; + } + } + } + + for (; i < FloatCount; ++i) + { + *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); + pFloat += InputStride; + pHalf += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__) && !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2)) + auto pFloat = reinterpret_cast(pInputStream); + auto pHalf = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = FloatCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(float)) + { + if (OutputStride == sizeof(HALF)) + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + float32x4_t vFloat = vld1q_f32(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat)); + + vst1_u16(reinterpret_cast(pHalf), vHalf); + pHalf += OutputStride * 4; + i += 4; + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + float32x4_t vFloat = vld1q_f32(reinterpret_cast(pFloat)); + pFloat += InputStride * 4; + + uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat)); + + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 0); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 1); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 2); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 3); + pHalf += OutputStride; + i += 4; + } + } + } + else if (OutputStride == sizeof(HALF)) + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + float32x4_t vFloat = vdupq_n_f32(0); + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 0); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 1); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 2); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 3); + pFloat += InputStride; + + uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat)); + + vst1_u16(reinterpret_cast(pHalf), vHalf); + pHalf += OutputStride * 4; + i += 4; + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + float32x4_t vFloat = vdupq_n_f32(0); + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 0); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 1); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 2); + pFloat += InputStride; + + vFloat = vld1q_lane_f32(reinterpret_cast(pFloat), vFloat, 3); + pFloat += InputStride; + + uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat)); + + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 0); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 1); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 2); + pHalf += OutputStride; + vst1_lane_u16(reinterpret_cast(pHalf), vHalf, 3); + pHalf += OutputStride; + i += 4; + } + } + } + + for (; i < FloatCount; ++i) + { + *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); + pFloat += InputStride; + pHalf += OutputStride; + } + + return pOutputStream; +#else + auto pFloat = reinterpret_cast(pInputStream); + auto pHalf = reinterpret_cast(pOutputStream); + + for (size_t i = 0; i < FloatCount; i++) + { + *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); + pFloat += InputStride; + pHalf += OutputStride; + } + return pOutputStream; +#endif // !_XM_F16C_INTRINSICS_ +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +/**************************************************************************** + * + * Vector and matrix load operations + * + ****************************************************************************/ + +#ifdef _PREFAST_ +#pragma prefast(push) +#pragma prefast(disable:28931, "PREfast noise: Esp:1266") +#endif + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadColor(const XMCOLOR* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + // int32_t -> Float conversions are done in one instruction. + // uint32_t -> Float calls a runtime function. Keep in int32_t + auto iColor = static_cast(pSource->c); + XMVECTORF32 vColor = { { { + static_cast((iColor >> 16) & 0xFF)* (1.0f / 255.0f), + static_cast((iColor >> 8) & 0xFF)* (1.0f / 255.0f), + static_cast(iColor & 0xFF)* (1.0f / 255.0f), + static_cast((iColor >> 24) & 0xFF)* (1.0f / 255.0f) + } } }; + return vColor.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32_t bgra = pSource->c; + uint32_t rgba = (bgra & 0xFF00FF00) | ((bgra >> 16) & 0xFF) | ((bgra << 16) & 0xFF0000); + uint32x2_t vInt8 = vdup_n_u32(rgba); + uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u32(vInt8)); + uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16)); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_n_f32(R, 1.0f / 255.0f); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + __m128i vInt = _mm_set1_epi32(static_cast(pSource->c)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vInt = _mm_and_si128(vInt, g_XMMaskA8R8G8B8); + // a is unsigned! Flip the bit to convert the order to signed + vInt = _mm_xor_si128(vInt, g_XMFlipA8R8G8B8); + // Convert to floating point numbers + XMVECTOR vTemp = _mm_cvtepi32_ps(vInt); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMFixAA8R8G8B8); + // Convert 0-255 to 0.0f-1.0f + return _mm_mul_ps(vTemp, g_XMNormalizeA8R8G8B8); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadHalf2(const XMHALF2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128 V = _mm_load_ss(reinterpret_cast(pSource)); + return _mm_cvtph_ps(_mm_castps_si128(V)); +#else + XMVECTORF32 vResult = { { { + XMConvertHalfToFloat(pSource->x), + XMConvertHalfToFloat(pSource->y), + 0.0f, + 0.0f + } } }; + return vResult.v; +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadShortN2(const XMSHORTN2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + (pSource->x == -32768) ? -1.f : (static_cast(pSource->x)* (1.0f / 32767.0f)), + (pSource->y == -32768) ? -1.f : (static_cast(pSource->y)* (1.0f / 32767.0f)), + 0.0f, + 0.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt16 = vld1_dup_u32(reinterpret_cast(pSource)); + int32x4_t vInt = vmovl_s16(vreinterpret_s16_u32(vInt16)); + vInt = vandq_s32(vInt, g_XMMaskXY); + float32x4_t R = vcvtq_f32_s32(vInt); + R = vmulq_n_f32(R, 1.0f / 32767.0f); + return vmaxq_f32(R, vdupq_n_f32(-1.f)); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp, g_XMMaskX16Y16); + // x needs to be sign extended + vTemp = _mm_xor_ps(vTemp, g_XMFlipX16Y16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x - 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMFixX16Y16); + // Convert -1.0f - 1.0f + vTemp = _mm_mul_ps(vTemp, g_XMNormalizeX16Y16); + // Clamp result (for case of -32768) + return _mm_max_ps(vTemp, g_XMNegativeOne); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadShort2(const XMSHORT2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x), + static_cast(pSource->y), + 0.f, + 0.f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt16 = vld1_dup_u32(reinterpret_cast(pSource)); + int32x4_t vInt = vmovl_s16(vreinterpret_s16_u32(vInt16)); + vInt = vandq_s32(vInt, g_XMMaskXY); + return vcvtq_f32_s32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp, g_XMMaskX16Y16); + // x needs to be sign extended + vTemp = _mm_xor_ps(vTemp, g_XMFlipX16Y16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x - 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMFixX16Y16); + // Y is 65536 too large + return _mm_mul_ps(vTemp, g_XMFixupY16); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUShortN2(const XMUSHORTN2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x) / 65535.0f, + static_cast(pSource->y) / 65535.0f, + 0.f, + 0.f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt16 = vld1_dup_u32(reinterpret_cast(pSource)); + uint32x4_t vInt = vmovl_u16(vreinterpret_u16_u32(vInt16)); + vInt = vandq_u32(vInt, g_XMMaskXY); + float32x4_t R = vcvtq_f32_u32(vInt); + R = vmulq_n_f32(R, 1.0f / 65535.0f); + return vmaxq_f32(R, vdupq_n_f32(-1.f)); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixupY16 = { { { 1.0f / 65535.0f, 1.0f / (65535.0f * 65536.0f), 0.0f, 0.0f } } }; + static const XMVECTORF32 FixaddY16 = { { { 0, 32768.0f * 65536.0f, 0, 0 } } }; + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp, g_XMMaskX16Y16); + // y needs to be sign flipped + vTemp = _mm_xor_ps(vTemp, g_XMFlipY); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // y + 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp, FixaddY16); + // Y is 65536 times too large + vTemp = _mm_mul_ps(vTemp, FixupY16); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUShort2(const XMUSHORT2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x), + static_cast(pSource->y), + 0.f, + 0.f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt16 = vld1_dup_u32(reinterpret_cast(pSource)); + uint32x4_t vInt = vmovl_u16(vreinterpret_u16_u32(vInt16)); + vInt = vandq_u32(vInt, g_XMMaskXY); + return vcvtq_f32_u32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixaddY16 = { { { 0, 32768.0f, 0, 0 } } }; + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp, g_XMMaskX16Y16); + // y needs to be sign flipped + vTemp = _mm_xor_ps(vTemp, g_XMFlipY); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // Y is 65536 times too large + vTemp = _mm_mul_ps(vTemp, g_XMFixupY16); + // y + 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp, FixaddY16); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadByteN2(const XMBYTEN2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + (pSource->x == -128) ? -1.f : (static_cast(pSource->x)* (1.0f / 127.0f)), + (pSource->y == -128) ? -1.f : (static_cast(pSource->y)* (1.0f / 127.0f)), + 0.0f, + 0.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast(pSource)); + int16x8_t vInt16 = vmovl_s8(vreinterpret_s8_u16(vInt8)); + int32x4_t vInt = vmovl_s16(vget_low_s16(vInt16)); + vInt = vandq_s32(vInt, g_XMMaskXY); + float32x4_t R = vcvtq_f32_s32(vInt); + R = vmulq_n_f32(R, 1.0f / 127.0f); + return vmaxq_f32(R, vdupq_n_f32(-1.f)); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.0f / 127.0f, 1.0f / (127.0f * 256.0f), 0, 0 } } }; + static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } }; + // Splat the color in all four entries (x,z,y,w) + __m128i vInt = XM_LOADU_SI16(&pSource->v); + XMVECTOR vTemp = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0)); + // Mask + vTemp = _mm_and_ps(vTemp, Mask); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMAddByte4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp, Scale); + // Clamp result (for case of -128) + return _mm_max_ps(vTemp, g_XMNegativeOne); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadByte2(const XMBYTE2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x), + static_cast(pSource->y), + 0.0f, + 0.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast(pSource)); + int16x8_t vInt16 = vmovl_s8(vreinterpret_s8_u16(vInt8)); + int32x4_t vInt = vmovl_s16(vget_low_s16(vInt16)); + vInt = vandq_s32(vInt, g_XMMaskXY); + return vcvtq_f32_s32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f * 256.0f) } } }; + static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } }; + // Splat the color in all four entries (x,z,y,w) + __m128i vInt = XM_LOADU_SI16(&pSource->v); + XMVECTOR vTemp = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0)); + // Mask + vTemp = _mm_and_ps(vTemp, Mask); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMAddByte4); + // Fix y, z and w because they are too large + return _mm_mul_ps(vTemp, Scale); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUByteN2(const XMUBYTEN2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x)* (1.0f / 255.0f), + static_cast(pSource->y)* (1.0f / 255.0f), + 0.0f, + 0.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast(pSource)); + uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u16(vInt8)); + uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16)); + vInt = vandq_u32(vInt, g_XMMaskXY); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_n_f32(R, 1.0f / 255.0f); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.0f / 255.0f, 1.0f / (255.0f * 256.0f), 0, 0 } } }; + static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } }; + // Splat the color in all four entries (x,z,y,w) + __m128i vInt = XM_LOADU_SI16(&pSource->v); + XMVECTOR vTemp = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0)); + // Mask + vTemp = _mm_and_ps(vTemp, Mask); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp, g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMAddUDec4); + // Fix y, z and w because they are too large + return _mm_mul_ps(vTemp, Scale); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUByte2(const XMUBYTE2* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x), + static_cast(pSource->y), + 0.0f, + 0.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast(pSource)); + uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u16(vInt8)); + uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16)); + vInt = vandq_u32(vInt, g_XMMaskXY); + return vcvtq_f32_u32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.0f, 1.0f / 256.0f, 0, 0 } } }; + static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } }; + // Splat the color in all four entries (x,z,y,w) + __m128i vInt = XM_LOADU_SI16(&pSource->v); + XMVECTOR vTemp = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0)); + // Mask + vTemp = _mm_and_ps(vTemp, Mask); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp, g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMAddUDec4); + // Fix y, z and w because they are too large + return _mm_mul_ps(vTemp, Scale); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadU565(const XMU565* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + float(pSource->v & 0x1F), + float((pSource->v >> 5) & 0x3F), + float((pSource->v >> 11) & 0x1F), + 0.f, + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORI32 U565And = { { { 0x1F, 0x3F << 5, 0x1F << 11, 0 } } }; + static const XMVECTORF32 U565Mul = { { { 1.0f, 1.0f / 32.0f, 1.0f / 2048.f, 0 } } }; + uint16x4_t vInt16 = vld1_dup_u16(reinterpret_cast(pSource)); + uint32x4_t vInt = vmovl_u16(vInt16); + vInt = vandq_u32(vInt, U565And); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_f32(R, U565Mul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORI32 U565And = { { { 0x1F, 0x3F << 5, 0x1F << 11, 0 } } }; + static const XMVECTORF32 U565Mul = { { { 1.0f, 1.0f / 32.0f, 1.0f / 2048.f, 0 } } }; + // Get the 16 bit value and splat it + __m128i vInt = XM_LOADU_SI16(&pSource->v); + XMVECTOR vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult, U565And); + // Convert to float + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult, U565Mul); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat3PK(const XMFLOAT3PK* pSource) noexcept +{ + assert(pSource); + + XM_ALIGNED_DATA(16) uint32_t Result[4]; + uint32_t Mantissa; + uint32_t Exponent; + + // X Channel (6-bit mantissa) + Mantissa = pSource->xm; + + if (pSource->xe == 0x1f) // INF or NAN + { + Result[0] = static_cast(0x7f800000 | (static_cast(pSource->xm) << 17)); + } + else + { + if (pSource->xe != 0) // The value is normalized + { + Exponent = pSource->xe; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x40) == 0); + + Mantissa &= 0x3F; + } + else // The value is zero + { + Exponent = static_cast(-112); + } + + Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17); + } + + // Y Channel (6-bit mantissa) + Mantissa = pSource->ym; + + if (pSource->ye == 0x1f) // INF or NAN + { + Result[1] = static_cast(0x7f800000 | (static_cast(pSource->ym) << 17)); + } + else + { + if (pSource->ye != 0) // The value is normalized + { + Exponent = pSource->ye; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x40) == 0); + + Mantissa &= 0x3F; + } + else // The value is zero + { + Exponent = static_cast(-112); + } + + Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17); + } + + // Z Channel (5-bit mantissa) + Mantissa = pSource->zm; + + if (pSource->ze == 0x1f) // INF or NAN + { + Result[2] = static_cast(0x7f800000 | (static_cast(pSource->zm) << 17)); + } + else + { + if (pSource->ze != 0) // The value is normalized + { + Exponent = pSource->ze; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x20) == 0); + + Mantissa &= 0x1F; + } + else // The value is zero + { + Exponent = static_cast(-112); + } + + Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18); + } + + return XMLoadFloat3A(reinterpret_cast(&Result)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat3SE(const XMFLOAT3SE* pSource) noexcept +{ + assert(pSource); + + union { float f; int32_t i; } fi; + fi.i = 0x33800000 + (pSource->e << 23); + float Scale = fi.f; + + XMVECTORF32 v = { { { + Scale * float(pSource->xm), + Scale * float(pSource->ym), + Scale * float(pSource->zm), + 1.0f } } }; + return v; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadHalf4(const XMHALF4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128i V = _mm_loadl_epi64(reinterpret_cast(pSource)); + return _mm_cvtph_ps(V); +#else + XMVECTORF32 vResult = { { { + XMConvertHalfToFloat(pSource->x), + XMConvertHalfToFloat(pSource->y), + XMConvertHalfToFloat(pSource->z), + XMConvertHalfToFloat(pSource->w) + } } }; + return vResult.v; +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadShortN4(const XMSHORTN4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + (pSource->x == -32768) ? -1.f : (static_cast(pSource->x)* (1.0f / 32767.0f)), + (pSource->y == -32768) ? -1.f : (static_cast(pSource->y)* (1.0f / 32767.0f)), + (pSource->z == -32768) ? -1.f : (static_cast(pSource->z)* (1.0f / 32767.0f)), + (pSource->w == -32768) ? -1.f : (static_cast(pSource->w)* (1.0f / 32767.0f)) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int16x4_t vInt = vld1_s16(reinterpret_cast(pSource)); + int32x4_t V = vmovl_s16(vInt); + float32x4_t vResult = vcvtq_f32_s32(V); + vResult = vmulq_n_f32(vResult, 1.0f / 32767.0f); + return vmaxq_f32(vResult, vdupq_n_f32(-1.f)); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd), g_XMMaskX16Y16Z16W16); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMFlipX16Y16Z16W16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x and z - 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMFixX16Y16Z16W16); + // Convert to -1.0f - 1.0f + vTemp = _mm_mul_ps(vTemp, g_XMNormalizeX16Y16Z16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 1, 2, 0)); + // Clamp result (for case of -32768) + return _mm_max_ps(vTemp, g_XMNegativeOne); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadShort4(const XMSHORT4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x), + static_cast(pSource->y), + static_cast(pSource->z), + static_cast(pSource->w) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int16x4_t vInt = vld1_s16(reinterpret_cast(pSource)); + int32x4_t V = vmovl_s16(vInt); + return vcvtq_f32_s32(V); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd), g_XMMaskX16Y16Z16W16); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMFlipX16Y16Z16W16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x and z - 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMFixX16Y16Z16W16); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp, g_XMFixupY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 1, 2, 0)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUShortN4(const XMUSHORTN4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x) / 65535.0f, + static_cast(pSource->y) / 65535.0f, + static_cast(pSource->z) / 65535.0f, + static_cast(pSource->w) / 65535.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt = vld1_u16(reinterpret_cast(pSource)); + uint32x4_t V = vmovl_u16(vInt); + float32x4_t vResult = vcvtq_f32_u32(V); + return vmulq_n_f32(vResult, 1.0f / 65535.0f); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixupY16W16 = { { { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / (65535.0f * 65536.0f), 1.0f / (65535.0f * 65536.0f) } } }; + static const XMVECTORF32 FixaddY16W16 = { { { 0, 0, 32768.0f * 65536.0f, 32768.0f * 65536.0f } } }; + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd), g_XMMaskX16Y16Z16W16); + // y and w are signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp, g_XMFlipZW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // y and w + 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp, FixaddY16W16); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp, FixupY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 1, 2, 0)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUShort4(const XMUSHORT4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x), + static_cast(pSource->y), + static_cast(pSource->z), + static_cast(pSource->w) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt = vld1_u16(reinterpret_cast(pSource)); + uint32x4_t V = vmovl_u16(vInt); + return vcvtq_f32_u32(V); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixaddY16W16 = { { { 0, 0, 32768.0f, 32768.0f } } }; + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd), g_XMMaskX16Y16Z16W16); + // y and w are signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp, g_XMFlipZW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp, g_XMFixupY16W16); + // y and w + 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp, FixaddY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 1, 2, 0)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadXDecN4(const XMXDECN4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + static const uint32_t SignExtend[] = { 0x00000000, 0xFFFFFC00 }; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { { { + (ElementX == 0x200) ? -1.f : (static_cast(static_cast(ElementX | SignExtend[ElementX >> 9])) / 511.0f), + (ElementY == 0x200) ? -1.f : (static_cast(static_cast(ElementY | SignExtend[ElementY >> 9])) / 511.0f), + (ElementZ == 0x200) ? -1.f : (static_cast(static_cast(ElementZ | SignExtend[ElementZ >> 9])) / 511.0f), + static_cast(pSource->v >> 30) / 3.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast(pSource)); + vInt = vandq_u32(vInt, g_XMMaskA2B10G10R10); + vInt = veorq_u32(vInt, g_XMFlipA2B10G10R10); + float32x4_t R = vcvtq_f32_s32(vreinterpretq_s32_u32(vInt)); + R = vaddq_f32(R, g_XMFixAA2B10G10R10); + R = vmulq_f32(R, g_XMNormalizeA2B10G10R10); + return vmaxq_f32(R, vdupq_n_f32(-1.0f)); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskA2B10G10R10); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMFlipA2B10G10R10); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMFixAA2B10G10R10); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp, g_XMNormalizeA2B10G10R10); + // Clamp result (for case of -512) + return _mm_max_ps(vTemp, g_XMNegativeOne); +#endif +} + +//------------------------------------------------------------------------------ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning +#endif + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadXDec4(const XMXDEC4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + static const uint32_t SignExtend[] = { 0x00000000, 0xFFFFFC00 }; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { { { + static_cast(static_cast(ElementX | SignExtend[ElementX >> 9])), + static_cast(static_cast(ElementY | SignExtend[ElementY >> 9])), + static_cast(static_cast(ElementZ | SignExtend[ElementZ >> 9])), + static_cast(pSource->v >> 30) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORU32 XDec4Xor = { { { 0x200, 0x200 << 10, 0x200 << 20, 0x80000000 } } }; + static const XMVECTORF32 XDec4Add = { { { -512.0f, -512.0f * 1024.0f, -512.0f * 1024.0f * 1024.0f, 32768 * 65536.0f } } }; + uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast(pSource)); + vInt = vandq_u32(vInt, g_XMMaskDec4); + vInt = veorq_u32(vInt, XDec4Xor); + float32x4_t R = vcvtq_f32_s32(vreinterpretq_s32_u32(vInt)); + R = vaddq_f32(R, XDec4Add); + return vmulq_f32(R, g_XMMulDec4); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORU32 XDec4Xor = { { { 0x200, 0x200 << 10, 0x200 << 20, 0x80000000 } } }; + static const XMVECTORF32 XDec4Add = { { { -512.0f, -512.0f * 1024.0f, -512.0f * 1024.0f * 1024.0f, 32768 * 65536.0f } } }; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp, XDec4Xor); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp, XDec4Add); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp, g_XMMulDec4); + return vTemp; +#endif +} + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUDecN4(const XMUDECN4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { { { + static_cast(ElementX) / 1023.0f, + static_cast(ElementY) / 1023.0f, + static_cast(ElementZ) / 1023.0f, + static_cast(pSource->v >> 30) / 3.0f + } } }; + return vResult.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 UDecN4Mul = { { { 1.0f / 1023.0f, 1.0f / (1023.0f * 1024.0f), 1.0f / (1023.0f * 1024.0f * 1024.0f), 1.0f / (3.0f * 1024.0f * 1024.0f * 1024.0f) } } }; + uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast(pSource)); + vInt = vandq_u32(vInt, g_XMMaskDec4); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_f32(R, UDecN4Mul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 UDecN4Mul = { { { 1.0f / 1023.0f, 1.0f / (1023.0f * 1024.0f), 1.0f / (1023.0f * 1024.0f * 1024.0f), 1.0f / (3.0f * 1024.0f * 1024.0f * 1024.0f) } } }; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMAddUDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp, UDecN4Mul); + return vTemp; +#endif +} + + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUDecN4_XR(const XMUDECN4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + int32_t ElementX = pSource->v & 0x3FF; + int32_t ElementY = (pSource->v >> 10) & 0x3FF; + int32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { { { + static_cast(ElementX - 0x180) / 510.0f, + static_cast(ElementY - 0x180) / 510.0f, + static_cast(ElementZ - 0x180) / 510.0f, + static_cast(pSource->v >> 30) / 3.0f + } } }; + + return vResult.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 XRMul = { { { 1.0f / 510.0f, 1.0f / (510.0f * 1024.0f), 1.0f / (510.0f * 1024.0f * 1024.0f), 1.0f / (3.0f * 1024.0f * 1024.0f * 1024.0f) } } }; + static const XMVECTORI32 XRBias = { { { 0x180, 0x180 * 1024, 0x180 * 1024 * 1024, 0 } } }; + uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast(pSource)); + vInt = vandq_u32(vInt, g_XMMaskDec4); + int32x4_t vTemp = vsubq_s32(vreinterpretq_s32_u32(vInt), XRBias); + vTemp = veorq_s32(vTemp, g_XMFlipW); + float32x4_t R = vcvtq_f32_s32(vTemp); + R = vaddq_f32(R, g_XMAddUDec4); + return vmulq_f32(R, XRMul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 XRMul = { { { 1.0f / 510.0f, 1.0f / (510.0f * 1024.0f), 1.0f / (510.0f * 1024.0f * 1024.0f), 1.0f / (3.0f * 1024.0f * 1024.0f * 1024.0f) } } }; + static const XMVECTORI32 XRBias = { { { 0x180, 0x180 * 1024, 0x180 * 1024 * 1024, 0 } } }; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask channels + vTemp = _mm_and_ps(vTemp, g_XMMaskDec4); + // Subtract bias + vTemp = _mm_castsi128_ps(_mm_sub_epi32(_mm_castps_si128(vTemp), XRBias)); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMAddUDec4); + // Convert to 0.0f-1.0f + return _mm_mul_ps(vTemp, XRMul); +#endif +} + + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUDec4(const XMUDEC4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { { { + static_cast(ElementX), + static_cast(ElementY), + static_cast(ElementZ), + static_cast(pSource->v >> 30) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast(pSource)); + vInt = vandq_u32(vInt, g_XMMaskDec4); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_f32(R, g_XMMulDec4); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMAddUDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp, g_XMMulDec4); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning +#endif + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadDecN4(const XMDECN4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + static const uint32_t SignExtend[] = { 0x00000000, 0xFFFFFC00 }; + static const uint32_t SignExtendW[] = { 0x00000000, 0xFFFFFFFC }; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + uint32_t ElementW = pSource->v >> 30; + + XMVECTORF32 vResult = { { { + (ElementX == 0x200) ? -1.f : (static_cast(static_cast(ElementX | SignExtend[ElementX >> 9])) / 511.0f), + (ElementY == 0x200) ? -1.f : (static_cast(static_cast(ElementY | SignExtend[ElementY >> 9])) / 511.0f), + (ElementZ == 0x200) ? -1.f : (static_cast(static_cast(ElementZ | SignExtend[ElementZ >> 9])) / 511.0f), + (ElementW == 0x2) ? -1.f : static_cast(static_cast(ElementW | SignExtendW[(ElementW >> 1) & 1])) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 DecN4Mul = { { { 1.0f / 511.0f, 1.0f / (511.0f * 1024.0f), 1.0f / (511.0f * 1024.0f * 1024.0f), 1.0f / (1024.0f * 1024.0f * 1024.0f) } } }; + uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast(pSource)); + vInt = vandq_u32(vInt, g_XMMaskDec4); + vInt = veorq_u32(vInt, g_XMXorDec4); + float32x4_t R = vcvtq_f32_s32(vreinterpretq_s32_u32(vInt)); + R = vaddq_f32(R, g_XMAddDec4); + R = vmulq_f32(R, DecN4Mul); + return vmaxq_f32(R, vdupq_n_f32(-1.0f)); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 DecN4Mul = { { { 1.0f / 511.0f, 1.0f / (511.0f * 1024.0f), 1.0f / (511.0f * 1024.0f * 1024.0f), 1.0f / (1024.0f * 1024.0f * 1024.0f) } } }; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMXorDec4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMAddDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp, DecN4Mul); + // Clamp result (for case of -512/-1) + return _mm_max_ps(vTemp, g_XMNegativeOne); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadDec4(const XMDEC4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + static const uint32_t SignExtend[] = { 0x00000000, 0xFFFFFC00 }; + static const uint32_t SignExtendW[] = { 0x00000000, 0xFFFFFFFC }; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + uint32_t ElementW = pSource->v >> 30; + + XMVECTORF32 vResult = { { { + static_cast(static_cast(ElementX | SignExtend[ElementX >> 9])), + static_cast(static_cast(ElementY | SignExtend[ElementY >> 9])), + static_cast(static_cast(ElementZ | SignExtend[ElementZ >> 9])), + static_cast(static_cast(ElementW | SignExtendW[ElementW >> 1])) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast(pSource)); + vInt = vandq_u32(vInt, g_XMMaskDec4); + vInt = veorq_u32(vInt, g_XMXorDec4); + float32x4_t R = vcvtq_f32_s32(vreinterpretq_s32_u32(vInt)); + R = vaddq_f32(R, g_XMAddDec4); + return vmulq_f32(R, g_XMMulDec4); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMXorDec4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp, g_XMAddDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp, g_XMMulDec4); + return vTemp; +#endif +} + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUByteN4(const XMUBYTEN4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x) / 255.0f, + static_cast(pSource->y) / 255.0f, + static_cast(pSource->z) / 255.0f, + static_cast(pSource->w) / 255.0f + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt8 = vld1_dup_u32(reinterpret_cast(pSource)); + uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u32(vInt8)); + uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16)); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_n_f32(R, 1.0f / 255.0f); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadUByteN4Mul = { { { 1.0f / 255.0f, 1.0f / (255.0f * 256.0f), 1.0f / (255.0f * 65536.0f), 1.0f / (255.0f * 65536.0f * 256.0f) } } }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskByte4); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp, g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMAddUDec4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp, LoadUByteN4Mul); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUByte4(const XMUBYTE4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x), + static_cast(pSource->y), + static_cast(pSource->z), + static_cast(pSource->w) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt8 = vld1_dup_u32(reinterpret_cast(pSource)); + uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u32(vInt8)); + uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16)); + return vcvtq_f32_u32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadUByte4Mul = { { { 1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f * 256.0f) } } }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskByte4); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp, g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMAddUDec4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp, LoadUByte4Mul); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadByteN4(const XMBYTEN4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + (pSource->x == -128) ? -1.f : (static_cast(pSource->x) / 127.0f), + (pSource->y == -128) ? -1.f : (static_cast(pSource->y) / 127.0f), + (pSource->z == -128) ? -1.f : (static_cast(pSource->z) / 127.0f), + (pSource->w == -128) ? -1.f : (static_cast(pSource->w) / 127.0f) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt8 = vld1_dup_u32(reinterpret_cast(pSource)); + int16x8_t vInt16 = vmovl_s8(vreinterpret_s8_u32(vInt8)); + int32x4_t vInt = vmovl_s16(vget_low_s16(vInt16)); + float32x4_t R = vcvtq_f32_s32(vInt); + R = vmulq_n_f32(R, 1.0f / 127.0f); + return vmaxq_f32(R, vdupq_n_f32(-1.f)); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadByteN4Mul = { { { 1.0f / 127.0f, 1.0f / (127.0f * 256.0f), 1.0f / (127.0f * 65536.0f), 1.0f / (127.0f * 65536.0f * 256.0f) } } }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskByte4); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMAddByte4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp, LoadByteN4Mul); + // Clamp result (for case of -128) + return _mm_max_ps(vTemp, g_XMNegativeOne); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadByte4(const XMBYTE4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + static_cast(pSource->x), + static_cast(pSource->y), + static_cast(pSource->z), + static_cast(pSource->w) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt8 = vld1_dup_u32(reinterpret_cast(pSource)); + int16x8_t vInt16 = vmovl_s8(vreinterpret_s8_u32(vInt8)); + int32x4_t vInt = vmovl_s16(vget_low_s16(vInt16)); + return vcvtq_f32_s32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadByte4Mul = { { { 1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f * 256.0f) } } }; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp, g_XMMaskByte4); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp, g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp, g_XMAddByte4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp, LoadByte4Mul); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUNibble4(const XMUNIBBLE4* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + float(pSource->v & 0xF), + float((pSource->v >> 4) & 0xF), + float((pSource->v >> 8) & 0xF), + float((pSource->v >> 12) & 0xF) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORI32 UNibble4And = { { { 0xF, 0xF0, 0xF00, 0xF000 } } }; + static const XMVECTORF32 UNibble4Mul = { { { 1.0f, 1.0f / 16.f, 1.0f / 256.f, 1.0f / 4096.f } } }; + uint16x4_t vInt16 = vld1_dup_u16(reinterpret_cast(pSource)); + uint32x4_t vInt = vmovl_u16(vInt16); + vInt = vandq_u32(vInt, UNibble4And); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_f32(R, UNibble4Mul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORI32 UNibble4And = { { { 0xF, 0xF0, 0xF00, 0xF000 } } }; + static const XMVECTORF32 UNibble4Mul = { { { 1.0f, 1.0f / 16.f, 1.0f / 256.f, 1.0f / 4096.f } } }; + // Get the 16 bit value and splat it + __m128i vInt = XM_LOADU_SI16(&pSource->v); + XMVECTOR vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0,0,0,0)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult, UNibble4And); + // Convert to float + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult, UNibble4Mul); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadU555(const XMU555* pSource) noexcept +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { { { + float(pSource->v & 0x1F), + float((pSource->v >> 5) & 0x1F), + float((pSource->v >> 10) & 0x1F), + float((pSource->v >> 15) & 0x1) + } } }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORI32 U555And = { { { 0x1F, 0x1F << 5, 0x1F << 10, 0x8000 } } }; + static const XMVECTORF32 U555Mul = { { { 1.0f, 1.0f / 32.f, 1.0f / 1024.f, 1.0f / 32768.f } } }; + uint16x4_t vInt16 = vld1_dup_u16(reinterpret_cast(pSource)); + uint32x4_t vInt = vmovl_u16(vInt16); + vInt = vandq_u32(vInt, U555And); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_f32(R, U555Mul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORI32 U555And = { { { 0x1F, 0x1F << 5, 0x1F << 10, 0x8000 } } }; + static const XMVECTORF32 U555Mul = { { { 1.0f, 1.0f / 32.f, 1.0f / 1024.f, 1.0f / 32768.f } } }; + // Get the 16bit value and splat it + __m128i vInt = XM_LOADU_SI16(&pSource->v); + XMVECTOR vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult, U555And); + // Convert to float + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult, U555Mul); + return vResult; +#endif +} + +#ifdef _PREFAST_ +#pragma prefast(pop) +#endif + +/**************************************************************************** + * + * Vector and matrix store operations + * + ****************************************************************************/ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreColor +( + XMCOLOR* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiply(N, g_UByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->c = (static_cast(tmp.w) << 24) | + (static_cast(tmp.x) << 16) | + (static_cast(tmp.y) << 8) | + static_cast(tmp.z); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0)); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32(R, 255.0f); + R = XMVectorRound(R); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32(vInt32); + uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16)); + uint32_t rgba = vget_lane_u32(vreinterpret_u32_u8(vInt8), 0); + pDestination->c = (rgba & 0xFF00FF00) | ((rgba >> 16) & 0xFF) | ((rgba << 16) & 0xFF0000); +#elif defined(_XM_SSE_INTRINSICS_) + // Set <0 to 0 + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + // Set>1 to 1 + vResult = _mm_min_ps(vResult, g_XMOne); + // Convert to 0-255 + vResult = _mm_mul_ps(vResult, g_UByteMax); + // Shuffle RGBA to ARGB + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2)); + // Convert to int + __m128i vInt = _mm_cvtps_epi32(vResult); + // Mash to shorts + vInt = _mm_packs_epi32(vInt, vInt); + // Mash to bytes + vInt = _mm_packus_epi16(vInt, vInt); + // Store the color + _mm_store_ss(reinterpret_cast(&pDestination->c), _mm_castsi128_ps(vInt)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreHalf2 +( + XMHALF2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128i V1 = _mm_cvtps_ph(V, _MM_FROUND_TO_NEAREST_INT); + _mm_store_ss(reinterpret_cast(pDestination), _mm_castsi128_ps(V1)); +#else + pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V)); + pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V)); +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreShortN2 +( + XMSHORTN2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, g_ShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f)); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32(R, 32767.0f); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32(vInt32); + vst1_lane_u32(&pDestination->v, vreinterpret_u32_s16(vInt16), 0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne); + vResult = _mm_min_ps(vResult, g_XMOne); + vResult = _mm_mul_ps(vResult, g_ShortMax); + __m128i vResulti = _mm_cvtps_epi32(vResult); + vResulti = _mm_packs_epi32(vResulti, vResulti); + _mm_store_ss(reinterpret_cast(&pDestination->x), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreShort2 +( + XMSHORT2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-32767.f)); + R = vminq_f32(R, vdupq_n_f32(32767.0f)); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32(vInt32); + vst1_lane_u32(&pDestination->v, vreinterpret_u32_s16(vInt16), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_ShortMin); + vResult = _mm_min_ps(vResult, g_ShortMax); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Pack the ints into shorts + vInt = _mm_packs_epi32(vInt, vInt); + _mm_store_ss(reinterpret_cast(&pDestination->x), _mm_castsi128_ps(vInt)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUShortN2 +( + XMUSHORTN2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f)); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32(R, 65535.0f); + R = vaddq_f32(R, g_XMOneHalf); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32(vInt32); + vst1_lane_u32(&pDestination->v, vreinterpret_u32_u16(vInt16), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_XMOne); + vResult = _mm_mul_ps(vResult, g_UShortMax); + vResult = _mm_add_ps(vResult, g_XMOneHalf); + // Convert to int + __m128i vInt = _mm_cvttps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt, 0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt, 2)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUShort2 +( + XMUSHORT2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f)); + R = vminq_f32(R, vdupq_n_f32(65535.0f)); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32(vInt32); + vst1_lane_u32(&pDestination->v, vreinterpret_u32_u16(vInt16), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_UShortMax); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt, 0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt, 2)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreByteN2 +( + XMBYTEN2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, g_ByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f)); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32(R, 127.0f); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32(vInt32); + int8x8_t vInt8 = vqmovn_s16(vcombine_s16(vInt16, vInt16)); + vst1_lane_u16(reinterpret_cast(pDestination), vreinterpret_u16_s8(vInt8), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne); + vResult = _mm_min_ps(vResult, g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, g_ByteMax); + // Convert to int by rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt, 0)); + auto y = static_cast(_mm_extract_epi16(vInt, 2)); + pDestination->v = static_cast(((static_cast(y) & 0xFF) << 8) | (static_cast(x) & 0xFF)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreByte2 +( + XMBYTE2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f)); + R = vminq_f32(R, vdupq_n_f32(127.0f)); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32(vInt32); + int8x8_t vInt8 = vqmovn_s16(vcombine_s16(vInt16, vInt16)); + vst1_lane_u16(reinterpret_cast(pDestination), vreinterpret_u16_s8(vInt8), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_ByteMin); + vResult = _mm_min_ps(vResult, g_ByteMax); + // Convert to int by rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt, 0)); + auto y = static_cast(_mm_extract_epi16(vInt, 2)); + pDestination->v = static_cast(((static_cast(y) & 0xFF) << 8) | (static_cast(x) & 0xFF)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUByteN2 +( + XMUBYTEN2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, g_UByteMax, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f)); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32(R, 255.0f); + R = vaddq_f32(R, g_XMOneHalf); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32(vInt32); + uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16)); + vst1_lane_u16(reinterpret_cast(pDestination), vreinterpret_u16_u8(vInt8), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, g_UByteMax); + vResult = _mm_add_ps(vResult, g_XMOneHalf); + // Convert to int + __m128i vInt = _mm_cvttps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt, 0)); + auto y = static_cast(_mm_extract_epi16(vInt, 2)); + pDestination->v = static_cast(((static_cast(y) & 0xFF) << 8) | (static_cast(x) & 0xFF)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUByte2 +( + XMUBYTE2* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f)); + R = vminq_f32(R, vdupq_n_f32(255.0f)); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32(vInt32); + uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16)); + vst1_lane_u16(reinterpret_cast(pDestination), vreinterpret_u16_u8(vInt8), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_UByteMax); + // Convert to int by rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt, 0)); + auto y = static_cast(_mm_extract_epi16(vInt, 2)); + pDestination->v = static_cast(((static_cast(y) & 0xFF) << 8) | (static_cast(x) & 0xFF)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreU565 +( + XMU565* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + static const XMVECTORF32 Max = { { { 31.0f, 63.0f, 31.0f, 0.0f } } }; + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + ((static_cast(tmp.z) & 0x1F) << 11) + | ((static_cast(tmp.y) & 0x3F) << 5) + | ((static_cast(tmp.x) & 0x1F))); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.0f, 32.f, 32.f * 64.f, 0.f } } }; + static const XMVECTORU32 Mask = { { { 0x1F, 0x3F << 5, 0x1F << 11, 0 } } }; + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0)); + vResult = vminq_f32(vResult, Max); + vResult = vmulq_f32(vResult, Scale); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti, Mask); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vhi = vget_high_u32(vResulti); + vTemp = vorr_u32(vTemp, vhi); + vTemp = vpadd_u32(vTemp, vTemp); + vst1_lane_u16(&pDestination->v, vreinterpret_u16_u32(vTemp), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt, 0)); + auto y = static_cast(_mm_extract_epi16(vInt, 2)); + auto z = static_cast(_mm_extract_epi16(vInt, 4)); + pDestination->v = static_cast( + ((static_cast(z) & 0x1F) << 11) + | ((static_cast(y) & 0x3F) << 5) + | ((static_cast(x) & 0x1F))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3PK +( + XMFLOAT3PK* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + + XM_ALIGNED_DATA(16) uint32_t IValue[4]; + XMStoreFloat3A(reinterpret_cast(&IValue), V); + + uint32_t Result[3]; + + // X & Y Channels (5-bit exponent, 6-bit mantissa) + for (uint32_t j = 0; j < 2; ++j) + { + uint32_t Sign = IValue[j] & 0x80000000; + uint32_t I = IValue[j] & 0x7FFFFFFF; + + if ((I & 0x7F800000) == 0x7F800000) + { + // INF or NAN + Result[j] = 0x7C0U; + if ((I & 0x7FFFFF) != 0) + { + Result[j] = 0x7FFU; + } + else if (Sign) + { + // -INF is clamped to 0 since 3PK is positive only + Result[j] = 0; + } + } + else if (Sign || I < 0x35800000) + { + // 3PK is positive only, so clamp to zero + Result[j] = 0; + } + else if (I > 0x477E0000U) + { + // The number is too large to be represented as a float11, set to max + Result[j] = 0x7BFU; + } + else + { + if (I < 0x38800000U) + { + // The number is too small to be represented as a normalized float11 + // Convert it to a denormalized value. + uint32_t Shift = 113U - (I >> 23U); + I = (0x800000U | (I & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized float11 + I += 0xC8000000U; + } + + Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U) & 0x7ffU; + } + } + + // Z Channel (5-bit exponent, 5-bit mantissa) + uint32_t Sign = IValue[2] & 0x80000000; + uint32_t I = IValue[2] & 0x7FFFFFFF; + + if ((I & 0x7F800000) == 0x7F800000) + { + // INF or NAN + Result[2] = 0x3E0U; + if (I & 0x7FFFFF) + { + Result[2] = 0x3FFU; + } + else if (Sign || I < 0x36000000) + { + // -INF is clamped to 0 since 3PK is positive only + Result[2] = 0; + } + } + else if (Sign) + { + // 3PK is positive only, so clamp to zero + Result[2] = 0; + } + else if (I > 0x477C0000U) + { + // The number is too large to be represented as a float10, set to max + Result[2] = 0x3DFU; + } + else + { + if (I < 0x38800000U) + { + // The number is too small to be represented as a normalized float10 + // Convert it to a denormalized value. + uint32_t Shift = 113U - (I >> 23U); + I = (0x800000U | (I & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized float10 + I += 0xC8000000U; + } + + Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U) & 0x3ffU; + } + + // Pack Result into memory + pDestination->v = (Result[0] & 0x7ff) + | ((Result[1] & 0x7ff) << 11) + | ((Result[2] & 0x3ff) << 22); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3SE +( + XMFLOAT3SE* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + + XMFLOAT3A tmp; + XMStoreFloat3A(&tmp, V); + + static constexpr float maxf9 = float(0x1FF << 7); + static constexpr float minf9 = float(1.f / (1 << 16)); + + float x = (tmp.x >= 0.f) ? ((tmp.x > maxf9) ? maxf9 : tmp.x) : 0.f; + float y = (tmp.y >= 0.f) ? ((tmp.y > maxf9) ? maxf9 : tmp.y) : 0.f; + float z = (tmp.z >= 0.f) ? ((tmp.z > maxf9) ? maxf9 : tmp.z) : 0.f; + + const float max_xy = (x > y) ? x : y; + const float max_xyz = (max_xy > z) ? max_xy : z; + + const float maxColor = (max_xyz > minf9) ? max_xyz : minf9; + + union { float f; int32_t i; } fi; + fi.f = maxColor; + fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1) + + auto exp = static_cast(fi.i) >> 23; + pDestination->e = exp - 0x6f; + + fi.i = static_cast(0x83000000 - (exp << 23)); + float ScaleR = fi.f; + + pDestination->xm = static_cast(Internal::round_to_nearest(x * ScaleR)); + pDestination->ym = static_cast(Internal::round_to_nearest(y * ScaleR)); + pDestination->zm = static_cast(Internal::round_to_nearest(z * ScaleR)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreHalf4 +( + XMHALF4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128i V1 = _mm_cvtps_ph(V, _MM_FROUND_TO_NEAREST_INT); + _mm_storel_epi64(reinterpret_cast<__m128i*>(pDestination), V1); +#else + XMFLOAT4A t; + XMStoreFloat4A(&t, V); + + pDestination->x = XMConvertFloatToHalf(t.x); + pDestination->y = XMConvertFloatToHalf(t.y); + pDestination->z = XMConvertFloatToHalf(t.z); + pDestination->w = XMConvertFloatToHalf(t.w); +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreShortN4 +( + XMSHORTN4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, g_ShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(-1.f)); + vResult = vminq_f32(vResult, vdupq_n_f32(1.0f)); + vResult = vmulq_n_f32(vResult, 32767.0f); + int16x4_t vInt = vmovn_s32(vcvtq_s32_f32(vResult)); + vst1_s16(reinterpret_cast(pDestination), vInt); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne); + vResult = _mm_min_ps(vResult, g_XMOne); + vResult = _mm_mul_ps(vResult, g_ShortMax); + __m128i vResulti = _mm_cvtps_epi32(vResult); + vResulti = _mm_packs_epi32(vResulti, vResulti); + _mm_store_sd(reinterpret_cast(&pDestination->x), _mm_castsi128_pd(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreShort4 +( + XMSHORT4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32(V, g_ShortMin); + vResult = vminq_f32(vResult, g_ShortMax); + int16x4_t vInt = vmovn_s32(vcvtq_s32_f32(vResult)); + vst1_s16(reinterpret_cast(pDestination), vInt); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_ShortMin); + vResult = _mm_min_ps(vResult, g_ShortMax); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Pack the ints into shorts + vInt = _mm_packs_epi32(vInt, vInt); + _mm_store_sd(reinterpret_cast(&pDestination->x), _mm_castsi128_pd(vInt)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUShortN4 +( + XMUSHORTN4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0)); + vResult = vminq_f32(vResult, vdupq_n_f32(1.0f)); + vResult = vmulq_n_f32(vResult, 65535.0f); + vResult = vaddq_f32(vResult, g_XMOneHalf); + uint16x4_t vInt = vmovn_u32(vcvtq_u32_f32(vResult)); + vst1_u16(reinterpret_cast(pDestination), vInt); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_XMOne); + vResult = _mm_mul_ps(vResult, g_UShortMax); + vResult = _mm_add_ps(vResult, g_XMOneHalf); + // Convert to int + __m128i vInt = _mm_cvttps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt, 0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt, 2)); + pDestination->z = static_cast(_mm_extract_epi16(vInt, 4)); + pDestination->w = static_cast(_mm_extract_epi16(vInt, 6)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUShort4 +( + XMUSHORT4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0)); + vResult = vminq_f32(vResult, g_UShortMax); + uint16x4_t vInt = vmovn_u32(vcvtq_u32_f32(vResult)); + vst1_u16(reinterpret_cast(pDestination), vInt); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_UShortMax); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt, 0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt, 2)); + pDestination->z = static_cast(_mm_extract_epi16(vInt, 4)); + pDestination->w = static_cast(_mm_extract_epi16(vInt, 6)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreXDecN4 +( + XMXDECN4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + static const XMVECTORF32 Min = { { { -1.0f, -1.0f, -1.0f, 0.0f } } }; + +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Scale = { { { 511.0f, 511.0f, 511.0f, 3.0f } } }; + + XMVECTOR N = XMVectorClamp(V, Min.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | (static_cast(tmp.x) & 0x3FF)); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 511.0f, 511.0f * 1024.0f, 511.0f * 1048576.0f, 3.0f * 536870912.0f } } }; + static const XMVECTORI32 ScaleMask = { { { 0x3FF, 0x3FF << 10, 0x3FF << 20, 0x3 << 29 } } }; + float32x4_t vResult = vmaxq_f32(V, Min); + vResult = vminq_f32(vResult, vdupq_n_f32(1.0f)); + vResult = vmulq_f32(vResult, Scale); + int32x4_t vResulti = vcvtq_s32_f32(vResult); + vResulti = vandq_s32(vResulti, ScaleMask); + int32x4_t vResultw = vandq_s32(vResulti, g_XMMaskW); + vResulti = vaddq_s32(vResulti, vResultw); + // Do a horizontal or of all 4 entries + uint32x2_t vTemp = vget_low_u32(vreinterpretq_u32_s32(vResulti)); + uint32x2_t vhi = vget_high_u32(vreinterpretq_u32_s32(vResulti)); + vTemp = vorr_u32(vTemp, vhi); + vTemp = vpadd_u32(vTemp, vTemp); + vst1_lane_u32(&pDestination->v, vTemp, 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 511.0f, 511.0f * 1024.0f, 511.0f * 1048576.0f, 3.0f * 536870912.0f } } }; + static const XMVECTORI32 ScaleMask = { { { 0x3FF, 0x3FF << 10, 0x3FF << 20, 0x3 << 29 } } }; + XMVECTOR vResult = _mm_max_ps(V, Min); + vResult = _mm_min_ps(vResult, g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, Scale); + // Convert to int (W is unsigned) + __m128i vResulti = _mm_cvtps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, ScaleMask); + // To fix W, add itself to shift it up to <<30 instead of <<29 + __m128i vResultw = _mm_and_si128(vResulti, g_XMMaskW); + vResulti = _mm_add_epi32(vResulti, vResultw); + // Do a horizontal or of all 4 entries + vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vResulti), _MM_SHUFFLE(0, 3, 2, 1)); + vResulti = _mm_or_si128(vResulti, _mm_castps_si128(vResult)); + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 3, 2, 1)); + vResulti = _mm_or_si128(vResulti, _mm_castps_si128(vResult)); + vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 3, 2, 1)); + vResulti = _mm_or_si128(vResulti, _mm_castps_si128(vResult)); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning +#endif + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreXDec4 +( + XMXDEC4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + static const XMVECTORF32 MinXDec4 = { { { -511.0f, -511.0f, -511.0f, 0.0f } } }; + static const XMVECTORF32 MaxXDec4 = { { { 511.0f, 511.0f, 511.0f, 3.0f } } }; + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, MinXDec4, MaxXDec4); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleXDec4 = { { { 1.0f, 1024.0f / 2.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f / 2.0f } } }; + static const XMVECTORI32 MaskXDec4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } }; + float32x4_t vResult = vmaxq_f32(V, MinXDec4); + vResult = vminq_f32(vResult, MaxXDec4); + vResult = vmulq_f32(vResult, ScaleXDec4); + int32x4_t vResulti = vcvtq_s32_f32(vResult); + vResulti = vandq_s32(vResulti, MaskXDec4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vreinterpretq_u32_s32(vResulti)); + uint32x2_t vTemp2 = vget_high_u32(vreinterpretq_u32_s32(vResulti)); + vTemp = vorr_u32(vTemp, vTemp2); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32(vTemp, 1); + vTemp2 = vadd_u32(vTemp2, vTemp2); + vTemp = vorr_u32(vTemp, vTemp2); + vst1_lane_u32(&pDestination->v, vTemp, 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleXDec4 = { { { 1.0f, 1024.0f / 2.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f / 2.0f } } }; + static const XMVECTORI32 MaskXDec4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, MinXDec4); + vResult = _mm_min_ps(vResult, MaxXDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleXDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, MaskXDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // Perform a single bit left shift on y|w + vResulti2 = _mm_add_epi32(vResulti2, vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUDecN4 +( + XMUDECN4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Scale = { { { 1023.0f, 1023.0f, 1023.0f, 3.0f } } }; + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiply(N, Scale.v); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleUDecN4 = { { { 1023.0f, 1023.0f * 1024.0f * 0.5f, 1023.0f * 1024.0f * 1024.0f, 3.0f * 1024.0f * 1024.0f * 1024.0f * 0.5f } } }; + static const XMVECTORI32 MaskUDecN4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } }; + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0.f)); + vResult = vminq_f32(vResult, vdupq_n_f32(1.f)); + vResult = vmulq_f32(vResult, ScaleUDecN4); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti, MaskUDecN4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vTemp2 = vget_high_u32(vResulti); + vTemp = vorr_u32(vTemp, vTemp2); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32(vTemp, 1); + vTemp2 = vadd_u32(vTemp2, vTemp2); + vTemp = vorr_u32(vTemp, vTemp2); + vst1_lane_u32(&pDestination->v, vTemp, 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUDecN4 = { { { 1023.0f, 1023.0f * 1024.0f * 0.5f, 1023.0f * 1024.0f * 1024.0f, 3.0f * 1024.0f * 1024.0f * 1024.0f * 0.5f } } }; + static const XMVECTORI32 MaskUDecN4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleUDecN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, MaskUDecN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // Perform a left shift by one bit on y|w + vResulti2 = _mm_add_epi32(vResulti2, vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUDecN4_XR +( + XMUDECN4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + static const XMVECTORF32 Scale = { { { 510.0f, 510.0f, 510.0f, 3.0f } } }; + static const XMVECTORF32 Bias = { { { 384.0f, 384.0f, 384.0f, 0.0f } } }; + static const XMVECTORF32 C = { { { 1023.f, 1023.f, 1023.f, 3.f } } }; + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorMultiplyAdd(V, Scale, Bias); + N = XMVectorClamp(N, g_XMZero, C); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Shift = { { { 1.0f, 1024.0f * 0.5f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f * 0.5f } } }; + static const XMVECTORU32 MaskUDecN4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } }; + float32x4_t vResult = vmlaq_f32(Bias, V, Scale); + vResult = vmaxq_f32(vResult, vdupq_n_f32(0.f)); + vResult = vminq_f32(vResult, C); + vResult = vmulq_f32(vResult, Shift); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti, MaskUDecN4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vTemp2 = vget_high_u32(vResulti); + vTemp = vorr_u32(vTemp, vTemp2); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32(vTemp, 1); + vTemp2 = vadd_u32(vTemp2, vTemp2); + vTemp = vorr_u32(vTemp, vTemp2); + vst1_lane_u32(&pDestination->v, vTemp, 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Shift = { { { 1.0f, 1024.0f * 0.5f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f * 0.5f } } }; + static const XMVECTORU32 MaskUDecN4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } }; + // Scale & bias + XMVECTOR vResult = XM_FMADD_PS(V, Scale, Bias); + // Clamp to bounds + vResult = _mm_max_ps(vResult, g_XMZero); + vResult = _mm_min_ps(vResult, C); + // Scale by shift values + vResult = _mm_mul_ps(vResult, Shift); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, MaskUDecN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // Perform a left shift by one bit on y|w + vResulti2 = _mm_add_epi32(vResulti2, vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUDec4 +( + XMUDEC4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + static const XMVECTORF32 MaxUDec4 = { { { 1023.0f, 1023.0f, 1023.0f, 3.0f } } }; + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), MaxUDec4); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleUDec4 = { { { 1.0f, 1024.0f / 2.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f / 2.0f } } }; + static const XMVECTORI32 MaskUDec4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } }; + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0.f)); + vResult = vminq_f32(vResult, MaxUDec4); + vResult = vmulq_f32(vResult, ScaleUDec4); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti, MaskUDec4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vTemp2 = vget_high_u32(vResulti); + vTemp = vorr_u32(vTemp, vTemp2); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32(vTemp, 1); + vTemp2 = vadd_u32(vTemp2, vTemp2); + vTemp = vorr_u32(vTemp, vTemp2); + vst1_lane_u32(&pDestination->v, vTemp, 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUDec4 = { { { 1.0f, 1024.0f / 2.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f / 2.0f } } }; + static const XMVECTORI32 MaskUDec4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, MaxUDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleUDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, MaskUDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // Perform a left shift by one bit on y|w + vResulti2 = _mm_add_epi32(vResulti2, vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning +#endif + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreDecN4 +( + XMDECN4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Scale = { { { 511.0f, 511.0f, 511.0f, 1.0f } } }; + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleDecN4 = { { { 511.0f, 511.0f * 1024.0f, 511.0f * 1024.0f * 1024.0f, 1.0f * 1024.0f * 1024.0f * 1024.0f } } }; + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(-1.f)); + vResult = vminq_f32(vResult, vdupq_n_f32(1.f)); + vResult = vmulq_f32(vResult, ScaleDecN4); + int32x4_t vResulti = vcvtq_s32_f32(vResult); + vResulti = vandq_s32(vResulti, g_XMMaskDec4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vreinterpretq_u32_s32(vResulti)); + uint32x2_t vhi = vget_high_u32(vreinterpretq_u32_s32(vResulti)); + vTemp = vorr_u32(vTemp, vhi); + vTemp = vpadd_u32(vTemp, vTemp); + vst1_lane_u32(&pDestination->v, vTemp, 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleDecN4 = { { { 511.0f, 511.0f * 1024.0f, 511.0f * 1024.0f * 1024.0f, 1.0f * 1024.0f * 1024.0f * 1024.0f } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne); + vResult = _mm_min_ps(vResult, g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleDecN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, g_XMMaskDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreDec4 +( + XMDEC4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + static const XMVECTORF32 MinDec4 = { { { -511.0f, -511.0f, -511.0f, -1.0f } } }; + static const XMVECTORF32 MaxDec4 = { { { 511.0f, 511.0f, 511.0f, 1.0f } } }; + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, MinDec4, MaxDec4); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + (static_cast(tmp.w) << 30) + | ((static_cast(tmp.z) & 0x3FF) << 20) + | ((static_cast(tmp.y) & 0x3FF) << 10) + | ((static_cast(tmp.x) & 0x3FF))); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleDec4 = { { { 1.0f, 1024.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f } } }; + float32x4_t vResult = vmaxq_f32(V, MinDec4); + vResult = vminq_f32(vResult, MaxDec4); + vResult = vmulq_f32(vResult, ScaleDec4); + int32x4_t vResulti = vcvtq_s32_f32(vResult); + vResulti = vandq_s32(vResulti, g_XMMaskDec4); + // Do a horizontal or of all 4 entries + uint32x2_t vTemp = vget_low_u32(vreinterpretq_u32_s32(vResulti)); + uint32x2_t vhi = vget_high_u32(vreinterpretq_u32_s32(vResulti)); + vTemp = vorr_u32(vTemp, vhi); + vTemp = vpadd_u32(vTemp, vTemp); + vst1_lane_u32(&pDestination->v, vTemp, 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleDec4 = { { { 1.0f, 1024.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, MinDec4); + vResult = _mm_min_ps(vResult, MaxDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, g_XMMaskDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUByteN4 +( + XMUBYTEN4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiply(N, g_UByteMax); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0)); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32(R, 255.0f); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32(vInt32); + uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16)); + vst1_lane_u32(&pDestination->v, vreinterpret_u32_u8(vInt8), 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUByteN4 = { { { 255.0f, 255.0f * 256.0f * 0.5f, 255.0f * 256.0f * 256.0f, 255.0f * 256.0f * 256.0f * 256.0f * 0.5f } } }; + static const XMVECTORI32 MaskUByteN4 = { { { 0xFF, 0xFF << (8 - 1), 0xFF << 16, 0xFF << (24 - 1) } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleUByteN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, MaskUByteN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // Perform a single bit left shift to fix y|w + vResulti2 = _mm_add_epi32(vResulti2, vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUByte4 +( + XMUBYTE4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0)); + R = vminq_f32(R, vdupq_n_f32(255.0f)); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32(vInt32); + uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16)); + vst1_lane_u32(&pDestination->v, vreinterpret_u32_u8(vInt8), 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUByte4 = { { { 1.0f, 256.0f * 0.5f, 256.0f * 256.0f, 256.0f * 256.0f * 256.0f * 0.5f } } }; + static const XMVECTORI32 MaskUByte4 = { { { 0xFF, 0xFF << (8 - 1), 0xFF << 16, 0xFF << (24 - 1) } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, g_UByteMax); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleUByte4); + // Convert to int by rounding + __m128i vResulti = _mm_cvtps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, MaskUByte4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // Perform a single bit left shift to fix y|w + vResulti2 = _mm_add_epi32(vResulti2, vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreByteN4 +( + XMBYTEN4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, g_ByteMax); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f)); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32(R, 127.0f); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32(vInt32); + int8x8_t vInt8 = vqmovn_s16(vcombine_s16(vInt16, vInt16)); + vst1_lane_u32(&pDestination->v, vreinterpret_u32_s8(vInt8), 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleByteN4 = { { { 127.0f, 127.0f * 256.0f, 127.0f * 256.0f * 256.0f, 127.0f * 256.0f * 256.0f * 256.0f } } }; + static const XMVECTORI32 MaskByteN4 = { { { 0xFF, 0xFF << 8, 0xFF << 16, static_cast(0xFF000000) } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne); + vResult = _mm_min_ps(vResult, g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleByteN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, MaskByteN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreByte4 +( + XMBYTE4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->x = static_cast(tmp.x); + pDestination->y = static_cast(tmp.y); + pDestination->z = static_cast(tmp.z); + pDestination->w = static_cast(tmp.w); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f)); + R = vminq_f32(R, vdupq_n_f32(127.f)); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32(vInt32); + int8x8_t vInt8 = vqmovn_s16(vcombine_s16(vInt16, vInt16)); + vst1_lane_u32(&pDestination->v, vreinterpret_u32_s8(vInt8), 0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleByte4 = { { { 1.0f, 256.0f, 256.0f * 256.0f, 256.0f * 256.0f * 256.0f } } }; + static const XMVECTORI32 MaskByte4 = { { { 0xFF, 0xFF << 8, 0xFF << 16, static_cast(0xFF000000) } } }; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V, g_ByteMin); + vResult = _mm_min_ps(vResult, g_ByteMax); + // Scale by multiplication + vResult = _mm_mul_ps(vResult, ScaleByte4); + // Convert to int by rounding + __m128i vResulti = _mm_cvtps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti, MaskByte4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti, vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti, vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v), _mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUNibble4 +( + XMUNIBBLE4* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + static const XMVECTORF32 Max = { { { 15.0f, 15.0f, 15.0f, 15.0f } } }; +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + ((static_cast(tmp.w) & 0xF) << 12) + | ((static_cast(tmp.z) & 0xF) << 8) + | ((static_cast(tmp.y) & 0xF) << 4) + | (static_cast(tmp.x) & 0xF)); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.0f, 16.f, 16.f * 16.f, 16.f * 16.f * 16.f } } }; + static const XMVECTORU32 Mask = { { { 0xF, 0xF << 4, 0xF << 8, 0xF << 12 } } }; + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0)); + vResult = vminq_f32(vResult, Max); + vResult = vmulq_f32(vResult, Scale); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti, Mask); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vhi = vget_high_u32(vResulti); + vTemp = vorr_u32(vTemp, vhi); + vTemp = vpadd_u32(vTemp, vTemp); + vst1_lane_u16(&pDestination->v, vreinterpret_u16_u32(vTemp), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt, 0)); + auto y = static_cast(_mm_extract_epi16(vInt, 2)); + auto z = static_cast(_mm_extract_epi16(vInt, 4)); + auto w = static_cast(_mm_extract_epi16(vInt, 6)); + pDestination->v = static_cast( + ((static_cast(w) & 0xF) << 12) + | ((static_cast(z) & 0xF) << 8) + | ((static_cast(y) & 0xF) << 4) + | ((static_cast(x) & 0xF))); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreU555 +( + XMU555* pDestination, + FXMVECTOR V +) noexcept +{ + assert(pDestination); + static const XMVECTORF32 Max = { { { 31.0f, 31.0f, 31.0f, 1.0f } } }; + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N); + + pDestination->v = static_cast( + ((tmp.w > 0.f) ? 0x8000 : 0) + | ((static_cast(tmp.z) & 0x1F) << 10) + | ((static_cast(tmp.y) & 0x1F) << 5) + | (static_cast(tmp.x) & 0x1F)); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = { { { 1.0f, 32.f / 2.f, 32.f * 32.f, 32.f * 32.f * 32.f / 2.f } } }; + static const XMVECTORU32 Mask = { { { 0x1F, 0x1F << (5 - 1), 0x1F << 10, 0x1 << (15 - 1) } } }; + float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0)); + vResult = vminq_f32(vResult, Max); + vResult = vmulq_f32(vResult, Scale); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti, Mask); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vTemp2 = vget_high_u32(vResulti); + vTemp = vorr_u32(vTemp, vTemp2); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32(vTemp, 1); + vTemp2 = vadd_u32(vTemp2, vTemp2); + vTemp = vorr_u32(vTemp, vTemp2); + vst1_lane_u16(&pDestination->v, vreinterpret_u16_u32(vTemp), 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V, g_XMZero); + vResult = _mm_min_ps(vResult, Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + auto x = static_cast(_mm_extract_epi16(vInt, 0)); + auto y = static_cast(_mm_extract_epi16(vInt, 2)); + auto z = static_cast(_mm_extract_epi16(vInt, 4)); + auto w = static_cast(_mm_extract_epi16(vInt, 6)); + pDestination->v = static_cast( + (static_cast(w) ? 0x8000 : 0) + | ((static_cast(z) & 0x1F) << 10) + | ((static_cast(y) & 0x1F) << 5) + | ((static_cast(x) & 0x1F))); +#endif +} + + +/**************************************************************************** + * + * XMCOLOR operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMCOLOR::XMCOLOR +( + float _r, + float _g, + float _b, + float _a +) noexcept +{ + XMStoreColor(this, XMVectorSet(_r, _g, _b, _a)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMCOLOR::XMCOLOR(const float* pArray) noexcept +{ + XMStoreColor(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMHALF2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMHALF2::XMHALF2 +( + float _x, + float _y +) noexcept +{ + x = XMConvertFloatToHalf(_x); + y = XMConvertFloatToHalf(_y); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMHALF2::XMHALF2(const float* pArray) noexcept +{ + assert(pArray != nullptr); + x = XMConvertFloatToHalf(pArray[0]); + y = XMConvertFloatToHalf(pArray[1]); +} + +/**************************************************************************** + * + * XMSHORTN2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMSHORTN2::XMSHORTN2 +( + float _x, + float _y +) noexcept +{ + XMStoreShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMSHORTN2::XMSHORTN2(const float* pArray) noexcept +{ + XMStoreShortN2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMSHORT2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMSHORT2::XMSHORT2 +( + float _x, + float _y +) noexcept +{ + XMStoreShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMSHORT2::XMSHORT2(const float* pArray) noexcept +{ + XMStoreShort2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUSHORTN2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUSHORTN2::XMUSHORTN2 +( + float _x, + float _y +) noexcept +{ + XMStoreUShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUSHORTN2::XMUSHORTN2(const float* pArray) noexcept +{ + XMStoreUShortN2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUSHORT2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUSHORT2::XMUSHORT2 +( + float _x, + float _y +) noexcept +{ + XMStoreUShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUSHORT2::XMUSHORT2(const float* pArray) noexcept +{ + XMStoreUShort2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMBYTEN2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMBYTEN2::XMBYTEN2 +( + float _x, + float _y +) noexcept +{ + XMStoreByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMBYTEN2::XMBYTEN2(const float* pArray) noexcept +{ + XMStoreByteN2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMBYTE2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMBYTE2::XMBYTE2 +( + float _x, + float _y +) noexcept +{ + XMStoreByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMBYTE2::XMBYTE2(const float* pArray) noexcept +{ + XMStoreByte2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUBYTEN2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUBYTEN2::XMUBYTEN2 +( + float _x, + float _y +) noexcept +{ + XMStoreUByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUBYTEN2::XMUBYTEN2(const float* pArray) noexcept +{ + XMStoreUByteN2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUBYTE2 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUBYTE2::XMUBYTE2 +( + float _x, + float _y +) noexcept +{ + XMStoreUByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUBYTE2::XMUBYTE2(const float* pArray) noexcept +{ + XMStoreUByte2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMU565 operators + * + ****************************************************************************/ + +inline XMU565::XMU565 +( + float _x, + float _y, + float _z +) noexcept +{ + XMStoreU565(this, XMVectorSet(_x, _y, _z, 0.0f)); +} + +_Use_decl_annotations_ +inline XMU565::XMU565(const float* pArray) noexcept +{ + XMStoreU565(this, XMLoadFloat3(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMFLOAT3PK operators + * + ****************************************************************************/ + +inline XMFLOAT3PK::XMFLOAT3PK +( + float _x, + float _y, + float _z +) noexcept +{ + XMStoreFloat3PK(this, XMVectorSet(_x, _y, _z, 0.0f)); +} + +_Use_decl_annotations_ +inline XMFLOAT3PK::XMFLOAT3PK(const float* pArray) noexcept +{ + XMStoreFloat3PK(this, XMLoadFloat3(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMFLOAT3SE operators + * + ****************************************************************************/ + +inline XMFLOAT3SE::XMFLOAT3SE +( + float _x, + float _y, + float _z +) noexcept +{ + XMStoreFloat3SE(this, XMVectorSet(_x, _y, _z, 0.0f)); +} + +_Use_decl_annotations_ +inline XMFLOAT3SE::XMFLOAT3SE(const float* pArray) noexcept +{ + XMStoreFloat3SE(this, XMLoadFloat3(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMHALF4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMHALF4::XMHALF4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + x = XMConvertFloatToHalf(_x); + y = XMConvertFloatToHalf(_y); + z = XMConvertFloatToHalf(_z); + w = XMConvertFloatToHalf(_w); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMHALF4::XMHALF4(const float* pArray) noexcept +{ + XMConvertFloatToHalfStream(&x, sizeof(HALF), pArray, sizeof(float), 4); +} + +/**************************************************************************** + * + * XMSHORTN4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMSHORTN4::XMSHORTN4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreShortN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMSHORTN4::XMSHORTN4(const float* pArray) noexcept +{ + XMStoreShortN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMSHORT4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMSHORT4::XMSHORT4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreShort4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMSHORT4::XMSHORT4(const float* pArray) noexcept +{ + XMStoreShort4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUSHORTN4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUSHORTN4::XMUSHORTN4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreUShortN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUSHORTN4::XMUSHORTN4(const float* pArray) noexcept +{ + XMStoreUShortN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUSHORT4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUSHORT4::XMUSHORT4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreUShort4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUSHORT4::XMUSHORT4(const float* pArray) noexcept +{ + XMStoreUShort4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMXDECN4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMXDECN4::XMXDECN4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreXDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMXDECN4::XMXDECN4(const float* pArray) noexcept +{ + XMStoreXDecN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMXDEC4 operators + * + ****************************************************************************/ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4996) + // C4996: ignore deprecation warning +#endif + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + + //------------------------------------------------------------------------------ + +inline XMXDEC4::XMXDEC4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreXDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMXDEC4::XMXDEC4(const float* pArray) noexcept +{ + XMStoreXDec4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMDECN4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMDECN4::XMDECN4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMDECN4::XMDECN4(const float* pArray) noexcept +{ + XMStoreDecN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMDEC4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMDEC4::XMDEC4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMDEC4::XMDEC4(const float* pArray) noexcept +{ + XMStoreDec4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +/**************************************************************************** + * + * XMUDECN4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUDECN4::XMUDECN4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreUDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUDECN4::XMUDECN4(const float* pArray) noexcept +{ + XMStoreUDecN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUDEC4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUDEC4::XMUDEC4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreUDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUDEC4::XMUDEC4(const float* pArray) noexcept +{ + XMStoreUDec4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMBYTEN4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMBYTEN4::XMBYTEN4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreByteN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMBYTEN4::XMBYTEN4(const float* pArray) noexcept +{ + XMStoreByteN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMBYTE4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMBYTE4::XMBYTE4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreByte4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMBYTE4::XMBYTE4(const float* pArray) noexcept +{ + XMStoreByte4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUBYTEN4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUBYTEN4::XMUBYTEN4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreUByteN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUBYTEN4::XMUBYTEN4(const float* pArray) noexcept +{ + XMStoreUByteN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUBYTE4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUBYTE4::XMUBYTE4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreUByte4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUBYTE4::XMUBYTE4(const float* pArray) noexcept +{ + XMStoreUByte4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUNIBBLE4 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMUNIBBLE4::XMUNIBBLE4 +( + float _x, + float _y, + float _z, + float _w +) noexcept +{ + XMStoreUNibble4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMUNIBBLE4::XMUNIBBLE4(const float* pArray) noexcept +{ + XMStoreUNibble4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMU555 operators + * + ****************************************************************************/ + + //------------------------------------------------------------------------------ + +inline XMU555::XMU555 +( + float _x, + float _y, + float _z, + bool _w +) noexcept +{ + XMStoreU555(this, XMVectorSet(_x, _y, _z, ((_w) ? 1.0f : 0.0f))); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMU555::XMU555 +( + const float* pArray, + bool _w +) noexcept +{ + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pArray)); + XMStoreU555(this, XMVectorSetW(V, ((_w) ? 1.0f : 0.0f))); +} + diff --git a/src/thirdparty/DirectXMath-dec2022/LICENSE b/src/thirdparty/DirectXMath-dec2022/LICENSE new file mode 100644 index 000000000..74ee33848 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/LICENSE @@ -0,0 +1,21 @@ + The MIT License (MIT) + +Copyright (c) 2011-2022 Microsoft Corp + +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, copy, modify, +merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be included in all copies +or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE +OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/src/thirdparty/DirectXMath-dec2022/MatrixStack/DirectXMatrixStack.h b/src/thirdparty/DirectXMath-dec2022/MatrixStack/DirectXMatrixStack.h new file mode 100644 index 000000000..46fe263ca --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/MatrixStack/DirectXMatrixStack.h @@ -0,0 +1,241 @@ +//------------------------------------------------------------------------------------- +// DirectXMatrixStack.h -- DirectXMath C++ Matrix Stack +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#endif + +#include + + +namespace DirectX +{ + class MatrixStack + { + public: + MatrixStack(size_t startSize = 16) noexcept(false) : + m_stackSize(0), + m_current(0), + m_stack(nullptr) + { + assert(startSize > 0); + Allocate(startSize); + LoadIdentity(); + } + + MatrixStack(MatrixStack&&) = default; + MatrixStack& operator= (MatrixStack&&) = default; + + MatrixStack(MatrixStack const&) = delete; + MatrixStack& operator= (MatrixStack const&) = delete; + + const XMMATRIX XM_CALLCONV Top() const noexcept { return m_stack[m_current]; } + const XMMATRIX* GetTop() const noexcept { return &m_stack[m_current]; } + + size_t Size() const noexcept { return (m_current + 1); } + + void Pop() + { + if (m_current > 0) + { + --m_current; + } + } + + void Push() + { + ++m_current; + + if (m_current >= m_stackSize) + { + Allocate(m_stackSize * 2); + } + + // Replicate the original top of the matrix stack. + m_stack[m_current] = m_stack[m_current - 1]; + } + + // Loads identity into the top of the matrix stack. + void LoadIdentity() noexcept + { + m_stack[m_current] = XMMatrixIdentity(); + } + + // Load a matrix into the top of the matrix stack. + void XM_CALLCONV LoadMatrix(FXMMATRIX matrix) noexcept + { + m_stack[m_current] = matrix; + } + + // Multiply a matrix by the top of the stack, store result in top. + void XM_CALLCONV MultiplyMatrix(FXMMATRIX matrix) noexcept + { + m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], matrix); + } + + // Pre-multiplies a matrix by the top of the stack, store result in top. + void XM_CALLCONV MultiplyMatrixLocal(FXMMATRIX matrix) noexcept + { + m_stack[m_current] = XMMatrixMultiply(matrix, m_stack[m_current]); + } + + // Add a rotation about X to stack top. + void XM_CALLCONV RotateX(float angle) noexcept + { + XMMATRIX mat = XMMatrixRotationX(angle); + m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], mat); + } + + void XM_CALLCONV RotateXLocal(float angle) noexcept + { + XMMATRIX mat = XMMatrixRotationX(angle); + m_stack[m_current] = XMMatrixMultiply(mat, m_stack[m_current]); + } + + // Add a rotation about Y to stack top. + void XM_CALLCONV RotateY(float angle) noexcept + { + XMMATRIX mat = XMMatrixRotationY(angle); + m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], mat); + } + + void XM_CALLCONV RotateYLocal(float angle) noexcept + { + XMMATRIX mat = XMMatrixRotationY(angle); + m_stack[m_current] = XMMatrixMultiply(mat, m_stack[m_current]); + } + + // Add a rotation about Z to stack top. + void XM_CALLCONV RotateZ(float angle) noexcept + { + XMMATRIX mat = XMMatrixRotationZ(angle); + m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], mat); + } + + void XM_CALLCONV RotateZLocal(float angle) noexcept + { + XMMATRIX mat = XMMatrixRotationZ(angle); + m_stack[m_current] = XMMatrixMultiply(mat, m_stack[m_current]); + } + + // Add a rotation around an axis to stack top. + void XM_CALLCONV RotateAxis(FXMVECTOR axis, float angle) noexcept + { + XMMATRIX mat = XMMatrixRotationAxis(axis, angle); + m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], mat); + } + + void XM_CALLCONV RotateAxisLocal(FXMVECTOR axis, float angle) noexcept + { + XMMATRIX mat = XMMatrixRotationAxis(axis, angle); + m_stack[m_current] = XMMatrixMultiply(mat, m_stack[m_current]); + } + + // Add a rotation by roll/pitch/yaw to the stack top. + void RotateRollPitchYaw(float pitch, float yaw, float roll) noexcept + { + XMMATRIX mat = XMMatrixRotationRollPitchYaw(pitch, yaw, roll); + m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], mat); + } + + void RotateRollPitchYawLocal(float pitch, float yaw, float roll) noexcept + { + XMMATRIX mat = XMMatrixRotationRollPitchYaw(pitch, yaw, roll); + m_stack[m_current] = XMMatrixMultiply(mat, m_stack[m_current]); + } + + // Add a rotation by a quaternion stack top. + void XM_CALLCONV RotateByQuaternion(FXMVECTOR quat) noexcept + { + XMMATRIX mat = XMMatrixRotationQuaternion(quat); + m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], mat); + } + + void XM_CALLCONV RotateByQuaternionLocal(FXMVECTOR quat) noexcept + { + XMMATRIX mat = XMMatrixRotationQuaternion(quat); + m_stack[m_current] = XMMatrixMultiply(mat, m_stack[m_current]); + } + + // Add a scale to the stack top. + void Scale(float x, float y, float z) noexcept + { + XMMATRIX mat = XMMatrixScaling(x, y, z); + m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], mat); + } + + void ScaleLocal(float x, float y, float z) noexcept + { + XMMATRIX mat = XMMatrixScaling(x, y, z); + m_stack[m_current] = XMMatrixMultiply(mat, m_stack[m_current]); + } + + // Add a translation to the stack top. + void Translate(float x, float y, float z) noexcept + { + XMMATRIX mat = XMMatrixTranslation(x, y, z); + m_stack[m_current] = XMMatrixMultiply(m_stack[m_current], mat); + } + + void TranslateLocal(float x, float y, float z) noexcept + { + XMMATRIX mat = XMMatrixTranslation(x, y, z); + m_stack[m_current] = XMMatrixMultiply(mat, m_stack[m_current]); + } + + private: + + struct matrix_deleter + { + void operator()(void* p) noexcept + { +#ifdef _WIN32 + _aligned_free(p); +#else + free(p); +#endif + } + }; + + void Allocate(size_t newSize) + { +#ifdef _WIN32 + void* ptr = _aligned_malloc(newSize * sizeof(XMMATRIX), 16); +#else + // This C++17 Standard Library function is currently NOT + // implemented for the Microsoft Standard C++ Library. + void* ptr = aligned_alloc(16, newSize * sizeof(XMMATRIX)); +#endif + if (!ptr) + throw std::bad_alloc(); + + if (m_stack) + { + assert(newSize >= m_stackSize); + memcpy(ptr, m_stack.get(), sizeof(XMMATRIX) * m_stackSize); + } + + m_stack.reset(reinterpret_cast(ptr)); + m_stackSize = newSize; + } + + size_t m_stackSize; + size_t m_current; + std::unique_ptr m_stack; + }; +} // namespace DirectX diff --git a/src/thirdparty/DirectXMath-dec2022/README.md b/src/thirdparty/DirectXMath-dec2022/README.md new file mode 100644 index 000000000..448640c56 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/README.md @@ -0,0 +1,115 @@ +![DirectX Logo](https://raw.githubusercontent.com/wiki/Microsoft/DirectXMath/X_jpg.jpg) + +# DirectXMath + +https://github.com/Microsoft/DirectXMath + +Copyright (c) Microsoft Corporation. + +**December 2022** + +This package contains the DirectXMath library, an all inline SIMD C++ linear algebra library for use in games and graphics apps. + +This code is designed to build with Visual Studio 2019 (16.11), Visual Studio 2022, or clang/LLVM for Windows. It is recommended that you make use of the latest updates. + +These components are designed to work without requiring any content from the legacy DirectX SDK. For details, see [Where is the DirectX SDK?](https://aka.ms/dxsdk). + +## Directory Layout + +* ``Inc\`` + + + DirectXMath Files (in the DirectX C++ namespace) + + * DirectXMath.h - Core library + * DirectXPackedVector.h - Load/Store functions and types for working with various compressed GPU formats + * DirectXColors.h - .NET-style Color defines in sRGB and linear color space + * DirectXCollision.h - Bounding volume collision library + +* ``Extentions\`` + + + Advanced instruction set variants for guarded codepaths + + * DirectXMathSSE3.h - SSE3 + * DirectXMathBE.h - Supplemental SSE3 (SSSE3) + * DirectXMathSSE4.h - SSE4.1 + * DirectXMathAVX.h - Advanced Vector Extensions (AVX) + * DirectXMathAVX2.h - Advanced Vector Extensions 2 (AVX2) + * DirectXMathF16C.h - Half-precision conversions (F16C) + * DirectXMathFMA3.h - Fused multiply-accumulate (FMA3) + * DirectXMathFMA4.h - Fused multiply-accumulate (FMA4) + +* ``SHMath\`` + + + Spherical Harmonics math functions + + * DirectXSH.h - Header for SHMath functions + * DirectXSH.cpp, DirectXSHD3D11.cpp, DirectXSHD3D12.cpp - Implementation + +* ``XDSP\`` + + + XDSP.h - Digital Signal Processing helper functions + +* ``build\`` + + + Contains YAML files for the build pipelines along with some miscellaneous build files and scripts. + +## Documentation + +Documentation is available on the [Microsoft Docs](https://docs.microsoft.com/en-us/windows/desktop/dxmath/directxmath-portal). Additional information can be found on the [project wiki](https://github.com/microsoft/DirectXMath/wiki). + +## Compiler support + +Officially the library is supported with Microsoft Visual C++ 2019 or later, clang/LLVM v12 or later, and GCC 9 or later. It should also compile with the Intel C++ and MinGW compilers. + +When building with clang/LLVM or other GNU C compilers, the ``_XM_NO_XMVECTOR_OVERLOADS_`` control define is set because these compilers do not support creating operator overloads for the ``XMVECTOR`` type. You can choose to enable this preprocessor define explicitly to do the same thing with Visual C++ for improved portability. + +To build for non-Windows platforms, you need to provide a ``sal.h`` header in your include path. You can obtain an open source version from [GitHub](https://github.com/dotnet/corert/blob/master/src/Native/inc/unix/sal.h). + +With GCC, the SAL annotation preprocessor symbols can conflict with the GNU implementation of the Standard C++ Library. The workaround is to include the system headers before including DirectXMath: + +``` +#include +#include + +#include +``` + +## Notices + +All content and source code for this package are subject to the terms of the [MIT License](https://github.com/microsoft/DirectXMath/blob/main/LICENSE). + +For the latest version of DirectXMath, bug reports, etc. please visit the project site on [GitHub](https://github.com/microsoft/DirectXMath). + +## Support + +For questions, consider using [Stack Overflow](https://stackoverflow.com/questions/tagged/directxmath) with the *directxmath* tag, or the [DirectX Discord Server](https://discord.gg/directx) in the *dx12-developers* or *dx9-dx11-developers* channel. + +For bug reports and feature requests, please use GitHub [issues](https://github.com/microsoft/DirectXMath/issues) for this project. + +## Contributing + +This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. + +When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. + +## Trademarks + +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies. + +## Credits + +The xboxmath library was originated by Matt Bronder with contributions from Sakphong Chanbai and David Hefner for the Xbox 360. + +The xnamath library for the DirectX SDK and Xbox XDK was the work of Chuck Walbourn and Becky Heineman based on xboxmath, with contributions from Jeremy Gup, Dan Haffner, Matt Lee, Casey Meekhof, Rich Sauer, Jason Strayer, and Xiaoyue Zheng. + +The DirectXMath library for the Windows SDK and Xbox One XDK is the work of Chuck Walbourn based on xnamath, with contributions from Darren Anderson, Matt Lee, Aaron Rodriguez Hernandez, Yuichi Ito, Reza Nourai, Rich Sauer, and Jason Strayer. + +Thanks to Dave Eberly for his contributions particularly in improving the transcendental functions. + +Thanks to Bruce Dawson for his help with the rounding functions. + +Thanks to Andrew Farrier for the fixes to ``XMVerifyCPUSupport`` to properly support clang. + +Thanks to Scott Matloff for his help in getting the library updated to use Intel SVML for VS 2019. diff --git a/src/thirdparty/DirectXMath-dec2022/SECURITY.md b/src/thirdparty/DirectXMath-dec2022/SECURITY.md new file mode 100644 index 000000000..f7b89984f --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/SECURITY.md @@ -0,0 +1,41 @@ + + +## Security + +Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). + +If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. + +## Reporting Security Issues + +**Please do not report security vulnerabilities through public GitHub issues.** + +Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). + +If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). + +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). + +Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: + + * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) + * Full paths of source file(s) related to the manifestation of the issue + * The location of the affected source code (tag/branch/commit or direct URL) + * Any special configuration required to reproduce the issue + * Step-by-step instructions to reproduce the issue + * Proof-of-concept or exploit code (if possible) + * Impact of the issue, including how an attacker might exploit the issue + +This information will help us triage your report more quickly. + +If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. + +## Preferred Languages + +We prefer all communications to be in English. + +## Policy + +Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). + + \ No newline at end of file diff --git a/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSH.cpp b/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSH.cpp new file mode 100644 index 000000000..a2c504bce --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSH.cpp @@ -0,0 +1,4908 @@ +//----------------------------------------------------------------------------------- +// DirectXSH.cpp -- C++ Spherical Harmonics Math Library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/p/?LinkId=262885 +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma warning( disable : 4619 4456 5264) +// C4619 #pragma warning warnings +// C4456 declaration hides previous local declaration +// C5264 'const' variable is not used +#endif + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wold-style-cast" +#pragma clang diagnostic ignored "-Wshadow" +#pragma clang diagnostic ignored "-Wunused-const-variable" +#pragma clang diagnostic ignored "-Wunused-function" +#endif + +#include "DirectXSH.h" +#include + +using namespace DirectX; + +namespace +{ +#ifdef _PREFAST_ +#pragma prefast(disable:246, "generated code by maple (nested const variable definitions)") +#endif + + const float fExtraNormFac[XM_SH_MAXORDER] = { 2.0f*sqrtf(XM_PI), 2.0f / 3.0f*sqrtf(3.0f*XM_PI), 2.0f / 5.0f*sqrtf(5.0f*XM_PI), 2.0f / 7.0f*sqrtf(7.0f*XM_PI), 2.0f / 3.0f*sqrtf(XM_PI), 2.0f / 11.0f*sqrtf(11.0f*XM_PI) }; + + // computes the integral of a constant function over a solid angular + // extent. No error checking - only used internaly. This function + // only returns the Yl0 coefficients, since the rest are zero for + // circularly symmetric functions. + const float ComputeCapInt_t1 = sqrtf(0.3141593E1f); + const float ComputeCapInt_t5 = sqrtf(3.0f); + const float ComputeCapInt_t11 = sqrtf(5.0f); + const float ComputeCapInt_t18 = sqrtf(7.0f); + const float ComputeCapInt_t32 = sqrtf(11.0f); + + inline void ComputeCapInt(const size_t order, float angle, float *pR) + { + const float t2 = cosf(angle); + const float t3 = ComputeCapInt_t1*t2; + const float t7 = sinf(angle); + const float t8 = t7*t7; + + + pR[0] = -t3 + ComputeCapInt_t1; + pR[1] = ComputeCapInt_t5*ComputeCapInt_t1*t8 / 2.0f; + + if (order > 2) + { + const float t13 = t2*t2; + + pR[2] = -ComputeCapInt_t11*ComputeCapInt_t1*t2*(t13 - 1.0f) / 2.0f; + if (order > 3) + { + const float t19 = ComputeCapInt_t18*ComputeCapInt_t1; + const float t20 = t13*t13; + + pR[3] = -5.0f / 8.0f*t19*t20 + 3.0f / 4.0f*t19*t13 - t19 / 8.0f; + if (order > 4) + { + + + pR[4] = -3.0f / 8.0f*t3*(7.0f*t20 - 10.0f*t13 + 3.0f); + if (order > 5) + { + const float t33 = ComputeCapInt_t32*ComputeCapInt_t1; + pR[5] = -21.0f / 16.0f*t33*t20*t13 + 35.0f / 16.0f*t33*t20 - 15.0f / 16.0f*t33*t13 + t33 / 16.0f; + } + } + } + } + } + + // input pF only consists of Yl0 values, normalizes coefficients for directional + // lights. + inline float CosWtInt(const size_t order) + { + const float fCW0 = 0.25f; + const float fCW1 = 0.5f; + const float fCW2 = 5.0f / 16.0f; + //const float fCW3 = 0.0f; + const float fCW4 = -3.0f / 32.0f; + //const float fCW5 = 0.0f; + + // order has to be at least linear... + + float fRet = fCW0 + fCW1; + + if (order > 2) fRet += fCW2; + if (order > 4) fRet += fCW4; + + // odd degrees >= 3 evaluate to zero integrated against cosine... + + return fRet; + } + + const float SHEvalHemisphereLight_fSqrtPi = sqrtf(XM_PI); + const float SHEvalHemisphereLight_fSqrtPi3 = sqrtf(XM_PI / 3.0f); + + using REAL = float; +#define CONSTANT(x) (x ## f) + + // routine generated programmatically for evaluating SH basis for degree 1 + // inputs (x,y,z) are a point on the sphere (i.e., must be unit length) + // output is vector b with SH basis evaluated at (x,y,z). + // + inline void sh_eval_basis_1(REAL x, REAL y, REAL z, REAL b[4]) + { + /* m=0 */ + + // l=0 + const REAL p_0_0 = CONSTANT(0.282094791773878140); + b[0] = p_0_0; // l=0,m=0 + // l=1 + const REAL p_1_0 = CONSTANT(0.488602511902919920)*z; + b[2] = p_1_0; // l=1,m=0 + + + /* m=1 */ + + const REAL s1 = y; + const REAL c1 = x; + + // l=1 + const REAL p_1_1 = CONSTANT(-0.488602511902919920); + b[1] = p_1_1*s1; // l=1,m=-1 + b[3] = p_1_1*c1; // l=1,m=+1 + } + + // routine generated programmatically for evaluating SH basis for degree 2 + // inputs (x,y,z) are a point on the sphere (i.e., must be unit length) + // output is vector b with SH basis evaluated at (x,y,z). + // + inline void sh_eval_basis_2(REAL x, REAL y, REAL z, REAL b[9]) + { + const REAL z2 = z*z; + + + /* m=0 */ + + // l=0 + const REAL p_0_0 = CONSTANT(0.282094791773878140); + b[0] = p_0_0; // l=0,m=0 + // l=1 + const REAL p_1_0 = CONSTANT(0.488602511902919920)*z; + b[2] = p_1_0; // l=1,m=0 + // l=2 + const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050); + b[6] = p_2_0; // l=2,m=0 + + + /* m=1 */ + + const REAL s1 = y; + const REAL c1 = x; + + // l=1 + const REAL p_1_1 = CONSTANT(-0.488602511902919920); + b[1] = p_1_1*s1; // l=1,m=-1 + b[3] = p_1_1*c1; // l=1,m=+1 + // l=2 + const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z; + b[5] = p_2_1*s1; // l=2,m=-1 + b[7] = p_2_1*c1; // l=2,m=+1 + + + /* m=2 */ + + const REAL s2 = x*s1 + y*c1; + const REAL c2 = x*c1 - y*s1; + + // l=2 + const REAL p_2_2 = CONSTANT(0.546274215296039590); + b[4] = p_2_2*s2; // l=2,m=-2 + b[8] = p_2_2*c2; // l=2,m=+2 + } + + // routine generated programmatically for evaluating SH basis for degree 3 + // inputs (x,y,z) are a point on the sphere (i.e., must be unit length) + // output is vector b with SH basis evaluated at (x,y,z). + // + void sh_eval_basis_3(REAL x, REAL y, REAL z, REAL b[16]) + { + const REAL z2 = z*z; + + + /* m=0 */ + + // l=0 + const REAL p_0_0 = CONSTANT(0.282094791773878140); + b[0] = p_0_0; // l=0,m=0 + // l=1 + const REAL p_1_0 = CONSTANT(0.488602511902919920)*z; + b[2] = p_1_0; // l=1,m=0 + // l=2 + const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050); + b[6] = p_2_0; // l=2,m=0 + // l=3 + const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200)); + b[12] = p_3_0; // l=3,m=0 + + + /* m=1 */ + + const REAL s1 = y; + const REAL c1 = x; + + // l=1 + const REAL p_1_1 = CONSTANT(-0.488602511902919920); + b[1] = p_1_1*s1; // l=1,m=-1 + b[3] = p_1_1*c1; // l=1,m=+1 + // l=2 + const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z; + b[5] = p_2_1*s1; // l=2,m=-1 + b[7] = p_2_1*c1; // l=2,m=+1 + // l=3 + const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770); + b[11] = p_3_1*s1; // l=3,m=-1 + b[13] = p_3_1*c1; // l=3,m=+1 + + + /* m=2 */ + + const REAL s2 = x*s1 + y*c1; + const REAL c2 = x*c1 - y*s1; + + // l=2 + const REAL p_2_2 = CONSTANT(0.546274215296039590); + b[4] = p_2_2*s2; // l=2,m=-2 + b[8] = p_2_2*c2; // l=2,m=+2 + // l=3 + const REAL p_3_2 = CONSTANT(1.445305721320277100)*z; + b[10] = p_3_2*s2; // l=3,m=-2 + b[14] = p_3_2*c2; // l=3,m=+2 + + + /* m=3 */ + + const REAL s3 = x*s2 + y*c2; + const REAL c3 = x*c2 - y*s2; + + // l=3 + const REAL p_3_3 = CONSTANT(-0.590043589926643520); + b[9] = p_3_3*s3; // l=3,m=-3 + b[15] = p_3_3*c3; // l=3,m=+3 + } + + // routine generated programmatically for evaluating SH basis for degree 4 + // inputs (x,y,z) are a point on the sphere (i.e., must be unit length) + // output is vector b with SH basis evaluated at (x,y,z). + // + void sh_eval_basis_4(REAL x, REAL y, REAL z, REAL b[25]) + { + const REAL z2 = z*z; + + + /* m=0 */ + + // l=0 + const REAL p_0_0 = CONSTANT(0.282094791773878140); + b[0] = p_0_0; // l=0,m=0 + // l=1 + const REAL p_1_0 = CONSTANT(0.488602511902919920)*z; + b[2] = p_1_0; // l=1,m=0 + // l=2 + const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050); + b[6] = p_2_0; // l=2,m=0 + // l=3 + const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200)); + b[12] = p_3_0; // l=3,m=0 + // l=4 + const REAL p_4_0 = CONSTANT(1.984313483298443000)*z*p_3_0 + CONSTANT(-1.006230589874905300)*p_2_0; + b[20] = p_4_0; // l=4,m=0 + + + /* m=1 */ + + const REAL s1 = y; + const REAL c1 = x; + + // l=1 + const REAL p_1_1 = CONSTANT(-0.488602511902919920); + b[1] = p_1_1*s1; // l=1,m=-1 + b[3] = p_1_1*c1; // l=1,m=+1 + // l=2 + const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z; + b[5] = p_2_1*s1; // l=2,m=-1 + b[7] = p_2_1*c1; // l=2,m=+1 + // l=3 + const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770); + b[11] = p_3_1*s1; // l=3,m=-1 + b[13] = p_3_1*c1; // l=3,m=+1 + // l=4 + const REAL p_4_1 = z*(CONSTANT(-4.683325804901024000)*z2 + CONSTANT(2.007139630671867200)); + b[19] = p_4_1*s1; // l=4,m=-1 + b[21] = p_4_1*c1; // l=4,m=+1 + + + /* m=2 */ + + const REAL s2 = x*s1 + y*c1; + const REAL c2 = x*c1 - y*s1; + + // l=2 + const REAL p_2_2 = CONSTANT(0.546274215296039590); + b[4] = p_2_2*s2; // l=2,m=-2 + b[8] = p_2_2*c2; // l=2,m=+2 + // l=3 + const REAL p_3_2 = CONSTANT(1.445305721320277100)*z; + b[10] = p_3_2*s2; // l=3,m=-2 + b[14] = p_3_2*c2; // l=3,m=+2 + // l=4 + const REAL p_4_2 = CONSTANT(3.311611435151459800)*z2 + CONSTANT(-0.473087347878779980); + b[18] = p_4_2*s2; // l=4,m=-2 + b[22] = p_4_2*c2; // l=4,m=+2 + + + /* m=3 */ + + const REAL s3 = x*s2 + y*c2; + const REAL c3 = x*c2 - y*s2; + + // l=3 + const REAL p_3_3 = CONSTANT(-0.590043589926643520); + b[9] = p_3_3*s3; // l=3,m=-3 + b[15] = p_3_3*c3; // l=3,m=+3 + // l=4 + const REAL p_4_3 = CONSTANT(-1.770130769779930200)*z; + b[17] = p_4_3*s3; // l=4,m=-3 + b[23] = p_4_3*c3; // l=4,m=+3 + + + /* m=4 */ + + const REAL s4 = x*s3 + y*c3; + const REAL c4 = x*c3 - y*s3; + + // l=4 + const REAL p_4_4 = CONSTANT(0.625835735449176030); + b[16] = p_4_4*s4; // l=4,m=-4 + b[24] = p_4_4*c4; // l=4,m=+4 + } + + // routine generated programmatically for evaluating SH basis for degree 5 + // inputs (x,y,z) are a point on the sphere (i.e., must be unit length) + // output is vector b with SH basis evaluated at (x,y,z). + // + void sh_eval_basis_5(REAL x, REAL y, REAL z, REAL b[36]) + { + const REAL z2 = z*z; + + + /* m=0 */ + + // l=0 + const REAL p_0_0 = CONSTANT(0.282094791773878140); + b[0] = p_0_0; // l=0,m=0 + // l=1 + const REAL p_1_0 = CONSTANT(0.488602511902919920)*z; + b[2] = p_1_0; // l=1,m=0 + // l=2 + const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050); + b[6] = p_2_0; // l=2,m=0 + // l=3 + const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200)); + b[12] = p_3_0; // l=3,m=0 + // l=4 + const REAL p_4_0 = CONSTANT(1.984313483298443000)*z*p_3_0 + CONSTANT(-1.006230589874905300)*p_2_0; + b[20] = p_4_0; // l=4,m=0 + // l=5 + const REAL p_5_0 = CONSTANT(1.989974874213239700)*z*p_4_0 + CONSTANT(-1.002853072844814000)*p_3_0; + b[30] = p_5_0; // l=5,m=0 + + + /* m=1 */ + + const REAL s1 = y; + const REAL c1 = x; + + // l=1 + const REAL p_1_1 = CONSTANT(-0.488602511902919920); + b[1] = p_1_1*s1; // l=1,m=-1 + b[3] = p_1_1*c1; // l=1,m=+1 + // l=2 + const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z; + b[5] = p_2_1*s1; // l=2,m=-1 + b[7] = p_2_1*c1; // l=2,m=+1 + // l=3 + const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770); + b[11] = p_3_1*s1; // l=3,m=-1 + b[13] = p_3_1*c1; // l=3,m=+1 + // l=4 + const REAL p_4_1 = z*(CONSTANT(-4.683325804901024000)*z2 + CONSTANT(2.007139630671867200)); + b[19] = p_4_1*s1; // l=4,m=-1 + b[21] = p_4_1*c1; // l=4,m=+1 + // l=5 + const REAL p_5_1 = CONSTANT(2.031009601158990200)*z*p_4_1 + CONSTANT(-0.991031208965114650)*p_3_1; + b[29] = p_5_1*s1; // l=5,m=-1 + b[31] = p_5_1*c1; // l=5,m=+1 + + + /* m=2 */ + + const REAL s2 = x*s1 + y*c1; + const REAL c2 = x*c1 - y*s1; + + // l=2 + const REAL p_2_2 = CONSTANT(0.546274215296039590); + b[4] = p_2_2*s2; // l=2,m=-2 + b[8] = p_2_2*c2; // l=2,m=+2 + // l=3 + const REAL p_3_2 = CONSTANT(1.445305721320277100)*z; + b[10] = p_3_2*s2; // l=3,m=-2 + b[14] = p_3_2*c2; // l=3,m=+2 + // l=4 + const REAL p_4_2 = CONSTANT(3.311611435151459800)*z2 + CONSTANT(-0.473087347878779980); + b[18] = p_4_2*s2; // l=4,m=-2 + b[22] = p_4_2*c2; // l=4,m=+2 + // l=5 + const REAL p_5_2 = z*(CONSTANT(7.190305177459987500)*z2 + CONSTANT(-2.396768392486662100)); + b[28] = p_5_2*s2; // l=5,m=-2 + b[32] = p_5_2*c2; // l=5,m=+2 + + + /* m=3 */ + + const REAL s3 = x*s2 + y*c2; + const REAL c3 = x*c2 - y*s2; + + // l=3 + const REAL p_3_3 = CONSTANT(-0.590043589926643520); + b[9] = p_3_3*s3; // l=3,m=-3 + b[15] = p_3_3*c3; // l=3,m=+3 + // l=4 + const REAL p_4_3 = CONSTANT(-1.770130769779930200)*z; + b[17] = p_4_3*s3; // l=4,m=-3 + b[23] = p_4_3*c3; // l=4,m=+3 + // l=5 + const REAL p_5_3 = CONSTANT(-4.403144694917253700)*z2 + CONSTANT(0.489238299435250430); + b[27] = p_5_3*s3; // l=5,m=-3 + b[33] = p_5_3*c3; // l=5,m=+3 + + + /* m=4 */ + + const REAL s4 = x*s3 + y*c3; + const REAL c4 = x*c3 - y*s3; + + // l=4 + const REAL p_4_4 = CONSTANT(0.625835735449176030); + b[16] = p_4_4*s4; // l=4,m=-4 + b[24] = p_4_4*c4; // l=4,m=+4 + // l=5 + const REAL p_5_4 = CONSTANT(2.075662314881041100)*z; + b[26] = p_5_4*s4; // l=5,m=-4 + b[34] = p_5_4*c4; // l=5,m=+4 + + + /* m=5 */ + + const REAL s5 = x*s4 + y*c4; + const REAL c5 = x*c4 - y*s4; + + // l=5 + const REAL p_5_5 = CONSTANT(-0.656382056840170150); + b[25] = p_5_5*s5; // l=5,m=-5 + b[35] = p_5_5*c5; // l=5,m=+5 + } + + const REAL M_PIjs = (REAL)(4.0*atan(1.0)); + const REAL maxang = (REAL)(M_PIjs / 2); + const int NSH0 = 1; + const int NSH1 = 4; + const int NSH2 = 9; + const int NSH3 = 16; + const int NSH4 = 25; + const int NSH5 = 36; + const int NSH6 = 49; + const int NSH7 = 64; + const int NSH8 = 81; + const int NSH9 = 100; + const int NL0 = 1; + const int NL1 = 3; + const int NL2 = 5; + const int NL3 = 7; + const int NL4 = 9; + const int NL5 = 11; + const int NL6 = 13; + const int NL7 = 15; + const int NL8 = 17; + const int NL9 = 19; + + inline void rot(REAL ct, REAL st, REAL x, REAL y, REAL &xout, REAL &yout) + { + xout = x*ct - y*st; + yout = y*ct + x*st; + } + + inline void rot_inv(REAL ct, REAL st, REAL x, REAL y, REAL &xout, REAL &yout) + { + xout = x*ct + y*st; + yout = y*ct - x*st; + } + + inline void rot_1(REAL ct, REAL st, REAL ctm[1], REAL stm[1]) + { + ctm[0] = ct; + stm[0] = st; + } + + inline void rot_2(REAL ct, REAL st, REAL ctm[2], REAL stm[2]) + { + REAL ct2 = CONSTANT(2.0)*ct; + ctm[0] = ct; + stm[0] = st; + ctm[1] = ct2*ct - CONSTANT(1.0); + stm[1] = ct2*st; + } + + inline void rot_3(REAL ct, REAL st, REAL ctm[3], REAL stm[3]) + { + REAL ct2 = CONSTANT(2.0)*ct; + ctm[0] = ct; + stm[0] = st; + ctm[1] = ct2*ct - CONSTANT(1.0); + stm[1] = ct2*st; + ctm[2] = ct2*ctm[1] - ct; + stm[2] = ct2*stm[1] - st; + } + + inline void rot_4(REAL ct, REAL st, REAL ctm[4], REAL stm[4]) + { + REAL ct2 = CONSTANT(2.0)*ct; + ctm[0] = ct; + stm[0] = st; + ctm[1] = ct2*ct - CONSTANT(1.0); + stm[1] = ct2*st; + ctm[2] = ct2*ctm[1] - ct; + stm[2] = ct2*stm[1] - st; + ctm[3] = ct2*ctm[2] - ctm[1]; + stm[3] = ct2*stm[2] - stm[1]; + } + + inline void rot_5(REAL ct, REAL st, REAL ctm[5], REAL stm[5]) + { + REAL ct2 = CONSTANT(2.0)*ct; + ctm[0] = ct; + stm[0] = st; + ctm[1] = ct2*ct - CONSTANT(1.0); + stm[1] = ct2*st; + ctm[2] = ct2*ctm[1] - ct; + stm[2] = ct2*stm[1] - st; + ctm[3] = ct2*ctm[2] - ctm[1]; + stm[3] = ct2*stm[2] - stm[1]; + ctm[4] = ct2*ctm[3] - ctm[2]; + stm[4] = ct2*stm[3] - stm[2]; + } + + inline void sh_rotz_1(REAL ctm[1], REAL stm[1], REAL y[NL1], REAL yr[NL1]) + { + yr[1] = y[1]; + rot_inv(ctm[0], stm[0], y[0], y[2], yr[0], yr[2]); + } + + inline void sh_rotz_2(REAL ctm[2], REAL stm[2], REAL y[NL2], REAL yr[NL2]) + { + yr[2] = y[2]; + rot_inv(ctm[0], stm[0], y[1], y[3], yr[1], yr[3]); + rot_inv(ctm[1], stm[1], y[0], y[4], yr[0], yr[4]); + } + + inline void sh_rotz_3(REAL ctm[3], REAL stm[3], REAL y[NL3], REAL yr[NL3]) + { + yr[3] = y[3]; + rot_inv(ctm[0], stm[0], y[2], y[4], yr[2], yr[4]); + rot_inv(ctm[1], stm[1], y[1], y[5], yr[1], yr[5]); + rot_inv(ctm[2], stm[2], y[0], y[6], yr[0], yr[6]); + } + + inline void sh_rotz_4(REAL ctm[4], REAL stm[4], REAL y[NL4], REAL yr[NL4]) + { + yr[4] = y[4]; + rot_inv(ctm[0], stm[0], y[3], y[5], yr[3], yr[5]); + rot_inv(ctm[1], stm[1], y[2], y[6], yr[2], yr[6]); + rot_inv(ctm[2], stm[2], y[1], y[7], yr[1], yr[7]); + rot_inv(ctm[3], stm[3], y[0], y[8], yr[0], yr[8]); + } + + inline void sh_rotz_5(REAL ctm[5], REAL stm[5], REAL y[NL5], REAL yr[NL5]) + { + yr[5] = y[5]; + rot_inv(ctm[0], stm[0], y[4], y[6], yr[4], yr[6]); + rot_inv(ctm[1], stm[1], y[3], y[7], yr[3], yr[7]); + rot_inv(ctm[2], stm[2], y[2], y[8], yr[2], yr[8]); + rot_inv(ctm[3], stm[3], y[1], y[9], yr[1], yr[9]); + rot_inv(ctm[4], stm[4], y[0], y[10], yr[0], yr[10]); + } + + // rotation code generated programmatically by rotatex (2000x4000 samples, eps=1e-008) + + const REAL fx_1_001 = (REAL)(sqrt(1.0) / 1.0); // 1 + const REAL fx_1_002 = (REAL)(-sqrt(1.0) / 1.0); // -1.00000030843 + + inline void sh_rotx90_1(REAL y[], REAL yr[]) + { + yr[0] = fx_1_001*y[1]; + yr[1] = fx_1_002*y[0]; + yr[2] = fx_1_001*y[2]; + }; + + inline void sh_rotx90_inv_1(REAL y[], REAL yr[]) + { + yr[0] = fx_1_002*y[1]; + yr[1] = fx_1_001*y[0]; + yr[2] = fx_1_001*y[2]; + } + + const REAL fx_2_001 = (REAL)(sqrt(4.0) / 2.0); // 1 + const REAL fx_2_002 = (REAL)(-sqrt(4.0) / 2.0); // -1 + const REAL fx_2_003 = (REAL)(-sqrt(1.0) / 2.0); // -0.500000257021 + const REAL fx_2_004 = (REAL)(-sqrt(3.0) / 2.0); // -0.866025848959 + const REAL fx_2_005 = (REAL)(sqrt(1.0) / 2.0); // 0.5 + + inline void sh_rotx90_2(REAL y[], REAL yr[]) + { + yr[0] = fx_2_001*y[3]; + yr[1] = fx_2_002*y[1]; + yr[2] = fx_2_003*y[2] + fx_2_004*y[4]; + yr[3] = fx_2_002*y[0]; + yr[4] = fx_2_004*y[2] + fx_2_005*y[4]; + }; + + inline void sh_rotx90_inv_2(REAL y[], REAL yr[]) + { + yr[0] = fx_2_002*y[3]; + yr[1] = fx_2_002*y[1]; + yr[2] = fx_2_003*y[2] + fx_2_004*y[4]; + yr[3] = fx_2_001*y[0]; + yr[4] = fx_2_004*y[2] + fx_2_005*y[4]; + } + + const REAL fx_3_001 = (REAL)(-sqrt(10.0) / 4.0); // -0.790569415042 + const REAL fx_3_002 = (REAL)(sqrt(6.0) / 4.0); // 0.612372435696 + const REAL fx_3_003 = (REAL)(-sqrt(16.0) / 4.0); // -1 + const REAL fx_3_004 = (REAL)(-sqrt(6.0) / 4.0); // -0.612372435695 + const REAL fx_3_005 = (REAL)(-sqrt(1.0) / 4.0); // -0.25 + const REAL fx_3_006 = (REAL)(-sqrt(15.0) / 4.0); // -0.968245836551 + const REAL fx_3_007 = (REAL)(sqrt(1.0) / 4.0); // 0.25 + const REAL fx_3_008 = (REAL)(sqrt(10.0) / 4.0); // 0.790569983984 + + inline void sh_rotx90_3(REAL y[], REAL yr[]) + { + yr[0] = fx_3_001*y[3] + fx_3_002*y[5]; + yr[1] = fx_3_003*y[1]; + yr[2] = fx_3_004*y[3] + fx_3_001*y[5]; + yr[3] = fx_3_008*y[0] + fx_3_002*y[2]; + yr[4] = fx_3_005*y[4] + fx_3_006*y[6]; + yr[5] = fx_3_004*y[0] - fx_3_001*y[2]; + yr[6] = fx_3_006*y[4] + fx_3_007*y[6]; + }; + + inline void sh_rotx90_inv_3(REAL y[], REAL yr[]) + { + yr[0] = fx_3_008*y[3] + fx_3_004*y[5]; + yr[1] = fx_3_003*y[1]; + yr[2] = fx_3_002*y[3] - fx_3_001*y[5]; + yr[3] = fx_3_001*y[0] + fx_3_004*y[2]; + yr[4] = fx_3_005*y[4] + fx_3_006*y[6]; + yr[5] = fx_3_002*y[0] + fx_3_001*y[2]; + yr[6] = fx_3_006*y[4] + fx_3_007*y[6]; + } + + const REAL fx_4_001 = (REAL)(-sqrt(56.0) / 8.0); // -0.935414346694 + const REAL fx_4_002 = (REAL)(sqrt(8.0) / 8.0); // 0.353553390593 + const REAL fx_4_003 = (REAL)(-sqrt(36.0) / 8.0); // -0.75 + const REAL fx_4_004 = (REAL)(sqrt(28.0) / 8.0); // 0.661437827766 + const REAL fx_4_005 = (REAL)(-sqrt(8.0) / 8.0); // -0.353553390593 + const REAL fx_4_006 = (REAL)(sqrt(36.0) / 8.0); // 0.749999999999 + const REAL fx_4_007 = (REAL)(sqrt(9.0) / 8.0); // 0.37500034698 + const REAL fx_4_008 = (REAL)(sqrt(20.0) / 8.0); // 0.559017511622 + const REAL fx_4_009 = (REAL)(sqrt(35.0) / 8.0); // 0.739510657141 + const REAL fx_4_010 = (REAL)(sqrt(16.0) / 8.0); // 0.5 + const REAL fx_4_011 = (REAL)(-sqrt(28.0) / 8.0); // -0.661437827766 + const REAL fx_4_012 = (REAL)(sqrt(1.0) / 8.0); // 0.125 + const REAL fx_4_013 = (REAL)(sqrt(56.0) / 8.0); // 0.935414346692 + + inline void sh_rotx90_4(REAL y[], REAL yr[]) + { + yr[0] = fx_4_001*y[5] + fx_4_002*y[7]; + yr[1] = fx_4_003*y[1] + fx_4_004*y[3]; + yr[2] = fx_4_005*y[5] + fx_4_001*y[7]; + yr[3] = fx_4_004*y[1] + fx_4_006*y[3]; + yr[4] = fx_4_007*y[4] + fx_4_008*y[6] + fx_4_009*y[8]; + yr[5] = fx_4_013*y[0] + fx_4_002*y[2]; + yr[6] = fx_4_008*y[4] + fx_4_010*y[6] + fx_4_011*y[8]; + yr[7] = fx_4_005*y[0] - fx_4_001*y[2]; + yr[8] = fx_4_009*y[4] + fx_4_011*y[6] + fx_4_012*y[8]; + }; + + inline void sh_rotx90_inv_4(REAL y[], REAL yr[]) + { + yr[0] = fx_4_013*y[5] + fx_4_005*y[7]; + yr[1] = fx_4_003*y[1] + fx_4_004*y[3]; + yr[2] = fx_4_002*y[5] - fx_4_001*y[7]; + yr[3] = fx_4_004*y[1] + fx_4_006*y[3]; + yr[4] = fx_4_007*y[4] + fx_4_008*y[6] + fx_4_009*y[8]; + yr[5] = fx_4_001*y[0] + fx_4_005*y[2]; + yr[6] = fx_4_008*y[4] + fx_4_010*y[6] + fx_4_011*y[8]; + yr[7] = fx_4_002*y[0] + fx_4_001*y[2]; + yr[8] = fx_4_009*y[4] + fx_4_011*y[6] + fx_4_012*y[8]; + } + + const REAL fx_5_001 = (REAL)(sqrt(126.0) / 16.0); // 0.70156076002 + const REAL fx_5_002 = (REAL)(-sqrt(120.0) / 16.0); // -0.684653196882 + const REAL fx_5_003 = (REAL)(sqrt(10.0) / 16.0); // 0.197642353761 + const REAL fx_5_004 = (REAL)(-sqrt(64.0) / 16.0); // -0.5 + const REAL fx_5_005 = (REAL)(sqrt(192.0) / 16.0); // 0.866025403784 + const REAL fx_5_006 = (REAL)(sqrt(70.0) / 16.0); // 0.522912516584 + const REAL fx_5_007 = (REAL)(sqrt(24.0) / 16.0); // 0.306186217848 + const REAL fx_5_008 = (REAL)(-sqrt(162.0) / 16.0); // -0.795495128835 + const REAL fx_5_009 = (REAL)(sqrt(64.0) / 16.0); // 0.5 + const REAL fx_5_010 = (REAL)(sqrt(60.0) / 16.0); // 0.484122918274 + const REAL fx_5_011 = (REAL)(sqrt(112.0) / 16.0); // 0.661437827763 + const REAL fx_5_012 = (REAL)(sqrt(84.0) / 16.0); // 0.572821961867 + const REAL fx_5_013 = (REAL)(sqrt(4.0) / 16.0); // 0.125 + const REAL fx_5_014 = (REAL)(sqrt(42.0) / 16.0); // 0.405046293649 + const REAL fx_5_015 = (REAL)(sqrt(210.0) / 16.0); // 0.905711046633 + const REAL fx_5_016 = (REAL)(sqrt(169.0) / 16.0); // 0.8125 + const REAL fx_5_017 = (REAL)(-sqrt(45.0) / 16.0); // -0.419262745781 + const REAL fx_5_018 = (REAL)(sqrt(1.0) / 16.0); // 0.0625 + const REAL fx_5_019 = (REAL)(-sqrt(126.0) / 16.0); // -0.701561553415 + const REAL fx_5_020 = (REAL)(sqrt(120.0) / 16.0); // 0.684653196881 + const REAL fx_5_021 = (REAL)(-sqrt(10.0) / 16.0); // -0.197642353761 + const REAL fx_5_022 = (REAL)(-sqrt(70.0) / 16.0); // -0.522913107945 + const REAL fx_5_023 = (REAL)(-sqrt(60.0) / 16.0); // -0.48412346577 + + inline void sh_rotx90_5(REAL y[], REAL yr[]) + { + yr[0] = fx_5_001*y[5] + fx_5_002*y[7] + fx_5_003*y[9]; + yr[1] = fx_5_004*y[1] + fx_5_005*y[3]; + yr[2] = fx_5_006*y[5] + fx_5_007*y[7] + fx_5_008*y[9]; + yr[3] = fx_5_005*y[1] + fx_5_009*y[3]; + yr[4] = fx_5_010*y[5] + fx_5_011*y[7] + fx_5_012*y[9]; + yr[5] = fx_5_019*y[0] + fx_5_022*y[2] + fx_5_023*y[4]; + yr[6] = fx_5_013*y[6] + fx_5_014*y[8] + fx_5_015*y[10]; + yr[7] = fx_5_020*y[0] - fx_5_007*y[2] - fx_5_011*y[4]; + yr[8] = fx_5_014*y[6] + fx_5_016*y[8] + fx_5_017*y[10]; + yr[9] = fx_5_021*y[0] - fx_5_008*y[2] - fx_5_012*y[4]; + yr[10] = fx_5_015*y[6] + fx_5_017*y[8] + fx_5_018*y[10]; + }; + + inline void sh_rotx90_inv_5(REAL y[], REAL yr[]) + { + yr[0] = fx_5_019*y[5] + fx_5_020*y[7] + fx_5_021*y[9]; + yr[1] = fx_5_004*y[1] + fx_5_005*y[3]; + yr[2] = fx_5_022*y[5] - fx_5_007*y[7] - fx_5_008*y[9]; + yr[3] = fx_5_005*y[1] + fx_5_009*y[3]; + yr[4] = fx_5_023*y[5] - fx_5_011*y[7] - fx_5_012*y[9]; + yr[5] = fx_5_001*y[0] + fx_5_006*y[2] + fx_5_010*y[4]; + yr[6] = fx_5_013*y[6] + fx_5_014*y[8] + fx_5_015*y[10]; + yr[7] = fx_5_002*y[0] + fx_5_007*y[2] + fx_5_011*y[4]; + yr[8] = fx_5_014*y[6] + fx_5_016*y[8] + fx_5_017*y[10]; + yr[9] = fx_5_003*y[0] + fx_5_008*y[2] + fx_5_012*y[4]; + yr[10] = fx_5_015*y[6] + fx_5_017*y[8] + fx_5_018*y[10]; + } + + inline void sh_rot_1(REAL m[3 * 3], REAL y[NL1], REAL yr[NL1]) + { + REAL yr0 = m[4] * y[0] - m[5] * y[1] + m[3] * y[2]; + REAL yr1 = m[8] * y[1] - m[7] * y[0] - m[6] * y[2]; + REAL yr2 = m[1] * y[0] - m[2] * y[1] + m[0] * y[2]; + + yr[0] = yr0; + yr[1] = yr1; + yr[2] = yr2; + } + + inline void sh_roty_1(REAL ctm[1], REAL stm[1], REAL y[NL1], REAL yr[NL1]) + { + yr[0] = y[0]; + rot_inv(ctm[0], stm[0], y[1], y[2], yr[1], yr[2]); + } + + inline void sh_roty_2(REAL ctm[2], REAL stm[2], REAL y[NL2], REAL yr[NL2]) + { + REAL ytmp[NL2]; + sh_rotx90_2(y, yr); + sh_rotz_2(ctm, stm, yr, ytmp); + sh_rotx90_inv_2(ytmp, yr); + } + + inline void sh_roty_3(REAL ctm[3], REAL stm[3], REAL y[NL3], REAL yr[NL3]) + { + REAL ytmp[NL3]; + sh_rotx90_3(y, yr); + sh_rotz_3(ctm, stm, yr, ytmp); + sh_rotx90_inv_3(ytmp, yr); + } + + inline void sh_roty_4(REAL ctm[4], REAL stm[4], REAL y[NL4], REAL yr[NL4]) + { + REAL ytmp[NL4]; + sh_rotx90_4(y, yr); + sh_rotz_4(ctm, stm, yr, ytmp); + sh_rotx90_inv_4(ytmp, yr); + } + + inline void sh_roty_5(REAL ctm[5], REAL stm[5], REAL y[NL5], REAL yr[NL5]) + { + REAL ytmp[NL5]; + sh_rotx90_5(y, yr); + sh_rotz_5(ctm, stm, yr, ytmp); + sh_rotx90_inv_5(ytmp, yr); + } + +#define ROT_TOL CONSTANT(1e-4) + + /* + Finds cosine,sine pairs for zyz rotation (i.e. rotation R_z2 R_y R_z1 v). + The rotation is one which maps mx to (1,0,0) and mz to (0,0,1). + */ + inline void zyz(REAL m[3 * 3], REAL &zc1, REAL &zs1, REAL &yc, REAL &ys, REAL &zc2, REAL &zs2) + { + REAL cz = m[8]; + + // rotate so that (cx,cy,0) aligns to (1,0,0) + REAL cxylen = (REAL)sqrtf(1.0f - cz*cz); + if (cxylen >= ROT_TOL) + { + // if above is a NaN, will do the correct thing + yc = cz; + ys = cxylen; + REAL len67inv = 1.0f / sqrtf(m[6] * m[6] + m[7] * m[7]); + zc1 = -m[6] * len67inv; + zs1 = m[7] * len67inv; + REAL len25inv = 1.0f / sqrtf(m[2] * m[2] + m[5] * m[5]); + zc2 = m[2] * len25inv; + zs2 = m[5] * len25inv; + } + else { // m[6],m[7],m[8] already aligned to (0,0,1) + zc1 = 1.0; zs1 = 0.0; // identity + yc = cz; ys = 0.0; // identity + zc2 = m[0] * cz; zs2 = -m[1]; // align x axis (mx[0],mx[1],0) to (1,0,0) + } + } + + inline void sh_rotzyz_2(REAL zc1m[2], REAL zs1m[2], REAL ycm[2], REAL ysm[2], REAL zc2m[2], REAL zs2m[2], REAL y[NL2], REAL yr[NL2]) + { + REAL ytmp[NL2]; + sh_rotz_2(zc1m, zs1m, y, yr); + sh_roty_2(ycm, ysm, yr, ytmp); + sh_rotz_2(zc2m, zs2m, ytmp, yr); + } + + inline void sh_rotzyz_3(REAL zc1m[3], REAL zs1m[3], REAL ycm[3], REAL ysm[3], REAL zc2m[3], REAL zs2m[3], REAL y[NL3], REAL yr[NL3]) + { + REAL ytmp[NL3]; + sh_rotz_3(zc1m, zs1m, y, yr); + sh_roty_3(ycm, ysm, yr, ytmp); + sh_rotz_3(zc2m, zs2m, ytmp, yr); + } + + inline void sh_rotzyz_4(REAL zc1m[4], REAL zs1m[4], REAL ycm[4], REAL ysm[4], REAL zc2m[4], REAL zs2m[4], REAL y[NL4], REAL yr[NL4]) + { + REAL ytmp[NL4]; + sh_rotz_4(zc1m, zs1m, y, yr); + sh_roty_4(ycm, ysm, yr, ytmp); + sh_rotz_4(zc2m, zs2m, ytmp, yr); + } + + inline void sh_rotzyz_5(REAL zc1m[5], REAL zs1m[5], REAL ycm[5], REAL ysm[5], REAL zc2m[5], REAL zs2m[5], REAL y[NL5], REAL yr[NL5]) + { + REAL ytmp[NL5]; + sh_rotz_5(zc1m, zs1m, y, yr); + sh_roty_5(ycm, ysm, yr, ytmp); + sh_rotz_5(zc2m, zs2m, ytmp, yr); + } + + inline void sh3_rot(REAL m[3 * 3], REAL zc1, REAL zs1, REAL yc, REAL ys, REAL zc2, REAL zs2, REAL y[NSH3], REAL yr[NSH3]) + { + REAL zc1m[3], zs1m[3]; + rot_3(zc1, zs1, zc1m, zs1m); + REAL ycm[3], ysm[3]; + rot_3(yc, ys, ycm, ysm); + REAL zc2m[3], zs2m[3]; + rot_3(zc2, zs2, zc2m, zs2m); + + yr[0] = y[0]; + sh_rot_1(m, y + NSH0, yr + NSH0); + sh_rotzyz_2(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH1, yr + NSH1); + sh_rotzyz_3(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH2, yr + NSH2); + } + + inline void sh4_rot(REAL m[3 * 3], REAL zc1, REAL zs1, REAL yc, REAL ys, REAL zc2, REAL zs2, REAL y[NSH4], REAL yr[NSH4]) + { + REAL zc1m[4], zs1m[4]; + rot_4(zc1, zs1, zc1m, zs1m); + REAL ycm[4], ysm[4]; + rot_4(yc, ys, ycm, ysm); + REAL zc2m[4], zs2m[4]; + rot_4(zc2, zs2, zc2m, zs2m); + + yr[0] = y[0]; + sh_rot_1(m, y + NSH0, yr + NSH0); + sh_rotzyz_2(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH1, yr + NSH1); + sh_rotzyz_3(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH2, yr + NSH2); + sh_rotzyz_4(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH3, yr + NSH3); + } + + inline void sh5_rot(REAL m[3 * 3], REAL zc1, REAL zs1, REAL yc, REAL ys, REAL zc2, REAL zs2, REAL y[NSH5], REAL yr[NSH5]) + { + REAL zc1m[5], zs1m[5]; + rot_5(zc1, zs1, zc1m, zs1m); + REAL ycm[5], ysm[5]; + rot_5(yc, ys, ycm, ysm); + REAL zc2m[5], zs2m[5]; + rot_5(zc2, zs2, zc2m, zs2m); + + yr[0] = y[0]; + sh_rot_1(m, y + NSH0, yr + NSH0); + sh_rotzyz_2(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH1, yr + NSH1); + sh_rotzyz_3(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH2, yr + NSH2); + sh_rotzyz_4(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH3, yr + NSH3); + sh_rotzyz_5(zc1m, zs1m, ycm, ysm, zc2m, zs2m, y + NSH4, yr + NSH4); + } + + inline void sh1_rot(REAL m[3 * 3], REAL y[NSH1], REAL yr[NSH1]) + { + yr[0] = y[0]; + sh_rot_1(m, y + NSH0, yr + NSH0); + } + + inline void sh3_rot(REAL m[3 * 3], REAL y[NSH3], REAL yr[NSH3]) + { + REAL zc1, zs1, yc, ys, zc2, zs2; + zyz(m, zc1, zs1, yc, ys, zc2, zs2); + sh3_rot(m, zc1, zs1, yc, ys, zc2, zs2, y, yr); + } + + inline void sh4_rot(REAL m[3 * 3], REAL y[NSH4], REAL yr[NSH4]) + { + REAL zc1, zs1, yc, ys, zc2, zs2; + zyz(m, zc1, zs1, yc, ys, zc2, zs2); + sh4_rot(m, zc1, zs1, yc, ys, zc2, zs2, y, yr); + } + + inline void sh5_rot(REAL m[3 * 3], REAL y[NSH5], REAL yr[NSH5]) + { + REAL zc1, zs1, yc, ys, zc2, zs2; + zyz(m, zc1, zs1, yc, ys, zc2, zs2); + sh5_rot(m, zc1, zs1, yc, ys, zc2, zs2, y, yr); + } + + // simple matrix vector multiply for a square matrix (only used by ZRotation) + inline void SimpMatMul(size_t dim, const float *matrix, const float *input, float *result) + { + for (size_t iR = 0; iR < dim; ++iR) + { + result[iR + 0] = matrix[iR*dim + 0] * input[0]; + for (size_t iC = 1; iC < dim; ++iC) + { + result[iR] += matrix[iR*dim + iC] * input[iC]; + } + } + } + +}; // anonymous namespace + + +//------------------------------------------------------------------------------------- +// Evaluates the Spherical Harmonic basis functions +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205448.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +float* XM_CALLCONV DirectX::XMSHEvalDirection( + float *result, + size_t order, + FXMVECTOR dir) noexcept +{ + if (!result) + return nullptr; + + XMFLOAT4A dv; + XMStoreFloat4A(&dv, dir); + + const float fX = dv.x; + const float fY = dv.y; + const float fZ = dv.z; + + switch (order) + { + case 2: + sh_eval_basis_1(fX, fY, fZ, result); + break; + + case 3: + sh_eval_basis_2(fX, fY, fZ, result); + break; + + case 4: + sh_eval_basis_3(fX, fY, fZ, result); + break; + + case 5: + sh_eval_basis_4(fX, fY, fZ, result); + break; + + case 6: + sh_eval_basis_5(fX, fY, fZ, result); + break; + + default: + assert(order < XM_SH_MINORDER || order > XM_SH_MAXORDER); + return nullptr; + } + + return result; +} + + +//------------------------------------------------------------------------------------- +// Rotates SH vector by a rotation matrix +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204992.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +float* XM_CALLCONV DirectX::XMSHRotate( + float *result, + size_t order, + FXMMATRIX rotMatrix, + const float *input) noexcept +{ + if (!result || !input) + return nullptr; + + if (result == input) + return nullptr; + + XMFLOAT3X3 mat; + XMStoreFloat3x3(&mat, rotMatrix); + + float mRot[3 * 3]; + const float r00 = mRot[0 * 3 + 0] = mat._11; + const float r10 = mRot[1 * 3 + 0] = mat._12; + const float r20 = mRot[2 * 3 + 0] = mat._13; + + const float r01 = mRot[0 * 3 + 1] = mat._21; + const float r11 = mRot[1 * 3 + 1] = mat._22; + const float r21 = mRot[2 * 3 + 1] = mat._23; + + const float r02 = mRot[0 * 3 + 2] = mat._31; + const float r12 = mRot[1 * 3 + 2] = mat._32; + const float r22 = mRot[2 * 3 + 2] = mat._33; + + result[0] = input[0]; // rotate the constant term + + switch (order) + { + case 2: + { + // do linear by hand... + + result[1] = r11*input[1] - r12*input[2] + r10*input[3]; + result[2] = -r21*input[1] + r22*input[2] - r20*input[3]; + result[3] = r01*input[1] - r02*input[2] + r00*input[3]; + } + break; + + case 3: + { + float R[25]; + // do linear by hand... + + result[1] = r11*input[1] - r12*input[2] + r10*input[3]; + result[2] = -r21*input[1] + r22*input[2] - r20*input[3]; + result[3] = r01*input[1] - r02*input[2] + r00*input[3]; + + // direct code for quadratics is faster than ZYZ reccurence relations + + const float t41 = r01 * r00; + const float t43 = r11 * r10; + const float t48 = r11 * r12; + const float t50 = r01 * r02; + const float t55 = r02 * r02; + const float t57 = r22 * r22; + const float t58 = r12 * r12; + const float t61 = r00 * r02; + const float t63 = r10 * r12; + const float t68 = r10 * r10; + const float t70 = r01 * r01; + const float t72 = r11 * r11; + const float t74 = r00 * r00; + const float t76 = r21 * r21; + const float t78 = r20 * r20; + + const float v173 = 0.1732050808e1f; + const float v577 = 0.5773502693e0f; + const float v115 = 0.1154700539e1f; + const float v288 = 0.2886751347e0f; + const float v866 = 0.8660254040e0f; + + R[0] = r11 * r00 + r01 * r10; + R[1] = -r01 * r12 - r11 * r02; + R[2] = v173 * r02 * r12; + R[3] = -r10 * r02 - r00 * r12; + R[4] = r00 * r10 - r01 * r11; + R[5] = -r11 * r20 - r21 * r10; + R[6] = r11 * r22 + r21 * r12; + R[7] = -v173 * r22 * r12; + R[8] = r20 * r12 + r10 * r22; + R[9] = -r10 * r20 + r11 * r21; + R[10] = -v577* (t41 + t43) + v115 * r21 * r20; + R[11] = v577* (t48 + t50) - v115 * r21 * r22; + R[12] = -0.5000000000e0f * (t55 + t58) + t57; + R[13] = v577 * (t61 + t63) - v115 * r20 * r22; + R[14] = v288 * (t70 - t68 + t72 - t74) - v577 * (t76 - t78); + R[15] = -r01 * r20 - r21 * r00; + R[16] = r01 * r22 + r21 * r02; + R[17] = -v173 * r22 * r02; + R[18] = r00 * r22 + r20 * r02; + R[19] = -r00 * r20 + r01 * r21; + R[20] = t41 - t43; + R[21] = -t50 + t48; + R[22] = v866 * (t55 - t58); + R[23] = t63 - t61; + R[24] = 0.5000000000e0f *(t74 - t68 - t70 + t72); + + // blow the matrix multiply out by hand, looping is ineficient on a P4... + for (unsigned int iR = 0; iR < 5; iR++) + { + const unsigned int uBase = iR * 5; + result[4 + iR] = R[uBase + 0] * input[4] + R[uBase + 1] * input[5] + R[uBase + 2] * input[6] + R[uBase + 3] * input[7] + R[uBase + 4] * input[8]; + } + } + break; + + case 4: + sh3_rot(mRot, const_cast(input), result); + break; + + case 5: + sh4_rot(mRot, const_cast(input), result); + break; + + case 6: + sh5_rot(mRot, const_cast(input), result); + break; + + default: + assert(order < XM_SH_MINORDER || order > XM_SH_MAXORDER); + return nullptr; + } + + return result; +} + + +//------------------------------------------------------------------------------------- +// Rotates the SH vector in the Z axis by an angle +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205461.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +float* DirectX::XMSHRotateZ( + float *result, + size_t order, + float angle, + const float *input) noexcept +{ + if (!result || !input) + return nullptr; + + if (result == input) + return nullptr; + + if (order < XM_SH_MINORDER || order > XM_SH_MAXORDER) + return nullptr; + + float R[(2 * (XM_SH_MAXORDER - 1) + 1)*(2 * (XM_SH_MAXORDER - 1) + 1)]; // used to store rotation matrices... + + // these are actually very sparse matrices, most of the entries are zero's... + + const float ca = cosf(angle); + const float sa = sinf(angle); + + const float t1 = ca; + const float t2 = sa; + R[0] = t1; + R[1] = 0.0f; + R[2] = t2; + R[3] = 0.0f; + R[4] = 1.0f; + R[5] = 0.0f; + R[6] = -t2; + R[7] = 0.0f; + R[8] = t1; + + result[0] = input[0]; + SimpMatMul(3, R, input + 1, result + 1); + + if (order > 2) + { + for (int j = 0; j < 5 * 5; j++) R[j] = 0.0f; + const float t1 = sa; + const float t2 = t1*t1; + const float t3 = ca; + const float t4 = t3*t3; + const float t5 = -t2 + t4; + const float t7 = 2.0f*t3*t1; + R[0] = t5; + R[4] = t7; + R[6] = t3; + R[8] = t1; + R[12] = 1.0f; + R[16] = -t1; + R[18] = t3; + R[20] = -t7; + R[24] = t5; + + SimpMatMul(5, R, input + 4, result + 4); // un-roll matrix/vector multiply + if (order > 3) + { + for (int j = 0; j < 7 * 7; j++) R[j] = 0.0f; + const float t1 = ca; + const float t2 = t1*t1; + const float t4 = sa; + const float t5 = t4*t4; + const float t8 = t2*t1 - 3.0f*t1*t5; + const float t12 = 3.0f*t4*t2 - t5*t4; + const float t13 = -t5 + t2; + const float t15 = 2.0f*t1*t4; + R[0] = t8; + R[6] = t12; + R[8] = t13; + R[12] = t15; + R[16] = t1; + R[18] = t4; + R[24] = 1.0f; + R[30] = -t4; + R[32] = t1; + R[36] = -t15; + R[40] = t13; + R[42] = -t12; + R[48] = t8; + SimpMatMul(7, R, input + 9, result + 9); + if (order > 4) + { + for (int j = 0; j <= 9 * 9; j++) R[j] = 0.0f; + const float t1 = ca; + const float t2 = t1*t1; + const float t3 = t2*t2; + const float t4 = sa; + const float t5 = t4*t4; + const float t6 = t5*t5; + const float t9 = t3 + t6 - 6.0f*t5*t2; + const float t10 = t5*t4; + const float t12 = t2*t1; + const float t14 = -t10*t1 + t4*t12; + const float t17 = t12 - 3.0f*t1*t5; + const float t20 = 3.0f*t4*t2 - t10; + const float t21 = -t5 + t2; + const float t23 = 2.0f*t1*t4; + R[0] = t9; + R[8] = 4.0f*t14; + R[10] = t17; + R[16] = t20; + R[20] = t21; + R[24] = t23; + R[30] = t1; + R[32] = t4; + R[40] = 1.0f; + R[48] = -t4; + R[50] = t1; + R[56] = -t23; + R[60] = t21; + R[64] = -t20; + R[70] = t17; + R[72] = -4.0f*t14; + R[80] = t9; + + SimpMatMul(9, R, input + 16, result + 16); + if (order > 5) + { + for (int j = 0; j < 11 * 11; j++) R[j] = 0.0f; + const float t1 = ca; + const float t2 = sa; + const float t3 = t2*t2; + const float t4 = t3*t3; + const float t7 = t1*t1; + const float t8 = t7*t1; + const float t11 = t7*t7; + const float t13 = 5.0f*t1*t4 - 10.0f*t3*t8 + t11*t1; + const float t14 = t3*t2; + const float t20 = -10.0f*t14*t7 + 5.0f*t2*t11 + t4*t2; + const float t23 = t11 + t4 - 6.0f*t3*t7; + const float t26 = -t14*t1 + t2*t8; + const float t29 = t8 - 3.0f*t1*t3; + const float t32 = 3.0f*t2*t7 - t14; + const float t33 = -t3 + t7; + const float t35 = 2.0f*t1*t2; + R[0] = t13; + R[10] = t20; + R[12] = t23; + R[20] = 4.0f*t26; + R[24] = t29; + R[30] = t32; + R[36] = t33; + R[40] = t35; + R[48] = t1; + R[50] = t2; + R[60] = 1.0f; + R[70] = -t2; + R[72] = t1; + R[80] = -t35; + R[84] = t33; + R[90] = -t32; + R[96] = t29; + R[100] = -4.0f*t26; + R[108] = t23; + R[110] = -t20; + R[120] = t13; + SimpMatMul(11, R, input + 25, result + 25); + } + } + } + } + + return result; +} + + +//------------------------------------------------------------------------------------- +// Adds two SH vectors, result[i] = inputA[i] + inputB[i]; +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205438.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +float* DirectX::XMSHAdd( + float *result, + size_t order, + const float *inputA, + const float *inputB) noexcept +{ + if (!result || !inputA || !inputB) + return nullptr; + + const size_t numcoeff = order*order; + + for (size_t i = 0; i < numcoeff; ++i) + { + result[i] = inputA[i] + inputB[i]; + } + + return result; +} + + +//------------------------------------------------------------------------------------- +// Scales a SH vector, result[i] = input[i] * scale; +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204994.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +float* DirectX::XMSHScale( + float *result, + size_t order, + const float *input, + float scale) noexcept +{ + if (!result || !input) + return nullptr; + + const size_t numcoeff = order*order; + + for (size_t i = 0; i < numcoeff; ++i) + { + result[i] = scale * input[i]; + } + + return result; +} + + +//------------------------------------------------------------------------------------- +// Computes the dot product of two SH vectors +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205446.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +float DirectX::XMSHDot( + size_t order, + const float *inputA, + const float *inputB) noexcept +{ + if (!inputA || !inputB) + return 0.f; + + float result = inputA[0] * inputB[0]; + + const size_t numcoeff = order*order; + + for (size_t i = 1; i < numcoeff; ++i) + { + result += inputA[i] * inputB[i]; + } + + return result; +} + + +//------------------------------------------------------------------------------------- +// Computes the product of two functions represented using SH (f and g), where: +// result[i] = int(y_i(s) * f(s) * g(s)), where y_i(s) is the ith SH basis +// function, f(s) and g(s) are SH functions (sum_i(y_i(s)*c_i)). The order O +// determines the lengths of the arrays, where there should always be O^2 +// coefficients. In general the product of two SH functions of order O generates +// and SH function of order 2*O - 1, but we truncate the result. This means +// that the product commutes (f*g == g*f) but doesn't associate +// (f*(g*h) != (f*g)*h. +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +float* DirectX::XMSHMultiply( + float *result, + size_t order, + const float *inputF, + const float *inputG) noexcept +{ + switch (order) + { + case 2: + return XMSHMultiply2(result, inputF, inputG); + + case 3: + return XMSHMultiply3(result, inputF, inputG); + + case 4: + return XMSHMultiply4(result, inputF, inputG); + + case 5: + return XMSHMultiply5(result, inputF, inputG); + + case 6: + return XMSHMultiply6(result, inputF, inputG); + + default: + assert(order < XM_SH_MINORDER || order > XM_SH_MAXORDER); + return nullptr; + } +} + + +//------------------------------------------------------------------------------------- +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205454.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +float* DirectX::XMSHMultiply2( + float *y, + const float *f, + const float *g) noexcept +{ + if (!y || !f || !g) + return nullptr; + + REAL tf, tg, t; + // [0,0]: 0, + y[0] = CONSTANT(0.282094792935999980)*f[0] * g[0]; + + // [1,1]: 0, + tf = CONSTANT(0.282094791773000010)*f[0]; + tg = CONSTANT(0.282094791773000010)*g[0]; + y[1] = tf*g[1] + tg*f[1]; + t = f[1] * g[1]; + y[0] += CONSTANT(0.282094791773000010)*t; + + // [2,2]: 0, + tf = CONSTANT(0.282094795249000000)*f[0]; + tg = CONSTANT(0.282094795249000000)*g[0]; + y[2] = tf*g[2] + tg*f[2]; + t = f[2] * g[2]; + y[0] += CONSTANT(0.282094795249000000)*t; + + // [3,3]: 0, + tf = CONSTANT(0.282094791773000010)*f[0]; + tg = CONSTANT(0.282094791773000010)*g[0]; + y[3] = tf*g[3] + tg*f[3]; + t = f[3] * g[3]; + y[0] += CONSTANT(0.282094791773000010)*t; + + // multiply count=20 + + return y; +} + + +//------------------------------------------------------------------------------------- +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232906.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +float* DirectX::XMSHMultiply3( + float *y, + const float *f, + const float *g) noexcept +{ + if (!y || !f || !g) + return nullptr; + + REAL tf, tg, t; + // [0,0]: 0, + y[0] = CONSTANT(0.282094792935999980)*f[0] * g[0]; + + // [1,1]: 0,6,8, + tf = CONSTANT(0.282094791773000010)*f[0] + CONSTANT(-0.126156626101000010)*f[6] + CONSTANT(-0.218509686119999990)*f[8]; + tg = CONSTANT(0.282094791773000010)*g[0] + CONSTANT(-0.126156626101000010)*g[6] + CONSTANT(-0.218509686119999990)*g[8]; + y[1] = tf*g[1] + tg*f[1]; + t = f[1] * g[1]; + y[0] += CONSTANT(0.282094791773000010)*t; + y[6] = CONSTANT(-0.126156626101000010)*t; + y[8] = CONSTANT(-0.218509686119999990)*t; + + // [1,2]: 5, + tf = CONSTANT(0.218509686118000010)*f[5]; + tg = CONSTANT(0.218509686118000010)*g[5]; + y[1] += tf*g[2] + tg*f[2]; + y[2] = tf*g[1] + tg*f[1]; + t = f[1] * g[2] + f[2] * g[1]; + y[5] = CONSTANT(0.218509686118000010)*t; + + // [1,3]: 4, + tf = CONSTANT(0.218509686114999990)*f[4]; + tg = CONSTANT(0.218509686114999990)*g[4]; + y[1] += tf*g[3] + tg*f[3]; + y[3] = tf*g[1] + tg*f[1]; + t = f[1] * g[3] + f[3] * g[1]; + y[4] = CONSTANT(0.218509686114999990)*t; + + // [2,2]: 0,6, + tf = CONSTANT(0.282094795249000000)*f[0] + CONSTANT(0.252313259986999990)*f[6]; + tg = CONSTANT(0.282094795249000000)*g[0] + CONSTANT(0.252313259986999990)*g[6]; + y[2] += tf*g[2] + tg*f[2]; + t = f[2] * g[2]; + y[0] += CONSTANT(0.282094795249000000)*t; + y[6] += CONSTANT(0.252313259986999990)*t; + + // [2,3]: 7, + tf = CONSTANT(0.218509686118000010)*f[7]; + tg = CONSTANT(0.218509686118000010)*g[7]; + y[2] += tf*g[3] + tg*f[3]; + y[3] += tf*g[2] + tg*f[2]; + t = f[2] * g[3] + f[3] * g[2]; + y[7] = CONSTANT(0.218509686118000010)*t; + + // [3,3]: 0,6,8, + tf = CONSTANT(0.282094791773000010)*f[0] + CONSTANT(-0.126156626101000010)*f[6] + CONSTANT(0.218509686119999990)*f[8]; + tg = CONSTANT(0.282094791773000010)*g[0] + CONSTANT(-0.126156626101000010)*g[6] + CONSTANT(0.218509686119999990)*g[8]; + y[3] += tf*g[3] + tg*f[3]; + t = f[3] * g[3]; + y[0] += CONSTANT(0.282094791773000010)*t; + y[6] += CONSTANT(-0.126156626101000010)*t; + y[8] += CONSTANT(0.218509686119999990)*t; + + // [4,4]: 0,6, + tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.180223751576000010)*f[6]; + tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.180223751576000010)*g[6]; + y[4] += tf*g[4] + tg*f[4]; + t = f[4] * g[4]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[6] += CONSTANT(-0.180223751576000010)*t; + + // [4,5]: 7, + tf = CONSTANT(0.156078347226000000)*f[7]; + tg = CONSTANT(0.156078347226000000)*g[7]; + y[4] += tf*g[5] + tg*f[5]; + y[5] += tf*g[4] + tg*f[4]; + t = f[4] * g[5] + f[5] * g[4]; + y[7] += CONSTANT(0.156078347226000000)*t; + + // [5,5]: 0,6,8, + tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.090111875786499998)*f[6] + CONSTANT(-0.156078347227999990)*f[8]; + tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.090111875786499998)*g[6] + CONSTANT(-0.156078347227999990)*g[8]; + y[5] += tf*g[5] + tg*f[5]; + t = f[5] * g[5]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[6] += CONSTANT(0.090111875786499998)*t; + y[8] += CONSTANT(-0.156078347227999990)*t; + + // [6,6]: 0,6, + tf = CONSTANT(0.282094797560000000)*f[0]; + tg = CONSTANT(0.282094797560000000)*g[0]; + y[6] += tf*g[6] + tg*f[6]; + t = f[6] * g[6]; + y[0] += CONSTANT(0.282094797560000000)*t; + y[6] += CONSTANT(0.180223764527000010)*t; + + // [7,7]: 0,6,8, + tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.090111875786499998)*f[6] + CONSTANT(0.156078347227999990)*f[8]; + tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.090111875786499998)*g[6] + CONSTANT(0.156078347227999990)*g[8]; + y[7] += tf*g[7] + tg*f[7]; + t = f[7] * g[7]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[6] += CONSTANT(0.090111875786499998)*t; + y[8] += CONSTANT(0.156078347227999990)*t; + + // [8,8]: 0,6, + tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.180223751576000010)*f[6]; + tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.180223751576000010)*g[6]; + y[8] += tf*g[8] + tg*f[8]; + t = f[8] * g[8]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[6] += CONSTANT(-0.180223751576000010)*t; + + // multiply count=120 + + return y; +} + + +//------------------------------------------------------------------------------------- +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232907.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +float* DirectX::XMSHMultiply4( + float *y, + const float *f, + const float *g) noexcept +{ + if (!y || !f || !g) + return nullptr; + + REAL tf, tg, t; + // [0,0]: 0, + y[0] = CONSTANT(0.282094792935999980)*f[0] * g[0]; + + // [1,1]: 0,6,8, + tf = CONSTANT(0.282094791773000010)*f[0] + CONSTANT(-0.126156626101000010)*f[6] + CONSTANT(-0.218509686119999990)*f[8]; + tg = CONSTANT(0.282094791773000010)*g[0] + CONSTANT(-0.126156626101000010)*g[6] + CONSTANT(-0.218509686119999990)*g[8]; + y[1] = tf*g[1] + tg*f[1]; + t = f[1] * g[1]; + y[0] += CONSTANT(0.282094791773000010)*t; + y[6] = CONSTANT(-0.126156626101000010)*t; + y[8] = CONSTANT(-0.218509686119999990)*t; + + // [1,4]: 3,13,15, + tf = CONSTANT(0.218509686114999990)*f[3] + CONSTANT(-0.058399170082300000)*f[13] + CONSTANT(-0.226179013157999990)*f[15]; + tg = CONSTANT(0.218509686114999990)*g[3] + CONSTANT(-0.058399170082300000)*g[13] + CONSTANT(-0.226179013157999990)*g[15]; + y[1] += tf*g[4] + tg*f[4]; + y[4] = tf*g[1] + tg*f[1]; + t = f[1] * g[4] + f[4] * g[1]; + y[3] = CONSTANT(0.218509686114999990)*t; + y[13] = CONSTANT(-0.058399170082300000)*t; + y[15] = CONSTANT(-0.226179013157999990)*t; + + // [1,5]: 2,12,14, + tf = CONSTANT(0.218509686118000010)*f[2] + CONSTANT(-0.143048168103000000)*f[12] + CONSTANT(-0.184674390923000000)*f[14]; + tg = CONSTANT(0.218509686118000010)*g[2] + CONSTANT(-0.143048168103000000)*g[12] + CONSTANT(-0.184674390923000000)*g[14]; + y[1] += tf*g[5] + tg*f[5]; + y[5] = tf*g[1] + tg*f[1]; + t = f[1] * g[5] + f[5] * g[1]; + y[2] = CONSTANT(0.218509686118000010)*t; + y[12] = CONSTANT(-0.143048168103000000)*t; + y[14] = CONSTANT(-0.184674390923000000)*t; + + // [1,6]: 11, + tf = CONSTANT(0.202300659402999990)*f[11]; + tg = CONSTANT(0.202300659402999990)*g[11]; + y[1] += tf*g[6] + tg*f[6]; + y[6] += tf*g[1] + tg*f[1]; + t = f[1] * g[6] + f[6] * g[1]; + y[11] = CONSTANT(0.202300659402999990)*t; + + // [1,8]: 9,11, + tf = CONSTANT(0.226179013155000000)*f[9] + CONSTANT(0.058399170081799998)*f[11]; + tg = CONSTANT(0.226179013155000000)*g[9] + CONSTANT(0.058399170081799998)*g[11]; + y[1] += tf*g[8] + tg*f[8]; + y[8] += tf*g[1] + tg*f[1]; + t = f[1] * g[8] + f[8] * g[1]; + y[9] = CONSTANT(0.226179013155000000)*t; + y[11] += CONSTANT(0.058399170081799998)*t; + + // [2,2]: 0,6, + tf = CONSTANT(0.282094795249000000)*f[0] + CONSTANT(0.252313259986999990)*f[6]; + tg = CONSTANT(0.282094795249000000)*g[0] + CONSTANT(0.252313259986999990)*g[6]; + y[2] += tf*g[2] + tg*f[2]; + t = f[2] * g[2]; + y[0] += CONSTANT(0.282094795249000000)*t; + y[6] += CONSTANT(0.252313259986999990)*t; + + // [2,6]: 12, + tf = CONSTANT(0.247766706973999990)*f[12]; + tg = CONSTANT(0.247766706973999990)*g[12]; + y[2] += tf*g[6] + tg*f[6]; + y[6] += tf*g[2] + tg*f[2]; + t = f[2] * g[6] + f[6] * g[2]; + y[12] += CONSTANT(0.247766706973999990)*t; + + // [3,3]: 0,6,8, + tf = CONSTANT(0.282094791773000010)*f[0] + CONSTANT(-0.126156626101000010)*f[6] + CONSTANT(0.218509686119999990)*f[8]; + tg = CONSTANT(0.282094791773000010)*g[0] + CONSTANT(-0.126156626101000010)*g[6] + CONSTANT(0.218509686119999990)*g[8]; + y[3] += tf*g[3] + tg*f[3]; + t = f[3] * g[3]; + y[0] += CONSTANT(0.282094791773000010)*t; + y[6] += CONSTANT(-0.126156626101000010)*t; + y[8] += CONSTANT(0.218509686119999990)*t; + + // [3,6]: 13, + tf = CONSTANT(0.202300659402999990)*f[13]; + tg = CONSTANT(0.202300659402999990)*g[13]; + y[3] += tf*g[6] + tg*f[6]; + y[6] += tf*g[3] + tg*f[3]; + t = f[3] * g[6] + f[6] * g[3]; + y[13] += CONSTANT(0.202300659402999990)*t; + + // [3,7]: 2,12,14, + tf = CONSTANT(0.218509686118000010)*f[2] + CONSTANT(-0.143048168103000000)*f[12] + CONSTANT(0.184674390923000000)*f[14]; + tg = CONSTANT(0.218509686118000010)*g[2] + CONSTANT(-0.143048168103000000)*g[12] + CONSTANT(0.184674390923000000)*g[14]; + y[3] += tf*g[7] + tg*f[7]; + y[7] = tf*g[3] + tg*f[3]; + t = f[3] * g[7] + f[7] * g[3]; + y[2] += CONSTANT(0.218509686118000010)*t; + y[12] += CONSTANT(-0.143048168103000000)*t; + y[14] += CONSTANT(0.184674390923000000)*t; + + // [3,8]: 13,15, + tf = CONSTANT(-0.058399170081799998)*f[13] + CONSTANT(0.226179013155000000)*f[15]; + tg = CONSTANT(-0.058399170081799998)*g[13] + CONSTANT(0.226179013155000000)*g[15]; + y[3] += tf*g[8] + tg*f[8]; + y[8] += tf*g[3] + tg*f[3]; + t = f[3] * g[8] + f[8] * g[3]; + y[13] += CONSTANT(-0.058399170081799998)*t; + y[15] += CONSTANT(0.226179013155000000)*t; + + // [4,4]: 0,6, + tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.180223751576000010)*f[6]; + tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.180223751576000010)*g[6]; + y[4] += tf*g[4] + tg*f[4]; + t = f[4] * g[4]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[6] += CONSTANT(-0.180223751576000010)*t; + + // [4,5]: 7, + tf = CONSTANT(0.156078347226000000)*f[7]; + tg = CONSTANT(0.156078347226000000)*g[7]; + y[4] += tf*g[5] + tg*f[5]; + y[5] += tf*g[4] + tg*f[4]; + t = f[4] * g[5] + f[5] * g[4]; + y[7] += CONSTANT(0.156078347226000000)*t; + + // [4,9]: 3,13, + tf = CONSTANT(0.226179013157999990)*f[3] + CONSTANT(-0.094031597258400004)*f[13]; + tg = CONSTANT(0.226179013157999990)*g[3] + CONSTANT(-0.094031597258400004)*g[13]; + y[4] += tf*g[9] + tg*f[9]; + y[9] += tf*g[4] + tg*f[4]; + t = f[4] * g[9] + f[9] * g[4]; + y[3] += CONSTANT(0.226179013157999990)*t; + y[13] += CONSTANT(-0.094031597258400004)*t; + + // [4,10]: 2,12, + tf = CONSTANT(0.184674390919999990)*f[2] + CONSTANT(-0.188063194517999990)*f[12]; + tg = CONSTANT(0.184674390919999990)*g[2] + CONSTANT(-0.188063194517999990)*g[12]; + y[4] += tf*g[10] + tg*f[10]; + y[10] = tf*g[4] + tg*f[4]; + t = f[4] * g[10] + f[10] * g[4]; + y[2] += CONSTANT(0.184674390919999990)*t; + y[12] += CONSTANT(-0.188063194517999990)*t; + + // [4,11]: 3,13,15, + tf = CONSTANT(-0.058399170082300000)*f[3] + CONSTANT(0.145673124078000010)*f[13] + CONSTANT(0.094031597258400004)*f[15]; + tg = CONSTANT(-0.058399170082300000)*g[3] + CONSTANT(0.145673124078000010)*g[13] + CONSTANT(0.094031597258400004)*g[15]; + y[4] += tf*g[11] + tg*f[11]; + y[11] += tf*g[4] + tg*f[4]; + t = f[4] * g[11] + f[11] * g[4]; + y[3] += CONSTANT(-0.058399170082300000)*t; + y[13] += CONSTANT(0.145673124078000010)*t; + y[15] += CONSTANT(0.094031597258400004)*t; + + // [5,5]: 0,6,8, + tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.090111875786499998)*f[6] + CONSTANT(-0.156078347227999990)*f[8]; + tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.090111875786499998)*g[6] + CONSTANT(-0.156078347227999990)*g[8]; + y[5] += tf*g[5] + tg*f[5]; + t = f[5] * g[5]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[6] += CONSTANT(0.090111875786499998)*t; + y[8] += CONSTANT(-0.156078347227999990)*t; + + // [5,9]: 14, + tf = CONSTANT(0.148677009677999990)*f[14]; + tg = CONSTANT(0.148677009677999990)*g[14]; + y[5] += tf*g[9] + tg*f[9]; + y[9] += tf*g[5] + tg*f[5]; + t = f[5] * g[9] + f[9] * g[5]; + y[14] += CONSTANT(0.148677009677999990)*t; + + // [5,10]: 3,13,15, + tf = CONSTANT(0.184674390919999990)*f[3] + CONSTANT(0.115164716490000000)*f[13] + CONSTANT(-0.148677009678999990)*f[15]; + tg = CONSTANT(0.184674390919999990)*g[3] + CONSTANT(0.115164716490000000)*g[13] + CONSTANT(-0.148677009678999990)*g[15]; + y[5] += tf*g[10] + tg*f[10]; + y[10] += tf*g[5] + tg*f[5]; + t = f[5] * g[10] + f[10] * g[5]; + y[3] += CONSTANT(0.184674390919999990)*t; + y[13] += CONSTANT(0.115164716490000000)*t; + y[15] += CONSTANT(-0.148677009678999990)*t; + + // [5,11]: 2,12,14, + tf = CONSTANT(0.233596680327000010)*f[2] + CONSTANT(0.059470803871800003)*f[12] + CONSTANT(-0.115164716491000000)*f[14]; + tg = CONSTANT(0.233596680327000010)*g[2] + CONSTANT(0.059470803871800003)*g[12] + CONSTANT(-0.115164716491000000)*g[14]; + y[5] += tf*g[11] + tg*f[11]; + y[11] += tf*g[5] + tg*f[5]; + t = f[5] * g[11] + f[11] * g[5]; + y[2] += CONSTANT(0.233596680327000010)*t; + y[12] += CONSTANT(0.059470803871800003)*t; + y[14] += CONSTANT(-0.115164716491000000)*t; + + // [6,6]: 0,6, + tf = CONSTANT(0.282094797560000000)*f[0]; + tg = CONSTANT(0.282094797560000000)*g[0]; + y[6] += tf*g[6] + tg*f[6]; + t = f[6] * g[6]; + y[0] += CONSTANT(0.282094797560000000)*t; + y[6] += CONSTANT(0.180223764527000010)*t; + + // [7,7]: 6,0,8, + tf = CONSTANT(0.090111875786499998)*f[6] + CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.156078347227999990)*f[8]; + tg = CONSTANT(0.090111875786499998)*g[6] + CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.156078347227999990)*g[8]; + y[7] += tf*g[7] + tg*f[7]; + t = f[7] * g[7]; + y[6] += CONSTANT(0.090111875786499998)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + y[8] += CONSTANT(0.156078347227999990)*t; + + // [7,10]: 9,1,11, + tf = CONSTANT(0.148677009678999990)*f[9] + CONSTANT(0.184674390919999990)*f[1] + CONSTANT(0.115164716490000000)*f[11]; + tg = CONSTANT(0.148677009678999990)*g[9] + CONSTANT(0.184674390919999990)*g[1] + CONSTANT(0.115164716490000000)*g[11]; + y[7] += tf*g[10] + tg*f[10]; + y[10] += tf*g[7] + tg*f[7]; + t = f[7] * g[10] + f[10] * g[7]; + y[9] += CONSTANT(0.148677009678999990)*t; + y[1] += CONSTANT(0.184674390919999990)*t; + y[11] += CONSTANT(0.115164716490000000)*t; + + // [7,13]: 12,2,14, + tf = CONSTANT(0.059470803871800003)*f[12] + CONSTANT(0.233596680327000010)*f[2] + CONSTANT(0.115164716491000000)*f[14]; + tg = CONSTANT(0.059470803871800003)*g[12] + CONSTANT(0.233596680327000010)*g[2] + CONSTANT(0.115164716491000000)*g[14]; + y[7] += tf*g[13] + tg*f[13]; + y[13] += tf*g[7] + tg*f[7]; + t = f[7] * g[13] + f[13] * g[7]; + y[12] += CONSTANT(0.059470803871800003)*t; + y[2] += CONSTANT(0.233596680327000010)*t; + y[14] += CONSTANT(0.115164716491000000)*t; + + // [7,14]: 15, + tf = CONSTANT(0.148677009677999990)*f[15]; + tg = CONSTANT(0.148677009677999990)*g[15]; + y[7] += tf*g[14] + tg*f[14]; + y[14] += tf*g[7] + tg*f[7]; + t = f[7] * g[14] + f[14] * g[7]; + y[15] += CONSTANT(0.148677009677999990)*t; + + // [8,8]: 0,6, + tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.180223751576000010)*f[6]; + tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.180223751576000010)*g[6]; + y[8] += tf*g[8] + tg*f[8]; + t = f[8] * g[8]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[6] += CONSTANT(-0.180223751576000010)*t; + + // [8,9]: 11, + tf = CONSTANT(-0.094031597259499999)*f[11]; + tg = CONSTANT(-0.094031597259499999)*g[11]; + y[8] += tf*g[9] + tg*f[9]; + y[9] += tf*g[8] + tg*f[8]; + t = f[8] * g[9] + f[9] * g[8]; + y[11] += CONSTANT(-0.094031597259499999)*t; + + // [8,13]: 15, + tf = CONSTANT(-0.094031597259499999)*f[15]; + tg = CONSTANT(-0.094031597259499999)*g[15]; + y[8] += tf*g[13] + tg*f[13]; + y[13] += tf*g[8] + tg*f[8]; + t = f[8] * g[13] + f[13] * g[8]; + y[15] += CONSTANT(-0.094031597259499999)*t; + + // [8,14]: 2,12, + tf = CONSTANT(0.184674390919999990)*f[2] + CONSTANT(-0.188063194517999990)*f[12]; + tg = CONSTANT(0.184674390919999990)*g[2] + CONSTANT(-0.188063194517999990)*g[12]; + y[8] += tf*g[14] + tg*f[14]; + y[14] += tf*g[8] + tg*f[8]; + t = f[8] * g[14] + f[14] * g[8]; + y[2] += CONSTANT(0.184674390919999990)*t; + y[12] += CONSTANT(-0.188063194517999990)*t; + + // [9,9]: 6,0, + tf = CONSTANT(-0.210261043508000010)*f[6] + CONSTANT(0.282094791766999970)*f[0]; + tg = CONSTANT(-0.210261043508000010)*g[6] + CONSTANT(0.282094791766999970)*g[0]; + y[9] += tf*g[9] + tg*f[9]; + t = f[9] * g[9]; + y[6] += CONSTANT(-0.210261043508000010)*t; + y[0] += CONSTANT(0.282094791766999970)*t; + + // [10,10]: 0, + tf = CONSTANT(0.282094791771999980)*f[0]; + tg = CONSTANT(0.282094791771999980)*g[0]; + y[10] += tf*g[10] + tg*f[10]; + t = f[10] * g[10]; + y[0] += CONSTANT(0.282094791771999980)*t; + + // [11,11]: 0,6,8, + tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.126156626101000010)*f[6] + CONSTANT(-0.145673124078999990)*f[8]; + tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.126156626101000010)*g[6] + CONSTANT(-0.145673124078999990)*g[8]; + y[11] += tf*g[11] + tg*f[11]; + t = f[11] * g[11]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[6] += CONSTANT(0.126156626101000010)*t; + y[8] += CONSTANT(-0.145673124078999990)*t; + + // [12,12]: 0,6, + tf = CONSTANT(0.282094799871999980)*f[0] + CONSTANT(0.168208852954000010)*f[6]; + tg = CONSTANT(0.282094799871999980)*g[0] + CONSTANT(0.168208852954000010)*g[6]; + y[12] += tf*g[12] + tg*f[12]; + t = f[12] * g[12]; + y[0] += CONSTANT(0.282094799871999980)*t; + y[6] += CONSTANT(0.168208852954000010)*t; + + // [13,13]: 0,8,6, + tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.145673124078999990)*f[8] + CONSTANT(0.126156626101000010)*f[6]; + tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.145673124078999990)*g[8] + CONSTANT(0.126156626101000010)*g[6]; + y[13] += tf*g[13] + tg*f[13]; + t = f[13] * g[13]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[8] += CONSTANT(0.145673124078999990)*t; + y[6] += CONSTANT(0.126156626101000010)*t; + + // [14,14]: 0, + tf = CONSTANT(0.282094791771999980)*f[0]; + tg = CONSTANT(0.282094791771999980)*g[0]; + y[14] += tf*g[14] + tg*f[14]; + t = f[14] * g[14]; + y[0] += CONSTANT(0.282094791771999980)*t; + + // [15,15]: 0,6, + tf = CONSTANT(0.282094791766999970)*f[0] + CONSTANT(-0.210261043508000010)*f[6]; + tg = CONSTANT(0.282094791766999970)*g[0] + CONSTANT(-0.210261043508000010)*g[6]; + y[15] += tf*g[15] + tg*f[15]; + t = f[15] * g[15]; + y[0] += CONSTANT(0.282094791766999970)*t; + y[6] += CONSTANT(-0.210261043508000010)*t; + + // multiply count=399 + + return y; +} + + +//------------------------------------------------------------------------------------- +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232908.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +float* DirectX::XMSHMultiply5( + float *y, + const float *f, + const float *g) noexcept +{ + if (!y || !f || !g) + return nullptr; + + REAL tf, tg, t; + // [0,0]: 0, + y[0] = CONSTANT(0.282094792935999980)*f[0] * g[0]; + + // [1,1]: 0,6,8, + tf = CONSTANT(0.282094791773000010)*f[0] + CONSTANT(-0.126156626101000010)*f[6] + CONSTANT(-0.218509686119999990)*f[8]; + tg = CONSTANT(0.282094791773000010)*g[0] + CONSTANT(-0.126156626101000010)*g[6] + CONSTANT(-0.218509686119999990)*g[8]; + y[1] = tf*g[1] + tg*f[1]; + t = f[1] * g[1]; + y[0] += CONSTANT(0.282094791773000010)*t; + y[6] = CONSTANT(-0.126156626101000010)*t; + y[8] = CONSTANT(-0.218509686119999990)*t; + + // [1,4]: 3,13,15, + tf = CONSTANT(0.218509686114999990)*f[3] + CONSTANT(-0.058399170082300000)*f[13] + CONSTANT(-0.226179013157999990)*f[15]; + tg = CONSTANT(0.218509686114999990)*g[3] + CONSTANT(-0.058399170082300000)*g[13] + CONSTANT(-0.226179013157999990)*g[15]; + y[1] += tf*g[4] + tg*f[4]; + y[4] = tf*g[1] + tg*f[1]; + t = f[1] * g[4] + f[4] * g[1]; + y[3] = CONSTANT(0.218509686114999990)*t; + y[13] = CONSTANT(-0.058399170082300000)*t; + y[15] = CONSTANT(-0.226179013157999990)*t; + + // [1,5]: 2,12,14, + tf = CONSTANT(0.218509686118000010)*f[2] + CONSTANT(-0.143048168103000000)*f[12] + CONSTANT(-0.184674390923000000)*f[14]; + tg = CONSTANT(0.218509686118000010)*g[2] + CONSTANT(-0.143048168103000000)*g[12] + CONSTANT(-0.184674390923000000)*g[14]; + y[1] += tf*g[5] + tg*f[5]; + y[5] = tf*g[1] + tg*f[1]; + t = f[1] * g[5] + f[5] * g[1]; + y[2] = CONSTANT(0.218509686118000010)*t; + y[12] = CONSTANT(-0.143048168103000000)*t; + y[14] = CONSTANT(-0.184674390923000000)*t; + + // [1,9]: 8,22,24, + tf = CONSTANT(0.226179013155000000)*f[8] + CONSTANT(-0.043528171378199997)*f[22] + CONSTANT(-0.230329432978999990)*f[24]; + tg = CONSTANT(0.226179013155000000)*g[8] + CONSTANT(-0.043528171378199997)*g[22] + CONSTANT(-0.230329432978999990)*g[24]; + y[1] += tf*g[9] + tg*f[9]; + y[9] = tf*g[1] + tg*f[1]; + t = f[1] * g[9] + f[9] * g[1]; + y[8] += CONSTANT(0.226179013155000000)*t; + y[22] = CONSTANT(-0.043528171378199997)*t; + y[24] = CONSTANT(-0.230329432978999990)*t; + + // [1,10]: 7,21,23, + tf = CONSTANT(0.184674390919999990)*f[7] + CONSTANT(-0.075393004386799994)*f[21] + CONSTANT(-0.199471140200000010)*f[23]; + tg = CONSTANT(0.184674390919999990)*g[7] + CONSTANT(-0.075393004386799994)*g[21] + CONSTANT(-0.199471140200000010)*g[23]; + y[1] += tf*g[10] + tg*f[10]; + y[10] = tf*g[1] + tg*f[1]; + t = f[1] * g[10] + f[10] * g[1]; + y[7] = CONSTANT(0.184674390919999990)*t; + y[21] = CONSTANT(-0.075393004386799994)*t; + y[23] = CONSTANT(-0.199471140200000010)*t; + + // [1,11]: 6,8,20,22, + tf = CONSTANT(0.202300659402999990)*f[6] + CONSTANT(0.058399170081799998)*f[8] + CONSTANT(-0.150786008773000000)*f[20] + CONSTANT(-0.168583882836999990)*f[22]; + tg = CONSTANT(0.202300659402999990)*g[6] + CONSTANT(0.058399170081799998)*g[8] + CONSTANT(-0.150786008773000000)*g[20] + CONSTANT(-0.168583882836999990)*g[22]; + y[1] += tf*g[11] + tg*f[11]; + y[11] = tf*g[1] + tg*f[1]; + t = f[1] * g[11] + f[11] * g[1]; + y[6] += CONSTANT(0.202300659402999990)*t; + y[8] += CONSTANT(0.058399170081799998)*t; + y[20] = CONSTANT(-0.150786008773000000)*t; + y[22] += CONSTANT(-0.168583882836999990)*t; + + // [1,12]: 19, + tf = CONSTANT(0.194663900273000010)*f[19]; + tg = CONSTANT(0.194663900273000010)*g[19]; + y[1] += tf*g[12] + tg*f[12]; + y[12] += tf*g[1] + tg*f[1]; + t = f[1] * g[12] + f[12] * g[1]; + y[19] = CONSTANT(0.194663900273000010)*t; + + // [1,13]: 18, + tf = CONSTANT(0.168583882834000000)*f[18]; + tg = CONSTANT(0.168583882834000000)*g[18]; + y[1] += tf*g[13] + tg*f[13]; + y[13] += tf*g[1] + tg*f[1]; + t = f[1] * g[13] + f[13] * g[1]; + y[18] = CONSTANT(0.168583882834000000)*t; + + // [1,14]: 17,19, + tf = CONSTANT(0.199471140196999990)*f[17] + CONSTANT(0.075393004386399995)*f[19]; + tg = CONSTANT(0.199471140196999990)*g[17] + CONSTANT(0.075393004386399995)*g[19]; + y[1] += tf*g[14] + tg*f[14]; + y[14] += tf*g[1] + tg*f[1]; + t = f[1] * g[14] + f[14] * g[1]; + y[17] = CONSTANT(0.199471140196999990)*t; + y[19] += CONSTANT(0.075393004386399995)*t; + + // [1,15]: 16,18, + tf = CONSTANT(0.230329432973999990)*f[16] + CONSTANT(0.043528171377799997)*f[18]; + tg = CONSTANT(0.230329432973999990)*g[16] + CONSTANT(0.043528171377799997)*g[18]; + y[1] += tf*g[15] + tg*f[15]; + y[15] += tf*g[1] + tg*f[1]; + t = f[1] * g[15] + f[15] * g[1]; + y[16] = CONSTANT(0.230329432973999990)*t; + y[18] += CONSTANT(0.043528171377799997)*t; + + // [2,2]: 0,6, + tf = CONSTANT(0.282094795249000000)*f[0] + CONSTANT(0.252313259986999990)*f[6]; + tg = CONSTANT(0.282094795249000000)*g[0] + CONSTANT(0.252313259986999990)*g[6]; + y[2] += tf*g[2] + tg*f[2]; + t = f[2] * g[2]; + y[0] += CONSTANT(0.282094795249000000)*t; + y[6] += CONSTANT(0.252313259986999990)*t; + + // [2,10]: 4,18, + tf = CONSTANT(0.184674390919999990)*f[4] + CONSTANT(0.213243618621000000)*f[18]; + tg = CONSTANT(0.184674390919999990)*g[4] + CONSTANT(0.213243618621000000)*g[18]; + y[2] += tf*g[10] + tg*f[10]; + y[10] += tf*g[2] + tg*f[2]; + t = f[2] * g[10] + f[10] * g[2]; + y[4] += CONSTANT(0.184674390919999990)*t; + y[18] += CONSTANT(0.213243618621000000)*t; + + // [2,12]: 6,20, + tf = CONSTANT(0.247766706973999990)*f[6] + CONSTANT(0.246232537174000010)*f[20]; + tg = CONSTANT(0.247766706973999990)*g[6] + CONSTANT(0.246232537174000010)*g[20]; + y[2] += tf*g[12] + tg*f[12]; + y[12] += tf*g[2] + tg*f[2]; + t = f[2] * g[12] + f[12] * g[2]; + y[6] += CONSTANT(0.247766706973999990)*t; + y[20] += CONSTANT(0.246232537174000010)*t; + + // [2,14]: 8,22, + tf = CONSTANT(0.184674390919999990)*f[8] + CONSTANT(0.213243618621000000)*f[22]; + tg = CONSTANT(0.184674390919999990)*g[8] + CONSTANT(0.213243618621000000)*g[22]; + y[2] += tf*g[14] + tg*f[14]; + y[14] += tf*g[2] + tg*f[2]; + t = f[2] * g[14] + f[14] * g[2]; + y[8] += CONSTANT(0.184674390919999990)*t; + y[22] += CONSTANT(0.213243618621000000)*t; + + // [3,3]: 0,6,8, + tf = CONSTANT(0.282094791773000010)*f[0] + CONSTANT(-0.126156626101000010)*f[6] + CONSTANT(0.218509686119999990)*f[8]; + tg = CONSTANT(0.282094791773000010)*g[0] + CONSTANT(-0.126156626101000010)*g[6] + CONSTANT(0.218509686119999990)*g[8]; + y[3] += tf*g[3] + tg*f[3]; + t = f[3] * g[3]; + y[0] += CONSTANT(0.282094791773000010)*t; + y[6] += CONSTANT(-0.126156626101000010)*t; + y[8] += CONSTANT(0.218509686119999990)*t; + + // [3,7]: 2,12,14, + tf = CONSTANT(0.218509686118000010)*f[2] + CONSTANT(-0.143048168103000000)*f[12] + CONSTANT(0.184674390923000000)*f[14]; + tg = CONSTANT(0.218509686118000010)*g[2] + CONSTANT(-0.143048168103000000)*g[12] + CONSTANT(0.184674390923000000)*g[14]; + y[3] += tf*g[7] + tg*f[7]; + y[7] += tf*g[3] + tg*f[3]; + t = f[3] * g[7] + f[7] * g[3]; + y[2] += CONSTANT(0.218509686118000010)*t; + y[12] += CONSTANT(-0.143048168103000000)*t; + y[14] += CONSTANT(0.184674390923000000)*t; + + // [3,9]: 4,16,18, + tf = CONSTANT(0.226179013157999990)*f[4] + CONSTANT(0.230329432973999990)*f[16] + CONSTANT(-0.043528171377799997)*f[18]; + tg = CONSTANT(0.226179013157999990)*g[4] + CONSTANT(0.230329432973999990)*g[16] + CONSTANT(-0.043528171377799997)*g[18]; + y[3] += tf*g[9] + tg*f[9]; + y[9] += tf*g[3] + tg*f[3]; + t = f[3] * g[9] + f[9] * g[3]; + y[4] += CONSTANT(0.226179013157999990)*t; + y[16] += CONSTANT(0.230329432973999990)*t; + y[18] += CONSTANT(-0.043528171377799997)*t; + + // [3,10]: 5,17,19, + tf = CONSTANT(0.184674390919999990)*f[5] + CONSTANT(0.199471140200000010)*f[17] + CONSTANT(-0.075393004386799994)*f[19]; + tg = CONSTANT(0.184674390919999990)*g[5] + CONSTANT(0.199471140200000010)*g[17] + CONSTANT(-0.075393004386799994)*g[19]; + y[3] += tf*g[10] + tg*f[10]; + y[10] += tf*g[3] + tg*f[3]; + t = f[3] * g[10] + f[10] * g[3]; + y[5] += CONSTANT(0.184674390919999990)*t; + y[17] += CONSTANT(0.199471140200000010)*t; + y[19] += CONSTANT(-0.075393004386799994)*t; + + // [3,12]: 21, + tf = CONSTANT(0.194663900273000010)*f[21]; + tg = CONSTANT(0.194663900273000010)*g[21]; + y[3] += tf*g[12] + tg*f[12]; + y[12] += tf*g[3] + tg*f[3]; + t = f[3] * g[12] + f[12] * g[3]; + y[21] += CONSTANT(0.194663900273000010)*t; + + // [3,13]: 8,6,20,22, + tf = CONSTANT(-0.058399170081799998)*f[8] + CONSTANT(0.202300659402999990)*f[6] + CONSTANT(-0.150786008773000000)*f[20] + CONSTANT(0.168583882836999990)*f[22]; + tg = CONSTANT(-0.058399170081799998)*g[8] + CONSTANT(0.202300659402999990)*g[6] + CONSTANT(-0.150786008773000000)*g[20] + CONSTANT(0.168583882836999990)*g[22]; + y[3] += tf*g[13] + tg*f[13]; + y[13] += tf*g[3] + tg*f[3]; + t = f[3] * g[13] + f[13] * g[3]; + y[8] += CONSTANT(-0.058399170081799998)*t; + y[6] += CONSTANT(0.202300659402999990)*t; + y[20] += CONSTANT(-0.150786008773000000)*t; + y[22] += CONSTANT(0.168583882836999990)*t; + + // [3,14]: 21,23, + tf = CONSTANT(-0.075393004386399995)*f[21] + CONSTANT(0.199471140196999990)*f[23]; + tg = CONSTANT(-0.075393004386399995)*g[21] + CONSTANT(0.199471140196999990)*g[23]; + y[3] += tf*g[14] + tg*f[14]; + y[14] += tf*g[3] + tg*f[3]; + t = f[3] * g[14] + f[14] * g[3]; + y[21] += CONSTANT(-0.075393004386399995)*t; + y[23] += CONSTANT(0.199471140196999990)*t; + + // [3,15]: 8,22,24, + tf = CONSTANT(0.226179013155000000)*f[8] + CONSTANT(-0.043528171378199997)*f[22] + CONSTANT(0.230329432978999990)*f[24]; + tg = CONSTANT(0.226179013155000000)*g[8] + CONSTANT(-0.043528171378199997)*g[22] + CONSTANT(0.230329432978999990)*g[24]; + y[3] += tf*g[15] + tg*f[15]; + y[15] += tf*g[3] + tg*f[3]; + t = f[3] * g[15] + f[15] * g[3]; + y[8] += CONSTANT(0.226179013155000000)*t; + y[22] += CONSTANT(-0.043528171378199997)*t; + y[24] += CONSTANT(0.230329432978999990)*t; + + // [4,4]: 0,6,20,24, + tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.180223751576000010)*f[6] + CONSTANT(0.040299255967500003)*f[20] + CONSTANT(-0.238413613505999990)*f[24]; + tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.180223751576000010)*g[6] + CONSTANT(0.040299255967500003)*g[20] + CONSTANT(-0.238413613505999990)*g[24]; + y[4] += tf*g[4] + tg*f[4]; + t = f[4] * g[4]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[6] += CONSTANT(-0.180223751576000010)*t; + y[20] += CONSTANT(0.040299255967500003)*t; + y[24] += CONSTANT(-0.238413613505999990)*t; + + // [4,5]: 7,21,23, + tf = CONSTANT(0.156078347226000000)*f[7] + CONSTANT(-0.063718718434399996)*f[21] + CONSTANT(-0.168583882835000000)*f[23]; + tg = CONSTANT(0.156078347226000000)*g[7] + CONSTANT(-0.063718718434399996)*g[21] + CONSTANT(-0.168583882835000000)*g[23]; + y[4] += tf*g[5] + tg*f[5]; + y[5] += tf*g[4] + tg*f[4]; + t = f[4] * g[5] + f[5] * g[4]; + y[7] += CONSTANT(0.156078347226000000)*t; + y[21] += CONSTANT(-0.063718718434399996)*t; + y[23] += CONSTANT(-0.168583882835000000)*t; + + // [4,11]: 3,13,15, + tf = CONSTANT(-0.058399170082300000)*f[3] + CONSTANT(0.145673124078000010)*f[13] + CONSTANT(0.094031597258400004)*f[15]; + tg = CONSTANT(-0.058399170082300000)*g[3] + CONSTANT(0.145673124078000010)*g[13] + CONSTANT(0.094031597258400004)*g[15]; + y[4] += tf*g[11] + tg*f[11]; + y[11] += tf*g[4] + tg*f[4]; + t = f[4] * g[11] + f[11] * g[4]; + y[3] += CONSTANT(-0.058399170082300000)*t; + y[13] += CONSTANT(0.145673124078000010)*t; + y[15] += CONSTANT(0.094031597258400004)*t; + + // [4,16]: 8,22, + tf = CONSTANT(0.238413613494000000)*f[8] + CONSTANT(-0.075080816693699995)*f[22]; + tg = CONSTANT(0.238413613494000000)*g[8] + CONSTANT(-0.075080816693699995)*g[22]; + y[4] += tf*g[16] + tg*f[16]; + y[16] += tf*g[4] + tg*f[4]; + t = f[4] * g[16] + f[16] * g[4]; + y[8] += CONSTANT(0.238413613494000000)*t; + y[22] += CONSTANT(-0.075080816693699995)*t; + + // [4,18]: 6,20,24, + tf = CONSTANT(0.156078347226000000)*f[6] + CONSTANT(-0.190364615029000010)*f[20] + CONSTANT(0.075080816691500005)*f[24]; + tg = CONSTANT(0.156078347226000000)*g[6] + CONSTANT(-0.190364615029000010)*g[20] + CONSTANT(0.075080816691500005)*g[24]; + y[4] += tf*g[18] + tg*f[18]; + y[18] += tf*g[4] + tg*f[4]; + t = f[4] * g[18] + f[18] * g[4]; + y[6] += CONSTANT(0.156078347226000000)*t; + y[20] += CONSTANT(-0.190364615029000010)*t; + y[24] += CONSTANT(0.075080816691500005)*t; + + // [4,19]: 7,21,23, + tf = CONSTANT(-0.063718718434399996)*f[7] + CONSTANT(0.141889406569999990)*f[21] + CONSTANT(0.112621225039000000)*f[23]; + tg = CONSTANT(-0.063718718434399996)*g[7] + CONSTANT(0.141889406569999990)*g[21] + CONSTANT(0.112621225039000000)*g[23]; + y[4] += tf*g[19] + tg*f[19]; + y[19] += tf*g[4] + tg*f[4]; + t = f[4] * g[19] + f[19] * g[4]; + y[7] += CONSTANT(-0.063718718434399996)*t; + y[21] += CONSTANT(0.141889406569999990)*t; + y[23] += CONSTANT(0.112621225039000000)*t; + + // [5,5]: 0,6,8,20,22, + tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.090111875786499998)*f[6] + CONSTANT(-0.156078347227999990)*f[8] + CONSTANT(-0.161197023870999990)*f[20] + CONSTANT(-0.180223751574000000)*f[22]; + tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.090111875786499998)*g[6] + CONSTANT(-0.156078347227999990)*g[8] + CONSTANT(-0.161197023870999990)*g[20] + CONSTANT(-0.180223751574000000)*g[22]; + y[5] += tf*g[5] + tg*f[5]; + t = f[5] * g[5]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[6] += CONSTANT(0.090111875786499998)*t; + y[8] += CONSTANT(-0.156078347227999990)*t; + y[20] += CONSTANT(-0.161197023870999990)*t; + y[22] += CONSTANT(-0.180223751574000000)*t; + + // [5,11]: 2,12,14, + tf = CONSTANT(0.233596680327000010)*f[2] + CONSTANT(0.059470803871800003)*f[12] + CONSTANT(-0.115164716491000000)*f[14]; + tg = CONSTANT(0.233596680327000010)*g[2] + CONSTANT(0.059470803871800003)*g[12] + CONSTANT(-0.115164716491000000)*g[14]; + y[5] += tf*g[11] + tg*f[11]; + y[11] += tf*g[5] + tg*f[5]; + t = f[5] * g[11] + f[11] * g[5]; + y[2] += CONSTANT(0.233596680327000010)*t; + y[12] += CONSTANT(0.059470803871800003)*t; + y[14] += CONSTANT(-0.115164716491000000)*t; + + // [5,17]: 8,22,24, + tf = CONSTANT(0.168583882832999990)*f[8] + CONSTANT(0.132725386548000010)*f[22] + CONSTANT(-0.140463346189000000)*f[24]; + tg = CONSTANT(0.168583882832999990)*g[8] + CONSTANT(0.132725386548000010)*g[22] + CONSTANT(-0.140463346189000000)*g[24]; + y[5] += tf*g[17] + tg*f[17]; + y[17] += tf*g[5] + tg*f[5]; + t = f[5] * g[17] + f[17] * g[5]; + y[8] += CONSTANT(0.168583882832999990)*t; + y[22] += CONSTANT(0.132725386548000010)*t; + y[24] += CONSTANT(-0.140463346189000000)*t; + + // [5,18]: 7,21,23, + tf = CONSTANT(0.180223751571000010)*f[7] + CONSTANT(0.090297865407399994)*f[21] + CONSTANT(-0.132725386549000010)*f[23]; + tg = CONSTANT(0.180223751571000010)*g[7] + CONSTANT(0.090297865407399994)*g[21] + CONSTANT(-0.132725386549000010)*g[23]; + y[5] += tf*g[18] + tg*f[18]; + y[18] += tf*g[5] + tg*f[5]; + t = f[5] * g[18] + f[18] * g[5]; + y[7] += CONSTANT(0.180223751571000010)*t; + y[21] += CONSTANT(0.090297865407399994)*t; + y[23] += CONSTANT(-0.132725386549000010)*t; + + // [5,19]: 6,8,20,22, + tf = CONSTANT(0.220728115440999990)*f[6] + CONSTANT(0.063718718433900007)*f[8] + CONSTANT(0.044869370061299998)*f[20] + CONSTANT(-0.090297865408399999)*f[22]; + tg = CONSTANT(0.220728115440999990)*g[6] + CONSTANT(0.063718718433900007)*g[8] + CONSTANT(0.044869370061299998)*g[20] + CONSTANT(-0.090297865408399999)*g[22]; + y[5] += tf*g[19] + tg*f[19]; + y[19] += tf*g[5] + tg*f[5]; + t = f[5] * g[19] + f[19] * g[5]; + y[6] += CONSTANT(0.220728115440999990)*t; + y[8] += CONSTANT(0.063718718433900007)*t; + y[20] += CONSTANT(0.044869370061299998)*t; + y[22] += CONSTANT(-0.090297865408399999)*t; + + // [6,6]: 0,6,20, + tf = CONSTANT(0.282094797560000000)*f[0] + CONSTANT(0.241795553185999990)*f[20]; + tg = CONSTANT(0.282094797560000000)*g[0] + CONSTANT(0.241795553185999990)*g[20]; + y[6] += tf*g[6] + tg*f[6]; + t = f[6] * g[6]; + y[0] += CONSTANT(0.282094797560000000)*t; + y[6] += CONSTANT(0.180223764527000010)*t; + y[20] += CONSTANT(0.241795553185999990)*t; + + // [7,7]: 6,0,8,20,22, + tf = CONSTANT(0.090111875786499998)*f[6] + CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.156078347227999990)*f[8] + CONSTANT(-0.161197023870999990)*f[20] + CONSTANT(0.180223751574000000)*f[22]; + tg = CONSTANT(0.090111875786499998)*g[6] + CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.156078347227999990)*g[8] + CONSTANT(-0.161197023870999990)*g[20] + CONSTANT(0.180223751574000000)*g[22]; + y[7] += tf*g[7] + tg*f[7]; + t = f[7] * g[7]; + y[6] += CONSTANT(0.090111875786499998)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + y[8] += CONSTANT(0.156078347227999990)*t; + y[20] += CONSTANT(-0.161197023870999990)*t; + y[22] += CONSTANT(0.180223751574000000)*t; + + // [7,13]: 12,2,14, + tf = CONSTANT(0.059470803871800003)*f[12] + CONSTANT(0.233596680327000010)*f[2] + CONSTANT(0.115164716491000000)*f[14]; + tg = CONSTANT(0.059470803871800003)*g[12] + CONSTANT(0.233596680327000010)*g[2] + CONSTANT(0.115164716491000000)*g[14]; + y[7] += tf*g[13] + tg*f[13]; + y[13] += tf*g[7] + tg*f[7]; + t = f[7] * g[13] + f[13] * g[7]; + y[12] += CONSTANT(0.059470803871800003)*t; + y[2] += CONSTANT(0.233596680327000010)*t; + y[14] += CONSTANT(0.115164716491000000)*t; + + // [7,17]: 16,4,18, + tf = CONSTANT(0.140463346187999990)*f[16] + CONSTANT(0.168583882835000000)*f[4] + CONSTANT(0.132725386549000010)*f[18]; + tg = CONSTANT(0.140463346187999990)*g[16] + CONSTANT(0.168583882835000000)*g[4] + CONSTANT(0.132725386549000010)*g[18]; + y[7] += tf*g[17] + tg*f[17]; + y[17] += tf*g[7] + tg*f[7]; + t = f[7] * g[17] + f[17] * g[7]; + y[16] += CONSTANT(0.140463346187999990)*t; + y[4] += CONSTANT(0.168583882835000000)*t; + y[18] += CONSTANT(0.132725386549000010)*t; + + // [7,21]: 8,20,6,22, + tf = CONSTANT(-0.063718718433900007)*f[8] + CONSTANT(0.044869370061299998)*f[20] + CONSTANT(0.220728115440999990)*f[6] + CONSTANT(0.090297865408399999)*f[22]; + tg = CONSTANT(-0.063718718433900007)*g[8] + CONSTANT(0.044869370061299998)*g[20] + CONSTANT(0.220728115440999990)*g[6] + CONSTANT(0.090297865408399999)*g[22]; + y[7] += tf*g[21] + tg*f[21]; + y[21] += tf*g[7] + tg*f[7]; + t = f[7] * g[21] + f[21] * g[7]; + y[8] += CONSTANT(-0.063718718433900007)*t; + y[20] += CONSTANT(0.044869370061299998)*t; + y[6] += CONSTANT(0.220728115440999990)*t; + y[22] += CONSTANT(0.090297865408399999)*t; + + // [7,23]: 8,22,24, + tf = CONSTANT(0.168583882832999990)*f[8] + CONSTANT(0.132725386548000010)*f[22] + CONSTANT(0.140463346189000000)*f[24]; + tg = CONSTANT(0.168583882832999990)*g[8] + CONSTANT(0.132725386548000010)*g[22] + CONSTANT(0.140463346189000000)*g[24]; + y[7] += tf*g[23] + tg*f[23]; + y[23] += tf*g[7] + tg*f[7]; + t = f[7] * g[23] + f[23] * g[7]; + y[8] += CONSTANT(0.168583882832999990)*t; + y[22] += CONSTANT(0.132725386548000010)*t; + y[24] += CONSTANT(0.140463346189000000)*t; + + // [8,8]: 0,6,20,24, + tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.180223751576000010)*f[6] + CONSTANT(0.040299255967500003)*f[20] + CONSTANT(0.238413613505999990)*f[24]; + tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.180223751576000010)*g[6] + CONSTANT(0.040299255967500003)*g[20] + CONSTANT(0.238413613505999990)*g[24]; + y[8] += tf*g[8] + tg*f[8]; + t = f[8] * g[8]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[6] += CONSTANT(-0.180223751576000010)*t; + y[20] += CONSTANT(0.040299255967500003)*t; + y[24] += CONSTANT(0.238413613505999990)*t; + + // [8,22]: 6,20,24, + tf = CONSTANT(0.156078347226000000)*f[6] + CONSTANT(-0.190364615029000010)*f[20] + CONSTANT(-0.075080816691500005)*f[24]; + tg = CONSTANT(0.156078347226000000)*g[6] + CONSTANT(-0.190364615029000010)*g[20] + CONSTANT(-0.075080816691500005)*g[24]; + y[8] += tf*g[22] + tg*f[22]; + y[22] += tf*g[8] + tg*f[8]; + t = f[8] * g[22] + f[22] * g[8]; + y[6] += CONSTANT(0.156078347226000000)*t; + y[20] += CONSTANT(-0.190364615029000010)*t; + y[24] += CONSTANT(-0.075080816691500005)*t; + + // [9,9]: 6,0,20, + tf = CONSTANT(-0.210261043508000010)*f[6] + CONSTANT(0.282094791766999970)*f[0] + CONSTANT(0.076934943209800002)*f[20]; + tg = CONSTANT(-0.210261043508000010)*g[6] + CONSTANT(0.282094791766999970)*g[0] + CONSTANT(0.076934943209800002)*g[20]; + y[9] += tf*g[9] + tg*f[9]; + t = f[9] * g[9]; + y[6] += CONSTANT(-0.210261043508000010)*t; + y[0] += CONSTANT(0.282094791766999970)*t; + y[20] += CONSTANT(0.076934943209800002)*t; + + // [9,10]: 7,21, + tf = CONSTANT(0.148677009678999990)*f[7] + CONSTANT(-0.099322584599600000)*f[21]; + tg = CONSTANT(0.148677009678999990)*g[7] + CONSTANT(-0.099322584599600000)*g[21]; + y[9] += tf*g[10] + tg*f[10]; + y[10] += tf*g[9] + tg*f[9]; + t = f[9] * g[10] + f[10] * g[9]; + y[7] += CONSTANT(0.148677009678999990)*t; + y[21] += CONSTANT(-0.099322584599600000)*t; + + // [9,11]: 8,22,24, + tf = CONSTANT(-0.094031597259499999)*f[8] + CONSTANT(0.133255230518000010)*f[22] + CONSTANT(0.117520066950999990)*f[24]; + tg = CONSTANT(-0.094031597259499999)*g[8] + CONSTANT(0.133255230518000010)*g[22] + CONSTANT(0.117520066950999990)*g[24]; + y[9] += tf*g[11] + tg*f[11]; + y[11] += tf*g[9] + tg*f[9]; + t = f[9] * g[11] + f[11] * g[9]; + y[8] += CONSTANT(-0.094031597259499999)*t; + y[22] += CONSTANT(0.133255230518000010)*t; + y[24] += CONSTANT(0.117520066950999990)*t; + + // [9,13]: 4,16,18, + tf = CONSTANT(-0.094031597258400004)*f[4] + CONSTANT(-0.117520066953000000)*f[16] + CONSTANT(0.133255230519000010)*f[18]; + tg = CONSTANT(-0.094031597258400004)*g[4] + CONSTANT(-0.117520066953000000)*g[16] + CONSTANT(0.133255230519000010)*g[18]; + y[9] += tf*g[13] + tg*f[13]; + y[13] += tf*g[9] + tg*f[9]; + t = f[9] * g[13] + f[13] * g[9]; + y[4] += CONSTANT(-0.094031597258400004)*t; + y[16] += CONSTANT(-0.117520066953000000)*t; + y[18] += CONSTANT(0.133255230519000010)*t; + + // [9,14]: 5,19, + tf = CONSTANT(0.148677009677999990)*f[5] + CONSTANT(-0.099322584600699995)*f[19]; + tg = CONSTANT(0.148677009677999990)*g[5] + CONSTANT(-0.099322584600699995)*g[19]; + y[9] += tf*g[14] + tg*f[14]; + y[14] += tf*g[9] + tg*f[9]; + t = f[9] * g[14] + f[14] * g[9]; + y[5] += CONSTANT(0.148677009677999990)*t; + y[19] += CONSTANT(-0.099322584600699995)*t; + + // [9,17]: 2,12, + tf = CONSTANT(0.162867503964999990)*f[2] + CONSTANT(-0.203550726872999990)*f[12]; + tg = CONSTANT(0.162867503964999990)*g[2] + CONSTANT(-0.203550726872999990)*g[12]; + y[9] += tf*g[17] + tg*f[17]; + y[17] += tf*g[9] + tg*f[9]; + t = f[9] * g[17] + f[17] * g[9]; + y[2] += CONSTANT(0.162867503964999990)*t; + y[12] += CONSTANT(-0.203550726872999990)*t; + + // [10,10]: 0,20,24, + tf = CONSTANT(0.282094791771999980)*f[0] + CONSTANT(-0.179514867494000000)*f[20] + CONSTANT(-0.151717754049000010)*f[24]; + tg = CONSTANT(0.282094791771999980)*g[0] + CONSTANT(-0.179514867494000000)*g[20] + CONSTANT(-0.151717754049000010)*g[24]; + y[10] += tf*g[10] + tg*f[10]; + t = f[10] * g[10]; + y[0] += CONSTANT(0.282094791771999980)*t; + y[20] += CONSTANT(-0.179514867494000000)*t; + y[24] += CONSTANT(-0.151717754049000010)*t; + + // [10,11]: 7,21,23, + tf = CONSTANT(0.115164716490000000)*f[7] + CONSTANT(0.102579924281000000)*f[21] + CONSTANT(-0.067850242288900006)*f[23]; + tg = CONSTANT(0.115164716490000000)*g[7] + CONSTANT(0.102579924281000000)*g[21] + CONSTANT(-0.067850242288900006)*g[23]; + y[10] += tf*g[11] + tg*f[11]; + y[11] += tf*g[10] + tg*f[10]; + t = f[10] * g[11] + f[11] * g[10]; + y[7] += CONSTANT(0.115164716490000000)*t; + y[21] += CONSTANT(0.102579924281000000)*t; + y[23] += CONSTANT(-0.067850242288900006)*t; + + // [10,12]: 4,18, + tf = CONSTANT(-0.188063194517999990)*f[4] + CONSTANT(-0.044418410173299998)*f[18]; + tg = CONSTANT(-0.188063194517999990)*g[4] + CONSTANT(-0.044418410173299998)*g[18]; + y[10] += tf*g[12] + tg*f[12]; + y[12] += tf*g[10] + tg*f[10]; + t = f[10] * g[12] + f[12] * g[10]; + y[4] += CONSTANT(-0.188063194517999990)*t; + y[18] += CONSTANT(-0.044418410173299998)*t; + + // [10,13]: 5,17,19, + tf = CONSTANT(0.115164716490000000)*f[5] + CONSTANT(0.067850242288900006)*f[17] + CONSTANT(0.102579924281000000)*f[19]; + tg = CONSTANT(0.115164716490000000)*g[5] + CONSTANT(0.067850242288900006)*g[17] + CONSTANT(0.102579924281000000)*g[19]; + y[10] += tf*g[13] + tg*f[13]; + y[13] += tf*g[10] + tg*f[10]; + t = f[10] * g[13] + f[13] * g[10]; + y[5] += CONSTANT(0.115164716490000000)*t; + y[17] += CONSTANT(0.067850242288900006)*t; + y[19] += CONSTANT(0.102579924281000000)*t; + + // [10,14]: 16, + tf = CONSTANT(0.151717754044999990)*f[16]; + tg = CONSTANT(0.151717754044999990)*g[16]; + y[10] += tf*g[14] + tg*f[14]; + y[14] += tf*g[10] + tg*f[10]; + t = f[10] * g[14] + f[14] * g[10]; + y[16] += CONSTANT(0.151717754044999990)*t; + + // [10,15]: 5,19, + tf = CONSTANT(-0.148677009678999990)*f[5] + CONSTANT(0.099322584599600000)*f[19]; + tg = CONSTANT(-0.148677009678999990)*g[5] + CONSTANT(0.099322584599600000)*g[19]; + y[10] += tf*g[15] + tg*f[15]; + y[15] += tf*g[10] + tg*f[10]; + t = f[10] * g[15] + f[15] * g[10]; + y[5] += CONSTANT(-0.148677009678999990)*t; + y[19] += CONSTANT(0.099322584599600000)*t; + + // [11,11]: 0,6,8,20,22, + tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.126156626101000010)*f[6] + CONSTANT(-0.145673124078999990)*f[8] + CONSTANT(0.025644981070299999)*f[20] + CONSTANT(-0.114687841910000000)*f[22]; + tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.126156626101000010)*g[6] + CONSTANT(-0.145673124078999990)*g[8] + CONSTANT(0.025644981070299999)*g[20] + CONSTANT(-0.114687841910000000)*g[22]; + y[11] += tf*g[11] + tg*f[11]; + t = f[11] * g[11]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[6] += CONSTANT(0.126156626101000010)*t; + y[8] += CONSTANT(-0.145673124078999990)*t; + y[20] += CONSTANT(0.025644981070299999)*t; + y[22] += CONSTANT(-0.114687841910000000)*t; + + // [11,14]: 17, + tf = CONSTANT(0.067850242288500007)*f[17]; + tg = CONSTANT(0.067850242288500007)*g[17]; + y[11] += tf*g[14] + tg*f[14]; + y[14] += tf*g[11] + tg*f[11]; + t = f[11] * g[14] + f[14] * g[11]; + y[17] += CONSTANT(0.067850242288500007)*t; + + // [11,15]: 16, + tf = CONSTANT(-0.117520066953000000)*f[16]; + tg = CONSTANT(-0.117520066953000000)*g[16]; + y[11] += tf*g[15] + tg*f[15]; + y[15] += tf*g[11] + tg*f[11]; + t = f[11] * g[15] + f[15] * g[11]; + y[16] += CONSTANT(-0.117520066953000000)*t; + + // [11,18]: 3,13,15, + tf = CONSTANT(0.168583882834000000)*f[3] + CONSTANT(0.114687841909000000)*f[13] + CONSTANT(-0.133255230519000010)*f[15]; + tg = CONSTANT(0.168583882834000000)*g[3] + CONSTANT(0.114687841909000000)*g[13] + CONSTANT(-0.133255230519000010)*g[15]; + y[11] += tf*g[18] + tg*f[18]; + y[18] += tf*g[11] + tg*f[11]; + t = f[11] * g[18] + f[18] * g[11]; + y[3] += CONSTANT(0.168583882834000000)*t; + y[13] += CONSTANT(0.114687841909000000)*t; + y[15] += CONSTANT(-0.133255230519000010)*t; + + // [11,19]: 2,14,12, + tf = CONSTANT(0.238413613504000000)*f[2] + CONSTANT(-0.102579924282000000)*f[14] + CONSTANT(0.099322584599300004)*f[12]; + tg = CONSTANT(0.238413613504000000)*g[2] + CONSTANT(-0.102579924282000000)*g[14] + CONSTANT(0.099322584599300004)*g[12]; + y[11] += tf*g[19] + tg*f[19]; + y[19] += tf*g[11] + tg*f[11]; + t = f[11] * g[19] + f[19] * g[11]; + y[2] += CONSTANT(0.238413613504000000)*t; + y[14] += CONSTANT(-0.102579924282000000)*t; + y[12] += CONSTANT(0.099322584599300004)*t; + + // [12,12]: 0,6,20, + tf = CONSTANT(0.282094799871999980)*f[0] + CONSTANT(0.168208852954000010)*f[6] + CONSTANT(0.153869910786000010)*f[20]; + tg = CONSTANT(0.282094799871999980)*g[0] + CONSTANT(0.168208852954000010)*g[6] + CONSTANT(0.153869910786000010)*g[20]; + y[12] += tf*g[12] + tg*f[12]; + t = f[12] * g[12]; + y[0] += CONSTANT(0.282094799871999980)*t; + y[6] += CONSTANT(0.168208852954000010)*t; + y[20] += CONSTANT(0.153869910786000010)*t; + + // [12,14]: 8,22, + tf = CONSTANT(-0.188063194517999990)*f[8] + CONSTANT(-0.044418410173299998)*f[22]; + tg = CONSTANT(-0.188063194517999990)*g[8] + CONSTANT(-0.044418410173299998)*g[22]; + y[12] += tf*g[14] + tg*f[14]; + y[14] += tf*g[12] + tg*f[12]; + t = f[12] * g[14] + f[14] * g[12]; + y[8] += CONSTANT(-0.188063194517999990)*t; + y[22] += CONSTANT(-0.044418410173299998)*t; + + // [13,13]: 0,8,6,20,22, + tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.145673124078999990)*f[8] + CONSTANT(0.126156626101000010)*f[6] + CONSTANT(0.025644981070299999)*f[20] + CONSTANT(0.114687841910000000)*f[22]; + tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.145673124078999990)*g[8] + CONSTANT(0.126156626101000010)*g[6] + CONSTANT(0.025644981070299999)*g[20] + CONSTANT(0.114687841910000000)*g[22]; + y[13] += tf*g[13] + tg*f[13]; + t = f[13] * g[13]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[8] += CONSTANT(0.145673124078999990)*t; + y[6] += CONSTANT(0.126156626101000010)*t; + y[20] += CONSTANT(0.025644981070299999)*t; + y[22] += CONSTANT(0.114687841910000000)*t; + + // [13,14]: 23, + tf = CONSTANT(0.067850242288500007)*f[23]; + tg = CONSTANT(0.067850242288500007)*g[23]; + y[13] += tf*g[14] + tg*f[14]; + y[14] += tf*g[13] + tg*f[13]; + t = f[13] * g[14] + f[14] * g[13]; + y[23] += CONSTANT(0.067850242288500007)*t; + + // [13,15]: 8,22,24, + tf = CONSTANT(-0.094031597259499999)*f[8] + CONSTANT(0.133255230518000010)*f[22] + CONSTANT(-0.117520066950999990)*f[24]; + tg = CONSTANT(-0.094031597259499999)*g[8] + CONSTANT(0.133255230518000010)*g[22] + CONSTANT(-0.117520066950999990)*g[24]; + y[13] += tf*g[15] + tg*f[15]; + y[15] += tf*g[13] + tg*f[13]; + t = f[13] * g[15] + f[15] * g[13]; + y[8] += CONSTANT(-0.094031597259499999)*t; + y[22] += CONSTANT(0.133255230518000010)*t; + y[24] += CONSTANT(-0.117520066950999990)*t; + + // [13,21]: 2,12,14, + tf = CONSTANT(0.238413613504000000)*f[2] + CONSTANT(0.099322584599300004)*f[12] + CONSTANT(0.102579924282000000)*f[14]; + tg = CONSTANT(0.238413613504000000)*g[2] + CONSTANT(0.099322584599300004)*g[12] + CONSTANT(0.102579924282000000)*g[14]; + y[13] += tf*g[21] + tg*f[21]; + y[21] += tf*g[13] + tg*f[13]; + t = f[13] * g[21] + f[21] * g[13]; + y[2] += CONSTANT(0.238413613504000000)*t; + y[12] += CONSTANT(0.099322584599300004)*t; + y[14] += CONSTANT(0.102579924282000000)*t; + + // [14,14]: 0,20,24, + tf = CONSTANT(0.282094791771999980)*f[0] + CONSTANT(-0.179514867494000000)*f[20] + CONSTANT(0.151717754049000010)*f[24]; + tg = CONSTANT(0.282094791771999980)*g[0] + CONSTANT(-0.179514867494000000)*g[20] + CONSTANT(0.151717754049000010)*g[24]; + y[14] += tf*g[14] + tg*f[14]; + t = f[14] * g[14]; + y[0] += CONSTANT(0.282094791771999980)*t; + y[20] += CONSTANT(-0.179514867494000000)*t; + y[24] += CONSTANT(0.151717754049000010)*t; + + // [14,15]: 7,21, + tf = CONSTANT(0.148677009677999990)*f[7] + CONSTANT(-0.099322584600699995)*f[21]; + tg = CONSTANT(0.148677009677999990)*g[7] + CONSTANT(-0.099322584600699995)*g[21]; + y[14] += tf*g[15] + tg*f[15]; + y[15] += tf*g[14] + tg*f[14]; + t = f[14] * g[15] + f[15] * g[14]; + y[7] += CONSTANT(0.148677009677999990)*t; + y[21] += CONSTANT(-0.099322584600699995)*t; + + // [15,15]: 0,6,20, + tf = CONSTANT(0.282094791766999970)*f[0] + CONSTANT(-0.210261043508000010)*f[6] + CONSTANT(0.076934943209800002)*f[20]; + tg = CONSTANT(0.282094791766999970)*g[0] + CONSTANT(-0.210261043508000010)*g[6] + CONSTANT(0.076934943209800002)*g[20]; + y[15] += tf*g[15] + tg*f[15]; + t = f[15] * g[15]; + y[0] += CONSTANT(0.282094791766999970)*t; + y[6] += CONSTANT(-0.210261043508000010)*t; + y[20] += CONSTANT(0.076934943209800002)*t; + + // [15,23]: 12,2, + tf = CONSTANT(-0.203550726872999990)*f[12] + CONSTANT(0.162867503964999990)*f[2]; + tg = CONSTANT(-0.203550726872999990)*g[12] + CONSTANT(0.162867503964999990)*g[2]; + y[15] += tf*g[23] + tg*f[23]; + y[23] += tf*g[15] + tg*f[15]; + t = f[15] * g[23] + f[23] * g[15]; + y[12] += CONSTANT(-0.203550726872999990)*t; + y[2] += CONSTANT(0.162867503964999990)*t; + + // [16,16]: 0,6,20, + tf = CONSTANT(0.282094791763999990)*f[0] + CONSTANT(-0.229375683829000000)*f[6] + CONSTANT(0.106525305981000000)*f[20]; + tg = CONSTANT(0.282094791763999990)*g[0] + CONSTANT(-0.229375683829000000)*g[6] + CONSTANT(0.106525305981000000)*g[20]; + y[16] += tf*g[16] + tg*f[16]; + t = f[16] * g[16]; + y[0] += CONSTANT(0.282094791763999990)*t; + y[6] += CONSTANT(-0.229375683829000000)*t; + y[20] += CONSTANT(0.106525305981000000)*t; + + // [16,18]: 8,22, + tf = CONSTANT(-0.075080816693699995)*f[8] + CONSTANT(0.135045473380000000)*f[22]; + tg = CONSTANT(-0.075080816693699995)*g[8] + CONSTANT(0.135045473380000000)*g[22]; + y[16] += tf*g[18] + tg*f[18]; + y[18] += tf*g[16] + tg*f[16]; + t = f[16] * g[18] + f[18] * g[16]; + y[8] += CONSTANT(-0.075080816693699995)*t; + y[22] += CONSTANT(0.135045473380000000)*t; + + // [16,23]: 19,5, + tf = CONSTANT(-0.119098912754999990)*f[19] + CONSTANT(0.140463346187999990)*f[5]; + tg = CONSTANT(-0.119098912754999990)*g[19] + CONSTANT(0.140463346187999990)*g[5]; + y[16] += tf*g[23] + tg*f[23]; + y[23] += tf*g[16] + tg*f[16]; + t = f[16] * g[23] + f[23] * g[16]; + y[19] += CONSTANT(-0.119098912754999990)*t; + y[5] += CONSTANT(0.140463346187999990)*t; + + // [17,17]: 0,6,20, + tf = CONSTANT(0.282094791768999990)*f[0] + CONSTANT(-0.057343920955899998)*f[6] + CONSTANT(-0.159787958979000000)*f[20]; + tg = CONSTANT(0.282094791768999990)*g[0] + CONSTANT(-0.057343920955899998)*g[6] + CONSTANT(-0.159787958979000000)*g[20]; + y[17] += tf*g[17] + tg*f[17]; + t = f[17] * g[17]; + y[0] += CONSTANT(0.282094791768999990)*t; + y[6] += CONSTANT(-0.057343920955899998)*t; + y[20] += CONSTANT(-0.159787958979000000)*t; + + // [17,19]: 8,22,24, + tf = CONSTANT(-0.112621225039000000)*f[8] + CONSTANT(0.045015157794100001)*f[22] + CONSTANT(0.119098912753000000)*f[24]; + tg = CONSTANT(-0.112621225039000000)*g[8] + CONSTANT(0.045015157794100001)*g[22] + CONSTANT(0.119098912753000000)*g[24]; + y[17] += tf*g[19] + tg*f[19]; + y[19] += tf*g[17] + tg*f[17]; + t = f[17] * g[19] + f[19] * g[17]; + y[8] += CONSTANT(-0.112621225039000000)*t; + y[22] += CONSTANT(0.045015157794100001)*t; + y[24] += CONSTANT(0.119098912753000000)*t; + + // [17,21]: 16,4,18, + tf = CONSTANT(-0.119098912754999990)*f[16] + CONSTANT(-0.112621225039000000)*f[4] + CONSTANT(0.045015157794399997)*f[18]; + tg = CONSTANT(-0.119098912754999990)*g[16] + CONSTANT(-0.112621225039000000)*g[4] + CONSTANT(0.045015157794399997)*g[18]; + y[17] += tf*g[21] + tg*f[21]; + y[21] += tf*g[17] + tg*f[17]; + t = f[17] * g[21] + f[21] * g[17]; + y[16] += CONSTANT(-0.119098912754999990)*t; + y[4] += CONSTANT(-0.112621225039000000)*t; + y[18] += CONSTANT(0.045015157794399997)*t; + + // [18,18]: 6,0,20,24, + tf = CONSTANT(0.065535909662600006)*f[6] + CONSTANT(0.282094791771999980)*f[0] + CONSTANT(-0.083698454702400005)*f[20] + CONSTANT(-0.135045473384000000)*f[24]; + tg = CONSTANT(0.065535909662600006)*g[6] + CONSTANT(0.282094791771999980)*g[0] + CONSTANT(-0.083698454702400005)*g[20] + CONSTANT(-0.135045473384000000)*g[24]; + y[18] += tf*g[18] + tg*f[18]; + t = f[18] * g[18]; + y[6] += CONSTANT(0.065535909662600006)*t; + y[0] += CONSTANT(0.282094791771999980)*t; + y[20] += CONSTANT(-0.083698454702400005)*t; + y[24] += CONSTANT(-0.135045473384000000)*t; + + // [18,19]: 7,21,23, + tf = CONSTANT(0.090297865407399994)*f[7] + CONSTANT(0.102084782359000000)*f[21] + CONSTANT(-0.045015157794399997)*f[23]; + tg = CONSTANT(0.090297865407399994)*g[7] + CONSTANT(0.102084782359000000)*g[21] + CONSTANT(-0.045015157794399997)*g[23]; + y[18] += tf*g[19] + tg*f[19]; + y[19] += tf*g[18] + tg*f[18]; + t = f[18] * g[19] + f[19] * g[18]; + y[7] += CONSTANT(0.090297865407399994)*t; + y[21] += CONSTANT(0.102084782359000000)*t; + y[23] += CONSTANT(-0.045015157794399997)*t; + + // [19,19]: 6,8,0,20,22, + tf = CONSTANT(0.139263808033999990)*f[6] + CONSTANT(-0.141889406570999990)*f[8] + CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.068480553847200004)*f[20] + CONSTANT(-0.102084782360000000)*f[22]; + tg = CONSTANT(0.139263808033999990)*g[6] + CONSTANT(-0.141889406570999990)*g[8] + CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.068480553847200004)*g[20] + CONSTANT(-0.102084782360000000)*g[22]; + y[19] += tf*g[19] + tg*f[19]; + t = f[19] * g[19]; + y[6] += CONSTANT(0.139263808033999990)*t; + y[8] += CONSTANT(-0.141889406570999990)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + y[20] += CONSTANT(0.068480553847200004)*t; + y[22] += CONSTANT(-0.102084782360000000)*t; + + // [20,20]: 6,0,20, + tf = CONSTANT(0.163839797503000010)*f[6] + CONSTANT(0.282094802232000010)*f[0]; + tg = CONSTANT(0.163839797503000010)*g[6] + CONSTANT(0.282094802232000010)*g[0]; + y[20] += tf*g[20] + tg*f[20]; + t = f[20] * g[20]; + y[6] += CONSTANT(0.163839797503000010)*t; + y[0] += CONSTANT(0.282094802232000010)*t; + y[20] += CONSTANT(0.136961139005999990)*t; + + // [21,21]: 6,20,0,8,22, + tf = CONSTANT(0.139263808033999990)*f[6] + CONSTANT(0.068480553847200004)*f[20] + CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.141889406570999990)*f[8] + CONSTANT(0.102084782360000000)*f[22]; + tg = CONSTANT(0.139263808033999990)*g[6] + CONSTANT(0.068480553847200004)*g[20] + CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.141889406570999990)*g[8] + CONSTANT(0.102084782360000000)*g[22]; + y[21] += tf*g[21] + tg*f[21]; + t = f[21] * g[21]; + y[6] += CONSTANT(0.139263808033999990)*t; + y[20] += CONSTANT(0.068480553847200004)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + y[8] += CONSTANT(0.141889406570999990)*t; + y[22] += CONSTANT(0.102084782360000000)*t; + + // [21,23]: 8,22,24, + tf = CONSTANT(-0.112621225039000000)*f[8] + CONSTANT(0.045015157794100001)*f[22] + CONSTANT(-0.119098912753000000)*f[24]; + tg = CONSTANT(-0.112621225039000000)*g[8] + CONSTANT(0.045015157794100001)*g[22] + CONSTANT(-0.119098912753000000)*g[24]; + y[21] += tf*g[23] + tg*f[23]; + y[23] += tf*g[21] + tg*f[21]; + t = f[21] * g[23] + f[23] * g[21]; + y[8] += CONSTANT(-0.112621225039000000)*t; + y[22] += CONSTANT(0.045015157794100001)*t; + y[24] += CONSTANT(-0.119098912753000000)*t; + + // [22,22]: 6,20,0,24, + tf = CONSTANT(0.065535909662600006)*f[6] + CONSTANT(-0.083698454702400005)*f[20] + CONSTANT(0.282094791771999980)*f[0] + CONSTANT(0.135045473384000000)*f[24]; + tg = CONSTANT(0.065535909662600006)*g[6] + CONSTANT(-0.083698454702400005)*g[20] + CONSTANT(0.282094791771999980)*g[0] + CONSTANT(0.135045473384000000)*g[24]; + y[22] += tf*g[22] + tg*f[22]; + t = f[22] * g[22]; + y[6] += CONSTANT(0.065535909662600006)*t; + y[20] += CONSTANT(-0.083698454702400005)*t; + y[0] += CONSTANT(0.282094791771999980)*t; + y[24] += CONSTANT(0.135045473384000000)*t; + + // [23,23]: 6,20,0, + tf = CONSTANT(-0.057343920955899998)*f[6] + CONSTANT(-0.159787958979000000)*f[20] + CONSTANT(0.282094791768999990)*f[0]; + tg = CONSTANT(-0.057343920955899998)*g[6] + CONSTANT(-0.159787958979000000)*g[20] + CONSTANT(0.282094791768999990)*g[0]; + y[23] += tf*g[23] + tg*f[23]; + t = f[23] * g[23]; + y[6] += CONSTANT(-0.057343920955899998)*t; + y[20] += CONSTANT(-0.159787958979000000)*t; + y[0] += CONSTANT(0.282094791768999990)*t; + + // [24,24]: 6,0,20, + tf = CONSTANT(-0.229375683829000000)*f[6] + CONSTANT(0.282094791763999990)*f[0] + CONSTANT(0.106525305981000000)*f[20]; + tg = CONSTANT(-0.229375683829000000)*g[6] + CONSTANT(0.282094791763999990)*g[0] + CONSTANT(0.106525305981000000)*g[20]; + y[24] += tf*g[24] + tg*f[24]; + t = f[24] * g[24]; + y[6] += CONSTANT(-0.229375683829000000)*t; + y[0] += CONSTANT(0.282094791763999990)*t; + y[20] += CONSTANT(0.106525305981000000)*t; + + // multiply count=1135 + + return y; +} + + +//------------------------------------------------------------------------------------- +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232909.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +float* DirectX::XMSHMultiply6( + float *y, + const float *f, + const float *g) noexcept +{ + if (!y || !f || !g) + return nullptr; + + REAL tf, tg, t; + // [0,0]: 0, + y[0] = CONSTANT(0.282094792935999980)*f[0] * g[0]; + + // [1,1]: 0,6,8, + tf = CONSTANT(0.282094791773000010)*f[0] + CONSTANT(-0.126156626101000010)*f[6] + CONSTANT(-0.218509686119999990)*f[8]; + tg = CONSTANT(0.282094791773000010)*g[0] + CONSTANT(-0.126156626101000010)*g[6] + CONSTANT(-0.218509686119999990)*g[8]; + y[1] = tf*g[1] + tg*f[1]; + t = f[1] * g[1]; + y[0] += CONSTANT(0.282094791773000010)*t; + y[6] = CONSTANT(-0.126156626101000010)*t; + y[8] = CONSTANT(-0.218509686119999990)*t; + + // [1,4]: 3,13,15, + tf = CONSTANT(0.218509686114999990)*f[3] + CONSTANT(-0.058399170082300000)*f[13] + CONSTANT(-0.226179013157999990)*f[15]; + tg = CONSTANT(0.218509686114999990)*g[3] + CONSTANT(-0.058399170082300000)*g[13] + CONSTANT(-0.226179013157999990)*g[15]; + y[1] += tf*g[4] + tg*f[4]; + y[4] = tf*g[1] + tg*f[1]; + t = f[1] * g[4] + f[4] * g[1]; + y[3] = CONSTANT(0.218509686114999990)*t; + y[13] = CONSTANT(-0.058399170082300000)*t; + y[15] = CONSTANT(-0.226179013157999990)*t; + + // [1,5]: 2,12, + tf = CONSTANT(0.218509686118000010)*f[2] + CONSTANT(-0.143048168103000000)*f[12]; + tg = CONSTANT(0.218509686118000010)*g[2] + CONSTANT(-0.143048168103000000)*g[12]; + y[1] += tf*g[5] + tg*f[5]; + y[5] = tf*g[1] + tg*f[1]; + t = f[1] * g[5] + f[5] * g[1]; + y[2] = CONSTANT(0.218509686118000010)*t; + y[12] = CONSTANT(-0.143048168103000000)*t; + + // [1,11]: 6,8,20,22, + tf = CONSTANT(0.202300659402999990)*f[6] + CONSTANT(0.058399170081799998)*f[8] + CONSTANT(-0.150786008773000000)*f[20] + CONSTANT(-0.168583882836999990)*f[22]; + tg = CONSTANT(0.202300659402999990)*g[6] + CONSTANT(0.058399170081799998)*g[8] + CONSTANT(-0.150786008773000000)*g[20] + CONSTANT(-0.168583882836999990)*g[22]; + y[1] += tf*g[11] + tg*f[11]; + y[11] = tf*g[1] + tg*f[1]; + t = f[1] * g[11] + f[11] * g[1]; + y[6] += CONSTANT(0.202300659402999990)*t; + y[8] += CONSTANT(0.058399170081799998)*t; + y[20] = CONSTANT(-0.150786008773000000)*t; + y[22] = CONSTANT(-0.168583882836999990)*t; + + // [1,16]: 15,33,35, + tf = CONSTANT(0.230329432973999990)*f[15] + CONSTANT(-0.034723468517399998)*f[33] + CONSTANT(-0.232932108051999990)*f[35]; + tg = CONSTANT(0.230329432973999990)*g[15] + CONSTANT(-0.034723468517399998)*g[33] + CONSTANT(-0.232932108051999990)*g[35]; + y[1] += tf*g[16] + tg*f[16]; + y[16] = tf*g[1] + tg*f[1]; + t = f[1] * g[16] + f[16] * g[1]; + y[15] += CONSTANT(0.230329432973999990)*t; + y[33] = CONSTANT(-0.034723468517399998)*t; + y[35] = CONSTANT(-0.232932108051999990)*t; + + // [1,18]: 15,13,31,33, + tf = CONSTANT(0.043528171377799997)*f[15] + CONSTANT(0.168583882834000000)*f[13] + CONSTANT(-0.085054779966799998)*f[31] + CONSTANT(-0.183739324705999990)*f[33]; + tg = CONSTANT(0.043528171377799997)*g[15] + CONSTANT(0.168583882834000000)*g[13] + CONSTANT(-0.085054779966799998)*g[31] + CONSTANT(-0.183739324705999990)*g[33]; + y[1] += tf*g[18] + tg*f[18]; + y[18] = tf*g[1] + tg*f[1]; + t = f[1] * g[18] + f[18] * g[1]; + y[15] += CONSTANT(0.043528171377799997)*t; + y[13] += CONSTANT(0.168583882834000000)*t; + y[31] = CONSTANT(-0.085054779966799998)*t; + y[33] += CONSTANT(-0.183739324705999990)*t; + + // [1,19]: 14,12,30,32, + tf = CONSTANT(0.075393004386399995)*f[14] + CONSTANT(0.194663900273000010)*f[12] + CONSTANT(-0.155288072037000010)*f[30] + CONSTANT(-0.159122922869999990)*f[32]; + tg = CONSTANT(0.075393004386399995)*g[14] + CONSTANT(0.194663900273000010)*g[12] + CONSTANT(-0.155288072037000010)*g[30] + CONSTANT(-0.159122922869999990)*g[32]; + y[1] += tf*g[19] + tg*f[19]; + y[19] = tf*g[1] + tg*f[1]; + t = f[1] * g[19] + f[19] * g[1]; + y[14] = CONSTANT(0.075393004386399995)*t; + y[12] += CONSTANT(0.194663900273000010)*t; + y[30] = CONSTANT(-0.155288072037000010)*t; + y[32] = CONSTANT(-0.159122922869999990)*t; + + // [1,24]: 9,25,27, + tf = CONSTANT(-0.230329432978999990)*f[9] + CONSTANT(0.232932108049000000)*f[25] + CONSTANT(0.034723468517100002)*f[27]; + tg = CONSTANT(-0.230329432978999990)*g[9] + CONSTANT(0.232932108049000000)*g[25] + CONSTANT(0.034723468517100002)*g[27]; + y[1] += tf*g[24] + tg*f[24]; + y[24] = tf*g[1] + tg*f[1]; + t = f[1] * g[24] + f[24] * g[1]; + y[9] = CONSTANT(-0.230329432978999990)*t; + y[25] = CONSTANT(0.232932108049000000)*t; + y[27] = CONSTANT(0.034723468517100002)*t; + + // [1,29]: 22,20, + tf = CONSTANT(0.085054779965999999)*f[22] + CONSTANT(0.190188269815000010)*f[20]; + tg = CONSTANT(0.085054779965999999)*g[22] + CONSTANT(0.190188269815000010)*g[20]; + y[1] += tf*g[29] + tg*f[29]; + y[29] = tf*g[1] + tg*f[1]; + t = f[1] * g[29] + f[29] * g[1]; + y[22] += CONSTANT(0.085054779965999999)*t; + y[20] += CONSTANT(0.190188269815000010)*t; + + // [2,2]: 0,6, + tf = CONSTANT(0.282094795249000000)*f[0] + CONSTANT(0.252313259986999990)*f[6]; + tg = CONSTANT(0.282094795249000000)*g[0] + CONSTANT(0.252313259986999990)*g[6]; + y[2] += tf*g[2] + tg*f[2]; + t = f[2] * g[2]; + y[0] += CONSTANT(0.282094795249000000)*t; + y[6] += CONSTANT(0.252313259986999990)*t; + + // [2,12]: 6,20, + tf = CONSTANT(0.247766706973999990)*f[6] + CONSTANT(0.246232537174000010)*f[20]; + tg = CONSTANT(0.247766706973999990)*g[6] + CONSTANT(0.246232537174000010)*g[20]; + y[2] += tf*g[12] + tg*f[12]; + y[12] += tf*g[2] + tg*f[2]; + t = f[2] * g[12] + f[12] * g[2]; + y[6] += CONSTANT(0.247766706973999990)*t; + y[20] += CONSTANT(0.246232537174000010)*t; + + // [2,20]: 30, + tf = CONSTANT(0.245532020560000010)*f[30]; + tg = CONSTANT(0.245532020560000010)*g[30]; + y[2] += tf*g[20] + tg*f[20]; + y[20] += tf*g[2] + tg*f[2]; + t = f[2] * g[20] + f[20] * g[2]; + y[30] += CONSTANT(0.245532020560000010)*t; + + // [3,3]: 0,6,8, + tf = CONSTANT(0.282094791773000010)*f[0] + CONSTANT(-0.126156626101000010)*f[6] + CONSTANT(0.218509686119999990)*f[8]; + tg = CONSTANT(0.282094791773000010)*g[0] + CONSTANT(-0.126156626101000010)*g[6] + CONSTANT(0.218509686119999990)*g[8]; + y[3] += tf*g[3] + tg*f[3]; + t = f[3] * g[3]; + y[0] += CONSTANT(0.282094791773000010)*t; + y[6] += CONSTANT(-0.126156626101000010)*t; + y[8] += CONSTANT(0.218509686119999990)*t; + + // [3,7]: 2,12, + tf = CONSTANT(0.218509686118000010)*f[2] + CONSTANT(-0.143048168103000000)*f[12]; + tg = CONSTANT(0.218509686118000010)*g[2] + CONSTANT(-0.143048168103000000)*g[12]; + y[3] += tf*g[7] + tg*f[7]; + y[7] = tf*g[3] + tg*f[3]; + t = f[3] * g[7] + f[7] * g[3]; + y[2] += CONSTANT(0.218509686118000010)*t; + y[12] += CONSTANT(-0.143048168103000000)*t; + + // [3,13]: 8,6,20,22, + tf = CONSTANT(-0.058399170081799998)*f[8] + CONSTANT(0.202300659402999990)*f[6] + CONSTANT(-0.150786008773000000)*f[20] + CONSTANT(0.168583882836999990)*f[22]; + tg = CONSTANT(-0.058399170081799998)*g[8] + CONSTANT(0.202300659402999990)*g[6] + CONSTANT(-0.150786008773000000)*g[20] + CONSTANT(0.168583882836999990)*g[22]; + y[3] += tf*g[13] + tg*f[13]; + y[13] += tf*g[3] + tg*f[3]; + t = f[3] * g[13] + f[13] * g[3]; + y[8] += CONSTANT(-0.058399170081799998)*t; + y[6] += CONSTANT(0.202300659402999990)*t; + y[20] += CONSTANT(-0.150786008773000000)*t; + y[22] += CONSTANT(0.168583882836999990)*t; + + // [3,16]: 9,25,27, + tf = CONSTANT(0.230329432973999990)*f[9] + CONSTANT(0.232932108051999990)*f[25] + CONSTANT(-0.034723468517399998)*f[27]; + tg = CONSTANT(0.230329432973999990)*g[9] + CONSTANT(0.232932108051999990)*g[25] + CONSTANT(-0.034723468517399998)*g[27]; + y[3] += tf*g[16] + tg*f[16]; + y[16] += tf*g[3] + tg*f[3]; + t = f[3] * g[16] + f[16] * g[3]; + y[9] += CONSTANT(0.230329432973999990)*t; + y[25] += CONSTANT(0.232932108051999990)*t; + y[27] += CONSTANT(-0.034723468517399998)*t; + + // [3,21]: 12,14,30,32, + tf = CONSTANT(0.194663900273000010)*f[12] + CONSTANT(-0.075393004386399995)*f[14] + CONSTANT(-0.155288072037000010)*f[30] + CONSTANT(0.159122922869999990)*f[32]; + tg = CONSTANT(0.194663900273000010)*g[12] + CONSTANT(-0.075393004386399995)*g[14] + CONSTANT(-0.155288072037000010)*g[30] + CONSTANT(0.159122922869999990)*g[32]; + y[3] += tf*g[21] + tg*f[21]; + y[21] = tf*g[3] + tg*f[3]; + t = f[3] * g[21] + f[21] * g[3]; + y[12] += CONSTANT(0.194663900273000010)*t; + y[14] += CONSTANT(-0.075393004386399995)*t; + y[30] += CONSTANT(-0.155288072037000010)*t; + y[32] += CONSTANT(0.159122922869999990)*t; + + // [3,24]: 15,33,35, + tf = CONSTANT(0.230329432978999990)*f[15] + CONSTANT(-0.034723468517100002)*f[33] + CONSTANT(0.232932108049000000)*f[35]; + tg = CONSTANT(0.230329432978999990)*g[15] + CONSTANT(-0.034723468517100002)*g[33] + CONSTANT(0.232932108049000000)*g[35]; + y[3] += tf*g[24] + tg*f[24]; + y[24] += tf*g[3] + tg*f[3]; + t = f[3] * g[24] + f[24] * g[3]; + y[15] += CONSTANT(0.230329432978999990)*t; + y[33] += CONSTANT(-0.034723468517100002)*t; + y[35] += CONSTANT(0.232932108049000000)*t; + + // [3,31]: 20,22, + tf = CONSTANT(0.190188269815000010)*f[20] + CONSTANT(-0.085054779965999999)*f[22]; + tg = CONSTANT(0.190188269815000010)*g[20] + CONSTANT(-0.085054779965999999)*g[22]; + y[3] += tf*g[31] + tg*f[31]; + y[31] += tf*g[3] + tg*f[3]; + t = f[3] * g[31] + f[31] * g[3]; + y[20] += CONSTANT(0.190188269815000010)*t; + y[22] += CONSTANT(-0.085054779965999999)*t; + + // [4,4]: 0,6,20,24, + tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.180223751576000010)*f[6] + CONSTANT(0.040299255967500003)*f[20] + CONSTANT(-0.238413613505999990)*f[24]; + tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.180223751576000010)*g[6] + CONSTANT(0.040299255967500003)*g[20] + CONSTANT(-0.238413613505999990)*g[24]; + y[4] += tf*g[4] + tg*f[4]; + t = f[4] * g[4]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[6] += CONSTANT(-0.180223751576000010)*t; + y[20] += CONSTANT(0.040299255967500003)*t; + y[24] += CONSTANT(-0.238413613505999990)*t; + + // [4,5]: 7,21,23, + tf = CONSTANT(0.156078347226000000)*f[7] + CONSTANT(-0.063718718434399996)*f[21] + CONSTANT(-0.168583882835000000)*f[23]; + tg = CONSTANT(0.156078347226000000)*g[7] + CONSTANT(-0.063718718434399996)*g[21] + CONSTANT(-0.168583882835000000)*g[23]; + y[4] += tf*g[5] + tg*f[5]; + y[5] += tf*g[4] + tg*f[4]; + t = f[4] * g[5] + f[5] * g[4]; + y[7] += CONSTANT(0.156078347226000000)*t; + y[21] += CONSTANT(-0.063718718434399996)*t; + y[23] = CONSTANT(-0.168583882835000000)*t; + + // [4,9]: 3,13,31,35, + tf = CONSTANT(0.226179013157999990)*f[3] + CONSTANT(-0.094031597258400004)*f[13] + CONSTANT(0.016943317729299998)*f[31] + CONSTANT(-0.245532000542000000)*f[35]; + tg = CONSTANT(0.226179013157999990)*g[3] + CONSTANT(-0.094031597258400004)*g[13] + CONSTANT(0.016943317729299998)*g[31] + CONSTANT(-0.245532000542000000)*g[35]; + y[4] += tf*g[9] + tg*f[9]; + y[9] += tf*g[4] + tg*f[4]; + t = f[4] * g[9] + f[9] * g[4]; + y[3] += CONSTANT(0.226179013157999990)*t; + y[13] += CONSTANT(-0.094031597258400004)*t; + y[31] += CONSTANT(0.016943317729299998)*t; + y[35] += CONSTANT(-0.245532000542000000)*t; + + // [4,10]: 2,12,30,34, + tf = CONSTANT(0.184674390919999990)*f[2] + CONSTANT(-0.188063194517999990)*f[12] + CONSTANT(0.053579475144400000)*f[30] + CONSTANT(-0.190188269816000010)*f[34]; + tg = CONSTANT(0.184674390919999990)*g[2] + CONSTANT(-0.188063194517999990)*g[12] + CONSTANT(0.053579475144400000)*g[30] + CONSTANT(-0.190188269816000010)*g[34]; + y[4] += tf*g[10] + tg*f[10]; + y[10] = tf*g[4] + tg*f[4]; + t = f[4] * g[10] + f[10] * g[4]; + y[2] += CONSTANT(0.184674390919999990)*t; + y[12] += CONSTANT(-0.188063194517999990)*t; + y[30] += CONSTANT(0.053579475144400000)*t; + y[34] = CONSTANT(-0.190188269816000010)*t; + + // [4,11]: 3,13,15,31,33, + tf = CONSTANT(-0.058399170082300000)*f[3] + CONSTANT(0.145673124078000010)*f[13] + CONSTANT(0.094031597258400004)*f[15] + CONSTANT(-0.065621187395699998)*f[31] + CONSTANT(-0.141757966610000010)*f[33]; + tg = CONSTANT(-0.058399170082300000)*g[3] + CONSTANT(0.145673124078000010)*g[13] + CONSTANT(0.094031597258400004)*g[15] + CONSTANT(-0.065621187395699998)*g[31] + CONSTANT(-0.141757966610000010)*g[33]; + y[4] += tf*g[11] + tg*f[11]; + y[11] += tf*g[4] + tg*f[4]; + t = f[4] * g[11] + f[11] * g[4]; + y[3] += CONSTANT(-0.058399170082300000)*t; + y[13] += CONSTANT(0.145673124078000010)*t; + y[15] += CONSTANT(0.094031597258400004)*t; + y[31] += CONSTANT(-0.065621187395699998)*t; + y[33] += CONSTANT(-0.141757966610000010)*t; + + // [4,16]: 8,22, + tf = CONSTANT(0.238413613494000000)*f[8] + CONSTANT(-0.075080816693699995)*f[22]; + tg = CONSTANT(0.238413613494000000)*g[8] + CONSTANT(-0.075080816693699995)*g[22]; + y[4] += tf*g[16] + tg*f[16]; + y[16] += tf*g[4] + tg*f[4]; + t = f[4] * g[16] + f[16] * g[4]; + y[8] += CONSTANT(0.238413613494000000)*t; + y[22] += CONSTANT(-0.075080816693699995)*t; + + // [4,18]: 6,20,24, + tf = CONSTANT(0.156078347226000000)*f[6] + CONSTANT(-0.190364615029000010)*f[20] + CONSTANT(0.075080816691500005)*f[24]; + tg = CONSTANT(0.156078347226000000)*g[6] + CONSTANT(-0.190364615029000010)*g[20] + CONSTANT(0.075080816691500005)*g[24]; + y[4] += tf*g[18] + tg*f[18]; + y[18] += tf*g[4] + tg*f[4]; + t = f[4] * g[18] + f[18] * g[4]; + y[6] += CONSTANT(0.156078347226000000)*t; + y[20] += CONSTANT(-0.190364615029000010)*t; + y[24] += CONSTANT(0.075080816691500005)*t; + + // [4,19]: 7,21,23, + tf = CONSTANT(-0.063718718434399996)*f[7] + CONSTANT(0.141889406569999990)*f[21] + CONSTANT(0.112621225039000000)*f[23]; + tg = CONSTANT(-0.063718718434399996)*g[7] + CONSTANT(0.141889406569999990)*g[21] + CONSTANT(0.112621225039000000)*g[23]; + y[4] += tf*g[19] + tg*f[19]; + y[19] += tf*g[4] + tg*f[4]; + t = f[4] * g[19] + f[19] * g[4]; + y[7] += CONSTANT(-0.063718718434399996)*t; + y[21] += CONSTANT(0.141889406569999990)*t; + y[23] += CONSTANT(0.112621225039000000)*t; + + // [4,25]: 15,33, + tf = CONSTANT(0.245532000542000000)*f[15] + CONSTANT(-0.062641347680800000)*f[33]; + tg = CONSTANT(0.245532000542000000)*g[15] + CONSTANT(-0.062641347680800000)*g[33]; + y[4] += tf*g[25] + tg*f[25]; + y[25] += tf*g[4] + tg*f[4]; + t = f[4] * g[25] + f[25] * g[4]; + y[15] += CONSTANT(0.245532000542000000)*t; + y[33] += CONSTANT(-0.062641347680800000)*t; + + // [4,26]: 14,32, + tf = CONSTANT(0.190188269806999990)*f[14] + CONSTANT(-0.097043558542400002)*f[32]; + tg = CONSTANT(0.190188269806999990)*g[14] + CONSTANT(-0.097043558542400002)*g[32]; + y[4] += tf*g[26] + tg*f[26]; + y[26] = tf*g[4] + tg*f[4]; + t = f[4] * g[26] + f[26] * g[4]; + y[14] += CONSTANT(0.190188269806999990)*t; + y[32] += CONSTANT(-0.097043558542400002)*t; + + // [4,27]: 13,31,35, + tf = CONSTANT(0.141757966610000010)*f[13] + CONSTANT(-0.121034582549000000)*f[31] + CONSTANT(0.062641347680800000)*f[35]; + tg = CONSTANT(0.141757966610000010)*g[13] + CONSTANT(-0.121034582549000000)*g[31] + CONSTANT(0.062641347680800000)*g[35]; + y[4] += tf*g[27] + tg*f[27]; + y[27] += tf*g[4] + tg*f[4]; + t = f[4] * g[27] + f[27] * g[4]; + y[13] += CONSTANT(0.141757966610000010)*t; + y[31] += CONSTANT(-0.121034582549000000)*t; + y[35] += CONSTANT(0.062641347680800000)*t; + + // [4,28]: 12,30,34, + tf = CONSTANT(0.141757966609000000)*f[12] + CONSTANT(-0.191372478254000000)*f[30] + CONSTANT(0.097043558538899996)*f[34]; + tg = CONSTANT(0.141757966609000000)*g[12] + CONSTANT(-0.191372478254000000)*g[30] + CONSTANT(0.097043558538899996)*g[34]; + y[4] += tf*g[28] + tg*f[28]; + y[28] = tf*g[4] + tg*f[4]; + t = f[4] * g[28] + f[28] * g[4]; + y[12] += CONSTANT(0.141757966609000000)*t; + y[30] += CONSTANT(-0.191372478254000000)*t; + y[34] += CONSTANT(0.097043558538899996)*t; + + // [4,29]: 13,15,31,33, + tf = CONSTANT(-0.065621187395699998)*f[13] + CONSTANT(-0.016943317729299998)*f[15] + CONSTANT(0.140070311613999990)*f[31] + CONSTANT(0.121034582549000000)*f[33]; + tg = CONSTANT(-0.065621187395699998)*g[13] + CONSTANT(-0.016943317729299998)*g[15] + CONSTANT(0.140070311613999990)*g[31] + CONSTANT(0.121034582549000000)*g[33]; + y[4] += tf*g[29] + tg*f[29]; + y[29] += tf*g[4] + tg*f[4]; + t = f[4] * g[29] + f[29] * g[4]; + y[13] += CONSTANT(-0.065621187395699998)*t; + y[15] += CONSTANT(-0.016943317729299998)*t; + y[31] += CONSTANT(0.140070311613999990)*t; + y[33] += CONSTANT(0.121034582549000000)*t; + + // [5,5]: 0,6,8,20,22, + tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.090111875786499998)*f[6] + CONSTANT(-0.156078347227999990)*f[8] + CONSTANT(-0.161197023870999990)*f[20] + CONSTANT(-0.180223751574000000)*f[22]; + tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.090111875786499998)*g[6] + CONSTANT(-0.156078347227999990)*g[8] + CONSTANT(-0.161197023870999990)*g[20] + CONSTANT(-0.180223751574000000)*g[22]; + y[5] += tf*g[5] + tg*f[5]; + t = f[5] * g[5]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[6] += CONSTANT(0.090111875786499998)*t; + y[8] += CONSTANT(-0.156078347227999990)*t; + y[20] += CONSTANT(-0.161197023870999990)*t; + y[22] += CONSTANT(-0.180223751574000000)*t; + + // [5,10]: 3,13,15,31,33, + tf = CONSTANT(0.184674390919999990)*f[3] + CONSTANT(0.115164716490000000)*f[13] + CONSTANT(-0.148677009678999990)*f[15] + CONSTANT(-0.083004965974099995)*f[31] + CONSTANT(-0.179311220383999990)*f[33]; + tg = CONSTANT(0.184674390919999990)*g[3] + CONSTANT(0.115164716490000000)*g[13] + CONSTANT(-0.148677009678999990)*g[15] + CONSTANT(-0.083004965974099995)*g[31] + CONSTANT(-0.179311220383999990)*g[33]; + y[5] += tf*g[10] + tg*f[10]; + y[10] += tf*g[5] + tg*f[5]; + t = f[5] * g[10] + f[10] * g[5]; + y[3] += CONSTANT(0.184674390919999990)*t; + y[13] += CONSTANT(0.115164716490000000)*t; + y[15] += CONSTANT(-0.148677009678999990)*t; + y[31] += CONSTANT(-0.083004965974099995)*t; + y[33] += CONSTANT(-0.179311220383999990)*t; + + // [5,11]: 2,12,14,30,32, + tf = CONSTANT(0.233596680327000010)*f[2] + CONSTANT(0.059470803871800003)*f[12] + CONSTANT(-0.115164716491000000)*f[14] + CONSTANT(-0.169433177294000010)*f[30] + CONSTANT(-0.173617342585000000)*f[32]; + tg = CONSTANT(0.233596680327000010)*g[2] + CONSTANT(0.059470803871800003)*g[12] + CONSTANT(-0.115164716491000000)*g[14] + CONSTANT(-0.169433177294000010)*g[30] + CONSTANT(-0.173617342585000000)*g[32]; + y[5] += tf*g[11] + tg*f[11]; + y[11] += tf*g[5] + tg*f[5]; + t = f[5] * g[11] + f[11] * g[5]; + y[2] += CONSTANT(0.233596680327000010)*t; + y[12] += CONSTANT(0.059470803871800003)*t; + y[14] += CONSTANT(-0.115164716491000000)*t; + y[30] += CONSTANT(-0.169433177294000010)*t; + y[32] += CONSTANT(-0.173617342585000000)*t; + + // [5,14]: 9,1,27,29, + tf = CONSTANT(0.148677009677999990)*f[9] + CONSTANT(-0.184674390923000000)*f[1] + CONSTANT(0.179311220382000010)*f[27] + CONSTANT(0.083004965973399999)*f[29]; + tg = CONSTANT(0.148677009677999990)*g[9] + CONSTANT(-0.184674390923000000)*g[1] + CONSTANT(0.179311220382000010)*g[27] + CONSTANT(0.083004965973399999)*g[29]; + y[5] += tf*g[14] + tg*f[14]; + y[14] += tf*g[5] + tg*f[5]; + t = f[5] * g[14] + f[14] * g[5]; + y[9] += CONSTANT(0.148677009677999990)*t; + y[1] += CONSTANT(-0.184674390923000000)*t; + y[27] += CONSTANT(0.179311220382000010)*t; + y[29] += CONSTANT(0.083004965973399999)*t; + + // [5,17]: 8,22,24, + tf = CONSTANT(0.168583882832999990)*f[8] + CONSTANT(0.132725386548000010)*f[22] + CONSTANT(-0.140463346189000000)*f[24]; + tg = CONSTANT(0.168583882832999990)*g[8] + CONSTANT(0.132725386548000010)*g[22] + CONSTANT(-0.140463346189000000)*g[24]; + y[5] += tf*g[17] + tg*f[17]; + y[17] = tf*g[5] + tg*f[5]; + t = f[5] * g[17] + f[17] * g[5]; + y[8] += CONSTANT(0.168583882832999990)*t; + y[22] += CONSTANT(0.132725386548000010)*t; + y[24] += CONSTANT(-0.140463346189000000)*t; + + // [5,18]: 7,21,23, + tf = CONSTANT(0.180223751571000010)*f[7] + CONSTANT(0.090297865407399994)*f[21] + CONSTANT(-0.132725386549000010)*f[23]; + tg = CONSTANT(0.180223751571000010)*g[7] + CONSTANT(0.090297865407399994)*g[21] + CONSTANT(-0.132725386549000010)*g[23]; + y[5] += tf*g[18] + tg*f[18]; + y[18] += tf*g[5] + tg*f[5]; + t = f[5] * g[18] + f[18] * g[5]; + y[7] += CONSTANT(0.180223751571000010)*t; + y[21] += CONSTANT(0.090297865407399994)*t; + y[23] += CONSTANT(-0.132725386549000010)*t; + + // [5,19]: 6,8,20,22, + tf = CONSTANT(0.220728115440999990)*f[6] + CONSTANT(0.063718718433900007)*f[8] + CONSTANT(0.044869370061299998)*f[20] + CONSTANT(-0.090297865408399999)*f[22]; + tg = CONSTANT(0.220728115440999990)*g[6] + CONSTANT(0.063718718433900007)*g[8] + CONSTANT(0.044869370061299998)*g[20] + CONSTANT(-0.090297865408399999)*g[22]; + y[5] += tf*g[19] + tg*f[19]; + y[19] += tf*g[5] + tg*f[5]; + t = f[5] * g[19] + f[19] * g[5]; + y[6] += CONSTANT(0.220728115440999990)*t; + y[8] += CONSTANT(0.063718718433900007)*t; + y[20] += CONSTANT(0.044869370061299998)*t; + y[22] += CONSTANT(-0.090297865408399999)*t; + + // [5,26]: 15,33,35, + tf = CONSTANT(0.155288072035000000)*f[15] + CONSTANT(0.138662534056999990)*f[33] + CONSTANT(-0.132882365179999990)*f[35]; + tg = CONSTANT(0.155288072035000000)*g[15] + CONSTANT(0.138662534056999990)*g[33] + CONSTANT(-0.132882365179999990)*g[35]; + y[5] += tf*g[26] + tg*f[26]; + y[26] += tf*g[5] + tg*f[5]; + t = f[5] * g[26] + f[26] * g[5]; + y[15] += CONSTANT(0.155288072035000000)*t; + y[33] += CONSTANT(0.138662534056999990)*t; + y[35] += CONSTANT(-0.132882365179999990)*t; + + // [5,28]: 15,13,31,33, + tf = CONSTANT(0.044827805096399997)*f[15] + CONSTANT(0.173617342584000000)*f[13] + CONSTANT(0.074118242118699995)*f[31] + CONSTANT(-0.114366930522000000)*f[33]; + tg = CONSTANT(0.044827805096399997)*g[15] + CONSTANT(0.173617342584000000)*g[13] + CONSTANT(0.074118242118699995)*g[31] + CONSTANT(-0.114366930522000000)*g[33]; + y[5] += tf*g[28] + tg*f[28]; + y[28] += tf*g[5] + tg*f[5]; + t = f[5] * g[28] + f[28] * g[5]; + y[15] += CONSTANT(0.044827805096399997)*t; + y[13] += CONSTANT(0.173617342584000000)*t; + y[31] += CONSTANT(0.074118242118699995)*t; + y[33] += CONSTANT(-0.114366930522000000)*t; + + // [5,29]: 12,30,32, + tf = CONSTANT(0.214317900578999990)*f[12] + CONSTANT(0.036165998945399999)*f[30] + CONSTANT(-0.074118242119099995)*f[32]; + tg = CONSTANT(0.214317900578999990)*g[12] + CONSTANT(0.036165998945399999)*g[30] + CONSTANT(-0.074118242119099995)*g[32]; + y[5] += tf*g[29] + tg*f[29]; + y[29] += tf*g[5] + tg*f[5]; + t = f[5] * g[29] + f[29] * g[5]; + y[12] += CONSTANT(0.214317900578999990)*t; + y[30] += CONSTANT(0.036165998945399999)*t; + y[32] += CONSTANT(-0.074118242119099995)*t; + + // [5,32]: 9,27, + tf = CONSTANT(-0.044827805096799997)*f[9] + CONSTANT(0.114366930522000000)*f[27]; + tg = CONSTANT(-0.044827805096799997)*g[9] + CONSTANT(0.114366930522000000)*g[27]; + y[5] += tf*g[32] + tg*f[32]; + y[32] += tf*g[5] + tg*f[5]; + t = f[5] * g[32] + f[32] * g[5]; + y[9] += CONSTANT(-0.044827805096799997)*t; + y[27] += CONSTANT(0.114366930522000000)*t; + + // [5,34]: 9,27,25, + tf = CONSTANT(-0.155288072036000010)*f[9] + CONSTANT(-0.138662534059000000)*f[27] + CONSTANT(0.132882365179000010)*f[25]; + tg = CONSTANT(-0.155288072036000010)*g[9] + CONSTANT(-0.138662534059000000)*g[27] + CONSTANT(0.132882365179000010)*g[25]; + y[5] += tf*g[34] + tg*f[34]; + y[34] += tf*g[5] + tg*f[5]; + t = f[5] * g[34] + f[34] * g[5]; + y[9] += CONSTANT(-0.155288072036000010)*t; + y[27] += CONSTANT(-0.138662534059000000)*t; + y[25] += CONSTANT(0.132882365179000010)*t; + + // [6,6]: 0,6,20, + tf = CONSTANT(0.282094797560000000)*f[0] + CONSTANT(0.241795553185999990)*f[20]; + tg = CONSTANT(0.282094797560000000)*g[0] + CONSTANT(0.241795553185999990)*g[20]; + y[6] += tf*g[6] + tg*f[6]; + t = f[6] * g[6]; + y[0] += CONSTANT(0.282094797560000000)*t; + y[6] += CONSTANT(0.180223764527000010)*t; + y[20] += CONSTANT(0.241795553185999990)*t; + + // [7,7]: 6,0,8,20,22, + tf = CONSTANT(0.090111875786499998)*f[6] + CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.156078347227999990)*f[8] + CONSTANT(-0.161197023870999990)*f[20] + CONSTANT(0.180223751574000000)*f[22]; + tg = CONSTANT(0.090111875786499998)*g[6] + CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.156078347227999990)*g[8] + CONSTANT(-0.161197023870999990)*g[20] + CONSTANT(0.180223751574000000)*g[22]; + y[7] += tf*g[7] + tg*f[7]; + t = f[7] * g[7]; + y[6] += CONSTANT(0.090111875786499998)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + y[8] += CONSTANT(0.156078347227999990)*t; + y[20] += CONSTANT(-0.161197023870999990)*t; + y[22] += CONSTANT(0.180223751574000000)*t; + + // [7,10]: 9,1,11,27,29, + tf = CONSTANT(0.148677009678999990)*f[9] + CONSTANT(0.184674390919999990)*f[1] + CONSTANT(0.115164716490000000)*f[11] + CONSTANT(0.179311220383999990)*f[27] + CONSTANT(-0.083004965974099995)*f[29]; + tg = CONSTANT(0.148677009678999990)*g[9] + CONSTANT(0.184674390919999990)*g[1] + CONSTANT(0.115164716490000000)*g[11] + CONSTANT(0.179311220383999990)*g[27] + CONSTANT(-0.083004965974099995)*g[29]; + y[7] += tf*g[10] + tg*f[10]; + y[10] += tf*g[7] + tg*f[7]; + t = f[7] * g[10] + f[10] * g[7]; + y[9] += CONSTANT(0.148677009678999990)*t; + y[1] += CONSTANT(0.184674390919999990)*t; + y[11] += CONSTANT(0.115164716490000000)*t; + y[27] += CONSTANT(0.179311220383999990)*t; + y[29] += CONSTANT(-0.083004965974099995)*t; + + // [7,13]: 12,2,14,30,32, + tf = CONSTANT(0.059470803871800003)*f[12] + CONSTANT(0.233596680327000010)*f[2] + CONSTANT(0.115164716491000000)*f[14] + CONSTANT(-0.169433177294000010)*f[30] + CONSTANT(0.173617342585000000)*f[32]; + tg = CONSTANT(0.059470803871800003)*g[12] + CONSTANT(0.233596680327000010)*g[2] + CONSTANT(0.115164716491000000)*g[14] + CONSTANT(-0.169433177294000010)*g[30] + CONSTANT(0.173617342585000000)*g[32]; + y[7] += tf*g[13] + tg*f[13]; + y[13] += tf*g[7] + tg*f[7]; + t = f[7] * g[13] + f[13] * g[7]; + y[12] += CONSTANT(0.059470803871800003)*t; + y[2] += CONSTANT(0.233596680327000010)*t; + y[14] += CONSTANT(0.115164716491000000)*t; + y[30] += CONSTANT(-0.169433177294000010)*t; + y[32] += CONSTANT(0.173617342585000000)*t; + + // [7,14]: 3,15,31,33, + tf = CONSTANT(0.184674390923000000)*f[3] + CONSTANT(0.148677009677999990)*f[15] + CONSTANT(-0.083004965973399999)*f[31] + CONSTANT(0.179311220382000010)*f[33]; + tg = CONSTANT(0.184674390923000000)*g[3] + CONSTANT(0.148677009677999990)*g[15] + CONSTANT(-0.083004965973399999)*g[31] + CONSTANT(0.179311220382000010)*g[33]; + y[7] += tf*g[14] + tg*f[14]; + y[14] += tf*g[7] + tg*f[7]; + t = f[7] * g[14] + f[14] * g[7]; + y[3] += CONSTANT(0.184674390923000000)*t; + y[15] += CONSTANT(0.148677009677999990)*t; + y[31] += CONSTANT(-0.083004965973399999)*t; + y[33] += CONSTANT(0.179311220382000010)*t; + + // [7,17]: 16,4,18, + tf = CONSTANT(0.140463346187999990)*f[16] + CONSTANT(0.168583882835000000)*f[4] + CONSTANT(0.132725386549000010)*f[18]; + tg = CONSTANT(0.140463346187999990)*g[16] + CONSTANT(0.168583882835000000)*g[4] + CONSTANT(0.132725386549000010)*g[18]; + y[7] += tf*g[17] + tg*f[17]; + y[17] += tf*g[7] + tg*f[7]; + t = f[7] * g[17] + f[17] * g[7]; + y[16] += CONSTANT(0.140463346187999990)*t; + y[4] += CONSTANT(0.168583882835000000)*t; + y[18] += CONSTANT(0.132725386549000010)*t; + + // [7,21]: 8,20,6,22, + tf = CONSTANT(-0.063718718433900007)*f[8] + CONSTANT(0.044869370061299998)*f[20] + CONSTANT(0.220728115440999990)*f[6] + CONSTANT(0.090297865408399999)*f[22]; + tg = CONSTANT(-0.063718718433900007)*g[8] + CONSTANT(0.044869370061299998)*g[20] + CONSTANT(0.220728115440999990)*g[6] + CONSTANT(0.090297865408399999)*g[22]; + y[7] += tf*g[21] + tg*f[21]; + y[21] += tf*g[7] + tg*f[7]; + t = f[7] * g[21] + f[21] * g[7]; + y[8] += CONSTANT(-0.063718718433900007)*t; + y[20] += CONSTANT(0.044869370061299998)*t; + y[6] += CONSTANT(0.220728115440999990)*t; + y[22] += CONSTANT(0.090297865408399999)*t; + + // [7,23]: 8,22,24, + tf = CONSTANT(0.168583882832999990)*f[8] + CONSTANT(0.132725386548000010)*f[22] + CONSTANT(0.140463346189000000)*f[24]; + tg = CONSTANT(0.168583882832999990)*g[8] + CONSTANT(0.132725386548000010)*g[22] + CONSTANT(0.140463346189000000)*g[24]; + y[7] += tf*g[23] + tg*f[23]; + y[23] += tf*g[7] + tg*f[7]; + t = f[7] * g[23] + f[23] * g[7]; + y[8] += CONSTANT(0.168583882832999990)*t; + y[22] += CONSTANT(0.132725386548000010)*t; + y[24] += CONSTANT(0.140463346189000000)*t; + + // [7,26]: 9,25,27, + tf = CONSTANT(0.155288072035000000)*f[9] + CONSTANT(0.132882365179999990)*f[25] + CONSTANT(0.138662534056999990)*f[27]; + tg = CONSTANT(0.155288072035000000)*g[9] + CONSTANT(0.132882365179999990)*g[25] + CONSTANT(0.138662534056999990)*g[27]; + y[7] += tf*g[26] + tg*f[26]; + y[26] += tf*g[7] + tg*f[7]; + t = f[7] * g[26] + f[26] * g[7]; + y[9] += CONSTANT(0.155288072035000000)*t; + y[25] += CONSTANT(0.132882365179999990)*t; + y[27] += CONSTANT(0.138662534056999990)*t; + + // [7,28]: 27,11,9,29, + tf = CONSTANT(0.114366930522000000)*f[27] + CONSTANT(0.173617342584000000)*f[11] + CONSTANT(-0.044827805096399997)*f[9] + CONSTANT(0.074118242118699995)*f[29]; + tg = CONSTANT(0.114366930522000000)*g[27] + CONSTANT(0.173617342584000000)*g[11] + CONSTANT(-0.044827805096399997)*g[9] + CONSTANT(0.074118242118699995)*g[29]; + y[7] += tf*g[28] + tg*f[28]; + y[28] += tf*g[7] + tg*f[7]; + t = f[7] * g[28] + f[28] * g[7]; + y[27] += CONSTANT(0.114366930522000000)*t; + y[11] += CONSTANT(0.173617342584000000)*t; + y[9] += CONSTANT(-0.044827805096399997)*t; + y[29] += CONSTANT(0.074118242118699995)*t; + + // [7,31]: 30,12,32, + tf = CONSTANT(0.036165998945399999)*f[30] + CONSTANT(0.214317900578999990)*f[12] + CONSTANT(0.074118242119099995)*f[32]; + tg = CONSTANT(0.036165998945399999)*g[30] + CONSTANT(0.214317900578999990)*g[12] + CONSTANT(0.074118242119099995)*g[32]; + y[7] += tf*g[31] + tg*f[31]; + y[31] += tf*g[7] + tg*f[7]; + t = f[7] * g[31] + f[31] * g[7]; + y[30] += CONSTANT(0.036165998945399999)*t; + y[12] += CONSTANT(0.214317900578999990)*t; + y[32] += CONSTANT(0.074118242119099995)*t; + + // [7,32]: 15,33, + tf = CONSTANT(-0.044827805096799997)*f[15] + CONSTANT(0.114366930522000000)*f[33]; + tg = CONSTANT(-0.044827805096799997)*g[15] + CONSTANT(0.114366930522000000)*g[33]; + y[7] += tf*g[32] + tg*f[32]; + y[32] += tf*g[7] + tg*f[7]; + t = f[7] * g[32] + f[32] * g[7]; + y[15] += CONSTANT(-0.044827805096799997)*t; + y[33] += CONSTANT(0.114366930522000000)*t; + + // [7,34]: 15,33,35, + tf = CONSTANT(0.155288072036000010)*f[15] + CONSTANT(0.138662534059000000)*f[33] + CONSTANT(0.132882365179000010)*f[35]; + tg = CONSTANT(0.155288072036000010)*g[15] + CONSTANT(0.138662534059000000)*g[33] + CONSTANT(0.132882365179000010)*g[35]; + y[7] += tf*g[34] + tg*f[34]; + y[34] += tf*g[7] + tg*f[7]; + t = f[7] * g[34] + f[34] * g[7]; + y[15] += CONSTANT(0.155288072036000010)*t; + y[33] += CONSTANT(0.138662534059000000)*t; + y[35] += CONSTANT(0.132882365179000010)*t; + + // [8,8]: 0,6,20,24, + tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.180223751576000010)*f[6] + CONSTANT(0.040299255967500003)*f[20] + CONSTANT(0.238413613505999990)*f[24]; + tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.180223751576000010)*g[6] + CONSTANT(0.040299255967500003)*g[20] + CONSTANT(0.238413613505999990)*g[24]; + y[8] += tf*g[8] + tg*f[8]; + t = f[8] * g[8]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[6] += CONSTANT(-0.180223751576000010)*t; + y[20] += CONSTANT(0.040299255967500003)*t; + y[24] += CONSTANT(0.238413613505999990)*t; + + // [8,9]: 1,11,25,29, + tf = CONSTANT(0.226179013155000000)*f[1] + CONSTANT(-0.094031597259499999)*f[11] + CONSTANT(0.245532000541000000)*f[25] + CONSTANT(0.016943317729199998)*f[29]; + tg = CONSTANT(0.226179013155000000)*g[1] + CONSTANT(-0.094031597259499999)*g[11] + CONSTANT(0.245532000541000000)*g[25] + CONSTANT(0.016943317729199998)*g[29]; + y[8] += tf*g[9] + tg*f[9]; + y[9] += tf*g[8] + tg*f[8]; + t = f[8] * g[9] + f[9] * g[8]; + y[1] += CONSTANT(0.226179013155000000)*t; + y[11] += CONSTANT(-0.094031597259499999)*t; + y[25] += CONSTANT(0.245532000541000000)*t; + y[29] += CONSTANT(0.016943317729199998)*t; + + // [8,14]: 2,12,30,34, + tf = CONSTANT(0.184674390919999990)*f[2] + CONSTANT(-0.188063194517999990)*f[12] + CONSTANT(0.053579475144400000)*f[30] + CONSTANT(0.190188269816000010)*f[34]; + tg = CONSTANT(0.184674390919999990)*g[2] + CONSTANT(-0.188063194517999990)*g[12] + CONSTANT(0.053579475144400000)*g[30] + CONSTANT(0.190188269816000010)*g[34]; + y[8] += tf*g[14] + tg*f[14]; + y[14] += tf*g[8] + tg*f[8]; + t = f[8] * g[14] + f[14] * g[8]; + y[2] += CONSTANT(0.184674390919999990)*t; + y[12] += CONSTANT(-0.188063194517999990)*t; + y[30] += CONSTANT(0.053579475144400000)*t; + y[34] += CONSTANT(0.190188269816000010)*t; + + // [8,15]: 13,3,31,35, + tf = CONSTANT(-0.094031597259499999)*f[13] + CONSTANT(0.226179013155000000)*f[3] + CONSTANT(0.016943317729199998)*f[31] + CONSTANT(0.245532000541000000)*f[35]; + tg = CONSTANT(-0.094031597259499999)*g[13] + CONSTANT(0.226179013155000000)*g[3] + CONSTANT(0.016943317729199998)*g[31] + CONSTANT(0.245532000541000000)*g[35]; + y[8] += tf*g[15] + tg*f[15]; + y[15] += tf*g[8] + tg*f[8]; + t = f[8] * g[15] + f[15] * g[8]; + y[13] += CONSTANT(-0.094031597259499999)*t; + y[3] += CONSTANT(0.226179013155000000)*t; + y[31] += CONSTANT(0.016943317729199998)*t; + y[35] += CONSTANT(0.245532000541000000)*t; + + // [8,22]: 6,20,24, + tf = CONSTANT(0.156078347226000000)*f[6] + CONSTANT(-0.190364615029000010)*f[20] + CONSTANT(-0.075080816691500005)*f[24]; + tg = CONSTANT(0.156078347226000000)*g[6] + CONSTANT(-0.190364615029000010)*g[20] + CONSTANT(-0.075080816691500005)*g[24]; + y[8] += tf*g[22] + tg*f[22]; + y[22] += tf*g[8] + tg*f[8]; + t = f[8] * g[22] + f[22] * g[8]; + y[6] += CONSTANT(0.156078347226000000)*t; + y[20] += CONSTANT(-0.190364615029000010)*t; + y[24] += CONSTANT(-0.075080816691500005)*t; + + // [8,26]: 10,28, + tf = CONSTANT(0.190188269806999990)*f[10] + CONSTANT(-0.097043558542400002)*f[28]; + tg = CONSTANT(0.190188269806999990)*g[10] + CONSTANT(-0.097043558542400002)*g[28]; + y[8] += tf*g[26] + tg*f[26]; + y[26] += tf*g[8] + tg*f[8]; + t = f[8] * g[26] + f[26] * g[8]; + y[10] += CONSTANT(0.190188269806999990)*t; + y[28] += CONSTANT(-0.097043558542400002)*t; + + // [8,27]: 25,11,29, + tf = CONSTANT(-0.062641347680800000)*f[25] + CONSTANT(0.141757966609000000)*f[11] + CONSTANT(-0.121034582550000010)*f[29]; + tg = CONSTANT(-0.062641347680800000)*g[25] + CONSTANT(0.141757966609000000)*g[11] + CONSTANT(-0.121034582550000010)*g[29]; + y[8] += tf*g[27] + tg*f[27]; + y[27] += tf*g[8] + tg*f[8]; + t = f[8] * g[27] + f[27] * g[8]; + y[25] += CONSTANT(-0.062641347680800000)*t; + y[11] += CONSTANT(0.141757966609000000)*t; + y[29] += CONSTANT(-0.121034582550000010)*t; + + // [8,32]: 30,12,34, + tf = CONSTANT(-0.191372478254000000)*f[30] + CONSTANT(0.141757966609000000)*f[12] + CONSTANT(-0.097043558538899996)*f[34]; + tg = CONSTANT(-0.191372478254000000)*g[30] + CONSTANT(0.141757966609000000)*g[12] + CONSTANT(-0.097043558538899996)*g[34]; + y[8] += tf*g[32] + tg*f[32]; + y[32] += tf*g[8] + tg*f[8]; + t = f[8] * g[32] + f[32] * g[8]; + y[30] += CONSTANT(-0.191372478254000000)*t; + y[12] += CONSTANT(0.141757966609000000)*t; + y[34] += CONSTANT(-0.097043558538899996)*t; + + // [8,33]: 13,31,35, + tf = CONSTANT(0.141757966609000000)*f[13] + CONSTANT(-0.121034582550000010)*f[31] + CONSTANT(-0.062641347680800000)*f[35]; + tg = CONSTANT(0.141757966609000000)*g[13] + CONSTANT(-0.121034582550000010)*g[31] + CONSTANT(-0.062641347680800000)*g[35]; + y[8] += tf*g[33] + tg*f[33]; + y[33] += tf*g[8] + tg*f[8]; + t = f[8] * g[33] + f[33] * g[8]; + y[13] += CONSTANT(0.141757966609000000)*t; + y[31] += CONSTANT(-0.121034582550000010)*t; + y[35] += CONSTANT(-0.062641347680800000)*t; + + // [9,9]: 6,0,20, + tf = CONSTANT(-0.210261043508000010)*f[6] + CONSTANT(0.282094791766999970)*f[0] + CONSTANT(0.076934943209800002)*f[20]; + tg = CONSTANT(-0.210261043508000010)*g[6] + CONSTANT(0.282094791766999970)*g[0] + CONSTANT(0.076934943209800002)*g[20]; + y[9] += tf*g[9] + tg*f[9]; + t = f[9] * g[9]; + y[6] += CONSTANT(-0.210261043508000010)*t; + y[0] += CONSTANT(0.282094791766999970)*t; + y[20] += CONSTANT(0.076934943209800002)*t; + + // [9,17]: 2,12,30, + tf = CONSTANT(0.162867503964999990)*f[2] + CONSTANT(-0.203550726872999990)*f[12] + CONSTANT(0.098140130728100003)*f[30]; + tg = CONSTANT(0.162867503964999990)*g[2] + CONSTANT(-0.203550726872999990)*g[12] + CONSTANT(0.098140130728100003)*g[30]; + y[9] += tf*g[17] + tg*f[17]; + y[17] += tf*g[9] + tg*f[9]; + t = f[9] * g[17] + f[17] * g[9]; + y[2] += CONSTANT(0.162867503964999990)*t; + y[12] += CONSTANT(-0.203550726872999990)*t; + y[30] += CONSTANT(0.098140130728100003)*t; + + // [9,18]: 3,13,31,35, + tf = CONSTANT(-0.043528171377799997)*f[3] + CONSTANT(0.133255230519000010)*f[13] + CONSTANT(-0.101584686310000010)*f[31] + CONSTANT(0.098140130731999994)*f[35]; + tg = CONSTANT(-0.043528171377799997)*g[3] + CONSTANT(0.133255230519000010)*g[13] + CONSTANT(-0.101584686310000010)*g[31] + CONSTANT(0.098140130731999994)*g[35]; + y[9] += tf*g[18] + tg*f[18]; + y[18] += tf*g[9] + tg*f[9]; + t = f[9] * g[18] + f[18] * g[9]; + y[3] += CONSTANT(-0.043528171377799997)*t; + y[13] += CONSTANT(0.133255230519000010)*t; + y[31] += CONSTANT(-0.101584686310000010)*t; + y[35] += CONSTANT(0.098140130731999994)*t; + + // [9,19]: 14,32,34, + tf = CONSTANT(-0.099322584600699995)*f[14] + CONSTANT(0.126698363970000010)*f[32] + CONSTANT(0.131668802180999990)*f[34]; + tg = CONSTANT(-0.099322584600699995)*g[14] + CONSTANT(0.126698363970000010)*g[32] + CONSTANT(0.131668802180999990)*g[34]; + y[9] += tf*g[19] + tg*f[19]; + y[19] += tf*g[9] + tg*f[9]; + t = f[9] * g[19] + f[19] * g[9]; + y[14] += CONSTANT(-0.099322584600699995)*t; + y[32] += CONSTANT(0.126698363970000010)*t; + y[34] += CONSTANT(0.131668802180999990)*t; + + // [9,22]: 1,11,25,29, + tf = CONSTANT(-0.043528171378199997)*f[1] + CONSTANT(0.133255230518000010)*f[11] + CONSTANT(-0.098140130732499997)*f[25] + CONSTANT(-0.101584686311000000)*f[29]; + tg = CONSTANT(-0.043528171378199997)*g[1] + CONSTANT(0.133255230518000010)*g[11] + CONSTANT(-0.098140130732499997)*g[25] + CONSTANT(-0.101584686311000000)*g[29]; + y[9] += tf*g[22] + tg*f[22]; + y[22] += tf*g[9] + tg*f[9]; + t = f[9] * g[22] + f[22] * g[9]; + y[1] += CONSTANT(-0.043528171378199997)*t; + y[11] += CONSTANT(0.133255230518000010)*t; + y[25] += CONSTANT(-0.098140130732499997)*t; + y[29] += CONSTANT(-0.101584686311000000)*t; + + // [9,27]: 6,20, + tf = CONSTANT(0.126792179874999990)*f[6] + CONSTANT(-0.196280261464999990)*f[20]; + tg = CONSTANT(0.126792179874999990)*g[6] + CONSTANT(-0.196280261464999990)*g[20]; + y[9] += tf*g[27] + tg*f[27]; + y[27] += tf*g[9] + tg*f[9]; + t = f[9] * g[27] + f[27] * g[9]; + y[6] += CONSTANT(0.126792179874999990)*t; + y[20] += CONSTANT(-0.196280261464999990)*t; + + // [10,10]: 0,20,24, + tf = CONSTANT(0.282094791771999980)*f[0] + CONSTANT(-0.179514867494000000)*f[20] + CONSTANT(-0.151717754049000010)*f[24]; + tg = CONSTANT(0.282094791771999980)*g[0] + CONSTANT(-0.179514867494000000)*g[20] + CONSTANT(-0.151717754049000010)*g[24]; + y[10] += tf*g[10] + tg*f[10]; + t = f[10] * g[10]; + y[0] += CONSTANT(0.282094791771999980)*t; + y[20] += CONSTANT(-0.179514867494000000)*t; + y[24] += CONSTANT(-0.151717754049000010)*t; + + // [10,16]: 14,32, + tf = CONSTANT(0.151717754044999990)*f[14] + CONSTANT(-0.077413979111300005)*f[32]; + tg = CONSTANT(0.151717754044999990)*g[14] + CONSTANT(-0.077413979111300005)*g[32]; + y[10] += tf*g[16] + tg*f[16]; + y[16] += tf*g[10] + tg*f[10]; + t = f[10] * g[16] + f[16] * g[10]; + y[14] += CONSTANT(0.151717754044999990)*t; + y[32] += CONSTANT(-0.077413979111300005)*t; + + // [10,17]: 13,3,31,35, + tf = CONSTANT(0.067850242288900006)*f[13] + CONSTANT(0.199471140200000010)*f[3] + CONSTANT(-0.113793659091000000)*f[31] + CONSTANT(-0.149911525925999990)*f[35]; + tg = CONSTANT(0.067850242288900006)*g[13] + CONSTANT(0.199471140200000010)*g[3] + CONSTANT(-0.113793659091000000)*g[31] + CONSTANT(-0.149911525925999990)*g[35]; + y[10] += tf*g[17] + tg*f[17]; + y[17] += tf*g[10] + tg*f[10]; + t = f[10] * g[17] + f[17] * g[10]; + y[13] += CONSTANT(0.067850242288900006)*t; + y[3] += CONSTANT(0.199471140200000010)*t; + y[31] += CONSTANT(-0.113793659091000000)*t; + y[35] += CONSTANT(-0.149911525925999990)*t; + + // [10,18]: 12,2,30,34, + tf = CONSTANT(-0.044418410173299998)*f[12] + CONSTANT(0.213243618621000000)*f[2] + CONSTANT(-0.171327458205000000)*f[30] + CONSTANT(-0.101358691177000000)*f[34]; + tg = CONSTANT(-0.044418410173299998)*g[12] + CONSTANT(0.213243618621000000)*g[2] + CONSTANT(-0.171327458205000000)*g[30] + CONSTANT(-0.101358691177000000)*g[34]; + y[10] += tf*g[18] + tg*f[18]; + y[18] += tf*g[10] + tg*f[10]; + t = f[10] * g[18] + f[18] * g[10]; + y[12] += CONSTANT(-0.044418410173299998)*t; + y[2] += CONSTANT(0.213243618621000000)*t; + y[30] += CONSTANT(-0.171327458205000000)*t; + y[34] += CONSTANT(-0.101358691177000000)*t; + + // [10,19]: 3,15,13,31,33, + tf = CONSTANT(-0.075393004386799994)*f[3] + CONSTANT(0.099322584599600000)*f[15] + CONSTANT(0.102579924281000000)*f[13] + CONSTANT(0.097749909976500002)*f[31] + CONSTANT(-0.025339672794100002)*f[33]; + tg = CONSTANT(-0.075393004386799994)*g[3] + CONSTANT(0.099322584599600000)*g[15] + CONSTANT(0.102579924281000000)*g[13] + CONSTANT(0.097749909976500002)*g[31] + CONSTANT(-0.025339672794100002)*g[33]; + y[10] += tf*g[19] + tg*f[19]; + y[19] += tf*g[10] + tg*f[10]; + t = f[10] * g[19] + f[19] * g[10]; + y[3] += CONSTANT(-0.075393004386799994)*t; + y[15] += CONSTANT(0.099322584599600000)*t; + y[13] += CONSTANT(0.102579924281000000)*t; + y[31] += CONSTANT(0.097749909976500002)*t; + y[33] += CONSTANT(-0.025339672794100002)*t; + + // [10,21]: 11,1,9,27,29, + tf = CONSTANT(0.102579924281000000)*f[11] + CONSTANT(-0.075393004386799994)*f[1] + CONSTANT(-0.099322584599600000)*f[9] + CONSTANT(0.025339672794100002)*f[27] + CONSTANT(0.097749909976500002)*f[29]; + tg = CONSTANT(0.102579924281000000)*g[11] + CONSTANT(-0.075393004386799994)*g[1] + CONSTANT(-0.099322584599600000)*g[9] + CONSTANT(0.025339672794100002)*g[27] + CONSTANT(0.097749909976500002)*g[29]; + y[10] += tf*g[21] + tg*f[21]; + y[21] += tf*g[10] + tg*f[10]; + t = f[10] * g[21] + f[21] * g[10]; + y[11] += CONSTANT(0.102579924281000000)*t; + y[1] += CONSTANT(-0.075393004386799994)*t; + y[9] += CONSTANT(-0.099322584599600000)*t; + y[27] += CONSTANT(0.025339672794100002)*t; + y[29] += CONSTANT(0.097749909976500002)*t; + + // [10,23]: 11,1,25,29, + tf = CONSTANT(-0.067850242288900006)*f[11] + CONSTANT(-0.199471140200000010)*f[1] + CONSTANT(0.149911525925999990)*f[25] + CONSTANT(0.113793659091000000)*f[29]; + tg = CONSTANT(-0.067850242288900006)*g[11] + CONSTANT(-0.199471140200000010)*g[1] + CONSTANT(0.149911525925999990)*g[25] + CONSTANT(0.113793659091000000)*g[29]; + y[10] += tf*g[23] + tg*f[23]; + y[23] += tf*g[10] + tg*f[10]; + t = f[10] * g[23] + f[23] * g[10]; + y[11] += CONSTANT(-0.067850242288900006)*t; + y[1] += CONSTANT(-0.199471140200000010)*t; + y[25] += CONSTANT(0.149911525925999990)*t; + y[29] += CONSTANT(0.113793659091000000)*t; + + // [10,28]: 6,20,24, + tf = CONSTANT(0.190188269814000000)*f[6] + CONSTANT(-0.065426753820500005)*f[20] + CONSTANT(0.077413979109600004)*f[24]; + tg = CONSTANT(0.190188269814000000)*g[6] + CONSTANT(-0.065426753820500005)*g[20] + CONSTANT(0.077413979109600004)*g[24]; + y[10] += tf*g[28] + tg*f[28]; + y[28] += tf*g[10] + tg*f[10]; + t = f[10] * g[28] + f[28] * g[10]; + y[6] += CONSTANT(0.190188269814000000)*t; + y[20] += CONSTANT(-0.065426753820500005)*t; + y[24] += CONSTANT(0.077413979109600004)*t; + + // [11,11]: 0,6,8,20,22, + tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.126156626101000010)*f[6] + CONSTANT(-0.145673124078999990)*f[8] + CONSTANT(0.025644981070299999)*f[20] + CONSTANT(-0.114687841910000000)*f[22]; + tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.126156626101000010)*g[6] + CONSTANT(-0.145673124078999990)*g[8] + CONSTANT(0.025644981070299999)*g[20] + CONSTANT(-0.114687841910000000)*g[22]; + y[11] += tf*g[11] + tg*f[11]; + t = f[11] * g[11]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[6] += CONSTANT(0.126156626101000010)*t; + y[8] += CONSTANT(-0.145673124078999990)*t; + y[20] += CONSTANT(0.025644981070299999)*t; + y[22] += CONSTANT(-0.114687841910000000)*t; + + // [11,16]: 15,33,35, + tf = CONSTANT(-0.117520066953000000)*f[15] + CONSTANT(0.119929220739999990)*f[33] + CONSTANT(0.134084945035999990)*f[35]; + tg = CONSTANT(-0.117520066953000000)*g[15] + CONSTANT(0.119929220739999990)*g[33] + CONSTANT(0.134084945035999990)*g[35]; + y[11] += tf*g[16] + tg*f[16]; + y[16] += tf*g[11] + tg*f[11]; + t = f[11] * g[16] + f[16] * g[11]; + y[15] += CONSTANT(-0.117520066953000000)*t; + y[33] += CONSTANT(0.119929220739999990)*t; + y[35] += CONSTANT(0.134084945035999990)*t; + + // [11,18]: 3,13,15,31,33, + tf = CONSTANT(0.168583882834000000)*f[3] + CONSTANT(0.114687841909000000)*f[13] + CONSTANT(-0.133255230519000010)*f[15] + CONSTANT(0.075189952564900006)*f[31] + CONSTANT(-0.101990215611000000)*f[33]; + tg = CONSTANT(0.168583882834000000)*g[3] + CONSTANT(0.114687841909000000)*g[13] + CONSTANT(-0.133255230519000010)*g[15] + CONSTANT(0.075189952564900006)*g[31] + CONSTANT(-0.101990215611000000)*g[33]; + y[11] += tf*g[18] + tg*f[18]; + y[18] += tf*g[11] + tg*f[11]; + t = f[11] * g[18] + f[18] * g[11]; + y[3] += CONSTANT(0.168583882834000000)*t; + y[13] += CONSTANT(0.114687841909000000)*t; + y[15] += CONSTANT(-0.133255230519000010)*t; + y[31] += CONSTANT(0.075189952564900006)*t; + y[33] += CONSTANT(-0.101990215611000000)*t; + + // [11,19]: 2,14,12,30,32, + tf = CONSTANT(0.238413613504000000)*f[2] + CONSTANT(-0.102579924282000000)*f[14] + CONSTANT(0.099322584599300004)*f[12] + CONSTANT(0.009577496073830001)*f[30] + CONSTANT(-0.104682806112000000)*f[32]; + tg = CONSTANT(0.238413613504000000)*g[2] + CONSTANT(-0.102579924282000000)*g[14] + CONSTANT(0.099322584599300004)*g[12] + CONSTANT(0.009577496073830001)*g[30] + CONSTANT(-0.104682806112000000)*g[32]; + y[11] += tf*g[19] + tg*f[19]; + y[19] += tf*g[11] + tg*f[11]; + t = f[11] * g[19] + f[19] * g[11]; + y[2] += CONSTANT(0.238413613504000000)*t; + y[14] += CONSTANT(-0.102579924282000000)*t; + y[12] += CONSTANT(0.099322584599300004)*t; + y[30] += CONSTANT(0.009577496073830001)*t; + y[32] += CONSTANT(-0.104682806112000000)*t; + + // [11,24]: 9,25,27, + tf = CONSTANT(0.117520066950999990)*f[9] + CONSTANT(-0.134084945037000000)*f[25] + CONSTANT(-0.119929220742000010)*f[27]; + tg = CONSTANT(0.117520066950999990)*g[9] + CONSTANT(-0.134084945037000000)*g[25] + CONSTANT(-0.119929220742000010)*g[27]; + y[11] += tf*g[24] + tg*f[24]; + y[24] += tf*g[11] + tg*f[11]; + t = f[11] * g[24] + f[24] * g[11]; + y[9] += CONSTANT(0.117520066950999990)*t; + y[25] += CONSTANT(-0.134084945037000000)*t; + y[27] += CONSTANT(-0.119929220742000010)*t; + + // [11,29]: 6,20,22,8, + tf = CONSTANT(0.227318461243000010)*f[6] + CONSTANT(0.086019920779800002)*f[20] + CONSTANT(-0.075189952565200002)*f[22] + CONSTANT(0.065621187395299999)*f[8]; + tg = CONSTANT(0.227318461243000010)*g[6] + CONSTANT(0.086019920779800002)*g[20] + CONSTANT(-0.075189952565200002)*g[22] + CONSTANT(0.065621187395299999)*g[8]; + y[11] += tf*g[29] + tg*f[29]; + y[29] += tf*g[11] + tg*f[11]; + t = f[11] * g[29] + f[29] * g[11]; + y[6] += CONSTANT(0.227318461243000010)*t; + y[20] += CONSTANT(0.086019920779800002)*t; + y[22] += CONSTANT(-0.075189952565200002)*t; + y[8] += CONSTANT(0.065621187395299999)*t; + + // [12,12]: 0,6,20, + tf = CONSTANT(0.282094799871999980)*f[0] + CONSTANT(0.168208852954000010)*f[6] + CONSTANT(0.153869910786000010)*f[20]; + tg = CONSTANT(0.282094799871999980)*g[0] + CONSTANT(0.168208852954000010)*g[6] + CONSTANT(0.153869910786000010)*g[20]; + y[12] += tf*g[12] + tg*f[12]; + t = f[12] * g[12]; + y[0] += CONSTANT(0.282094799871999980)*t; + y[6] += CONSTANT(0.168208852954000010)*t; + y[20] += CONSTANT(0.153869910786000010)*t; + + // [12,30]: 20,6, + tf = CONSTANT(0.148373961712999990)*f[20] + CONSTANT(0.239614719999000000)*f[6]; + tg = CONSTANT(0.148373961712999990)*g[20] + CONSTANT(0.239614719999000000)*g[6]; + y[12] += tf*g[30] + tg*f[30]; + y[30] += tf*g[12] + tg*f[12]; + t = f[12] * g[30] + f[30] * g[12]; + y[20] += CONSTANT(0.148373961712999990)*t; + y[6] += CONSTANT(0.239614719999000000)*t; + + // [13,13]: 0,8,6,20,22, + tf = CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.145673124078999990)*f[8] + CONSTANT(0.126156626101000010)*f[6] + CONSTANT(0.025644981070299999)*f[20] + CONSTANT(0.114687841910000000)*f[22]; + tg = CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.145673124078999990)*g[8] + CONSTANT(0.126156626101000010)*g[6] + CONSTANT(0.025644981070299999)*g[20] + CONSTANT(0.114687841910000000)*g[22]; + y[13] += tf*g[13] + tg*f[13]; + t = f[13] * g[13]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[8] += CONSTANT(0.145673124078999990)*t; + y[6] += CONSTANT(0.126156626101000010)*t; + y[20] += CONSTANT(0.025644981070299999)*t; + y[22] += CONSTANT(0.114687841910000000)*t; + + // [13,16]: 9,25,27, + tf = CONSTANT(-0.117520066953000000)*f[9] + CONSTANT(-0.134084945035999990)*f[25] + CONSTANT(0.119929220739999990)*f[27]; + tg = CONSTANT(-0.117520066953000000)*g[9] + CONSTANT(-0.134084945035999990)*g[25] + CONSTANT(0.119929220739999990)*g[27]; + y[13] += tf*g[16] + tg*f[16]; + y[16] += tf*g[13] + tg*f[13]; + t = f[13] * g[16] + f[16] * g[13]; + y[9] += CONSTANT(-0.117520066953000000)*t; + y[25] += CONSTANT(-0.134084945035999990)*t; + y[27] += CONSTANT(0.119929220739999990)*t; + + // [13,21]: 2,12,14,30,32, + tf = CONSTANT(0.238413613504000000)*f[2] + CONSTANT(0.099322584599300004)*f[12] + CONSTANT(0.102579924282000000)*f[14] + CONSTANT(0.009577496073830001)*f[30] + CONSTANT(0.104682806112000000)*f[32]; + tg = CONSTANT(0.238413613504000000)*g[2] + CONSTANT(0.099322584599300004)*g[12] + CONSTANT(0.102579924282000000)*g[14] + CONSTANT(0.009577496073830001)*g[30] + CONSTANT(0.104682806112000000)*g[32]; + y[13] += tf*g[21] + tg*f[21]; + y[21] += tf*g[13] + tg*f[13]; + t = f[13] * g[21] + f[21] * g[13]; + y[2] += CONSTANT(0.238413613504000000)*t; + y[12] += CONSTANT(0.099322584599300004)*t; + y[14] += CONSTANT(0.102579924282000000)*t; + y[30] += CONSTANT(0.009577496073830001)*t; + y[32] += CONSTANT(0.104682806112000000)*t; + + // [13,24]: 15,33,35, + tf = CONSTANT(-0.117520066950999990)*f[15] + CONSTANT(0.119929220742000010)*f[33] + CONSTANT(-0.134084945037000000)*f[35]; + tg = CONSTANT(-0.117520066950999990)*g[15] + CONSTANT(0.119929220742000010)*g[33] + CONSTANT(-0.134084945037000000)*g[35]; + y[13] += tf*g[24] + tg*f[24]; + y[24] += tf*g[13] + tg*f[13]; + t = f[13] * g[24] + f[24] * g[13]; + y[15] += CONSTANT(-0.117520066950999990)*t; + y[33] += CONSTANT(0.119929220742000010)*t; + y[35] += CONSTANT(-0.134084945037000000)*t; + + // [13,31]: 6,22,20,8, + tf = CONSTANT(0.227318461243000010)*f[6] + CONSTANT(0.075189952565200002)*f[22] + CONSTANT(0.086019920779800002)*f[20] + CONSTANT(-0.065621187395299999)*f[8]; + tg = CONSTANT(0.227318461243000010)*g[6] + CONSTANT(0.075189952565200002)*g[22] + CONSTANT(0.086019920779800002)*g[20] + CONSTANT(-0.065621187395299999)*g[8]; + y[13] += tf*g[31] + tg*f[31]; + y[31] += tf*g[13] + tg*f[13]; + t = f[13] * g[31] + f[31] * g[13]; + y[6] += CONSTANT(0.227318461243000010)*t; + y[22] += CONSTANT(0.075189952565200002)*t; + y[20] += CONSTANT(0.086019920779800002)*t; + y[8] += CONSTANT(-0.065621187395299999)*t; + + // [14,14]: 0,20,24, + tf = CONSTANT(0.282094791771999980)*f[0] + CONSTANT(-0.179514867494000000)*f[20] + CONSTANT(0.151717754049000010)*f[24]; + tg = CONSTANT(0.282094791771999980)*g[0] + CONSTANT(-0.179514867494000000)*g[20] + CONSTANT(0.151717754049000010)*g[24]; + y[14] += tf*g[14] + tg*f[14]; + t = f[14] * g[14]; + y[0] += CONSTANT(0.282094791771999980)*t; + y[20] += CONSTANT(-0.179514867494000000)*t; + y[24] += CONSTANT(0.151717754049000010)*t; + + // [14,17]: 11,1,25,29, + tf = CONSTANT(0.067850242288500007)*f[11] + CONSTANT(0.199471140196999990)*f[1] + CONSTANT(0.149911525925999990)*f[25] + CONSTANT(-0.113793659092000000)*f[29]; + tg = CONSTANT(0.067850242288500007)*g[11] + CONSTANT(0.199471140196999990)*g[1] + CONSTANT(0.149911525925999990)*g[25] + CONSTANT(-0.113793659092000000)*g[29]; + y[14] += tf*g[17] + tg*f[17]; + y[17] += tf*g[14] + tg*f[14]; + t = f[14] * g[17] + f[17] * g[14]; + y[11] += CONSTANT(0.067850242288500007)*t; + y[1] += CONSTANT(0.199471140196999990)*t; + y[25] += CONSTANT(0.149911525925999990)*t; + y[29] += CONSTANT(-0.113793659092000000)*t; + + // [14,22]: 12,2,30,34, + tf = CONSTANT(-0.044418410173299998)*f[12] + CONSTANT(0.213243618621000000)*f[2] + CONSTANT(-0.171327458205000000)*f[30] + CONSTANT(0.101358691177000000)*f[34]; + tg = CONSTANT(-0.044418410173299998)*g[12] + CONSTANT(0.213243618621000000)*g[2] + CONSTANT(-0.171327458205000000)*g[30] + CONSTANT(0.101358691177000000)*g[34]; + y[14] += tf*g[22] + tg*f[22]; + y[22] += tf*g[14] + tg*f[14]; + t = f[14] * g[22] + f[22] * g[14]; + y[12] += CONSTANT(-0.044418410173299998)*t; + y[2] += CONSTANT(0.213243618621000000)*t; + y[30] += CONSTANT(-0.171327458205000000)*t; + y[34] += CONSTANT(0.101358691177000000)*t; + + // [14,23]: 13,3,31,35, + tf = CONSTANT(0.067850242288500007)*f[13] + CONSTANT(0.199471140196999990)*f[3] + CONSTANT(-0.113793659092000000)*f[31] + CONSTANT(0.149911525925999990)*f[35]; + tg = CONSTANT(0.067850242288500007)*g[13] + CONSTANT(0.199471140196999990)*g[3] + CONSTANT(-0.113793659092000000)*g[31] + CONSTANT(0.149911525925999990)*g[35]; + y[14] += tf*g[23] + tg*f[23]; + y[23] += tf*g[14] + tg*f[14]; + t = f[14] * g[23] + f[23] * g[14]; + y[13] += CONSTANT(0.067850242288500007)*t; + y[3] += CONSTANT(0.199471140196999990)*t; + y[31] += CONSTANT(-0.113793659092000000)*t; + y[35] += CONSTANT(0.149911525925999990)*t; + + // [14,32]: 20,6,24, + tf = CONSTANT(-0.065426753820500005)*f[20] + CONSTANT(0.190188269814000000)*f[6] + CONSTANT(-0.077413979109600004)*f[24]; + tg = CONSTANT(-0.065426753820500005)*g[20] + CONSTANT(0.190188269814000000)*g[6] + CONSTANT(-0.077413979109600004)*g[24]; + y[14] += tf*g[32] + tg*f[32]; + y[32] += tf*g[14] + tg*f[14]; + t = f[14] * g[32] + f[32] * g[14]; + y[20] += CONSTANT(-0.065426753820500005)*t; + y[6] += CONSTANT(0.190188269814000000)*t; + y[24] += CONSTANT(-0.077413979109600004)*t; + + // [15,15]: 0,6,20, + tf = CONSTANT(0.282094791766999970)*f[0] + CONSTANT(-0.210261043508000010)*f[6] + CONSTANT(0.076934943209800002)*f[20]; + tg = CONSTANT(0.282094791766999970)*g[0] + CONSTANT(-0.210261043508000010)*g[6] + CONSTANT(0.076934943209800002)*g[20]; + y[15] += tf*g[15] + tg*f[15]; + t = f[15] * g[15]; + y[0] += CONSTANT(0.282094791766999970)*t; + y[6] += CONSTANT(-0.210261043508000010)*t; + y[20] += CONSTANT(0.076934943209800002)*t; + + // [15,21]: 14,32,34, + tf = CONSTANT(-0.099322584600699995)*f[14] + CONSTANT(0.126698363970000010)*f[32] + CONSTANT(-0.131668802180999990)*f[34]; + tg = CONSTANT(-0.099322584600699995)*g[14] + CONSTANT(0.126698363970000010)*g[32] + CONSTANT(-0.131668802180999990)*g[34]; + y[15] += tf*g[21] + tg*f[21]; + y[21] += tf*g[15] + tg*f[15]; + t = f[15] * g[21] + f[21] * g[15]; + y[14] += CONSTANT(-0.099322584600699995)*t; + y[32] += CONSTANT(0.126698363970000010)*t; + y[34] += CONSTANT(-0.131668802180999990)*t; + + // [15,22]: 13,3,31,35, + tf = CONSTANT(0.133255230518000010)*f[13] + CONSTANT(-0.043528171378199997)*f[3] + CONSTANT(-0.101584686311000000)*f[31] + CONSTANT(-0.098140130732499997)*f[35]; + tg = CONSTANT(0.133255230518000010)*g[13] + CONSTANT(-0.043528171378199997)*g[3] + CONSTANT(-0.101584686311000000)*g[31] + CONSTANT(-0.098140130732499997)*g[35]; + y[15] += tf*g[22] + tg*f[22]; + y[22] += tf*g[15] + tg*f[15]; + t = f[15] * g[22] + f[22] * g[15]; + y[13] += CONSTANT(0.133255230518000010)*t; + y[3] += CONSTANT(-0.043528171378199997)*t; + y[31] += CONSTANT(-0.101584686311000000)*t; + y[35] += CONSTANT(-0.098140130732499997)*t; + + // [15,23]: 12,2,30, + tf = CONSTANT(-0.203550726872999990)*f[12] + CONSTANT(0.162867503964999990)*f[2] + CONSTANT(0.098140130728100003)*f[30]; + tg = CONSTANT(-0.203550726872999990)*g[12] + CONSTANT(0.162867503964999990)*g[2] + CONSTANT(0.098140130728100003)*g[30]; + y[15] += tf*g[23] + tg*f[23]; + y[23] += tf*g[15] + tg*f[15]; + t = f[15] * g[23] + f[23] * g[15]; + y[12] += CONSTANT(-0.203550726872999990)*t; + y[2] += CONSTANT(0.162867503964999990)*t; + y[30] += CONSTANT(0.098140130728100003)*t; + + // [15,33]: 6,20, + tf = CONSTANT(0.126792179874999990)*f[6] + CONSTANT(-0.196280261464999990)*f[20]; + tg = CONSTANT(0.126792179874999990)*g[6] + CONSTANT(-0.196280261464999990)*g[20]; + y[15] += tf*g[33] + tg*f[33]; + y[33] += tf*g[15] + tg*f[15]; + t = f[15] * g[33] + f[33] * g[15]; + y[6] += CONSTANT(0.126792179874999990)*t; + y[20] += CONSTANT(-0.196280261464999990)*t; + + // [16,16]: 0,6,20, + tf = CONSTANT(0.282094791763999990)*f[0] + CONSTANT(-0.229375683829000000)*f[6] + CONSTANT(0.106525305981000000)*f[20]; + tg = CONSTANT(0.282094791763999990)*g[0] + CONSTANT(-0.229375683829000000)*g[6] + CONSTANT(0.106525305981000000)*g[20]; + y[16] += tf*g[16] + tg*f[16]; + t = f[16] * g[16]; + y[0] += CONSTANT(0.282094791763999990)*t; + y[6] += CONSTANT(-0.229375683829000000)*t; + y[20] += CONSTANT(0.106525305981000000)*t; + + // [16,18]: 8,22, + tf = CONSTANT(-0.075080816693699995)*f[8] + CONSTANT(0.135045473380000000)*f[22]; + tg = CONSTANT(-0.075080816693699995)*g[8] + CONSTANT(0.135045473380000000)*g[22]; + y[16] += tf*g[18] + tg*f[18]; + y[18] += tf*g[16] + tg*f[16]; + t = f[16] * g[18] + f[18] * g[16]; + y[8] += CONSTANT(-0.075080816693699995)*t; + y[22] += CONSTANT(0.135045473380000000)*t; + + // [16,23]: 19,5, + tf = CONSTANT(-0.119098912754999990)*f[19] + CONSTANT(0.140463346187999990)*f[5]; + tg = CONSTANT(-0.119098912754999990)*g[19] + CONSTANT(0.140463346187999990)*g[5]; + y[16] += tf*g[23] + tg*f[23]; + y[23] += tf*g[16] + tg*f[16]; + t = f[16] * g[23] + f[23] * g[16]; + y[19] += CONSTANT(-0.119098912754999990)*t; + y[5] += CONSTANT(0.140463346187999990)*t; + + // [16,26]: 12,2,30, + tf = CONSTANT(-0.207723503645000000)*f[12] + CONSTANT(0.147319200325000010)*f[2] + CONSTANT(0.130197596199999990)*f[30]; + tg = CONSTANT(-0.207723503645000000)*g[12] + CONSTANT(0.147319200325000010)*g[2] + CONSTANT(0.130197596199999990)*g[30]; + y[16] += tf*g[26] + tg*f[26]; + y[26] += tf*g[16] + tg*f[16]; + t = f[16] * g[26] + f[26] * g[16]; + y[12] += CONSTANT(-0.207723503645000000)*t; + y[2] += CONSTANT(0.147319200325000010)*t; + y[30] += CONSTANT(0.130197596199999990)*t; + + // [16,28]: 14,32, + tf = CONSTANT(-0.077413979111300005)*f[14] + CONSTANT(0.128376561115000010)*f[32]; + tg = CONSTANT(-0.077413979111300005)*g[14] + CONSTANT(0.128376561115000010)*g[32]; + y[16] += tf*g[28] + tg*f[28]; + y[28] += tf*g[16] + tg*f[16]; + t = f[16] * g[28] + f[28] * g[16]; + y[14] += CONSTANT(-0.077413979111300005)*t; + y[32] += CONSTANT(0.128376561115000010)*t; + + // [16,29]: 15,33,35, + tf = CONSTANT(0.035835708931099997)*f[15] + CONSTANT(-0.118853600623999990)*f[33] + CONSTANT(-0.053152946071899999)*f[35]; + tg = CONSTANT(0.035835708931099997)*g[15] + CONSTANT(-0.118853600623999990)*g[33] + CONSTANT(-0.053152946071899999)*g[35]; + y[16] += tf*g[29] + tg*f[29]; + y[29] += tf*g[16] + tg*f[16]; + t = f[16] * g[29] + f[29] * g[16]; + y[15] += CONSTANT(0.035835708931099997)*t; + y[33] += CONSTANT(-0.118853600623999990)*t; + y[35] += CONSTANT(-0.053152946071899999)*t; + + // [16,31]: 27,9,25, + tf = CONSTANT(-0.118853600623999990)*f[27] + CONSTANT(0.035835708931099997)*f[9] + CONSTANT(0.053152946071899999)*f[25]; + tg = CONSTANT(-0.118853600623999990)*g[27] + CONSTANT(0.035835708931099997)*g[9] + CONSTANT(0.053152946071899999)*g[25]; + y[16] += tf*g[31] + tg*f[31]; + y[31] += tf*g[16] + tg*f[16]; + t = f[16] * g[31] + f[31] * g[16]; + y[27] += CONSTANT(-0.118853600623999990)*t; + y[9] += CONSTANT(0.035835708931099997)*t; + y[25] += CONSTANT(0.053152946071899999)*t; + + // [17,17]: 0,6,20, + tf = CONSTANT(0.282094791768999990)*f[0] + CONSTANT(-0.057343920955899998)*f[6] + CONSTANT(-0.159787958979000000)*f[20]; + tg = CONSTANT(0.282094791768999990)*g[0] + CONSTANT(-0.057343920955899998)*g[6] + CONSTANT(-0.159787958979000000)*g[20]; + y[17] += tf*g[17] + tg*f[17]; + t = f[17] * g[17]; + y[0] += CONSTANT(0.282094791768999990)*t; + y[6] += CONSTANT(-0.057343920955899998)*t; + y[20] += CONSTANT(-0.159787958979000000)*t; + + // [17,19]: 8,22,24, + tf = CONSTANT(-0.112621225039000000)*f[8] + CONSTANT(0.045015157794100001)*f[22] + CONSTANT(0.119098912753000000)*f[24]; + tg = CONSTANT(-0.112621225039000000)*g[8] + CONSTANT(0.045015157794100001)*g[22] + CONSTANT(0.119098912753000000)*g[24]; + y[17] += tf*g[19] + tg*f[19]; + y[19] += tf*g[17] + tg*f[17]; + t = f[17] * g[19] + f[19] * g[17]; + y[8] += CONSTANT(-0.112621225039000000)*t; + y[22] += CONSTANT(0.045015157794100001)*t; + y[24] += CONSTANT(0.119098912753000000)*t; + + // [17,21]: 16,4,18, + tf = CONSTANT(-0.119098912754999990)*f[16] + CONSTANT(-0.112621225039000000)*f[4] + CONSTANT(0.045015157794399997)*f[18]; + tg = CONSTANT(-0.119098912754999990)*g[16] + CONSTANT(-0.112621225039000000)*g[4] + CONSTANT(0.045015157794399997)*g[18]; + y[17] += tf*g[21] + tg*f[21]; + y[21] += tf*g[17] + tg*f[17]; + t = f[17] * g[21] + f[21] * g[17]; + y[16] += CONSTANT(-0.119098912754999990)*t; + y[4] += CONSTANT(-0.112621225039000000)*t; + y[18] += CONSTANT(0.045015157794399997)*t; + + // [17,26]: 3,13,31, + tf = CONSTANT(0.208340811096000000)*f[3] + CONSTANT(0.029982305185199998)*f[13] + CONSTANT(-0.118853600623999990)*f[31]; + tg = CONSTANT(0.208340811096000000)*g[3] + CONSTANT(0.029982305185199998)*g[13] + CONSTANT(-0.118853600623999990)*g[31]; + y[17] += tf*g[26] + tg*f[26]; + y[26] += tf*g[17] + tg*f[17]; + t = f[17] * g[26] + f[26] * g[17]; + y[3] += CONSTANT(0.208340811096000000)*t; + y[13] += CONSTANT(0.029982305185199998)*t; + y[31] += CONSTANT(-0.118853600623999990)*t; + + // [17,27]: 12,2,30, + tf = CONSTANT(-0.103861751821000010)*f[12] + CONSTANT(0.196425600433000000)*f[2] + CONSTANT(-0.130197596204999990)*f[30]; + tg = CONSTANT(-0.103861751821000010)*g[12] + CONSTANT(0.196425600433000000)*g[2] + CONSTANT(-0.130197596204999990)*g[30]; + y[17] += tf*g[27] + tg*f[27]; + y[27] += tf*g[17] + tg*f[17]; + t = f[17] * g[27] + f[27] * g[17]; + y[12] += CONSTANT(-0.103861751821000010)*t; + y[2] += CONSTANT(0.196425600433000000)*t; + y[30] += CONSTANT(-0.130197596204999990)*t; + + // [17,28]: 13,3,31,35, + tf = CONSTANT(0.121172043789000000)*f[13] + CONSTANT(-0.060142811686500000)*f[3] + CONSTANT(0.034310079156700000)*f[31] + CONSTANT(0.099440056652200001)*f[35]; + tg = CONSTANT(0.121172043789000000)*g[13] + CONSTANT(-0.060142811686500000)*g[3] + CONSTANT(0.034310079156700000)*g[31] + CONSTANT(0.099440056652200001)*g[35]; + y[17] += tf*g[28] + tg*f[28]; + y[28] += tf*g[17] + tg*f[17]; + t = f[17] * g[28] + f[28] * g[17]; + y[13] += CONSTANT(0.121172043789000000)*t; + y[3] += CONSTANT(-0.060142811686500000)*t; + y[31] += CONSTANT(0.034310079156700000)*t; + y[35] += CONSTANT(0.099440056652200001)*t; + + // [17,32]: 11,1,25,29, + tf = CONSTANT(0.121172043788000010)*f[11] + CONSTANT(-0.060142811686900000)*f[1] + CONSTANT(-0.099440056652700004)*f[25] + CONSTANT(0.034310079156599997)*f[29]; + tg = CONSTANT(0.121172043788000010)*g[11] + CONSTANT(-0.060142811686900000)*g[1] + CONSTANT(-0.099440056652700004)*g[25] + CONSTANT(0.034310079156599997)*g[29]; + y[17] += tf*g[32] + tg*f[32]; + y[32] += tf*g[17] + tg*f[17]; + t = f[17] * g[32] + f[32] * g[17]; + y[11] += CONSTANT(0.121172043788000010)*t; + y[1] += CONSTANT(-0.060142811686900000)*t; + y[25] += CONSTANT(-0.099440056652700004)*t; + y[29] += CONSTANT(0.034310079156599997)*t; + + // [17,34]: 29,11,1, + tf = CONSTANT(0.118853600623000000)*f[29] + CONSTANT(-0.029982305185400002)*f[11] + CONSTANT(-0.208340811100000000)*f[1]; + tg = CONSTANT(0.118853600623000000)*g[29] + CONSTANT(-0.029982305185400002)*g[11] + CONSTANT(-0.208340811100000000)*g[1]; + y[17] += tf*g[34] + tg*f[34]; + y[34] += tf*g[17] + tg*f[17]; + t = f[17] * g[34] + f[34] * g[17]; + y[29] += CONSTANT(0.118853600623000000)*t; + y[11] += CONSTANT(-0.029982305185400002)*t; + y[1] += CONSTANT(-0.208340811100000000)*t; + + // [18,18]: 6,0,20,24, + tf = CONSTANT(0.065535909662600006)*f[6] + CONSTANT(0.282094791771999980)*f[0] + CONSTANT(-0.083698454702400005)*f[20] + CONSTANT(-0.135045473384000000)*f[24]; + tg = CONSTANT(0.065535909662600006)*g[6] + CONSTANT(0.282094791771999980)*g[0] + CONSTANT(-0.083698454702400005)*g[20] + CONSTANT(-0.135045473384000000)*g[24]; + y[18] += tf*g[18] + tg*f[18]; + t = f[18] * g[18]; + y[6] += CONSTANT(0.065535909662600006)*t; + y[0] += CONSTANT(0.282094791771999980)*t; + y[20] += CONSTANT(-0.083698454702400005)*t; + y[24] += CONSTANT(-0.135045473384000000)*t; + + // [18,19]: 7,21,23, + tf = CONSTANT(0.090297865407399994)*f[7] + CONSTANT(0.102084782359000000)*f[21] + CONSTANT(-0.045015157794399997)*f[23]; + tg = CONSTANT(0.090297865407399994)*g[7] + CONSTANT(0.102084782359000000)*g[21] + CONSTANT(-0.045015157794399997)*g[23]; + y[18] += tf*g[19] + tg*f[19]; + y[19] += tf*g[18] + tg*f[18]; + t = f[18] * g[19] + f[19] * g[18]; + y[7] += CONSTANT(0.090297865407399994)*t; + y[21] += CONSTANT(0.102084782359000000)*t; + y[23] += CONSTANT(-0.045015157794399997)*t; + + // [18,25]: 15,33, + tf = CONSTANT(-0.098140130731999994)*f[15] + CONSTANT(0.130197596202000000)*f[33]; + tg = CONSTANT(-0.098140130731999994)*g[15] + CONSTANT(0.130197596202000000)*g[33]; + y[18] += tf*g[25] + tg*f[25]; + y[25] += tf*g[18] + tg*f[18]; + t = f[18] * g[25] + f[25] * g[18]; + y[15] += CONSTANT(-0.098140130731999994)*t; + y[33] += CONSTANT(0.130197596202000000)*t; + + // [18,26]: 14,32, + tf = CONSTANT(0.101358691174000000)*f[14] + CONSTANT(0.084042186965900004)*f[32]; + tg = CONSTANT(0.101358691174000000)*g[14] + CONSTANT(0.084042186965900004)*g[32]; + y[18] += tf*g[26] + tg*f[26]; + y[26] += tf*g[18] + tg*f[18]; + t = f[18] * g[26] + f[26] * g[18]; + y[14] += CONSTANT(0.101358691174000000)*t; + y[32] += CONSTANT(0.084042186965900004)*t; + + // [18,27]: 13,3,35, + tf = CONSTANT(0.101990215611000000)*f[13] + CONSTANT(0.183739324705999990)*f[3] + CONSTANT(-0.130197596202000000)*f[35]; + tg = CONSTANT(0.101990215611000000)*g[13] + CONSTANT(0.183739324705999990)*g[3] + CONSTANT(-0.130197596202000000)*g[35]; + y[18] += tf*g[27] + tg*f[27]; + y[27] += tf*g[18] + tg*f[18]; + t = f[18] * g[27] + f[27] * g[18]; + y[13] += CONSTANT(0.101990215611000000)*t; + y[3] += CONSTANT(0.183739324705999990)*t; + y[35] += CONSTANT(-0.130197596202000000)*t; + + // [18,28]: 2,12,30,34, + tf = CONSTANT(0.225033795606000010)*f[2] + CONSTANT(0.022664492358099999)*f[12] + CONSTANT(-0.099440056651100006)*f[30] + CONSTANT(-0.084042186968800003)*f[34]; + tg = CONSTANT(0.225033795606000010)*g[2] + CONSTANT(0.022664492358099999)*g[12] + CONSTANT(-0.099440056651100006)*g[30] + CONSTANT(-0.084042186968800003)*g[34]; + y[18] += tf*g[28] + tg*f[28]; + y[28] += tf*g[18] + tg*f[18]; + t = f[18] * g[28] + f[28] * g[18]; + y[2] += CONSTANT(0.225033795606000010)*t; + y[12] += CONSTANT(0.022664492358099999)*t; + y[30] += CONSTANT(-0.099440056651100006)*t; + y[34] += CONSTANT(-0.084042186968800003)*t; + + // [18,29]: 3,13,15,31, + tf = CONSTANT(-0.085054779966799998)*f[3] + CONSTANT(0.075189952564900006)*f[13] + CONSTANT(0.101584686310000010)*f[15] + CONSTANT(0.097043558538999999)*f[31]; + tg = CONSTANT(-0.085054779966799998)*g[3] + CONSTANT(0.075189952564900006)*g[13] + CONSTANT(0.101584686310000010)*g[15] + CONSTANT(0.097043558538999999)*g[31]; + y[18] += tf*g[29] + tg*f[29]; + y[29] += tf*g[18] + tg*f[18]; + t = f[18] * g[29] + f[29] * g[18]; + y[3] += CONSTANT(-0.085054779966799998)*t; + y[13] += CONSTANT(0.075189952564900006)*t; + y[15] += CONSTANT(0.101584686310000010)*t; + y[31] += CONSTANT(0.097043558538999999)*t; + + // [19,19]: 6,8,0,20,22, + tf = CONSTANT(0.139263808033999990)*f[6] + CONSTANT(-0.141889406570999990)*f[8] + CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.068480553847200004)*f[20] + CONSTANT(-0.102084782360000000)*f[22]; + tg = CONSTANT(0.139263808033999990)*g[6] + CONSTANT(-0.141889406570999990)*g[8] + CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.068480553847200004)*g[20] + CONSTANT(-0.102084782360000000)*g[22]; + y[19] += tf*g[19] + tg*f[19]; + t = f[19] * g[19]; + y[6] += CONSTANT(0.139263808033999990)*t; + y[8] += CONSTANT(-0.141889406570999990)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + y[20] += CONSTANT(0.068480553847200004)*t; + y[22] += CONSTANT(-0.102084782360000000)*t; + + // [19,25]: 34, + tf = CONSTANT(-0.130197596205999990)*f[34]; + tg = CONSTANT(-0.130197596205999990)*g[34]; + y[19] += tf*g[25] + tg*f[25]; + y[25] += tf*g[19] + tg*f[19]; + t = f[19] * g[25] + f[25] * g[19]; + y[34] += CONSTANT(-0.130197596205999990)*t; + + // [19,26]: 15,35, + tf = CONSTANT(-0.131668802182000000)*f[15] + CONSTANT(0.130197596204999990)*f[35]; + tg = CONSTANT(-0.131668802182000000)*g[15] + CONSTANT(0.130197596204999990)*g[35]; + y[19] += tf*g[26] + tg*f[26]; + y[26] += tf*g[19] + tg*f[19]; + t = f[19] * g[26] + f[26] * g[19]; + y[15] += CONSTANT(-0.131668802182000000)*t; + y[35] += CONSTANT(0.130197596204999990)*t; + + // [19,27]: 14,32, + tf = CONSTANT(0.025339672793899998)*f[14] + CONSTANT(0.084042186967699994)*f[32]; + tg = CONSTANT(0.025339672793899998)*g[14] + CONSTANT(0.084042186967699994)*g[32]; + y[19] += tf*g[27] + tg*f[27]; + y[27] += tf*g[19] + tg*f[19]; + t = f[19] * g[27] + f[27] * g[19]; + y[14] += CONSTANT(0.025339672793899998)*t; + y[32] += CONSTANT(0.084042186967699994)*t; + + // [19,28]: 13,3,15,31,33, + tf = CONSTANT(0.104682806111000000)*f[13] + CONSTANT(0.159122922869999990)*f[3] + CONSTANT(-0.126698363970000010)*f[15] + CONSTANT(0.090775936911399999)*f[31] + CONSTANT(-0.084042186968400004)*f[33]; + tg = CONSTANT(0.104682806111000000)*g[13] + CONSTANT(0.159122922869999990)*g[3] + CONSTANT(-0.126698363970000010)*g[15] + CONSTANT(0.090775936911399999)*g[31] + CONSTANT(-0.084042186968400004)*g[33]; + y[19] += tf*g[28] + tg*f[28]; + y[28] += tf*g[19] + tg*f[19]; + t = f[19] * g[28] + f[28] * g[19]; + y[13] += CONSTANT(0.104682806111000000)*t; + y[3] += CONSTANT(0.159122922869999990)*t; + y[15] += CONSTANT(-0.126698363970000010)*t; + y[31] += CONSTANT(0.090775936911399999)*t; + y[33] += CONSTANT(-0.084042186968400004)*t; + + // [19,29]: 12,14,2,30,32, + tf = CONSTANT(0.115089467124000010)*f[12] + CONSTANT(-0.097749909977199997)*f[14] + CONSTANT(0.240571246744999990)*f[2] + CONSTANT(0.053152946072499999)*f[30] + CONSTANT(-0.090775936912099994)*f[32]; + tg = CONSTANT(0.115089467124000010)*g[12] + CONSTANT(-0.097749909977199997)*g[14] + CONSTANT(0.240571246744999990)*g[2] + CONSTANT(0.053152946072499999)*g[30] + CONSTANT(-0.090775936912099994)*g[32]; + y[19] += tf*g[29] + tg*f[29]; + y[29] += tf*g[19] + tg*f[19]; + t = f[19] * g[29] + f[29] * g[19]; + y[12] += CONSTANT(0.115089467124000010)*t; + y[14] += CONSTANT(-0.097749909977199997)*t; + y[2] += CONSTANT(0.240571246744999990)*t; + y[30] += CONSTANT(0.053152946072499999)*t; + y[32] += CONSTANT(-0.090775936912099994)*t; + + // [20,20]: 6,0,20, + tf = CONSTANT(0.163839797503000010)*f[6] + CONSTANT(0.282094802232000010)*f[0]; + tg = CONSTANT(0.163839797503000010)*g[6] + CONSTANT(0.282094802232000010)*g[0]; + y[20] += tf*g[20] + tg*f[20]; + t = f[20] * g[20]; + y[6] += CONSTANT(0.163839797503000010)*t; + y[0] += CONSTANT(0.282094802232000010)*t; + y[20] += CONSTANT(0.136961139005999990)*t; + + // [21,21]: 6,20,0,8,22, + tf = CONSTANT(0.139263808033999990)*f[6] + CONSTANT(0.068480553847200004)*f[20] + CONSTANT(0.282094791773999990)*f[0] + CONSTANT(0.141889406570999990)*f[8] + CONSTANT(0.102084782360000000)*f[22]; + tg = CONSTANT(0.139263808033999990)*g[6] + CONSTANT(0.068480553847200004)*g[20] + CONSTANT(0.282094791773999990)*g[0] + CONSTANT(0.141889406570999990)*g[8] + CONSTANT(0.102084782360000000)*g[22]; + y[21] += tf*g[21] + tg*f[21]; + t = f[21] * g[21]; + y[6] += CONSTANT(0.139263808033999990)*t; + y[20] += CONSTANT(0.068480553847200004)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + y[8] += CONSTANT(0.141889406570999990)*t; + y[22] += CONSTANT(0.102084782360000000)*t; + + // [21,23]: 8,22,24, + tf = CONSTANT(-0.112621225039000000)*f[8] + CONSTANT(0.045015157794100001)*f[22] + CONSTANT(-0.119098912753000000)*f[24]; + tg = CONSTANT(-0.112621225039000000)*g[8] + CONSTANT(0.045015157794100001)*g[22] + CONSTANT(-0.119098912753000000)*g[24]; + y[21] += tf*g[23] + tg*f[23]; + y[23] += tf*g[21] + tg*f[21]; + t = f[21] * g[23] + f[23] * g[21]; + y[8] += CONSTANT(-0.112621225039000000)*t; + y[22] += CONSTANT(0.045015157794100001)*t; + y[24] += CONSTANT(-0.119098912753000000)*t; + + // [21,26]: 9,25, + tf = CONSTANT(-0.131668802182000000)*f[9] + CONSTANT(-0.130197596204999990)*f[25]; + tg = CONSTANT(-0.131668802182000000)*g[9] + CONSTANT(-0.130197596204999990)*g[25]; + y[21] += tf*g[26] + tg*f[26]; + y[26] += tf*g[21] + tg*f[21]; + t = f[21] * g[26] + f[26] * g[21]; + y[9] += CONSTANT(-0.131668802182000000)*t; + y[25] += CONSTANT(-0.130197596204999990)*t; + + // [21,28]: 27,1,11,9,29, + tf = CONSTANT(0.084042186968400004)*f[27] + CONSTANT(0.159122922869999990)*f[1] + CONSTANT(0.104682806111000000)*f[11] + CONSTANT(0.126698363970000010)*f[9] + CONSTANT(0.090775936911399999)*f[29]; + tg = CONSTANT(0.084042186968400004)*g[27] + CONSTANT(0.159122922869999990)*g[1] + CONSTANT(0.104682806111000000)*g[11] + CONSTANT(0.126698363970000010)*g[9] + CONSTANT(0.090775936911399999)*g[29]; + y[21] += tf*g[28] + tg*f[28]; + y[28] += tf*g[21] + tg*f[21]; + t = f[21] * g[28] + f[28] * g[21]; + y[27] += CONSTANT(0.084042186968400004)*t; + y[1] += CONSTANT(0.159122922869999990)*t; + y[11] += CONSTANT(0.104682806111000000)*t; + y[9] += CONSTANT(0.126698363970000010)*t; + y[29] += CONSTANT(0.090775936911399999)*t; + + // [21,31]: 14,2,30,12,32, + tf = CONSTANT(0.097749909977199997)*f[14] + CONSTANT(0.240571246744999990)*f[2] + CONSTANT(0.053152946072499999)*f[30] + CONSTANT(0.115089467124000010)*f[12] + CONSTANT(0.090775936912099994)*f[32]; + tg = CONSTANT(0.097749909977199997)*g[14] + CONSTANT(0.240571246744999990)*g[2] + CONSTANT(0.053152946072499999)*g[30] + CONSTANT(0.115089467124000010)*g[12] + CONSTANT(0.090775936912099994)*g[32]; + y[21] += tf*g[31] + tg*f[31]; + y[31] += tf*g[21] + tg*f[21]; + t = f[21] * g[31] + f[31] * g[21]; + y[14] += CONSTANT(0.097749909977199997)*t; + y[2] += CONSTANT(0.240571246744999990)*t; + y[30] += CONSTANT(0.053152946072499999)*t; + y[12] += CONSTANT(0.115089467124000010)*t; + y[32] += CONSTANT(0.090775936912099994)*t; + + // [21,33]: 32,14, + tf = CONSTANT(0.084042186967699994)*f[32] + CONSTANT(0.025339672793899998)*f[14]; + tg = CONSTANT(0.084042186967699994)*g[32] + CONSTANT(0.025339672793899998)*g[14]; + y[21] += tf*g[33] + tg*f[33]; + y[33] += tf*g[21] + tg*f[21]; + t = f[21] * g[33] + f[33] * g[21]; + y[32] += CONSTANT(0.084042186967699994)*t; + y[14] += CONSTANT(0.025339672793899998)*t; + + // [21,34]: 35, + tf = CONSTANT(-0.130197596205999990)*f[35]; + tg = CONSTANT(-0.130197596205999990)*g[35]; + y[21] += tf*g[34] + tg*f[34]; + y[34] += tf*g[21] + tg*f[21]; + t = f[21] * g[34] + f[34] * g[21]; + y[35] += CONSTANT(-0.130197596205999990)*t; + + // [22,22]: 6,20,0,24, + tf = CONSTANT(0.065535909662600006)*f[6] + CONSTANT(-0.083698454702400005)*f[20] + CONSTANT(0.282094791771999980)*f[0] + CONSTANT(0.135045473384000000)*f[24]; + tg = CONSTANT(0.065535909662600006)*g[6] + CONSTANT(-0.083698454702400005)*g[20] + CONSTANT(0.282094791771999980)*g[0] + CONSTANT(0.135045473384000000)*g[24]; + y[22] += tf*g[22] + tg*f[22]; + t = f[22] * g[22]; + y[6] += CONSTANT(0.065535909662600006)*t; + y[20] += CONSTANT(-0.083698454702400005)*t; + y[0] += CONSTANT(0.282094791771999980)*t; + y[24] += CONSTANT(0.135045473384000000)*t; + + // [22,26]: 10,28, + tf = CONSTANT(0.101358691174000000)*f[10] + CONSTANT(0.084042186965900004)*f[28]; + tg = CONSTANT(0.101358691174000000)*g[10] + CONSTANT(0.084042186965900004)*g[28]; + y[22] += tf*g[26] + tg*f[26]; + y[26] += tf*g[22] + tg*f[22]; + t = f[22] * g[26] + f[26] * g[22]; + y[10] += CONSTANT(0.101358691174000000)*t; + y[28] += CONSTANT(0.084042186965900004)*t; + + // [22,27]: 1,11,25, + tf = CONSTANT(0.183739324704000010)*f[1] + CONSTANT(0.101990215611000000)*f[11] + CONSTANT(0.130197596200999990)*f[25]; + tg = CONSTANT(0.183739324704000010)*g[1] + CONSTANT(0.101990215611000000)*g[11] + CONSTANT(0.130197596200999990)*g[25]; + y[22] += tf*g[27] + tg*f[27]; + y[27] += tf*g[22] + tg*f[22]; + t = f[22] * g[27] + f[27] * g[22]; + y[1] += CONSTANT(0.183739324704000010)*t; + y[11] += CONSTANT(0.101990215611000000)*t; + y[25] += CONSTANT(0.130197596200999990)*t; + + // [22,32]: 2,30,12,34, + tf = CONSTANT(0.225033795606000010)*f[2] + CONSTANT(-0.099440056651100006)*f[30] + CONSTANT(0.022664492358099999)*f[12] + CONSTANT(0.084042186968800003)*f[34]; + tg = CONSTANT(0.225033795606000010)*g[2] + CONSTANT(-0.099440056651100006)*g[30] + CONSTANT(0.022664492358099999)*g[12] + CONSTANT(0.084042186968800003)*g[34]; + y[22] += tf*g[32] + tg*f[32]; + y[32] += tf*g[22] + tg*f[22]; + t = f[22] * g[32] + f[32] * g[22]; + y[2] += CONSTANT(0.225033795606000010)*t; + y[30] += CONSTANT(-0.099440056651100006)*t; + y[12] += CONSTANT(0.022664492358099999)*t; + y[34] += CONSTANT(0.084042186968800003)*t; + + // [22,33]: 3,13,35, + tf = CONSTANT(0.183739324704000010)*f[3] + CONSTANT(0.101990215611000000)*f[13] + CONSTANT(0.130197596200999990)*f[35]; + tg = CONSTANT(0.183739324704000010)*g[3] + CONSTANT(0.101990215611000000)*g[13] + CONSTANT(0.130197596200999990)*g[35]; + y[22] += tf*g[33] + tg*f[33]; + y[33] += tf*g[22] + tg*f[22]; + t = f[22] * g[33] + f[33] * g[22]; + y[3] += CONSTANT(0.183739324704000010)*t; + y[13] += CONSTANT(0.101990215611000000)*t; + y[35] += CONSTANT(0.130197596200999990)*t; + + // [23,23]: 6,20,0, + tf = CONSTANT(-0.057343920955899998)*f[6] + CONSTANT(-0.159787958979000000)*f[20] + CONSTANT(0.282094791768999990)*f[0]; + tg = CONSTANT(-0.057343920955899998)*g[6] + CONSTANT(-0.159787958979000000)*g[20] + CONSTANT(0.282094791768999990)*g[0]; + y[23] += tf*g[23] + tg*f[23]; + t = f[23] * g[23]; + y[6] += CONSTANT(-0.057343920955899998)*t; + y[20] += CONSTANT(-0.159787958979000000)*t; + y[0] += CONSTANT(0.282094791768999990)*t; + + // [23,26]: 1,11,29, + tf = CONSTANT(0.208340811096000000)*f[1] + CONSTANT(0.029982305185199998)*f[11] + CONSTANT(-0.118853600623999990)*f[29]; + tg = CONSTANT(0.208340811096000000)*g[1] + CONSTANT(0.029982305185199998)*g[11] + CONSTANT(-0.118853600623999990)*g[29]; + y[23] += tf*g[26] + tg*f[26]; + y[26] += tf*g[23] + tg*f[23]; + t = f[23] * g[26] + f[26] * g[23]; + y[1] += CONSTANT(0.208340811096000000)*t; + y[11] += CONSTANT(0.029982305185199998)*t; + y[29] += CONSTANT(-0.118853600623999990)*t; + + // [23,28]: 25,11,1,29, + tf = CONSTANT(-0.099440056652200001)*f[25] + CONSTANT(-0.121172043789000000)*f[11] + CONSTANT(0.060142811686500000)*f[1] + CONSTANT(-0.034310079156700000)*f[29]; + tg = CONSTANT(-0.099440056652200001)*g[25] + CONSTANT(-0.121172043789000000)*g[11] + CONSTANT(0.060142811686500000)*g[1] + CONSTANT(-0.034310079156700000)*g[29]; + y[23] += tf*g[28] + tg*f[28]; + y[28] += tf*g[23] + tg*f[23]; + t = f[23] * g[28] + f[28] * g[23]; + y[25] += CONSTANT(-0.099440056652200001)*t; + y[11] += CONSTANT(-0.121172043789000000)*t; + y[1] += CONSTANT(0.060142811686500000)*t; + y[29] += CONSTANT(-0.034310079156700000)*t; + + // [23,32]: 31,13,3,35, + tf = CONSTANT(0.034310079156599997)*f[31] + CONSTANT(0.121172043788000010)*f[13] + CONSTANT(-0.060142811686900000)*f[3] + CONSTANT(-0.099440056652700004)*f[35]; + tg = CONSTANT(0.034310079156599997)*g[31] + CONSTANT(0.121172043788000010)*g[13] + CONSTANT(-0.060142811686900000)*g[3] + CONSTANT(-0.099440056652700004)*g[35]; + y[23] += tf*g[32] + tg*f[32]; + y[32] += tf*g[23] + tg*f[23]; + t = f[23] * g[32] + f[32] * g[23]; + y[31] += CONSTANT(0.034310079156599997)*t; + y[13] += CONSTANT(0.121172043788000010)*t; + y[3] += CONSTANT(-0.060142811686900000)*t; + y[35] += CONSTANT(-0.099440056652700004)*t; + + // [23,33]: 2,30,12, + tf = CONSTANT(0.196425600433000000)*f[2] + CONSTANT(-0.130197596204999990)*f[30] + CONSTANT(-0.103861751821000010)*f[12]; + tg = CONSTANT(0.196425600433000000)*g[2] + CONSTANT(-0.130197596204999990)*g[30] + CONSTANT(-0.103861751821000010)*g[12]; + y[23] += tf*g[33] + tg*f[33]; + y[33] += tf*g[23] + tg*f[23]; + t = f[23] * g[33] + f[33] * g[23]; + y[2] += CONSTANT(0.196425600433000000)*t; + y[30] += CONSTANT(-0.130197596204999990)*t; + y[12] += CONSTANT(-0.103861751821000010)*t; + + // [23,34]: 3,13,31, + tf = CONSTANT(0.208340811100000000)*f[3] + CONSTANT(0.029982305185400002)*f[13] + CONSTANT(-0.118853600623000000)*f[31]; + tg = CONSTANT(0.208340811100000000)*g[3] + CONSTANT(0.029982305185400002)*g[13] + CONSTANT(-0.118853600623000000)*g[31]; + y[23] += tf*g[34] + tg*f[34]; + y[34] += tf*g[23] + tg*f[23]; + t = f[23] * g[34] + f[34] * g[23]; + y[3] += CONSTANT(0.208340811100000000)*t; + y[13] += CONSTANT(0.029982305185400002)*t; + y[31] += CONSTANT(-0.118853600623000000)*t; + + // [24,24]: 6,0,20, + tf = CONSTANT(-0.229375683829000000)*f[6] + CONSTANT(0.282094791763999990)*f[0] + CONSTANT(0.106525305981000000)*f[20]; + tg = CONSTANT(-0.229375683829000000)*g[6] + CONSTANT(0.282094791763999990)*g[0] + CONSTANT(0.106525305981000000)*g[20]; + y[24] += tf*g[24] + tg*f[24]; + t = f[24] * g[24]; + y[6] += CONSTANT(-0.229375683829000000)*t; + y[0] += CONSTANT(0.282094791763999990)*t; + y[20] += CONSTANT(0.106525305981000000)*t; + + // [24,29]: 9,27,25, + tf = CONSTANT(-0.035835708931400000)*f[9] + CONSTANT(0.118853600623000000)*f[27] + CONSTANT(0.053152946071199997)*f[25]; + tg = CONSTANT(-0.035835708931400000)*g[9] + CONSTANT(0.118853600623000000)*g[27] + CONSTANT(0.053152946071199997)*g[25]; + y[24] += tf*g[29] + tg*f[29]; + y[29] += tf*g[24] + tg*f[24]; + t = f[24] * g[29] + f[29] * g[24]; + y[9] += CONSTANT(-0.035835708931400000)*t; + y[27] += CONSTANT(0.118853600623000000)*t; + y[25] += CONSTANT(0.053152946071199997)*t; + + // [24,31]: 15,33,35, + tf = CONSTANT(0.035835708931400000)*f[15] + CONSTANT(-0.118853600623000000)*f[33] + CONSTANT(0.053152946071199997)*f[35]; + tg = CONSTANT(0.035835708931400000)*g[15] + CONSTANT(-0.118853600623000000)*g[33] + CONSTANT(0.053152946071199997)*g[35]; + y[24] += tf*g[31] + tg*f[31]; + y[31] += tf*g[24] + tg*f[24]; + t = f[24] * g[31] + f[31] * g[24]; + y[15] += CONSTANT(0.035835708931400000)*t; + y[33] += CONSTANT(-0.118853600623000000)*t; + y[35] += CONSTANT(0.053152946071199997)*t; + + // [24,34]: 12,30,2, + tf = CONSTANT(-0.207723503645000000)*f[12] + CONSTANT(0.130197596199999990)*f[30] + CONSTANT(0.147319200325000010)*f[2]; + tg = CONSTANT(-0.207723503645000000)*g[12] + CONSTANT(0.130197596199999990)*g[30] + CONSTANT(0.147319200325000010)*g[2]; + y[24] += tf*g[34] + tg*f[34]; + y[34] += tf*g[24] + tg*f[24]; + t = f[24] * g[34] + f[34] * g[24]; + y[12] += CONSTANT(-0.207723503645000000)*t; + y[30] += CONSTANT(0.130197596199999990)*t; + y[2] += CONSTANT(0.147319200325000010)*t; + + // [25,25]: 0,6,20, + tf = CONSTANT(0.282094791761999970)*f[0] + CONSTANT(-0.242608896358999990)*f[6] + CONSTANT(0.130197596198000000)*f[20]; + tg = CONSTANT(0.282094791761999970)*g[0] + CONSTANT(-0.242608896358999990)*g[6] + CONSTANT(0.130197596198000000)*g[20]; + y[25] += tf*g[25] + tg*f[25]; + t = f[25] * g[25]; + y[0] += CONSTANT(0.282094791761999970)*t; + y[6] += CONSTANT(-0.242608896358999990)*t; + y[20] += CONSTANT(0.130197596198000000)*t; + + // [26,26]: 6,20,0, + tf = CONSTANT(-0.097043558542400002)*f[6] + CONSTANT(-0.130197596207000000)*f[20] + CONSTANT(0.282094791766000000)*f[0]; + tg = CONSTANT(-0.097043558542400002)*g[6] + CONSTANT(-0.130197596207000000)*g[20] + CONSTANT(0.282094791766000000)*g[0]; + y[26] += tf*g[26] + tg*f[26]; + t = f[26] * g[26]; + y[6] += CONSTANT(-0.097043558542400002)*t; + y[20] += CONSTANT(-0.130197596207000000)*t; + y[0] += CONSTANT(0.282094791766000000)*t; + + // [27,27]: 0,20,6, + tf = CONSTANT(0.282094791770000020)*f[0] + CONSTANT(-0.130197596204999990)*f[20] + CONSTANT(0.016173926423100001)*f[6]; + tg = CONSTANT(0.282094791770000020)*g[0] + CONSTANT(-0.130197596204999990)*g[20] + CONSTANT(0.016173926423100001)*g[6]; + y[27] += tf*g[27] + tg*f[27]; + t = f[27] * g[27]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[20] += CONSTANT(-0.130197596204999990)*t; + y[6] += CONSTANT(0.016173926423100001)*t; + + // [28,28]: 6,0,20,24, + tf = CONSTANT(0.097043558538800007)*f[6] + CONSTANT(0.282094791771999980)*f[0] + CONSTANT(-0.021699599367299999)*f[20] + CONSTANT(-0.128376561118000000)*f[24]; + tg = CONSTANT(0.097043558538800007)*g[6] + CONSTANT(0.282094791771999980)*g[0] + CONSTANT(-0.021699599367299999)*g[20] + CONSTANT(-0.128376561118000000)*g[24]; + y[28] += tf*g[28] + tg*f[28]; + t = f[28] * g[28]; + y[6] += CONSTANT(0.097043558538800007)*t; + y[0] += CONSTANT(0.282094791771999980)*t; + y[20] += CONSTANT(-0.021699599367299999)*t; + y[24] += CONSTANT(-0.128376561118000000)*t; + + // [29,29]: 20,6,0,22,8, + tf = CONSTANT(0.086798397468799998)*f[20] + CONSTANT(0.145565337808999990)*f[6] + CONSTANT(0.282094791773999990)*f[0] + CONSTANT(-0.097043558539500002)*f[22] + CONSTANT(-0.140070311615000000)*f[8]; + tg = CONSTANT(0.086798397468799998)*g[20] + CONSTANT(0.145565337808999990)*g[6] + CONSTANT(0.282094791773999990)*g[0] + CONSTANT(-0.097043558539500002)*g[22] + CONSTANT(-0.140070311615000000)*g[8]; + y[29] += tf*g[29] + tg*f[29]; + t = f[29] * g[29]; + y[20] += CONSTANT(0.086798397468799998)*t; + y[6] += CONSTANT(0.145565337808999990)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + y[22] += CONSTANT(-0.097043558539500002)*t; + y[8] += CONSTANT(-0.140070311615000000)*t; + + // [30,30]: 0,20,6, + tf = CONSTANT(0.282094804531000000)*f[0] + CONSTANT(0.130197634486000000)*f[20] + CONSTANT(0.161739292769000010)*f[6]; + tg = CONSTANT(0.282094804531000000)*g[0] + CONSTANT(0.130197634486000000)*g[20] + CONSTANT(0.161739292769000010)*g[6]; + y[30] += tf*g[30] + tg*f[30]; + t = f[30] * g[30]; + y[0] += CONSTANT(0.282094804531000000)*t; + y[20] += CONSTANT(0.130197634486000000)*t; + y[6] += CONSTANT(0.161739292769000010)*t; + + // [31,31]: 6,8,20,22,0, + tf = CONSTANT(0.145565337808999990)*f[6] + CONSTANT(0.140070311615000000)*f[8] + CONSTANT(0.086798397468799998)*f[20] + CONSTANT(0.097043558539500002)*f[22] + CONSTANT(0.282094791773999990)*f[0]; + tg = CONSTANT(0.145565337808999990)*g[6] + CONSTANT(0.140070311615000000)*g[8] + CONSTANT(0.086798397468799998)*g[20] + CONSTANT(0.097043558539500002)*g[22] + CONSTANT(0.282094791773999990)*g[0]; + y[31] += tf*g[31] + tg*f[31]; + t = f[31] * g[31]; + y[6] += CONSTANT(0.145565337808999990)*t; + y[8] += CONSTANT(0.140070311615000000)*t; + y[20] += CONSTANT(0.086798397468799998)*t; + y[22] += CONSTANT(0.097043558539500002)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + + // [32,32]: 0,24,20,6, + tf = CONSTANT(0.282094791771999980)*f[0] + CONSTANT(0.128376561118000000)*f[24] + CONSTANT(-0.021699599367299999)*f[20] + CONSTANT(0.097043558538800007)*f[6]; + tg = CONSTANT(0.282094791771999980)*g[0] + CONSTANT(0.128376561118000000)*g[24] + CONSTANT(-0.021699599367299999)*g[20] + CONSTANT(0.097043558538800007)*g[6]; + y[32] += tf*g[32] + tg*f[32]; + t = f[32] * g[32]; + y[0] += CONSTANT(0.282094791771999980)*t; + y[24] += CONSTANT(0.128376561118000000)*t; + y[20] += CONSTANT(-0.021699599367299999)*t; + y[6] += CONSTANT(0.097043558538800007)*t; + + // [33,33]: 6,20,0, + tf = CONSTANT(0.016173926423100001)*f[6] + CONSTANT(-0.130197596204999990)*f[20] + CONSTANT(0.282094791770000020)*f[0]; + tg = CONSTANT(0.016173926423100001)*g[6] + CONSTANT(-0.130197596204999990)*g[20] + CONSTANT(0.282094791770000020)*g[0]; + y[33] += tf*g[33] + tg*f[33]; + t = f[33] * g[33]; + y[6] += CONSTANT(0.016173926423100001)*t; + y[20] += CONSTANT(-0.130197596204999990)*t; + y[0] += CONSTANT(0.282094791770000020)*t; + + // [34,34]: 20,6,0, + tf = CONSTANT(-0.130197596207000000)*f[20] + CONSTANT(-0.097043558542400002)*f[6] + CONSTANT(0.282094791766000000)*f[0]; + tg = CONSTANT(-0.130197596207000000)*g[20] + CONSTANT(-0.097043558542400002)*g[6] + CONSTANT(0.282094791766000000)*g[0]; + y[34] += tf*g[34] + tg*f[34]; + t = f[34] * g[34]; + y[20] += CONSTANT(-0.130197596207000000)*t; + y[6] += CONSTANT(-0.097043558542400002)*t; + y[0] += CONSTANT(0.282094791766000000)*t; + + // [35,35]: 6,0,20, + tf = CONSTANT(-0.242608896358999990)*f[6] + CONSTANT(0.282094791761999970)*f[0] + CONSTANT(0.130197596198000000)*f[20]; + tg = CONSTANT(-0.242608896358999990)*g[6] + CONSTANT(0.282094791761999970)*g[0] + CONSTANT(0.130197596198000000)*g[20]; + y[35] += tf*g[35] + tg*f[35]; + t = f[35] * g[35]; + y[6] += CONSTANT(-0.242608896358999990)*t; + y[0] += CONSTANT(0.282094791761999970)*t; + y[20] += CONSTANT(0.130197596198000000)*t; + + // multiply count=2527 + + return y; +} + + +//------------------------------------------------------------------------------------- +// Evaluates a directional light and returns spectral SH data. The output +// vector is computed so that if the intensity of R/G/B is unit the resulting +// exit radiance of a point directly under the light on a diffuse object with +// an albedo of 1 would be 1.0. This will compute 3 spectral samples, resultR +// has to be specified, while resultG and resultB are optional. +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204988.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +bool XM_CALLCONV DirectX::XMSHEvalDirectionalLight( + size_t order, + FXMVECTOR dir, + FXMVECTOR color, + float *resultR, + float *resultG, + float *resultB) noexcept +{ + if (!resultR) + return false; + + if (order < XM_SH_MINORDER || order > XM_SH_MAXORDER) + return false; + + XMFLOAT3A clr; + XMStoreFloat3A(&clr, color); + + float fTmp[XM_SH_MAXORDER * XM_SH_MAXORDER]; + + XMSHEvalDirection(fTmp, order, dir); // evaluate the BF in this direction... + + // now compute "normalization" and scale vector for each valid spectral band + const float fNorm = XM_PI / CosWtInt(order); + + const size_t numcoeff = order*order; + + const float fRScale = fNorm * clr.x; + + for (size_t i = 0; i < numcoeff; ++i) + { + resultR[i] = fTmp[i] * fRScale; + } + + if (resultG) + { + const float fGScale = fNorm * clr.y; + + for (size_t i = 0; i < numcoeff; ++i) + { + resultG[i] = fTmp[i] * fGScale; + } + } + + if (resultB) + { + const float fBScale = fNorm * clr.z; + + for (size_t i = 0; i < numcoeff; ++i) + { + resultB[i] = fTmp[i] * fBScale; + } + } + + return true; +} + + +//------------------------------------------------------------------------------------ +// Evaluates a spherical light and returns spectral SH data. There is no +// normalization of the intensity of the light like there is for directional +// lights, care has to be taken when specifiying the intensities. This will +// compute 3 spectral samples, resultR has to be specified, while resultG and +// resultB are optional. +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205451.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +bool XM_CALLCONV DirectX::XMSHEvalSphericalLight( + size_t order, + FXMVECTOR pos, + float radius, + FXMVECTOR color, + float *resultR, + float *resultG, + float *resultB) noexcept +{ + if (!resultR) + return false; + + if (radius < 0.f) + return false; + + const float fDist = XMVectorGetX(XMVector3Length(pos)); + + // WARNING: fDist should not be < radius - otherwise light contains origin + + //const float fSinConeAngle = (fDist <= radius) ? 0.99999f : radius/fDist; + const float fConeAngle = (fDist <= radius) ? (XM_PIDIV2) : asinf(radius / fDist); + + XMVECTOR dir = XMVector3Normalize(pos); + + float fTmpDir[XM_SH_MAXORDER* XM_SH_MAXORDER]; // rotation "vector" + float fTmpL0[XM_SH_MAXORDER]; + + // + // Sphere at distance fDist, the cone angle is determined by looking at the + // right triangle with one side (the hypotenuse) beind the vector from the + // origin to the center of the sphere, another side is from the origin to + // a point on the sphere whose normal is perpendicular to the given side (this + // is one of the points on the cone that is defined by the projection of the sphere + // through the origin - we want to find the angle of this cone) and the final + // side being from the center of the sphere to the point of tagency (the two + // sides conected to this are at a right angle by construction.) + // From trig we know that sin(theta) = ||opposite||/||hypotenuse||, where + // ||opposite|| = Radius, ||hypotenuse|| = fDist + // theta is the angle of the cone that subtends the sphere from the origin + // + + // no default normalization is done for this case, have to be careful how + // you represent the coefficients... + + const float fNewNorm = 1.0f;///(fSinConeAngle*fSinConeAngle); + + ComputeCapInt(order, fConeAngle, fTmpL0); + + XMFLOAT3A vd; + XMStoreFloat3(&vd, dir); + + const float fX = vd.x; + const float fY = vd.y; + const float fZ = vd.z; + + switch (order) + { + case 2: + sh_eval_basis_1(fX, fY, fZ, fTmpDir); + break; + + case 3: + sh_eval_basis_2(fX, fY, fZ, fTmpDir); + break; + + case 4: + sh_eval_basis_3(fX, fY, fZ, fTmpDir); + break; + + case 5: + sh_eval_basis_4(fX, fY, fZ, fTmpDir); + break; + + case 6: + sh_eval_basis_5(fX, fY, fZ, fTmpDir); + break; + + default: + assert(order < XM_SH_MINORDER || order > XM_SH_MAXORDER); + return false; + } + + XMFLOAT3A clr; + XMStoreFloat3A(&clr, color); + + for (size_t i = 0; i < order; ++i) + { + const size_t cNumCoefs = 2 * i + 1; + const size_t cStart = i*i; + const float fValUse = fTmpL0[i] * clr.x*fNewNorm*fExtraNormFac[i]; + for (size_t j = 0; j < cNumCoefs; ++j) resultR[cStart + j] = fTmpDir[cStart + j] * fValUse; + } + + if (resultG) + { + for (size_t i = 0; i < order; ++i) + { + const size_t cNumCoefs = 2 * i + 1; + const size_t cStart = i*i; + const float fValUse = fTmpL0[i] * clr.y*fNewNorm*fExtraNormFac[i]; + for (size_t j = 0; j < cNumCoefs; ++j) resultG[cStart + j] = fTmpDir[cStart + j] * fValUse; + } + } + + if (resultB) + { + for (size_t i = 0; i < order; ++i) + { + const size_t cNumCoefs = 2 * i + 1; + const size_t cStart = i*i; + const float fValUse = fTmpL0[i] * clr.z*fNewNorm*fExtraNormFac[i]; + for (size_t j = 0; j < cNumCoefs; ++j) resultB[cStart + j] = fTmpDir[cStart + j] * fValUse; + } + } + + return true; +} + + +//------------------------------------------------------------------------------------- +// Evaluates a light that is a cone of constant intensity and returns spectral +// SH data. The output vector is computed so that if the intensity of R/G/B is +// unit the resulting exit radiance of a point directly under the light oriented +// in the cone direction on a diffuse object with an albedo of 1 would be 1.0. +// This will compute 3 spectral samples, resultR has to be specified, while resultG +// and resultB are optional. +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204986.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +bool XM_CALLCONV DirectX::XMSHEvalConeLight( + size_t order, + FXMVECTOR dir, + float radius, + FXMVECTOR color, + float *resultR, + float *resultG, + float *resultB) noexcept +{ + if (!resultR) + return false; + + if (radius < 0.f || radius >(XM_PI*1.00001f)) + return false; + + if (radius < 0.0001f) + { + // turn it into a pure directional light... + return XMSHEvalDirectionalLight(order, dir, color, resultR, resultG, resultB); + } + else + { + float fTmpL0[XM_SH_MAXORDER]; + float fTmpDir[XM_SH_MAXORDER * XM_SH_MAXORDER]; + + const float fConeAngle = radius; + const float fAngCheck = (fConeAngle > XM_PIDIV2) ? (XM_PIDIV2) : fConeAngle; + + const float fNewNorm = 1.0f / (sinf(fAngCheck)*sinf(fAngCheck)); + + ComputeCapInt(order, fConeAngle, fTmpL0); + + XMFLOAT3A vd; + XMStoreFloat3(&vd, dir); + + const float fX = vd.x; + const float fY = vd.y; + const float fZ = vd.z; + + switch (order) + { + case 2: + sh_eval_basis_1(fX, fY, fZ, fTmpDir); + break; + + case 3: + sh_eval_basis_2(fX, fY, fZ, fTmpDir); + break; + + case 4: + sh_eval_basis_3(fX, fY, fZ, fTmpDir); + break; + + case 5: + sh_eval_basis_4(fX, fY, fZ, fTmpDir); + break; + + case 6: + sh_eval_basis_5(fX, fY, fZ, fTmpDir); + break; + + default: + assert(order < XM_SH_MINORDER || order > XM_SH_MAXORDER); + return false; + } + + XMFLOAT3A clr; + XMStoreFloat3A(&clr, color); + + for (size_t i = 0; i < order; ++i) + { + const size_t cNumCoefs = 2 * i + 1; + const size_t cStart = i*i; + const float fValUse = fTmpL0[i] * clr.x*fNewNorm*fExtraNormFac[i]; + for (size_t j = 0; j < cNumCoefs; ++j) + resultR[cStart + j] = fTmpDir[cStart + j] * fValUse; + } + + if (resultG) + { + for (size_t i = 0; i < order; ++i) + { + const size_t cNumCoefs = 2 * i + 1; + const size_t cStart = i*i; + const float fValUse = fTmpL0[i] * clr.y*fNewNorm*fExtraNormFac[i]; + for (size_t j = 0; j < cNumCoefs; ++j) + resultG[cStart + j] = fTmpDir[cStart + j] * fValUse; + } + } + + if (resultB) + { + for (size_t i = 0; i < order; ++i) + { + const size_t cNumCoefs = 2 * i + 1; + const size_t cStart = i*i; + const float fValUse = fTmpL0[i] * clr.z*fNewNorm*fExtraNormFac[i]; + for (size_t j = 0; j < cNumCoefs; ++j) + resultB[cStart + j] = fTmpDir[cStart + j] * fValUse; + } + } + } + + return true; +} + + +//------------------------------------------------------------------------------------ +// Evaluates a light that is a linear interpolant between two colors over the +// sphere. The interpolant is linear along the axis of the two points, not +// over the surface of the sphere (ie: if the axis was (0,0,1) it is linear in +// Z, not in the azimuthal angle.) The resulting spherical lighting function +// is normalized so that a point on a perfectly diffuse surface with no +// shadowing and a normal pointed in the direction pDir would result in exit +// radiance with a value of 1 if the top color was white and the bottom color +// was black. This is a very simple model where topColor represents the intensity +// of the "sky" and bottomColor represents the intensity of the "ground". +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204989.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +bool XM_CALLCONV DirectX::XMSHEvalHemisphereLight( + size_t order, + FXMVECTOR dir, + FXMVECTOR topColor, + FXMVECTOR bottomColor, + float *resultR, + float *resultG, + float *resultB) noexcept +{ + if (!resultR) + return false; + + if (order < XM_SH_MINORDER || order > XM_SH_MAXORDER) + return false; + + // seperate "R/G/B colors... + + float fTmpDir[XM_SH_MAXORDER * XM_SH_MAXORDER]; // rotation "vector" + float fTmpL0[XM_SH_MAXORDER]; + + const float fNewNorm = 3.0f / 2.0f; // normalizes things for 1 sky color, 0 ground color... + + XMFLOAT3A vd; + XMStoreFloat3(&vd, dir); + + const float fX = vd.x; + const float fY = vd.y; + const float fZ = vd.z; + + sh_eval_basis_1(fX, fY, fZ, fTmpDir); + + XMFLOAT3A clrTop; + XMStoreFloat3A(&clrTop, topColor); + + XMFLOAT3A clrBottom; + XMStoreFloat3A(&clrBottom, bottomColor); + + float fA = clrTop.x; + float fAvrg = (clrTop.x + clrBottom.x)*0.5f; + + fTmpL0[0] = fAvrg*2.0f*SHEvalHemisphereLight_fSqrtPi; + fTmpL0[1] = (fA - fAvrg)*2.0f*SHEvalHemisphereLight_fSqrtPi3; + + size_t i = 0; + for (; i < 2; ++i) + { + _Analysis_assume_(i < order); + const size_t cNumCoefs = 2 * i + 1; + const size_t cStart = i*i; + const float fValUse = fTmpL0[i] * fNewNorm*fExtraNormFac[i]; + for (size_t j = 0; j < cNumCoefs; ++j) resultR[cStart + j] = fTmpDir[cStart + j] * fValUse; + } + + for (; i < order; ++i) + { + const size_t cNumCoefs = 2 * i + 1; + const size_t cStart = i*i; + for (size_t j = 0; j < cNumCoefs; ++j) resultR[cStart + j] = 0.0f; + } + + if (resultG) + { + fA = clrTop.y; + fAvrg = (clrTop.y + clrBottom.y)*0.5f; + + fTmpL0[0] = fAvrg*2.0f*SHEvalHemisphereLight_fSqrtPi; + fTmpL0[1] = (fA - fAvrg)*2.0f*SHEvalHemisphereLight_fSqrtPi3; + + for (i = 0; i < 2; ++i) + { + _Analysis_assume_(i < order); + const size_t cNumCoefs = 2 * i + 1; + const size_t cStart = i*i; + const float fValUse = fTmpL0[i] * fNewNorm*fExtraNormFac[i]; + for (size_t j = 0; j < cNumCoefs; ++j) resultG[cStart + j] = fTmpDir[cStart + j] * fValUse; + } + + for (; i < order; ++i) + { + const size_t cNumCoefs = 2 * i + 1; + const size_t cStart = i*i; + for (size_t j = 0; j < cNumCoefs; ++j) resultG[cStart + j] = 0.0f; + } + } + + if (resultB) + { + fA = clrTop.z; + fAvrg = (clrTop.z + clrBottom.z)*0.5f; + + fTmpL0[0] = fAvrg*2.0f*SHEvalHemisphereLight_fSqrtPi; + fTmpL0[1] = (fA - fAvrg)*2.0f*SHEvalHemisphereLight_fSqrtPi3; + + for (i = 0; i < 2; ++i) + { + _Analysis_assume_(i < order); + const size_t cNumCoefs = 2 * i + 1; + const size_t cStart = i*i; + const float fValUse = fTmpL0[i] * fNewNorm*fExtraNormFac[i]; + for (size_t j = 0; j < cNumCoefs; ++j) resultB[cStart + j] = fTmpDir[cStart + j] * fValUse; + } + + for (; i < order; ++i) + { + const size_t cNumCoefs = 2 * i + 1; + const size_t cStart = i*i; + for (size_t j = 0; j < cNumCoefs; ++j) resultB[cStart + j] = 0.0f; + } + } + + return true; +} diff --git a/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSH.h b/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSH.h new file mode 100644 index 000000000..9f5183553 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSH.h @@ -0,0 +1,72 @@ +//------------------------------------------------------------------------------------- +// DirectXSH.h -- C++ Spherical Harmonics Math Library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/p/?LinkId=262885 +//------------------------------------------------------------------------------------- + +#pragma once + +#define DIRECTX_SHMATH_VERSION 106 + +#include + +namespace DirectX +{ + constexpr size_t XM_SH_MINORDER = 2; + constexpr size_t XM_SH_MAXORDER = 6; + + float* XM_CALLCONV XMSHEvalDirection(_Out_writes_(order*order) float *result, _In_ size_t order, _In_ FXMVECTOR dir) noexcept; + + float* XM_CALLCONV XMSHRotate(_Out_writes_(order*order) float *result, _In_ size_t order, _In_ FXMMATRIX rotMatrix, _In_reads_(order*order) const float *input) noexcept; + + float* XMSHRotateZ(_Out_writes_(order*order) float *result, _In_ size_t order, _In_ float angle, _In_reads_(order*order) const float *input) noexcept; + + float* XMSHAdd(_Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *inputA, _In_reads_(order*order) const float *inputB) noexcept; + + float* XMSHScale(_Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *input, _In_ float scale) noexcept; + + float XMSHDot(_In_ size_t order, _In_reads_(order*order) const float *inputA, _In_reads_(order*order) const float *inputB) noexcept; + + float* XMSHMultiply(_Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *inputF, _In_reads_(order*order) const float *inputG) noexcept; + + float* XMSHMultiply2(_Out_writes_(4) float *result, _In_reads_(4) const float *inputF, _In_reads_(4) const float *inputG) noexcept; + + float* XMSHMultiply3(_Out_writes_(9) float *result, _In_reads_(9) const float *inputF, _In_reads_(9) const float *inputG) noexcept; + + float* XMSHMultiply4(_Out_writes_(16) float *result, _In_reads_(16) const float *inputF, _In_reads_(16) const float *inputG) noexcept; + + float* XMSHMultiply5(_Out_writes_(25) float *result, _In_reads_(25) const float *inputF, _In_reads_(25) const float *inputG) noexcept; + + float* XMSHMultiply6(_Out_writes_(36) float *result, _In_reads_(36) const float *inputF, _In_reads_(36) const float *inputG) noexcept; + + bool XM_CALLCONV XMSHEvalDirectionalLight( + _In_ size_t order, _In_ FXMVECTOR dir, _In_ FXMVECTOR color, + _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB) noexcept; + + bool XM_CALLCONV XMSHEvalSphericalLight( + _In_ size_t order, _In_ FXMVECTOR pos, _In_ float radius, _In_ FXMVECTOR color, + _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB) noexcept; + + bool XM_CALLCONV XMSHEvalConeLight( + _In_ size_t order, _In_ FXMVECTOR dir, _In_ float radius, _In_ FXMVECTOR color, + _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB) noexcept; + + bool XM_CALLCONV XMSHEvalHemisphereLight( + _In_ size_t order, _In_ FXMVECTOR dir, _In_ FXMVECTOR topColor, _In_ FXMVECTOR bottomColor, + _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB) noexcept; + + #if defined(__d3d11_h__) || defined(__d3d11_x_h__) + HRESULT SHProjectCubeMap( + _In_ ID3D11DeviceContext *context, _In_ size_t order, _In_ ID3D11Texture2D *cubeMap, + _Out_writes_opt_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB) noexcept; + #endif + + #if defined(__d3d12_h__) || defined(__d3d12_x_h__) || defined(__XBOX_D3D12_X__) + HRESULT SHProjectCubeMap( + _In_ size_t order, _In_ const D3D12_RESOURCE_DESC& desc, _In_ const D3D12_SUBRESOURCE_DATA cubeMap[6], + _Out_writes_opt_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB) noexcept; + #endif +} // namespace DirectX diff --git a/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSHD3D11.cpp b/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSHD3D11.cpp new file mode 100644 index 000000000..a2e4e0bba --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSHD3D11.cpp @@ -0,0 +1,383 @@ +//------------------------------------------------------------------------------------- +// DirectXSHD3D11.cpp -- C++ Spherical Harmonics Math Library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/p/?LinkId=262885 +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma warning( disable : 4616 4619 4061 4265 4626 5039 ) +// C4616/C4619 #pragma warning warnings +// C4061 numerator 'identifier' in switch of enum 'enumeration' is not explicitly handled by a case label +// C4265 class has virtual functions, but destructor is not virtual +// C4626 assignment operator was implicitly defined as deleted +// C5039 pointer or reference to potentially throwing function passed to extern C function under - EHc + +#pragma warning(push) +#pragma warning(disable: 4365) +#endif +#include +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#include "DirectXSH.h" + +#include + +#include +#include +#include + +#include + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wcovered-switch-default" +#pragma clang diagnostic ignored "-Wswitch-enum" +#endif + +using namespace DirectX; + +using Microsoft::WRL::ComPtr; + +namespace +{ + struct aligned_deleter { void operator()(void* p) { _aligned_free(p); } }; + + using ScopedAlignedArrayXMVECTOR = std::unique_ptr; + + //------------------------------------------------------------------------------------- + // This code is lifted from DirectXTex http://go.microsoft.com/fwlink/?LinkId=248926 + // If you need additional DXGI format support, see DirectXTexConvert.cpp + //------------------------------------------------------------------------------------- +#define LOAD_SCANLINE( type, func )\ + if ( size >= sizeof(type) )\ + {\ + const type * __restrict sPtr = reinterpret_cast(pSource);\ + for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\ + {\ + if ( dPtr >= ePtr ) break;\ + *(dPtr++) = func( sPtr++ );\ + }\ + return true;\ + }\ + return false; + +#define LOAD_SCANLINE3( type, func, defvec )\ + if ( size >= sizeof(type) )\ + {\ + const type * __restrict sPtr = reinterpret_cast(pSource);\ + for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\ + {\ + XMVECTOR v = func( sPtr++ );\ + if ( dPtr >= ePtr ) break;\ + *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1110 );\ + }\ + return true;\ + }\ + return false; + +#define LOAD_SCANLINE2( type, func, defvec )\ + if ( size >= sizeof(type) )\ + {\ + const type * __restrict sPtr = reinterpret_cast(pSource);\ + for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\ + {\ + XMVECTOR v = func( sPtr++ );\ + if ( dPtr >= ePtr ) break;\ + *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1100 );\ + }\ + return true;\ + }\ + return false; + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 6101) +#endif + _Success_(return) + bool LoadScanline( + _Out_writes_(count) DirectX::XMVECTOR* pDestination, + size_t count, + _In_reads_bytes_(size) LPCVOID pSource, + size_t size, + DXGI_FORMAT format) + { + assert(pDestination && count > 0 && ((reinterpret_cast(pDestination) & 0xF) == 0)); + assert(pSource && size > 0); + + using namespace DirectX::PackedVector; + + XMVECTOR* __restrict dPtr = pDestination; + if (!dPtr) + return false; + + const XMVECTOR* ePtr = pDestination + count; + + switch (format) + { + case DXGI_FORMAT_R32G32B32A32_FLOAT: + { + size_t msize = (size > (sizeof(XMVECTOR)*count)) ? (sizeof(XMVECTOR)*count) : size; + memcpy_s(dPtr, sizeof(XMVECTOR)*count, pSource, msize); + } + return true; + + case DXGI_FORMAT_R32G32B32_FLOAT: + LOAD_SCANLINE3(XMFLOAT3, XMLoadFloat3, g_XMIdentityR3) + + case DXGI_FORMAT_R16G16B16A16_FLOAT: + LOAD_SCANLINE(XMHALF4, XMLoadHalf4) + + case DXGI_FORMAT_R32G32_FLOAT: + LOAD_SCANLINE2(XMFLOAT2, XMLoadFloat2, g_XMIdentityR3) + + case DXGI_FORMAT_R11G11B10_FLOAT: + LOAD_SCANLINE3(XMFLOAT3PK, XMLoadFloat3PK, g_XMIdentityR3) + + case DXGI_FORMAT_R16G16_FLOAT: + LOAD_SCANLINE2(XMHALF2, XMLoadHalf2, g_XMIdentityR3) + + case DXGI_FORMAT_R32_FLOAT: + if (size >= sizeof(float)) + { + const float* __restrict sPtr = reinterpret_cast(pSource); + for (size_t icount = 0; icount < size; icount += sizeof(float)) + { + XMVECTOR v = XMLoadFloat(sPtr++); + if (dPtr >= ePtr) break; + *(dPtr++) = XMVectorSelect(g_XMIdentityR3, v, g_XMSelect1000); + } + return true; + } + return false; + + case DXGI_FORMAT_R16_FLOAT: + if (size >= sizeof(HALF)) + { + const HALF * __restrict sPtr = reinterpret_cast(pSource); + for (size_t icount = 0; icount < size; icount += sizeof(HALF)) + { + if (dPtr >= ePtr) break; + *(dPtr++) = XMVectorSet(XMConvertHalfToFloat(*sPtr++), 0.f, 0.f, 1.f); + } + return true; + } + return false; + + default: + return false; + } + } +#ifdef _MSC_VER +#pragma warning(pop) +#endif +} // namespace anonymous + +//------------------------------------------------------------------------------------- +// Projects a function represented in a cube map into spherical harmonics. +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/ff476300.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +HRESULT DirectX::SHProjectCubeMap( + ID3D11DeviceContext *context, + size_t order, + ID3D11Texture2D *cubeMap, + float *resultR, + float *resultG, + float* resultB) noexcept +{ + if (!context || !cubeMap) + return E_INVALIDARG; + + if (order < XM_SH_MINORDER || order > XM_SH_MAXORDER) + return E_INVALIDARG; + + D3D11_TEXTURE2D_DESC desc; + cubeMap->GetDesc(&desc); + + if ((desc.ArraySize != 6) + || (desc.Width != desc.Height) + || (desc.SampleDesc.Count > 1)) + return E_FAIL; + + switch (desc.Format) + { + case DXGI_FORMAT_R32G32B32A32_FLOAT: + case DXGI_FORMAT_R32G32B32_FLOAT: + case DXGI_FORMAT_R16G16B16A16_FLOAT: + case DXGI_FORMAT_R32G32_FLOAT: + case DXGI_FORMAT_R11G11B10_FLOAT: + case DXGI_FORMAT_R16G16_FLOAT: + case DXGI_FORMAT_R32_FLOAT: + case DXGI_FORMAT_R16_FLOAT: + // See LoadScanline to support more pixel formats + break; + + default: + return E_FAIL; + } + + //--- Create a staging resource copy (if needed) to be able to read data + ID3D11Texture2D* texture = nullptr; + + ComPtr staging; + if (!(desc.CPUAccessFlags & D3D11_CPU_ACCESS_READ)) + { + D3D11_TEXTURE2D_DESC sdesc = desc; + sdesc.BindFlags = 0; + sdesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; + sdesc.Usage = D3D11_USAGE_STAGING; + + ComPtr device; + context->GetDevice(&device); + + HRESULT hr = device->CreateTexture2D(&sdesc, nullptr, &staging); + if (FAILED(hr)) + return hr; + + context->CopyResource(staging.Get(), cubeMap); + + texture = staging.Get(); + } + else + texture = cubeMap; + + assert(texture != nullptr); + + //--- Setup for SH projection + ScopedAlignedArrayXMVECTOR scanline(reinterpret_cast(_aligned_malloc(sizeof(XMVECTOR)*desc.Width, 16))); + if (!scanline) + return E_OUTOFMEMORY; + + assert(desc.Width > 0); + float fSize = static_cast(desc.Width); + float fPicSize = 1.0f / fSize; + + // index from [0,W-1], f(0) maps to -1 + 1/W, f(W-1) maps to 1 - 1/w + // linear function x*S +B, 1st constraint means B is (-1+1/W), plug into + // second and solve for S: S = 2*(1-1/W)/(W-1). The old code that did + // this was incorrect - but only for computing the differential solid + // angle, where the final value was 1.0 instead of 1-1/w... + + float fB = -1.0f + 1.0f / fSize; + float fS = (desc.Width > 1) ? (2.0f*(1.0f - 1.0f / fSize) / (fSize - 1.0f)) : 0.f; + + // clear out accumulation variables + float fWt = 0.0f; + + if (resultR) + memset(resultR, 0, sizeof(float)*order*order); + if (resultG) + memset(resultG, 0, sizeof(float)*order*order); + if (resultB) + memset(resultB, 0, sizeof(float)*order*order); + + float shBuff[XM_SH_MAXORDER*XM_SH_MAXORDER] = {}; + float shBuffB[XM_SH_MAXORDER*XM_SH_MAXORDER] = {}; + + //--- Process each face of the cubemap + for (UINT face = 0; face < 6; ++face) + { + UINT dindex = D3D11CalcSubresource(0, face, desc.MipLevels); + + D3D11_MAPPED_SUBRESOURCE mapped; + HRESULT hr = context->Map(texture, dindex, D3D11_MAP_READ, 0, &mapped); + if (FAILED(hr)) + return hr; + + const uint8_t *pSrc = reinterpret_cast(mapped.pData); + for (UINT y = 0; y < desc.Height; ++y) + { + XMVECTOR* ptr = scanline.get(); + if (!LoadScanline(ptr, desc.Width, pSrc, mapped.RowPitch, desc.Format)) + { + context->Unmap(texture, dindex); + return E_FAIL; + } + + const float v = float(y) * fS + fB; + + XMVECTOR* pixel = ptr; + for (UINT x = 0; x < desc.Width; ++x, ++pixel) + { + const float u = float(x) * fS + fB; + + float ix, iy, iz; + switch (face) + { + case 0: // Positive X + iz = 1.0f - (2.0f * float(x) + 1.0f) * fPicSize; + iy = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize; + ix = 1.0f; + break; + + case 1: // Negative X + iz = -1.0f + (2.0f * float(x) + 1.0f) * fPicSize; + iy = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize; + ix = -1; + break; + + case 2: // Positive Y + iz = -1.0f + (2.0f * float(y) + 1.0f) * fPicSize; + iy = 1.0f; + ix = -1.0f + (2.0f * float(x) + 1.0f) * fPicSize; + break; + + case 3: // Negative Y + iz = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize; + iy = -1.0f; + ix = -1.0f + (2.0f * float(x) + 1.0f) * fPicSize; + break; + + case 4: // Positive Z + iz = 1.0f; + iy = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize; + ix = -1.0f + (2.0f * float(x) + 1.0f) * fPicSize; + break; + + case 5: // Negative Z + iz = -1.0f; + iy = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize; + ix = 1.0f - (2.0f * float(x) + 1.0f) * fPicSize; + break; + + default: + ix = iy = iz = 0.f; + assert(false); + break; + } + + XMVECTOR dir = XMVectorSet(ix, iy, iz, 0); + dir = XMVector3Normalize(dir); + + const float fDiffSolid = 4.0f / ((1.0f + u * u + v * v)*sqrtf(1.0f + u * u + v * v)); + fWt += fDiffSolid; + + XMSHEvalDirection(shBuff, order, dir); + + XMFLOAT3A clr; + XMStoreFloat3A(&clr, *pixel); + + if (resultR) XMSHAdd(resultR, order, resultR, XMSHScale(shBuffB, order, shBuff, clr.x*fDiffSolid)); + if (resultG) XMSHAdd(resultG, order, resultG, XMSHScale(shBuffB, order, shBuff, clr.y*fDiffSolid)); + if (resultB) XMSHAdd(resultB, order, resultB, XMSHScale(shBuffB, order, shBuff, clr.z*fDiffSolid)); + } + + pSrc += mapped.RowPitch; + } + + context->Unmap(texture, dindex); + } + + const float fNormProj = (4.0f*XM_PI) / fWt; + + if (resultR) XMSHScale(resultR, order, resultR, fNormProj); + if (resultG) XMSHScale(resultG, order, resultG, fNormProj); + if (resultB) XMSHScale(resultB, order, resultB, fNormProj); + + return S_OK; +} diff --git a/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSHD3D12.cpp b/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSHD3D12.cpp new file mode 100644 index 000000000..5ef93527d --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/SHMath/DirectXSHD3D12.cpp @@ -0,0 +1,339 @@ +//------------------------------------------------------------------------------------- +// DirectXSHD3D12.cpp -- C++ Spherical Harmonics Math Library +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/p/?LinkId=262885 +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma warning( disable : 4616 4619 4061 4265 4626 5039 ) +// C4616/C4619 #pragma warning warnings +// C4061 numerator 'identifier' in switch of enum 'enumeration' is not explicitly handled by a case label +// C4265 class has virtual functions, but destructor is not virtual +// C4626 assignment operator was implicitly defined as deleted +// C5039 pointer or reference to potentially throwing function passed to extern C function under - EHc +#endif + +#include + +#include "DirectXSH.h" + +#include + +#include +#include +#include + +#include + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wcovered-switch-default" +#pragma clang diagnostic ignored "-Wswitch-enum" +#endif + +using namespace DirectX; + +using Microsoft::WRL::ComPtr; + +namespace +{ + struct aligned_deleter { void operator()(void* p) { _aligned_free(p); } }; + + using ScopedAlignedArrayXMVECTOR = std::unique_ptr; + + //------------------------------------------------------------------------------------- + // This code is lifted from DirectXTex http://go.microsoft.com/fwlink/?LinkId=248926 + // If you need additional DXGI format support, see DirectXTexConvert.cpp + //------------------------------------------------------------------------------------- +#define LOAD_SCANLINE( type, func )\ + if ( size >= sizeof(type) )\ + {\ + const type * __restrict sPtr = reinterpret_cast(pSource);\ + for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\ + {\ + if ( dPtr >= ePtr ) break;\ + *(dPtr++) = func( sPtr++ );\ + }\ + return true;\ + }\ + return false; + +#define LOAD_SCANLINE3( type, func, defvec )\ + if ( size >= sizeof(type) )\ + {\ + const type * __restrict sPtr = reinterpret_cast(pSource);\ + for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\ + {\ + XMVECTOR v = func( sPtr++ );\ + if ( dPtr >= ePtr ) break;\ + *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1110 );\ + }\ + return true;\ + }\ + return false; + +#define LOAD_SCANLINE2( type, func, defvec )\ + if ( size >= sizeof(type) )\ + {\ + const type * __restrict sPtr = reinterpret_cast(pSource);\ + for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\ + {\ + XMVECTOR v = func( sPtr++ );\ + if ( dPtr >= ePtr ) break;\ + *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1100 );\ + }\ + return true;\ + }\ + return false; + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 6101) +#endif + _Success_(return) + bool LoadScanline( + _Out_writes_(count) DirectX::XMVECTOR* pDestination, + size_t count, + _In_reads_bytes_(size) LPCVOID pSource, + size_t size, + DXGI_FORMAT format) + { + assert(pDestination && count > 0 && ((reinterpret_cast(pDestination) & 0xF) == 0)); + assert(pSource && size > 0); + + using namespace DirectX::PackedVector; + + XMVECTOR* __restrict dPtr = pDestination; + if (!dPtr) + return false; + + const XMVECTOR* ePtr = pDestination + count; + + switch (format) + { + case DXGI_FORMAT_R32G32B32A32_FLOAT: + { + size_t msize = (size > (sizeof(XMVECTOR)*count)) ? (sizeof(XMVECTOR)*count) : size; + memcpy_s(dPtr, sizeof(XMVECTOR)*count, pSource, msize); + } + return true; + + case DXGI_FORMAT_R32G32B32_FLOAT: + LOAD_SCANLINE3(XMFLOAT3, XMLoadFloat3, g_XMIdentityR3) + + case DXGI_FORMAT_R16G16B16A16_FLOAT: + LOAD_SCANLINE(XMHALF4, XMLoadHalf4) + + case DXGI_FORMAT_R32G32_FLOAT: + LOAD_SCANLINE2(XMFLOAT2, XMLoadFloat2, g_XMIdentityR3) + + case DXGI_FORMAT_R11G11B10_FLOAT: + LOAD_SCANLINE3(XMFLOAT3PK, XMLoadFloat3PK, g_XMIdentityR3) + + case DXGI_FORMAT_R16G16_FLOAT: + LOAD_SCANLINE2(XMHALF2, XMLoadHalf2, g_XMIdentityR3) + + case DXGI_FORMAT_R32_FLOAT: + if (size >= sizeof(float)) + { + const float* __restrict sPtr = reinterpret_cast(pSource); + for (size_t icount = 0; icount < size; icount += sizeof(float)) + { + XMVECTOR v = XMLoadFloat(sPtr++); + if (dPtr >= ePtr) break; + *(dPtr++) = XMVectorSelect(g_XMIdentityR3, v, g_XMSelect1000); + } + return true; + } + return false; + + case DXGI_FORMAT_R16_FLOAT: + if (size >= sizeof(HALF)) + { + const HALF * __restrict sPtr = reinterpret_cast(pSource); + for (size_t icount = 0; icount < size; icount += sizeof(HALF)) + { + if (dPtr >= ePtr) break; + *(dPtr++) = XMVectorSet(XMConvertHalfToFloat(*sPtr++), 0.f, 0.f, 1.f); + } + return true; + } + return false; + + default: + return false; + } + } +#ifdef _MSC_VER +#pragma warning(pop) +#endif +} // namespace anonymous + +//------------------------------------------------------------------------------------- +// Projects a function represented in a cube map into spherical harmonics. +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/ff476300.aspx +//------------------------------------------------------------------------------------- +_Use_decl_annotations_ +HRESULT DirectX::SHProjectCubeMap( + size_t order, + const D3D12_RESOURCE_DESC& desc, + const D3D12_SUBRESOURCE_DATA cubeMap[6], + float *resultR, + float *resultG, + float *resultB) noexcept +{ + if (order < XM_SH_MINORDER || order > XM_SH_MAXORDER) + return E_INVALIDARG; + + if (desc.Dimension != D3D12_RESOURCE_DIMENSION_TEXTURE2D + || (desc.DepthOrArraySize != 6) + || (desc.Width != desc.Height) + || (desc.SampleDesc.Count > 1)) + return E_FAIL; + + switch (desc.Format) + { + case DXGI_FORMAT_R32G32B32A32_FLOAT: + case DXGI_FORMAT_R32G32B32_FLOAT: + case DXGI_FORMAT_R16G16B16A16_FLOAT: + case DXGI_FORMAT_R32G32_FLOAT: + case DXGI_FORMAT_R11G11B10_FLOAT: + case DXGI_FORMAT_R16G16_FLOAT: + case DXGI_FORMAT_R32_FLOAT: + case DXGI_FORMAT_R16_FLOAT: + // See LoadScanline to support more pixel formats + break; + + default: + return E_FAIL; + } + + //--- Setup for SH projection + ScopedAlignedArrayXMVECTOR scanline(reinterpret_cast(_aligned_malloc(static_cast(sizeof(XMVECTOR)*desc.Width), 16))); + if (!scanline) + return E_OUTOFMEMORY; + + assert(desc.Width > 0); + float fSize = static_cast(desc.Width); + float fPicSize = 1.0f / fSize; + + // index from [0,W-1], f(0) maps to -1 + 1/W, f(W-1) maps to 1 - 1/w + // linear function x*S +B, 1st constraint means B is (-1+1/W), plug into + // second and solve for S: S = 2*(1-1/W)/(W-1). The old code that did + // this was incorrect - but only for computing the differential solid + // angle, where the final value was 1.0 instead of 1-1/w... + + float fB = -1.0f + 1.0f / fSize; + float fS = (desc.Width > 1) ? (2.0f*(1.0f - 1.0f / fSize) / (fSize - 1.0f)) : 0.f; + + // clear out accumulation variables + float fWt = 0.0f; + + if (resultR) + memset(resultR, 0, sizeof(float)*order*order); + if (resultG) + memset(resultG, 0, sizeof(float)*order*order); + if (resultB) + memset(resultB, 0, sizeof(float)*order*order); + + float shBuff[XM_SH_MAXORDER*XM_SH_MAXORDER] = {}; + float shBuffB[XM_SH_MAXORDER*XM_SH_MAXORDER] = {}; + + //--- Process each face of the cubemap + for (UINT face = 0; face < 6; ++face) + { + if (!cubeMap[face].pData) + return E_POINTER; + + const uint8_t *pSrc = reinterpret_cast(cubeMap[face].pData); + for (UINT y = 0; y < desc.Height; ++y) + { + XMVECTOR* ptr = scanline.get(); + if (!LoadScanline(ptr, static_cast(desc.Width), pSrc, static_cast(cubeMap[face].RowPitch), desc.Format)) + { + return E_FAIL; + } + + const float v = float(y) * fS + fB; + + XMVECTOR* pixel = ptr; + for (UINT x = 0; x < desc.Width; ++x, ++pixel) + { + const float u = float(x) * fS + fB; + + float ix, iy, iz; + switch (face) + { + case 0: // Positive X + iz = 1.0f - (2.0f * float(x) + 1.0f) * fPicSize; + iy = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize; + ix = 1.0f; + break; + + case 1: // Negative X + iz = -1.0f + (2.0f * float(x) + 1.0f) * fPicSize; + iy = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize; + ix = -1; + break; + + case 2: // Positive Y + iz = -1.0f + (2.0f * float(y) + 1.0f) * fPicSize; + iy = 1.0f; + ix = -1.0f + (2.0f * float(x) + 1.0f) * fPicSize; + break; + + case 3: // Negative Y + iz = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize; + iy = -1.0f; + ix = -1.0f + (2.0f * float(x) + 1.0f) * fPicSize; + break; + + case 4: // Positive Z + iz = 1.0f; + iy = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize; + ix = -1.0f + (2.0f * float(x) + 1.0f) * fPicSize; + break; + + case 5: // Negative Z + iz = -1.0f; + iy = 1.0f - (2.0f * float(y) + 1.0f) * fPicSize; + ix = 1.0f - (2.0f * float(x) + 1.0f) * fPicSize; + break; + + default: + ix = iy = iz = 0.f; + assert(false); + break; + } + + XMVECTOR dir = XMVectorSet(ix, iy, iz, 0); + dir = XMVector3Normalize(dir); + + const float fDiffSolid = 4.0f / ((1.0f + u * u + v * v)*sqrtf(1.0f + u * u + v * v)); + fWt += fDiffSolid; + + XMSHEvalDirection(shBuff, order, dir); + + XMFLOAT3A clr; + XMStoreFloat3A(&clr, *pixel); + + if (resultR) XMSHAdd(resultR, order, resultR, XMSHScale(shBuffB, order, shBuff, clr.x*fDiffSolid)); + if (resultG) XMSHAdd(resultG, order, resultG, XMSHScale(shBuffB, order, shBuff, clr.y*fDiffSolid)); + if (resultB) XMSHAdd(resultB, order, resultB, XMSHScale(shBuffB, order, shBuff, clr.z*fDiffSolid)); + } + + pSrc += cubeMap[face].RowPitch; + } + } + + const float fNormProj = (4.0f*XM_PI) / fWt; + + if (resultR) XMSHScale(resultR, order, resultR, fNormProj); + if (resultG) XMSHScale(resultG, order, resultG, fNormProj); + if (resultB) XMSHScale(resultB, order, resultB, fNormProj); + + return S_OK; +} diff --git a/src/thirdparty/DirectXMath-dec2022/Stereo3D/Stereo3DMatrixHelper.cpp b/src/thirdparty/DirectXMath-dec2022/Stereo3D/Stereo3DMatrixHelper.cpp new file mode 100644 index 000000000..6e49b6cad --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Stereo3D/Stereo3DMatrixHelper.cpp @@ -0,0 +1,257 @@ +//------------------------------------------------------------------------------------- +// Stereo3DMatrixHelper.cpp -- SIMD C++ Math helper for Stereo 3D matricies +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +//------------------------------------------------------------------------------------- + +#include "Stereo3DMatrixHelper.h" + +using namespace DirectX; + +namespace +{ + inline bool StereoProjectionHelper + ( + const STEREO_PARAMETERS& stereoParameters, + _Out_ float* fVirtualProjection, + _Out_ float* zNearWidth, + _Out_ float* zNearHeight, + float FovAngleY, + float AspectRatio, + float NearZ + ) + { + // note that most people have difficulty fusing images into 3D + // if the separation equals even just the human average. by + // reducing the separation (interocular distance) by 1/2, we + // guarantee a larger subset of people will see full 3D + + // the conservative setting should always be used. the only problem + // with the conservative setting is that the 3D effect will be less + // impressive on smaller screens (which makes sense, since your eye + // cannot be tricked as easily based on the smaller fov). to simulate + // the effect of a larger screen, use the liberal settings (debug only) + + // Conservative Settings: * max acuity angle: 0.8f degrees * interoc distance: 1.25 inches + + // Liberal Settings: * max acuity angle: 1.6f degrees * interoc distance: 2.5f inches + + // maximum visual accuity angle allowed is 3.2 degrees for + // a physical scene, and 1.6 degrees for a virtual one. + // thus we cannot allow an object to appear any closer to + // the viewer than 1.6 degrees (divided by two for most + // half-angle calculations) + + static const float fMaxStereoDistance = 780; // inches (should be between 10 and 20m) + static const float fMaxVisualAcuityAngle = 1.6f * (XM_PI / 180.0f); // radians + static const float fInterocularDistance = 1.25f; // inches + + float fDisplayHeight = stereoParameters.fDisplaySizeInches / sqrtf(AspectRatio * AspectRatio + 1.0f); + float fDisplayWidth = fDisplayHeight * AspectRatio; + float fHalfInterocular = 0.5f * fInterocularDistance * stereoParameters.fStereoExaggerationFactor; + float fHalfPixelWidth = fDisplayWidth / stereoParameters.fPixelResolutionWidth * 0.5f; + float fHalfMaximumAcuityAngle = fMaxVisualAcuityAngle * 0.5f * stereoParameters.fStereoExaggerationFactor; + // float fHalfWidth = fDisplayWidth * 0.5f; + + float fMaxSeparationAcuityAngle = atanf(fHalfInterocular / fMaxStereoDistance); + float fMaxSeparationDistance = fHalfPixelWidth / tanf(fMaxSeparationAcuityAngle); + float fRefinedMaxStereoDistance = fMaxStereoDistance - fMaxSeparationDistance; + float fFovHalfAngle = FovAngleY / 2.0f; + + bool ComfortableResult = true; + if (fRefinedMaxStereoDistance < 0.0f || fMaxSeparationDistance > 0.1f * fMaxStereoDistance) + { + // Pixel resolution is too low to offer a comfortable stereo experience + ComfortableResult = false; + } + + float fRefinedMaxSeparationAcuityAngle = atanf(fHalfInterocular / (fRefinedMaxStereoDistance)); + float fPhysicalZNearDistance = fHalfInterocular / tanf(fHalfMaximumAcuityAngle); + // float fScalingFactor = fHalfMaximumAcuityAngle / atanf(fHalfInterocular / stereoParameters.fViewerDistanceInches); + + float fNearZSeparation = tanf(fRefinedMaxSeparationAcuityAngle) * (fRefinedMaxStereoDistance - fPhysicalZNearDistance); + // float fNearZSeparation2 = fHalfInterocular * (fRefinedMaxStereoDistance - fPhysicalZNearDistance) / fRefinedMaxStereoDistance; + + (*zNearHeight) = cosf(fFovHalfAngle) / sinf(fFovHalfAngle); + (*zNearWidth) = (*zNearHeight) / AspectRatio; + (*fVirtualProjection) = (fNearZSeparation * NearZ * (*zNearWidth * 4.0f)) / (2.0f * NearZ); + + return ComfortableResult; + } +} + +//------------------------------------------------------------------------------ + +void DirectX::StereoCreateDefaultParameters +( + STEREO_PARAMETERS& stereoParameters +) +{ + // Default assumption is 1920x1200 resolution, a 22" LCD monitor, and a 2' viewing distance + stereoParameters.fViewerDistanceInches = 24.0f; + stereoParameters.fPixelResolutionWidth = 1920.0f; + stereoParameters.fPixelResolutionHeight = 1200.0f; + stereoParameters.fDisplaySizeInches = 22.0f; + + stereoParameters.fStereoSeparationFactor = 1.0f; + stereoParameters.fStereoExaggerationFactor = 1.0f; +} + +//------------------------------------------------------------------------------ + +XMMATRIX DirectX::StereoProjectionFovLH +( + _In_opt_ const STEREO_PARAMETERS* pStereoParameters, + STEREO_CHANNEL Channel, + float FovAngleY, + float AspectRatio, + float NearZ, + float FarZ, + STEREO_MODE StereoMode +) +{ + assert(Channel == STEREO_CHANNEL_LEFT || Channel == STEREO_CHANNEL_RIGHT); + assert(StereoMode == STEREO_MODE_NORMAL || StereoMode == STEREO_MODE_INVERTED); + assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + + STEREO_PARAMETERS DefaultParameters = {}; + if (pStereoParameters == nullptr) + { + StereoCreateDefaultParameters(DefaultParameters); + pStereoParameters = &DefaultParameters; + } + + assert(pStereoParameters->fStereoSeparationFactor >= 0.0f && pStereoParameters->fStereoSeparationFactor <= 1.0f); + assert(pStereoParameters->fStereoExaggerationFactor >= 1.0f && pStereoParameters->fStereoExaggerationFactor <= 2.0f); + + float fVirtualProjection = 0.0f; + float zNearWidth = 0.0f; + float zNearHeight = 0.0f; + StereoProjectionHelper(*pStereoParameters, &fVirtualProjection, &zNearWidth, &zNearHeight, FovAngleY, AspectRatio, NearZ); + + fVirtualProjection *= pStereoParameters->fStereoSeparationFactor; // incorporate developer defined bias + + // + // By applying a translation, we are forcing our cameras to be parallel + // + + float fInvertedAngle = atanf(fVirtualProjection / (2.0f * NearZ)); + + XMMATRIX proj = XMMatrixPerspectiveFovLH(FovAngleY, AspectRatio, NearZ, FarZ); + + XMMATRIX patchedProjection; + if (Channel == STEREO_CHANNEL_LEFT) + { + if (StereoMode > STEREO_MODE_NORMAL) + { + XMMATRIX rots = XMMatrixRotationY(fInvertedAngle); + XMMATRIX trans = XMMatrixTranslation(-fVirtualProjection, 0, 0); + patchedProjection = XMMatrixMultiply(XMMatrixMultiply(rots, trans), proj); + } + else + { + XMMATRIX trans = XMMatrixTranslation(-fVirtualProjection, 0, 0); + patchedProjection = XMMatrixMultiply(trans, proj); + } + } + else + { + if (StereoMode > STEREO_MODE_NORMAL) + { + XMMATRIX rots = XMMatrixRotationY(-fInvertedAngle); + XMMATRIX trans = XMMatrixTranslation(fVirtualProjection, 0, 0); + patchedProjection = XMMatrixMultiply(XMMatrixMultiply(rots, trans), proj); + } + else + { + XMMATRIX trans = XMMatrixTranslation(fVirtualProjection, 0, 0); + patchedProjection = XMMatrixMultiply(trans, proj); + } + } + + return patchedProjection; +} + +//------------------------------------------------------------------------------ + +XMMATRIX DirectX::StereoProjectionFovRH +( + _In_opt_ const STEREO_PARAMETERS* pStereoParameters, + STEREO_CHANNEL Channel, + float FovAngleY, + float AspectRatio, + float NearZ, + float FarZ, + STEREO_MODE StereoMode +) +{ + assert(Channel == STEREO_CHANNEL_LEFT || Channel == STEREO_CHANNEL_RIGHT); + assert(StereoMode == STEREO_MODE_NORMAL || StereoMode == STEREO_MODE_INVERTED); + assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + + STEREO_PARAMETERS DefaultParameters = {}; + if (pStereoParameters == nullptr) + { + StereoCreateDefaultParameters(DefaultParameters); + pStereoParameters = &DefaultParameters; + } + + assert(pStereoParameters->fStereoSeparationFactor >= 0.0f && pStereoParameters->fStereoSeparationFactor <= 1.0f); + assert(pStereoParameters->fStereoExaggerationFactor >= 1.0f && pStereoParameters->fStereoExaggerationFactor <= 2.0f); + + float fVirtualProjection = 0.0f; + float zNearWidth = 0.0f; + float zNearHeight = 0.0f; + StereoProjectionHelper(*pStereoParameters, &fVirtualProjection, &zNearWidth, &zNearHeight, FovAngleY, AspectRatio, NearZ); + + fVirtualProjection *= pStereoParameters->fStereoSeparationFactor; // incorporate developer defined bias + + // + // By applying a translation, we are forcing our cameras to be parallel + // + + float fInvertedAngle = atanf(fVirtualProjection / (2.0f * NearZ)); + + XMMATRIX proj = XMMatrixPerspectiveFovRH(FovAngleY, AspectRatio, NearZ, FarZ); + + // + // By applying a translation, we are forcing our cameras to be parallel + // + + XMMATRIX patchedProjection; + if (Channel == STEREO_CHANNEL_LEFT) + { + if (StereoMode > STEREO_MODE_NORMAL) + { + XMMATRIX rots = XMMatrixRotationY(fInvertedAngle); + XMMATRIX trans = XMMatrixTranslation(-fVirtualProjection, 0, 0); + patchedProjection = XMMatrixMultiply(XMMatrixMultiply(rots, trans), proj); + } + else + { + XMMATRIX trans = XMMatrixTranslation(-fVirtualProjection, 0, 0); + patchedProjection = XMMatrixMultiply(trans, proj); + } + } + else + { + if (StereoMode > STEREO_MODE_NORMAL) + { + XMMATRIX rots = XMMatrixRotationY(-fInvertedAngle); + XMMATRIX trans = XMMatrixTranslation(fVirtualProjection, 0, 0); + patchedProjection = XMMatrixMultiply(XMMatrixMultiply(rots, trans), proj); + } + else + { + XMMATRIX trans = XMMatrixTranslation(fVirtualProjection, 0, 0); + patchedProjection = XMMatrixMultiply(trans, proj); + } + } + + return patchedProjection; +} diff --git a/src/thirdparty/DirectXMath-dec2022/Stereo3D/Stereo3DMatrixHelper.h b/src/thirdparty/DirectXMath-dec2022/Stereo3D/Stereo3DMatrixHelper.h new file mode 100644 index 000000000..412d0350a --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/Stereo3D/Stereo3DMatrixHelper.h @@ -0,0 +1,64 @@ +//------------------------------------------------------------------------------------- +// Stereo3DMatrixHelper.h -- SIMD C++ Math helper for Stereo 3D matrices +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +//------------------------------------------------------------------------------------- + +#pragma once + +#include "DirectXMath.h" + +namespace DirectX +{ + // Enumeration for stereo channels (left and right). + enum STEREO_CHANNEL + { + STEREO_CHANNEL_LEFT = 0, + STEREO_CHANNEL_RIGHT + }; + + // Enumeration for stereo mode (normal or inverted). + enum STEREO_MODE + { + STEREO_MODE_NORMAL = 0, + STEREO_MODE_INVERTED, + }; + + //------------------------------------------------------------------------------ + // + // Stereo calibration settings + // + // * Viewer distance to the display + // * Physical display size + // * Render resolution + // + // The stereo separation factor indicates how much separation is between the left and right + // eyes. 0 is no separation, 1 is full separation. It defaults to 1.0. + // + // The debug stereo exaggeration factor indicates how much to increase the interocular spacing and + // maximum acuity angle from comfortable defaults. For retail builds, this value should always + // be 1.0, but during development, on small screens, this value can be raised to up to 2.0 in + // order to exaggerate the 3D effect. Values over 1.0 may cause discomfort on normal sized + // displays. It defaults to 1.0. + // + struct STEREO_PARAMETERS + { + float fViewerDistanceInches; + float fDisplaySizeInches; + float fPixelResolutionWidth; + float fPixelResolutionHeight; + float fStereoSeparationFactor; + float fStereoExaggerationFactor; + }; + + void StereoCreateDefaultParameters(STEREO_PARAMETERS& stereoParameters); + + XMMATRIX StereoProjectionFovLH(_In_opt_ const STEREO_PARAMETERS* pStereoParameters, + STEREO_CHANNEL Channel, float FovAngleY, float AspectRatio, float NearZ, float FarZ, + STEREO_MODE StereoMode = STEREO_MODE_NORMAL); + + XMMATRIX StereoProjectionFovRH(_In_opt_ const STEREO_PARAMETERS* pStereoParameters, + STEREO_CHANNEL Channel, float FovAngleY, float AspectRatio, float NearZ, float FarZ, + STEREO_MODE StereoMode = STEREO_MODE_NORMAL); +} \ No newline at end of file diff --git a/src/thirdparty/DirectXMath-dec2022/XDSP/XDSP.h b/src/thirdparty/DirectXMath-dec2022/XDSP/XDSP.h new file mode 100644 index 000000000..eabd64ab0 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/XDSP/XDSP.h @@ -0,0 +1,871 @@ +//-------------------------------------------------------------------------------------- +// File: XDSP.h +// +// DirectXMath based Digital Signal Processing (DSP) functions for audio, +// primarily Fast Fourier Transform (FFT) +// +// All buffer parameters must be 16-byte aligned +// +// All FFT functions support only single-precision floating-point audio +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// +// http://go.microsoft.com/fwlink/?LinkID=615557 +//-------------------------------------------------------------------------------------- + +#pragma once + +#include +#include + +#include +#include + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 6001 6262) +#endif + +namespace XDSP +{ + using XMVECTOR = DirectX::XMVECTOR; + using FXMVECTOR = DirectX::FXMVECTOR; + using GXMVECTOR = DirectX::GXMVECTOR; + using CXMVECTOR = DirectX::CXMVECTOR; + using XMFLOAT4A = DirectX::XMFLOAT4A; + + inline bool ISPOWEROF2(size_t n) { return (((n)&((n)-1)) == 0 && (n) != 0); } + + // Parallel multiplication of four complex numbers, assuming real and imaginary values are stored in separate vectors. + inline void XM_CALLCONV vmulComplex( + _Out_ XMVECTOR& rResult, _Out_ XMVECTOR& iResult, + _In_ FXMVECTOR r1, _In_ FXMVECTOR i1, _In_ FXMVECTOR r2, _In_ GXMVECTOR i2) noexcept + { + using namespace DirectX; + // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1) + const XMVECTOR vr1r2 = XMVectorMultiply(r1, r2); + const XMVECTOR vr1i2 = XMVectorMultiply(r1, i2); + rResult = XMVectorNegativeMultiplySubtract(i1, i2, vr1r2); // real: (r1*r2 - i1*i2) + iResult = XMVectorMultiplyAdd(r2, i1, vr1i2); // imaginary: (r1*i2 + r2*i1) + } + + inline void XM_CALLCONV vmulComplex( + _Inout_ XMVECTOR& r1, _Inout_ XMVECTOR& i1, _In_ FXMVECTOR r2, _In_ FXMVECTOR i2) noexcept + { + using namespace DirectX; + // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1) + const XMVECTOR vr1r2 = XMVectorMultiply(r1, r2); + const XMVECTOR vr1i2 = XMVectorMultiply(r1, i2); + r1 = XMVectorNegativeMultiplySubtract(i1, i2, vr1r2); // real: (r1*r2 - i1*i2) + i1 = XMVectorMultiplyAdd(r2, i1, vr1i2); // imaginary: (r1*i2 + r2*i1) + } + + //---------------------------------------------------------------------------------- + // Radix-4 decimation-in-time FFT butterfly. + // This version assumes that all four elements of the butterfly are + // adjacent in a single vector. + // + // Compute the product of the complex input vector and the + // 4-element DFT matrix: + // | 1 1 1 1 | | (r1X,i1X) | + // | 1 -j -1 j | | (r1Y,i1Y) | + // | 1 -1 1 -1 | | (r1Z,i1Z) | + // | 1 j -1 -j | | (r1W,i1W) | + // + // This matrix can be decomposed into two simpler ones to reduce the + // number of additions needed. The decomposed matrices look like this: + // | 1 0 1 0 | | 1 0 1 0 | + // | 0 1 0 -j | | 1 0 -1 0 | + // | 1 0 -1 0 | | 0 1 0 1 | + // | 0 1 0 j | | 0 1 0 -1 | + // + // Combine as follows: + // | 1 0 1 0 | | (r1X,i1X) | | (r1X + r1Z, i1X + i1Z) | + // Temp = | 1 0 -1 0 | * | (r1Y,i1Y) | = | (r1X - r1Z, i1X - i1Z) | + // | 0 1 0 1 | | (r1Z,i1Z) | | (r1Y + r1W, i1Y + i1W) | + // | 0 1 0 -1 | | (r1W,i1W) | | (r1Y - r1W, i1Y - i1W) | + // + // | 1 0 1 0 | | (rTempX,iTempX) | | (rTempX + rTempZ, iTempX + iTempZ) | + // Result = | 0 1 0 -j | * | (rTempY,iTempY) | = | (rTempY + iTempW, iTempY - rTempW) | + // | 1 0 -1 0 | | (rTempZ,iTempZ) | | (rTempX - rTempZ, iTempX - iTempZ) | + // | 0 1 0 j | | (rTempW,iTempW) | | (rTempY - iTempW, iTempY + rTempW) | + //---------------------------------------------------------------------------------- + inline void ButterflyDIT4_1 (_Inout_ XMVECTOR& r1, _Inout_ XMVECTOR& i1) noexcept + { + using namespace DirectX; + + // sign constants for radix-4 butterflies + static const XMVECTORF32 vDFT4SignBits1 = { { { 1.0f, -1.0f, 1.0f, -1.0f } } }; + static const XMVECTORF32 vDFT4SignBits2 = { { { 1.0f, 1.0f, -1.0f, -1.0f } } }; + static const XMVECTORF32 vDFT4SignBits3 = { { { 1.0f, -1.0f, -1.0f, 1.0f } } }; + + // calculating Temp + // [r1X| r1X|r1Y| r1Y] + [r1Z|-r1Z|r1W|-r1W] + // [i1X| i1X|i1Y| i1Y] + [i1Z|-i1Z|i1W|-i1W] + const XMVECTOR r1L = XMVectorSwizzle<0, 0, 1, 1>(r1); + const XMVECTOR r1H = XMVectorSwizzle<2, 2, 3, 3>(r1); + + const XMVECTOR i1L = XMVectorSwizzle<0, 0, 1, 1>(i1); + const XMVECTOR i1H = XMVectorSwizzle<2, 2, 3, 3>(i1); + + const XMVECTOR rTemp = XMVectorMultiplyAdd(r1H, vDFT4SignBits1, r1L); + const XMVECTOR iTemp = XMVectorMultiplyAdd(i1H, vDFT4SignBits1, i1L); + + // calculating Result + const XMVECTOR rZrWiZiW = XMVectorPermute<2, 3, 6, 7>(rTemp, iTemp); // [rTempZ|rTempW|iTempZ|iTempW] + const XMVECTOR rZiWrZiW = XMVectorSwizzle<0, 3, 0, 3>(rZrWiZiW); // [rTempZ|iTempW|rTempZ|iTempW] + const XMVECTOR iZrWiZrW = XMVectorSwizzle<2, 1, 2, 1>(rZrWiZiW); // [rTempZ|iTempW|rTempZ|iTempW] + + // [rTempX| rTempY| rTempX| rTempY] + [rTempZ| iTempW|-rTempZ|-iTempW] + // [iTempX| iTempY| iTempX| iTempY] + // [iTempZ|-rTempW|-iTempZ| rTempW] + const XMVECTOR rTempL = XMVectorSwizzle<0, 1, 0, 1>(rTemp); + const XMVECTOR iTempL = XMVectorSwizzle<0, 1, 0, 1>(iTemp); + + r1 = XMVectorMultiplyAdd(rZiWrZiW, vDFT4SignBits2, rTempL); + i1 = XMVectorMultiplyAdd(iZrWiZrW, vDFT4SignBits3, iTempL); + } + + //---------------------------------------------------------------------------------- + // Radix-4 decimation-in-time FFT butterfly. + // This version assumes that elements of the butterfly are + // in different vectors, so that each vector in the input + // contains elements from four different butterflies. + // The four separate butterflies are processed in parallel. + // + // The calculations here are the same as the ones in the single-vector + // radix-4 DFT, but instead of being done on a single vector (X,Y,Z,W) + // they are done in parallel on sixteen independent complex values. + // There is no interdependence between the vector elements: + // | 1 0 1 0 | | (rIn0,iIn0) | | (rIn0 + rIn2, iIn0 + iIn2) | + // | 1 0 -1 0 | * | (rIn1,iIn1) | = Temp = | (rIn0 - rIn2, iIn0 - iIn2) | + // | 0 1 0 1 | | (rIn2,iIn2) | | (rIn1 + rIn3, iIn1 + iIn3) | + // | 0 1 0 -1 | | (rIn3,iIn3) | | (rIn1 - rIn3, iIn1 - iIn3) | + // + // | 1 0 1 0 | | (rTemp0,iTemp0) | | (rTemp0 + rTemp2, iTemp0 + iTemp2) | + // Result = | 0 1 0 -j | * | (rTemp1,iTemp1) | = | (rTemp1 + iTemp3, iTemp1 - rTemp3) | + // | 1 0 -1 0 | | (rTemp2,iTemp2) | | (rTemp0 - rTemp2, iTemp0 - iTemp2) | + // | 0 1 0 j | | (rTemp3,iTemp3) | | (rTemp1 - iTemp3, iTemp1 + rTemp3) | + //---------------------------------------------------------------------------------- + inline void ButterflyDIT4_4( + _Inout_ XMVECTOR& r0, + _Inout_ XMVECTOR& r1, + _Inout_ XMVECTOR& r2, + _Inout_ XMVECTOR& r3, + _Inout_ XMVECTOR& i0, + _Inout_ XMVECTOR& i1, + _Inout_ XMVECTOR& i2, + _Inout_ XMVECTOR& i3, + _In_reads_(uStride * 4) const XMVECTOR* __restrict pUnityTableReal, + _In_reads_(uStride * 4) const XMVECTOR* __restrict pUnityTableImaginary, + _In_ size_t uStride, + _In_ const bool fLast) noexcept + { + using namespace DirectX; + + assert(pUnityTableReal); + assert(pUnityTableImaginary); + assert(reinterpret_cast(pUnityTableReal) % 16 == 0); + assert(reinterpret_cast(pUnityTableImaginary) % 16 == 0); + assert(ISPOWEROF2(uStride)); + + // calculating Temp + const XMVECTOR rTemp0 = XMVectorAdd(r0, r2); + const XMVECTOR iTemp0 = XMVectorAdd(i0, i2); + + const XMVECTOR rTemp2 = XMVectorAdd(r1, r3); + const XMVECTOR iTemp2 = XMVectorAdd(i1, i3); + + const XMVECTOR rTemp1 = XMVectorSubtract(r0, r2); + const XMVECTOR iTemp1 = XMVectorSubtract(i0, i2); + + const XMVECTOR rTemp3 = XMVectorSubtract(r1, r3); + const XMVECTOR iTemp3 = XMVectorSubtract(i1, i3); + + XMVECTOR rTemp4 = XMVectorAdd(rTemp0, rTemp2); + XMVECTOR iTemp4 = XMVectorAdd(iTemp0, iTemp2); + + XMVECTOR rTemp5 = XMVectorAdd(rTemp1, iTemp3); + XMVECTOR iTemp5 = XMVectorSubtract(iTemp1, rTemp3); + + XMVECTOR rTemp6 = XMVectorSubtract(rTemp0, rTemp2); + XMVECTOR iTemp6 = XMVectorSubtract(iTemp0, iTemp2); + + XMVECTOR rTemp7 = XMVectorSubtract(rTemp1, iTemp3); + XMVECTOR iTemp7 = XMVectorAdd(iTemp1, rTemp3); + + // calculating Result + // vmulComplex(rTemp0, iTemp0, rTemp0, iTemp0, pUnityTableReal[0], pUnityTableImaginary[0]); // first one is always trivial + vmulComplex(rTemp5, iTemp5, pUnityTableReal[uStride], pUnityTableImaginary[uStride]); + vmulComplex(rTemp6, iTemp6, pUnityTableReal[uStride * 2], pUnityTableImaginary[uStride * 2]); + vmulComplex(rTemp7, iTemp7, pUnityTableReal[uStride * 3], pUnityTableImaginary[uStride * 3]); + + if (fLast) + { + ButterflyDIT4_1(rTemp4, iTemp4); + ButterflyDIT4_1(rTemp5, iTemp5); + ButterflyDIT4_1(rTemp6, iTemp6); + ButterflyDIT4_1(rTemp7, iTemp7); + } + + r0 = rTemp4; i0 = iTemp4; + r1 = rTemp5; i1 = iTemp5; + r2 = rTemp6; i2 = iTemp6; + r3 = rTemp7; i3 = iTemp7; + } + + //================================================================================== + // F-U-N-C-T-I-O-N-S + //================================================================================== + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // 4-sample FFT. + // + // PARAMETERS: + // pReal - [inout] real components, must have at least uCount elements + // pImaginary - [inout] imaginary components, must have at least uCount elements + // uCount - [in] number of FFT iterations + //---------------------------------------------------------------------------------- + inline void FFT4( + _Inout_updates_(uCount) XMVECTOR* __restrict pReal, + _Inout_updates_(uCount) XMVECTOR* __restrict pImaginary, + const size_t uCount = 1) noexcept + { + assert(pReal); + assert(pImaginary); + assert(reinterpret_cast(pReal) % 16 == 0); + assert(reinterpret_cast(pImaginary) % 16 == 0); + assert(ISPOWEROF2(uCount)); + + for (size_t uIndex = 0; uIndex < uCount; ++uIndex) + { + ButterflyDIT4_1(pReal[uIndex], pImaginary[uIndex]); + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // 8-sample FFT. + // + // PARAMETERS: + // pReal - [inout] real components, must have at least uCount*2 elements + // pImaginary - [inout] imaginary components, must have at least uCount*2 elements + // uCount - [in] number of FFT iterations + //---------------------------------------------------------------------------------- + inline void FFT8( + _Inout_updates_(uCount * 2) XMVECTOR* __restrict pReal, + _Inout_updates_(uCount * 2) XMVECTOR* __restrict pImaginary, + _In_ const size_t uCount = 1) noexcept + { + using namespace DirectX; + + assert(pReal); + assert(pImaginary); + assert(reinterpret_cast(pReal) % 16 == 0); + assert(reinterpret_cast(pImaginary) % 16 == 0); + assert(ISPOWEROF2(uCount)); + + static const XMVECTORF32 wr1 = { { { 1.0f, 0.70710677f, 0.0f, -0.70710677f } } }; + static const XMVECTORF32 wi1 = { { { 0.0f, -0.70710677f, -1.0f, -0.70710677f } } }; + static const XMVECTORF32 wr2 = { { { -1.0f, -0.70710677f, 0.0f, 0.70710677f } } }; + static const XMVECTORF32 wi2 = { { { 0.0f, 0.70710677f, 1.0f, 0.70710677f } } }; + + for (size_t uIndex = 0; uIndex < uCount; ++uIndex) + { + XMVECTOR* __restrict pR = pReal + uIndex * 2; + XMVECTOR* __restrict pI = pImaginary + uIndex * 2; + + XMVECTOR oddsR = XMVectorPermute<1, 3, 5, 7>(pR[0], pR[1]); + XMVECTOR evensR = XMVectorPermute<0, 2, 4, 6>(pR[0], pR[1]); + XMVECTOR oddsI = XMVectorPermute<1, 3, 5, 7>(pI[0], pI[1]); + XMVECTOR evensI = XMVectorPermute<0, 2, 4, 6>(pI[0], pI[1]); + ButterflyDIT4_1(oddsR, oddsI); + ButterflyDIT4_1(evensR, evensI); + + XMVECTOR r, i; + vmulComplex(r, i, oddsR, oddsI, wr1, wi1); + pR[0] = XMVectorAdd(evensR, r); + pI[0] = XMVectorAdd(evensI, i); + + vmulComplex(r, i, oddsR, oddsI, wr2, wi2); + pR[1] = XMVectorAdd(evensR, r); + pI[1] = XMVectorAdd(evensI, i); + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // 16-sample FFT. + // + // PARAMETERS: + // pReal - [inout] real components, must have at least uCount*4 elements + // pImaginary - [inout] imaginary components, must have at least uCount*4 elements + // uCount - [in] number of FFT iterations + //---------------------------------------------------------------------------------- + inline void FFT16( + _Inout_updates_(uCount * 4) XMVECTOR* __restrict pReal, + _Inout_updates_(uCount * 4) XMVECTOR* __restrict pImaginary, + _In_ const size_t uCount = 1) noexcept + { + using namespace DirectX; + + assert(pReal); + assert(pImaginary); + assert(reinterpret_cast(pReal) % 16 == 0); + assert(reinterpret_cast(pImaginary) % 16 == 0); + assert(ISPOWEROF2(uCount)); + + static const XMVECTORF32 aUnityTableReal[4] = { + { { { 1.0f, 1.0f, 1.0f, 1.0f } } }, + { { { 1.0f, 0.92387950f, 0.70710677f, 0.38268343f } } }, + { { { 1.0f, 0.70710677f, -4.3711388e-008f, -0.70710677f } } }, + { { { 1.0f, 0.38268343f, -0.70710677f, -0.92387950f } } } + }; + static const XMVECTORF32 aUnityTableImaginary[4] = + { + { { { -0.0f, -0.0f, -0.0f, -0.0f } } }, + { { { -0.0f, -0.38268343f, -0.70710677f, -0.92387950f } } }, + { { { -0.0f, -0.70710677f, -1.0f, -0.70710677f } } }, + { { { -0.0f, -0.92387950f, -0.70710677f, 0.38268343f } } } + }; + + for (size_t uIndex = 0; uIndex < uCount; ++uIndex) + { + ButterflyDIT4_4(pReal[uIndex * 4], + pReal[uIndex * 4 + 1], + pReal[uIndex * 4 + 2], + pReal[uIndex * 4 + 3], + pImaginary[uIndex * 4], + pImaginary[uIndex * 4 + 1], + pImaginary[uIndex * 4 + 2], + pImaginary[uIndex * 4 + 3], + reinterpret_cast(aUnityTableReal), + reinterpret_cast(aUnityTableImaginary), + 1, true); + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // 2^N-sample FFT. + // + // REMARKS: + // For FFTs length 16 and below, call FFT16(), FFT8(), or FFT4(). + // + // PARAMETERS: + // pReal - [inout] real components, must have at least (uLength*uCount)/4 elements + // pImaginary - [inout] imaginary components, must have at least (uLength*uCount)/4 elements + // pUnityTable - [in] unity table, must have at least uLength*uCount elements, see FFTInitializeUnityTable() + // uLength - [in] FFT length in samples, must be a power of 2 > 16 + // uCount - [in] number of FFT iterations + //---------------------------------------------------------------------------------- + inline void FFT ( + _Inout_updates_((uLength * uCount) / 4) XMVECTOR* __restrict pReal, + _Inout_updates_((uLength * uCount) / 4) XMVECTOR* __restrict pImaginary, + _In_reads_(uLength * uCount) const XMVECTOR* __restrict pUnityTable, + _In_ const size_t uLength, + _In_ const size_t uCount = 1) noexcept + { + assert(pReal); + assert(pImaginary); + assert(pUnityTable); + assert(reinterpret_cast(pReal) % 16 == 0); + assert(reinterpret_cast(pImaginary) % 16 == 0); + assert(reinterpret_cast(pUnityTable) % 16 == 0); + assert(uLength > 16); + _Analysis_assume_(uLength > 16); + assert(ISPOWEROF2(uLength)); + assert(ISPOWEROF2(uCount)); + + const XMVECTOR* __restrict pUnityTableReal = pUnityTable; + const XMVECTOR* __restrict pUnityTableImaginary = pUnityTable + (uLength >> 2); + const size_t uTotal = uCount * uLength; + const size_t uTotal_vectors = uTotal >> 2; + const size_t uStage_vectors = uLength >> 2; + const size_t uStage_vectors_mask = uStage_vectors - 1; + const size_t uStride = uLength >> 4; // stride between butterfly elements + const size_t uStrideMask = uStride - 1; + const size_t uStride2 = uStride * 2; + const size_t uStride3 = uStride * 3; + const size_t uStrideInvMask = ~uStrideMask; + + for (size_t uIndex=0; uIndex < (uTotal_vectors >> 2); ++uIndex) + { + const size_t n = ((uIndex & uStrideInvMask) << 2) + (uIndex & uStrideMask); + ButterflyDIT4_4(pReal[n], + pReal[n + uStride], + pReal[n + uStride2], + pReal[n + uStride3], + pImaginary[n ], + pImaginary[n + uStride], + pImaginary[n + uStride2], + pImaginary[n + uStride3], + pUnityTableReal + (n & uStage_vectors_mask), + pUnityTableImaginary + (n & uStage_vectors_mask), + uStride, false); + } + + if (uLength > 16 * 4) + { + FFT(pReal, pImaginary, pUnityTable + (uLength >> 1), uLength >> 2, uCount * 4); + } + else if (uLength == 16 * 4) + { + FFT16(pReal, pImaginary, uCount * 4); + } + else if (uLength == 8 * 4) + { + FFT8(pReal, pImaginary, uCount * 4); + } + else if (uLength == 4 * 4) + { + FFT4(pReal, pImaginary, uCount * 4); + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // Initializes unity roots lookup table used by FFT functions. + // Once initialized, the table need not be initialized again unless a + // different FFT length is desired. + // + // REMARKS: + // The unity tables of FFT length 16 and below are hard coded into the + // respective FFT functions and so need not be initialized. + // + // PARAMETERS: + // pUnityTable - [out] unity table, receives unity roots lookup table, must have at least uLength elements + // uLength - [in] FFT length in frames, must be a power of 2 > 16 + //---------------------------------------------------------------------------------- + inline void FFTInitializeUnityTable (_Out_writes_(uLength) XMVECTOR* __restrict pUnityTable, _In_ size_t uLength) noexcept + { + using namespace DirectX; + + assert(pUnityTable); + assert(uLength > 16); + _Analysis_assume_(uLength > 16); + assert(ISPOWEROF2(uLength)); + + // initialize unity table for recursive FFT lengths: uLength, uLength/4, uLength/16... > 16 + // pUnityTable[0 to uLength*4-1] contains real components for current FFT length + // pUnityTable[uLength*4 to uLength*8-1] contains imaginary components for current FFT length + static const XMVECTORF32 vXM0123 = { { { 0.0f, 1.0f, 2.0f, 3.0f } } }; + uLength >>= 2; + XMVECTOR vlStep = XMVectorReplicate(XM_PIDIV2 / float(uLength)); + do + { + uLength >>= 2; + XMVECTOR vJP = vXM0123; + for (size_t j = 0; j < uLength; ++j) + { + XMVECTOR vSin, vCos; + XMVECTOR viJP, vlS; + + pUnityTable[j] = g_XMOne; + pUnityTable[j + uLength * 4] = XMVectorZero(); + + vlS = XMVectorMultiply(vJP, vlStep); + XMVectorSinCos(&vSin, &vCos, vlS); + pUnityTable[j + uLength] = vCos; + pUnityTable[j + uLength * 5] = XMVectorMultiply(vSin, g_XMNegativeOne); + + viJP = XMVectorAdd(vJP, vJP); + vlS = XMVectorMultiply(viJP, vlStep); + XMVectorSinCos(&vSin, &vCos, vlS); + pUnityTable[j + uLength * 2] = vCos; + pUnityTable[j + uLength * 6] = XMVectorMultiply(vSin, g_XMNegativeOne); + + viJP = XMVectorAdd(viJP, vJP); + vlS = XMVectorMultiply(viJP, vlStep); + XMVectorSinCos(&vSin, &vCos, vlS); + pUnityTable[j + uLength * 3] = vCos; + pUnityTable[j + uLength * 7] = XMVectorMultiply(vSin, g_XMNegativeOne); + + vJP = XMVectorAdd(vJP, g_XMFour); + } + vlStep = XMVectorMultiply(vlStep, g_XMFour); + pUnityTable += uLength * 8; + } while (uLength > 4); + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // The FFT functions generate output in bit reversed order. + // Use this function to re-arrange them into order of increasing frequency. + // + // REMARKS: + // Exponential values and bits correspond, so the reversed upper index can be omitted depending on the number of exponents. + // + // PARAMETERS: + // pOutput - [out] output buffer, receives samples in order of increasing frequency, cannot overlap pInput, must have at least (1<= 2 + //---------------------------------------------------------------------------------- + inline void FFTUnswizzle ( + _Out_writes_((1 << uLog2Length) / 4) XMVECTOR* __restrict pOutput, + _In_reads_((1 << uLog2Length) / 4) const XMVECTOR* __restrict pInput, + _In_ const size_t uLog2Length) noexcept + { + assert(pOutput); + assert(pInput); + assert(uLog2Length >= 2); + _Analysis_assume_(uLog2Length >= 2); + + float* __restrict pfOutput = reinterpret_cast(pOutput); + const size_t uLength = size_t(1) << (uLog2Length - 2); + + static const unsigned char cSwizzleTable[256] = { + 0x00, 0x40, 0x80, 0xC0, 0x10, 0x50, 0x90, 0xD0, 0x20, 0x60, 0xA0, 0xE0, 0x30, 0x70, 0xB0, 0xF0, + 0x04, 0x44, 0x84, 0xC4, 0x14, 0x54, 0x94, 0xD4, 0x24, 0x64, 0xA4, 0xE4, 0x34, 0x74, 0xB4, 0xF4, + 0x08, 0x48, 0x88, 0xC8, 0x18, 0x58, 0x98, 0xD8, 0x28, 0x68, 0xA8, 0xE8, 0x38, 0x78, 0xB8, 0xF8, + 0x0C, 0x4C, 0x8C, 0xCC, 0x1C, 0x5C, 0x9C, 0xDC, 0x2C, 0x6C, 0xAC, 0xEC, 0x3C, 0x7C, 0xBC, 0xFC, + 0x01, 0x41, 0x81, 0xC1, 0x11, 0x51, 0x91, 0xD1, 0x21, 0x61, 0xA1, 0xE1, 0x31, 0x71, 0xB1, 0xF1, + 0x05, 0x45, 0x85, 0xC5, 0x15, 0x55, 0x95, 0xD5, 0x25, 0x65, 0xA5, 0xE5, 0x35, 0x75, 0xB5, 0xF5, + 0x09, 0x49, 0x89, 0xC9, 0x19, 0x59, 0x99, 0xD9, 0x29, 0x69, 0xA9, 0xE9, 0x39, 0x79, 0xB9, 0xF9, + 0x0D, 0x4D, 0x8D, 0xCD, 0x1D, 0x5D, 0x9D, 0xDD, 0x2D, 0x6D, 0xAD, 0xED, 0x3D, 0x7D, 0xBD, 0xFD, + 0x02, 0x42, 0x82, 0xC2, 0x12, 0x52, 0x92, 0xD2, 0x22, 0x62, 0xA2, 0xE2, 0x32, 0x72, 0xB2, 0xF2, + 0x06, 0x46, 0x86, 0xC6, 0x16, 0x56, 0x96, 0xD6, 0x26, 0x66, 0xA6, 0xE6, 0x36, 0x76, 0xB6, 0xF6, + 0x0A, 0x4A, 0x8A, 0xCA, 0x1A, 0x5A, 0x9A, 0xDA, 0x2A, 0x6A, 0xAA, 0xEA, 0x3A, 0x7A, 0xBA, 0xFA, + 0x0E, 0x4E, 0x8E, 0xCE, 0x1E, 0x5E, 0x9E, 0xDE, 0x2E, 0x6E, 0xAE, 0xEE, 0x3E, 0x7E, 0xBE, 0xFE, + 0x03, 0x43, 0x83, 0xC3, 0x13, 0x53, 0x93, 0xD3, 0x23, 0x63, 0xA3, 0xE3, 0x33, 0x73, 0xB3, 0xF3, + 0x07, 0x47, 0x87, 0xC7, 0x17, 0x57, 0x97, 0xD7, 0x27, 0x67, 0xA7, 0xE7, 0x37, 0x77, 0xB7, 0xF7, + 0x0B, 0x4B, 0x8B, 0xCB, 0x1B, 0x5B, 0x9B, 0xDB, 0x2B, 0x6B, 0xAB, 0xEB, 0x3B, 0x7B, 0xBB, 0xFB, + 0x0F, 0x4F, 0x8F, 0xCF, 0x1F, 0x5F, 0x9F, 0xDF, 0x2F, 0x6F, 0xAF, 0xEF, 0x3F, 0x7F, 0xBF, 0xFF + }; + if ((uLog2Length & 1) == 0) + { + // even powers of two + const size_t uRev32 = 32 - uLog2Length; + for (size_t uIndex = 0; uIndex < uLength; ++uIndex) + { + XMFLOAT4A f4a; + XMStoreFloat4A(&f4a, pInput[uIndex]); + const size_t n = uIndex * 4; + const size_t uAddr = (static_cast(cSwizzleTable[n & 0xff]) << 24) | + (static_cast(cSwizzleTable[(n >> 8) & 0xff]) << 16) | + (static_cast(cSwizzleTable[(n >> 16) & 0xff]) << 8) | + (static_cast(cSwizzleTable[(n >> 24)])); + pfOutput[uAddr >> uRev32] = f4a.x; + pfOutput[(0x40000000 | uAddr) >> uRev32] = f4a.y; + pfOutput[(0x80000000 | uAddr) >> uRev32] = f4a.z; + pfOutput[(0xC0000000 | uAddr) >> uRev32] = f4a.w; + } + } + else + { + // odd powers of two + const size_t uRev7 = size_t(1) << (uLog2Length - 3); + const size_t uRev32 = 32 - (uLog2Length - 3); + for (size_t uIndex = 0; uIndex < uLength; ++uIndex) + { + XMFLOAT4A f4a; + XMStoreFloat4A(&f4a, pInput[uIndex]); + const size_t n = (uIndex >> 1); + size_t uAddr = (((static_cast(cSwizzleTable[n & 0xff]) << 24) | + (static_cast(cSwizzleTable[(n >> 8) & 0xff]) << 16) | + (static_cast(cSwizzleTable[(n >> 16) & 0xff]) << 8) | + (static_cast(cSwizzleTable[(n >> 24)]))) >> uRev32) | + ((uIndex & 1) * uRev7 * 4); + pfOutput[uAddr] = f4a.x; + uAddr += uRev7; + pfOutput[uAddr] = f4a.y; + uAddr += uRev7; + pfOutput[uAddr] = f4a.z; + uAddr += uRev7; + pfOutput[uAddr] = f4a.w; + } + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // Convert complex components to polar form. + // + // PARAMETERS: + // pOutput - [out] output buffer, receives samples in polar form, must have at least uLength/4 elements + // pInputReal - [in] input buffer (real components), must have at least uLength/4 elements + // pInputImaginary - [in] input buffer (imaginary components), must have at least uLength/4 elements + // uLength - [in] FFT length in samples, must be a power of 2 >= 4 + //---------------------------------------------------------------------------------- +#ifdef _MSC_VER +#pragma warning(suppress: 6101) +#endif + inline void FFTPolar( + _Out_writes_(uLength / 4) XMVECTOR* __restrict pOutput, + _In_reads_(uLength / 4) const XMVECTOR* __restrict pInputReal, + _In_reads_(uLength / 4) const XMVECTOR* __restrict pInputImaginary, + _In_ const size_t uLength) noexcept + { + using namespace DirectX; + + assert(pOutput); + assert(pInputReal); + assert(pInputImaginary); + assert(uLength >= 4); + _Analysis_assume_(uLength >= 4); + assert(ISPOWEROF2(uLength)); + + const float flOneOverLength = 1.0f / float(uLength); + + // result = sqrtf((real/uLength)^2 + (imaginary/uLength)^2) * 2 + const XMVECTOR vOneOverLength = XMVectorReplicate(flOneOverLength); + + for (size_t uIndex = 0; uIndex < (uLength >> 2); ++uIndex) + { + XMVECTOR vReal = XMVectorMultiply(pInputReal[uIndex], vOneOverLength); + XMVECTOR vImaginary = XMVectorMultiply(pInputImaginary[uIndex], vOneOverLength); + XMVECTOR vRR = XMVectorMultiply(vReal, vReal); + XMVECTOR vII = XMVectorMultiply(vImaginary, vImaginary); + XMVECTOR vRRplusII = XMVectorAdd(vRR, vII); + XMVECTOR vTotal = XMVectorSqrt(vRRplusII); + pOutput[uIndex] = XMVectorAdd(vTotal, vTotal); + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // Deinterleaves audio samples + // + // REMARKS: + // For example, audio of the form [LRLRLR] becomes [LLLRRR]. + // + // PARAMETERS: + // pOutput - [out] output buffer, receives samples in deinterleaved form, cannot overlap pInput, must have at least (uChannelCount*uFrameCount)/4 elements + // pInput - [in] input buffer, cannot overlap pOutput, must have at least (uChannelCount*uFrameCount)/4 elements + // uChannelCount - [in] number of channels, must be > 1 + // uFrameCount - [in] number of frames of valid data, must be > 0 + //---------------------------------------------------------------------------------- + inline void Deinterleave ( + _Out_writes_((uChannelCount * uFrameCount) / 4) XMVECTOR* __restrict pOutput, + _In_reads_((uChannelCount * uFrameCount) / 4) const XMVECTOR* __restrict pInput, + _In_ const size_t uChannelCount, + _In_ const size_t uFrameCount) noexcept + { + assert(pOutput); + assert(pInput); + assert(uChannelCount > 1); + assert(uFrameCount > 0); + + float* __restrict pfOutput = reinterpret_cast(pOutput); + const float* __restrict pfInput = reinterpret_cast(pInput); + + for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel) + { + for (size_t uFrame = 0; uFrame < uFrameCount; ++uFrame) + { + pfOutput[uChannel * uFrameCount + uFrame] = pfInput[uFrame * uChannelCount + uChannel]; + } + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // Interleaves audio samples + // + // REMARKS: + // For example, audio of the form [LLLRRR] becomes [LRLRLR]. + // + // PARAMETERS: + // pOutput - [out] output buffer, receives samples in interleaved form, cannot overlap pInput, must have at least (uChannelCount*uFrameCount)/4 elements + // pInput - [in] input buffer, cannot overlap pOutput, must have at least (uChannelCount*uFrameCount)/4 elements + // uChannelCount - [in] number of channels, must be > 1 + // uFrameCount - [in] number of frames of valid data, must be > 0 + //---------------------------------------------------------------------------------- + inline void Interleave( + _Out_writes_((uChannelCount * uFrameCount) / 4) XMVECTOR* __restrict pOutput, + _In_reads_((uChannelCount * uFrameCount) / 4) const XMVECTOR* __restrict pInput, + _In_ const size_t uChannelCount, + _In_ const size_t uFrameCount) noexcept + { + assert(pOutput); + assert(pInput); + assert(uChannelCount > 1); + assert(uFrameCount > 0); + + float* __restrict pfOutput = reinterpret_cast(pOutput); + const float* __restrict pfInput = reinterpret_cast(pInput); + + for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel) + { + for (size_t uFrame = 0; uFrame < uFrameCount; ++uFrame) + { + pfOutput[uFrame * uChannelCount + uChannel] = pfInput[uChannel * uFrameCount + uFrame]; + } + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // This function applies a 2^N-sample FFT and unswizzles the result such + // that the samples are in order of increasing frequency. + // Audio is first deinterleaved if multichannel. + // + // PARAMETERS: + // pReal - [inout] real components, must have at least (1<(pReal) % 16 == 0); + assert(reinterpret_cast(pImaginary) % 16 == 0); + assert(reinterpret_cast(pUnityTable) % 16 == 0); + assert(uChannelCount > 0 && uChannelCount <= 6); + assert(uLog2Length >= 2 && uLog2Length <= 9); + + XMVECTOR vRealTemp[768]; + XMVECTOR vImaginaryTemp[768]; + const size_t uLength = size_t(1) << uLog2Length; + + if (uChannelCount > 1) + { + Deinterleave(vRealTemp, pReal, uChannelCount, uLength); + } + else + { + memcpy_s(vRealTemp, sizeof(vRealTemp), pReal, (uLength >> 2) * sizeof(XMVECTOR)); + } + + memset(vImaginaryTemp, 0, (uChannelCount * (uLength >> 2)) * sizeof(XMVECTOR)); + + if (uLength > 16) + { + for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel) + { + FFT(&vRealTemp[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)], pUnityTable, uLength); + } + } + else if (uLength == 16) + { + for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel) + { + FFT16(&vRealTemp[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)]); + } + } + else if (uLength == 8) + { + for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel) + { + FFT8(&vRealTemp[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)]); + } + } + else if (uLength == 4) + { + for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel) + { + FFT4(&vRealTemp[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)]); + } + } + + for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel) + { + FFTUnswizzle(&pReal[uChannel * (uLength >> 2)], &vRealTemp[uChannel * (uLength >> 2)], uLog2Length); + FFTUnswizzle(&pImaginary[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)], uLog2Length); + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // This function applies a 2^N-sample inverse FFT. + // Audio is interleaved if multichannel. + // + // PARAMETERS: + // pReal - [inout] real components, must have at least (1< 0 + // uLog2Length - [in] LOG (base 2) of FFT length in frames, must within [2, 9] + //---------------------------------------------------------------------------------- + inline void IFFTDeinterleaved( + _Inout_updates_(((1 << uLog2Length) * uChannelCount) / 4) XMVECTOR* __restrict pReal, + _In_reads_(((1 << uLog2Length) * uChannelCount) / 4) const XMVECTOR* __restrict pImaginary, + _In_reads_(1 << uLog2Length) const XMVECTOR* __restrict pUnityTable, + _In_ const size_t uChannelCount, + _In_ const size_t uLog2Length) noexcept + { + using namespace DirectX; + + assert(pReal); + assert(pImaginary); + assert(pUnityTable); + assert(reinterpret_cast(pReal) % 16 == 0); + assert(reinterpret_cast(pImaginary) % 16 == 0); + assert(reinterpret_cast(pUnityTable) % 16 == 0); + assert(uChannelCount > 0 && uChannelCount <= 6); + _Analysis_assume_(uChannelCount > 0 && uChannelCount <= 6); + assert(uLog2Length >= 2 && uLog2Length <= 9); + _Analysis_assume_(uLog2Length >= 2 && uLog2Length <= 9); + + XMVECTOR vRealTemp[768] = {}; + XMVECTOR vImaginaryTemp[768] = {}; + + const size_t uLength = size_t(1) << uLog2Length; + + const XMVECTOR vRnp = XMVectorReplicate(1.0f / float(uLength)); + const XMVECTOR vRnm = XMVectorReplicate(-1.0f / float(uLength)); + for (size_t u = 0; u < uChannelCount * (uLength >> 2); u++) + { + vRealTemp[u] = XMVectorMultiply(pReal[u], vRnp); + vImaginaryTemp[u] = XMVectorMultiply(pImaginary[u], vRnm); + } + + if (uLength > 16) + { + for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel) + { + FFT(&vRealTemp[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)], pUnityTable, uLength); + } + } + else if (uLength == 16) + { + for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel) + { + FFT16(&vRealTemp[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)]); + } + } + else if (uLength == 8) + { + for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel) + { + FFT8(&vRealTemp[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)]); + } + } + else if (uLength == 4) + { + for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel) + { + FFT4(&vRealTemp[uChannel * (uLength >> 2)], &vImaginaryTemp[uChannel * (uLength >> 2)]); + } + } + + for (size_t uChannel = 0; uChannel < uChannelCount; ++uChannel) + { + FFTUnswizzle(&vImaginaryTemp[uChannel * (uLength >> 2)], &vRealTemp[uChannel * (uLength >> 2)], uLog2Length); + } + + if (uChannelCount > 1) + { + Interleave(pReal, vImaginaryTemp, uChannelCount, uLength); + } + else + { + memcpy_s(pReal, uLength * uChannelCount * sizeof(float), vImaginaryTemp, (uLength >> 2) * sizeof(XMVECTOR)); + } + } + +} // namespace XDSP + +#ifdef _MSC_VER +#pragma warning(pop) +#endif diff --git a/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-CMake-Dev17.yml b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-CMake-Dev17.yml new file mode 100644 index 000000000..4cff817cf --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-CMake-Dev17.yml @@ -0,0 +1,119 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# http://go.microsoft.com/fwlink/?LinkID=615560 + +# Builds the library and test suite using CMake. + +schedules: +- cron: "0 0 * * *" + displayName: 'Nightly build' + branches: + include: + - main + +resources: + repositories: + - repository: self + type: git + ref: refs/heads/main + trigger: none + +name: $(Year:yyyy).$(Month).$(DayOfMonth)$(Rev:.r) + +variables: + VS_GENERATOR: 'Visual Studio 17 2022' + WIN10_SDK: '10.0.19041.0' + WIN11_SDK: '10.0.22000.0' + GITHUB_PAT: $(GITHUBPUBLICTOKEN) + +pool: + vmImage: windows-2022 + +jobs: +- job: CMAKE_BUILD + displayName: CMake using VS Generator BUILD_TESTING=ON + cancelTimeoutInMinutes: 1 + steps: + - checkout: self + clean: true + fetchTags: false + - task: CmdLine@2 + displayName: Fetch Tests + inputs: + script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests + - task: CMake@1 + displayName: 'CMake (MSVC): Config x64' + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: '-G "$(VS_GENERATOR)" -A x64 -B out -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)' + - task: CMake@1 + displayName: 'CMake (MSVC): Build x64 Debug' + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: --build out -v --config Debug + - task: CMake@1 + displayName: 'CMake (MSVC): Build x64 Release' + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: --build out -v --config RelWithDebInfo + - task: CMake@1 + displayName: 'CMake (MSVC): Config x86' + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: '-G "$(VS_GENERATOR)" -A Win32 -B out2 -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)' + - task: CMake@1 + displayName: 'CMake (MSVC): Build x86 Debug' + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: --build out2 -v --config Debug + - task: CMake@1 + displayName: 'CMake (MSVC): Build x86 Release' + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: --build out2 -v --config RelWithDebInfo + - task: CMake@1 + displayName: 'CMake (MSVC): Config ARM64' + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: '-G "$(VS_GENERATOR)" -A ARM64 -B out3 -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)' + - task: CMake@1 + displayName: 'CMake (MSVC): Build ARM64 Debug' + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: --build out3 -v --config Debug + - task: CMake@1 + displayName: 'CMake (MSVC): Build ARM64 Release' + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: --build out3 -v --config RelWithDebInfo + - task: CMake@1 + displayName: 'CMake (ClangCl): Config x64' + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: '-G "$(VS_GENERATOR)" -A x64 -T clangcl -B out4 -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)' + - task: CMake@1 + displayName: 'CMake (ClangCl): Build x64 Debug' + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: --build out4 -v --config Debug + - task: CMake@1 + displayName: 'CMake (ClangCl): Build x64 Release' + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: --build out4 -v --config RelWithDebInfo + - task: CMake@1 + displayName: 'CMake (ClangCl): Config ARM64' + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: '-G "$(VS_GENERATOR)" -A ARM64 -T clangcl -B out5 -DCMAKE_SYSTEM_VERSION=$(WIN11_SDK)' + - task: CMake@1 + displayName: 'CMake (ClangCl): Build ARM64 Debug' + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: --build out5 -v --config Debug + - task: CMake@1 + displayName: 'CMake (ClangCl): Build ARM64 Release' + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: --build out5 -v --config RelWithDebInfo diff --git a/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-CMake.yml b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-CMake.yml new file mode 100644 index 000000000..1c4e4cd43 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-CMake.yml @@ -0,0 +1,103 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# http://go.microsoft.com/fwlink/?LinkID=615560 + +# Builds the library and test suite using CMake. + +schedules: +- cron: "0 0 * * *" + displayName: 'Nightly build' + branches: + include: + - main + +resources: + repositories: + - repository: self + type: git + ref: refs/heads/main + trigger: none + +name: $(Year:yyyy).$(Month).$(DayOfMonth)$(Rev:.r) + +variables: + VS_GENERATOR: 'Visual Studio 16 2019' + WIN10_SDK: '10.0.19041.0' + GITHUB_PAT: $(GITHUBPUBLICTOKEN) + +pool: + vmImage: windows-2019 + +jobs: +- job: CMAKE_BUILD + displayName: CMake using VS Generator + cancelTimeoutInMinutes: 1 + steps: + - checkout: self + clean: true + fetchTags: false + - task: CmdLine@2 + displayName: Fetch Tests + inputs: + script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests + - task: CMake@1 + displayName: CMake (MSVC x64) + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: '-G "$(VS_GENERATOR)" -A x64 -B out -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)' + - task: CMake@1 + displayName: CMake (Build x64) + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: --build out -v + - task: CMake@1 + displayName: CMake Test (MSVC x64) + inputs: + cwd: Tests + cmakeArgs: '-G "$(VS_GENERATOR)" -A x64 -B out -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)' + - task: CMake@1 + displayName: CMake Test (Build x64) + inputs: + cwd: Tests + cmakeArgs: --build out -v + - task: CMake@1 + displayName: CMake (MSVC ARM64) + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: '-G "$(VS_GENERATOR)" -A ARM64 -B out2 -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)' + - task: CMake@1 + displayName: CMake (Build ARM64) + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: --build out2 -v + - task: CMake@1 + displayName: CMake Test (MSVC ARM64) + inputs: + cwd: Tests + cmakeArgs: '-G "$(VS_GENERATOR)" -A ARM64 -B out2 -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)' + - task: CMake@1 + displayName: CMake Test (Build ARM64) + inputs: + cwd: Tests + cmakeArgs: --build out2 -v + - task: CMake@1 + displayName: CMake (ClangCl) + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: '-G "$(VS_GENERATOR)" -A x64 -T clangcl -B out3 -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)' + - task: CMake@1 + displayName: CMake (Build) + inputs: + cwd: '$(Build.SourcesDirectory)' + cmakeArgs: --build out3 -v + - task: CMake@1 + displayName: CMake Test (ClangCL) + inputs: + cwd: Tests + cmakeArgs: '-G "$(VS_GENERATOR)" -A x64 -T clangcl -B out3 -DCMAKE_SYSTEM_VERSION=$(WIN10_SDK)' + - task: CMake@1 + displayName: CMake Test (Build) + inputs: + cwd: Tests + cmakeArgs: --build out3 -v diff --git a/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-Dev17.yml b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-Dev17.yml new file mode 100644 index 000000000..ae7ea23d2 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-Dev17.yml @@ -0,0 +1,296 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# http://go.microsoft.com/fwlink/?LinkID=615560 + +# Builds the math3 test suite for DirectXMath. + +schedules: +- cron: "0 0 * * *" + displayName: 'Nightly build' + branches: + include: + - main + +resources: + repositories: + - repository: self + type: git + ref: refs/heads/main + trigger: + branches: + include: + - main + paths: + exclude: + - README.md + - HISTORY.md + - SECURITY.md + +name: $(Year:yyyy).$(Month).$(DayOfMonth)$(Rev:.r) + +pool: + vmImage: windows-2022 + +variables: + GITHUB_PAT: $(GITHUBPUBLICTOKEN) + +jobs: +- job: BUILD_DEV17 + displayName: 'Visual Studio 2022 (v143)' + cancelTimeoutInMinutes: 1 + steps: + - checkout: self + clean: true + fetchTags: false + - task: DeleteFiles@1 + displayName: Delete files from Tests + inputs: + SourceFolder: Tests + Contents: '**' + RemoveSourceFolder: true + RemoveDotFiles: true + - task: CmdLine@2 + displayName: Fetch Tests + inputs: + script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x86dbg + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x86 + configuration: Debug + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x86rel + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x86 + configuration: Release + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x64dbg + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x64 + configuration: Debug + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x64rel + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x64 + configuration: Release + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln arm64dbg + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: ARM64 + configuration: Debug + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln arm64rel + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: ARM64 + configuration: Release + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x86dbg sse3 + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x86 + configuration: SSE3 Debug + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x86rel sse3 + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x86 + configuration: SSE3 Release + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x64dbg sse3 + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x64 + configuration: SSE3 Debug + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x64rel sse3 + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x64 + configuration: SSE3 Release + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x86dbg sse4 + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x86 + configuration: SSE4 Debug + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x86rel sse4 + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x86 + configuration: SSE4 Release + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x64dbg sse4 + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x64 + configuration: SSE4 Debug + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x64rel sse4 + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x64 + configuration: SSE4 Release + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x86dbg avx + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x86 + configuration: AVX Debug + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x86rel avx + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x86 + configuration: AVX Release + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x64dbg avx + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x64 + configuration: AVX Debug + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x64rel avx + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x64 + configuration: AVX Release + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x86dbg avx2 + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x86 + configuration: AVX2 Debug + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x86rel avx2 + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x86 + configuration: AVX2 Release + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x64dbg avx2 + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x64 + configuration: AVX2 Debug + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x64rel avx2 + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x64 + configuration: AVX2 Release + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x86dbg nointrinsics + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x86 + configuration: NI Debug + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x86rel nointrinsics + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x86 + configuration: NI Release + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x64dbg nointrinsics + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x64 + configuration: NI Debug + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x64rel nointrinsics + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x64 + configuration: NI Release + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln arm64dbg nointrinsics + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: ARM64 + configuration: NI Debug + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln arm86rel nointrinsics + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: ARM64 + configuration: NI Release + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x86dbg x87 + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x86 + configuration: x87 Debug + msbuildArchitecture: x64 + - task: VSBuild@1 + displayName: Build solution math3_2022.sln x86rel x87 + inputs: + solution: Tests/math3/math3_2022.sln + vsVersion: 17.0 + platform: x86 + configuration: x87 Release + msbuildArchitecture: x64 diff --git a/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-MinGW.yml b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-MinGW.yml new file mode 100644 index 000000000..309e8d76c --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-MinGW.yml @@ -0,0 +1,170 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# http://go.microsoft.com/fwlink/?LinkID=615560 + +# Builds the library and test suite using the MinGW compiler. + +schedules: +- cron: "0 0 * * *" + displayName: 'Nightly build' + branches: + include: + - main + +resources: + repositories: + - repository: self + type: git + ref: refs/heads/main + trigger: none + +name: $(Year:yyyy).$(Month).$(DayOfMonth)$(Rev:.r) + +pool: + vmImage: windows-2022 + +variables: + GITHUB_PAT: $(GITHUBPUBLICTOKEN) + URL_MINGW32: https://github.com/brechtsanders/winlibs_mingw/releases/download/12.2.0-14.0.6-10.0.0-ucrt-r2/winlibs-i686-posix-dwarf-gcc-12.2.0-llvm-14.0.6-mingw-w64ucrt-10.0.0-r2.zip + HASH_MINGW32: 'fcd1e11b896190da01c83d5b5fb0d37b7c61585e53446c2dab0009debc3915e757213882c35e35396329338de6f0222ba012e23a5af86932db45186a225d1272' + URL_MINGW64: https://github.com/brechtsanders/winlibs_mingw/releases/download/12.2.0-14.0.6-10.0.0-ucrt-r2/winlibs-x86_64-posix-seh-gcc-12.2.0-llvm-14.0.6-mingw-w64ucrt-10.0.0-r2.zip + HASH_MINGW64: '6694e552d73195b57f283645ab78cb0180f4d957b5501a83e6b4f2679dfad13a8e85e1df6f7b061ea4431fbd2bb0c8f2ac3a1dd810489c1a8d1665b226df8092' + +jobs: +- job: MINGW32_BUILD + displayName: 'Minimalist GNU for Windows (MinGW32)' + steps: + - checkout: self + clean: true + fetchTags: false + - task: CmdLine@2 + displayName: Fetch Tests + inputs: + script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests + workingDirectory: $(Build.SourcesDirectory) + - task: PowerShell@2 + # We install GCC 12.2 as the MS Hosted only offers 11.2 + displayName: Install MinGW32 + inputs: + targetType: inline + script: | + $ProgressPreference = 'SilentlyContinue' + Write-Host "Downloading winlibs..." + Invoke-WebRequest -Uri "$(URL_MINGW32)" -OutFile "gw32.zip" + Write-Host "Downloaded." + $fileHash = Get-FileHash -Algorithm SHA512 gw32.zip | ForEach { $_.Hash} | Out-String + $filehash = $fileHash.Trim() + Write-Host "##[debug]SHA512: " $fileHash + if ($fileHash -ne '$(HASH_MINGW32)') { + Write-Error -Message "##[error]Computed hash does not match!" -ErrorAction Stop + } + Write-Host "Extracting winlibs..." + Expand-Archive -LiteralPath 'gw32.zip' + Write-Host "Extracted." + Write-Host "Added to path: $env:BUILD_SOURCESDIRECTORY\gw32\mingw32\bin" + Write-Host "##vso[task.prependpath]$env:BUILD_SOURCESDIRECTORY\gw32\mingw32\bin" + + workingDirectory: $(Build.SourcesDirectory) + - task: CmdLine@2 + displayName: GCC version + inputs: + script: g++ --version + - task: CMake@1 + displayName: CMake (MinGW32) Dbg + inputs: + cwd: Tests + cmakeArgs: -B out -DCMAKE_BUILD_TYPE="Debug" -DDXMATH_ARCHITECTURE=x86 -DCMAKE_CXX_COMPILER="g++.exe" -G "MinGW Makefiles" + - task: CMake@1 + displayName: CMake (MinGW32) Build Dbg + inputs: + cwd: Tests + cmakeArgs: --build out + - task: CMake@1 + displayName: CMake (MinGW32) Rel + inputs: + cwd: Tests + cmakeArgs: -B out2 -DCMAKE_BUILD_TYPE="RelWithDebInfo" -DDXMATH_ARCHITECTURE=x86 -DCMAKE_CXX_COMPILER="g++.exe" -G "MinGW Makefiles" + - task: CMake@1 + displayName: CMake (MinGW32) Build Rel + inputs: + cwd: Tests + cmakeArgs: --build out2 + - task: CMake@1 + displayName: CMake (MinGW32) Dbg NI + inputs: + cwd: Tests + cmakeArgs: -B out3 -DCMAKE_BUILD_TYPE="Debug" -DBUILD_NO_INTRINSICS=ON -DDXMATH_ARCHITECTURE=x86 -DCMAKE_CXX_COMPILER="g++.exe" -G "MinGW Makefiles" + - task: CMake@1 + displayName: CMake (MinGW32) Build Dbg NI + inputs: + cwd: Tests + cmakeArgs: --build out3 + +- job: MINGW64_BUILD + displayName: 'Minimalist GNU for Windows (MinGW-W64) BUILD_TESTING=ON' + steps: + - checkout: self + clean: true + fetchTags: false + - task: CmdLine@2 + displayName: Fetch Tests + inputs: + script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests + workingDirectory: $(Build.SourcesDirectory) + - task: PowerShell@2 + displayName: Install MinGW-W64 + inputs: + targetType: inline + script: | + $ProgressPreference = 'SilentlyContinue' + Write-Host "Downloading winlibs..." + Invoke-WebRequest -Uri "$(URL_MINGW64)" -OutFile "gw64.zip" + Write-Host "Downloaded." + $fileHash = Get-FileHash -Algorithm SHA512 gw64.zip | ForEach { $_.Hash} | Out-String + $filehash = $fileHash.Trim() + Write-Host "##[debug]SHA512: " $fileHash + if ($fileHash -ne '$(HASH_MINGW64)') { + Write-Error -Message "##[error]Computed hash does not match!" -ErrorAction Stop + } + Write-Host "Extracting winlibs..." + Expand-Archive -LiteralPath 'gw64.zip' + Write-Host "Extracted." + Write-Host "Added to path: $env:BUILD_SOURCESDIRECTORY\gw64\mingw64\bin" + Write-Host "##vso[task.prependpath]$env:BUILD_SOURCESDIRECTORY\gw64\mingw64\bin" + + workingDirectory: $(Build.SourcesDirectory) + - task: CmdLine@2 + displayName: GCC version + inputs: + script: g++ --version + - task: CMake@1 + displayName: CMake (MinGW-W64) Dbg + inputs: + cwd: Tests + cmakeArgs: -B out -DCMAKE_BUILD_TYPE="Debug" -DDXMATH_ARCHITECTURE=x64 -DCMAKE_CXX_COMPILER="g++.exe" -G "MinGW Makefiles" + - task: CMake@1 + displayName: CMake (MinGW-W64) Build Dbg + inputs: + cwd: Tests + cmakeArgs: --build out + - task: CMake@1 + displayName: CMake (MinGW-W64) Rel + inputs: + cwd: Tests + cmakeArgs: -B out2 -DCMAKE_BUILD_TYPE="RelWithDebInfo" -DDXMATH_ARCHITECTURE=x64 -DCMAKE_CXX_COMPILER="g++.exe" -G "MinGW Makefiles" + - task: CMake@1 + displayName: CMake (MinGW-W64) Build Rel + inputs: + cwd: Tests + cmakeArgs: --build out2 + - task: CMake@1 + displayName: CMake (MinGW-W64) Dbg NI + inputs: + cwd: Tests + cmakeArgs: -B out3 -DCMAKE_BUILD_TYPE="Debug" -DBUILD_NO_INTRINSICS=ON -DDXMATH_ARCHITECTURE=x64 -DCMAKE_CXX_COMPILER="g++.exe" -G "MinGW Makefiles" + - task: CMake@1 + displayName: CMake (MinGW-W64) Build Dbg NI + inputs: + cwd: Tests + cmakeArgs: --build out3 diff --git a/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-WSL-11.yml b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-WSL-11.yml new file mode 100644 index 000000000..c8f7b3c6f --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-WSL-11.yml @@ -0,0 +1,64 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# http://go.microsoft.com/fwlink/?LinkID=615560 + +# Builds the math3 test suite for Windows Subsystem for Linux (WSL) + +schedules: +- cron: "0 3 * * *" + displayName: 'Nightly build' + branches: + include: + - main + +resources: + repositories: + - repository: self + type: git + ref: refs/heads/main + trigger: none + +name: $(Year:yyyy).$(Month).$(DayOfMonth)$(Rev:.r) + +pool: + vmImage: ubuntu-22.04 + +variables: + GITHUB_PAT: $(GITHUBPUBLICTOKEN) + +jobs: +- job: BUILD_WSL + displayName: 'Windows Subsystem for Linux (WSL)' + steps: + - checkout: self + clean: true + fetchTags: false + - task: CmdLine@2 + displayName: Fetch tests + inputs: + script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests + - task: CMake@1 + displayName: DirectXMath Tests + inputs: + cwd: Tests + cmakeArgs: . + - task: PowerShell@2 + displayName: Fetch SAL.H + inputs: + targetType: inline + script: | + $ProgressPreference = 'SilentlyContinue' + Invoke-WebRequest -Uri https://raw.githubusercontent.com/dotnet/corert/master/src/Native/inc/unix/sal.h -o $(Build.SourcesDirectory)/Inc/sal.h + $fileHash = Get-FileHash -Algorithm SHA512 $(Build.SourcesDirectory)/Inc/sal.h | ForEach { $_.Hash} | Out-String + $filehash = $fileHash.Trim() + Write-Host "##[debug]SHA512: " $filehash + if ($fileHash -ne "1643571673195d9eb892d2f2ac76eac7113ef7aa0ca116d79f3e4d3dc9df8a31600a9668b7e7678dfbe5a76906f9e0734ef8d6db0903ccc68fc742dd8238d8b0") { + Write-Error -Message "##[error]Computed hash does not match!" -ErrorAction Stop + } + + - task: CMake@1 + displayName: DirectXMath Tests Build + inputs: + cwd: Tests + cmakeArgs: --build . -v diff --git a/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-WSL.yml b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-WSL.yml new file mode 100644 index 000000000..05d6c1117 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub-WSL.yml @@ -0,0 +1,64 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# http://go.microsoft.com/fwlink/?LinkID=615560 + +# Builds the math3 test suite for Windows Subsystem for Linux (WSL) + +schedules: +- cron: "0 3 * * *" + displayName: 'Nightly build' + branches: + include: + - main + +resources: + repositories: + - repository: self + type: git + ref: refs/heads/main + trigger: none + +name: $(Year:yyyy).$(Month).$(DayOfMonth)$(Rev:.r) + +pool: + vmImage: ubuntu-20.04 + +variables: + GITHUB_PAT: $(GITHUBPUBLICTOKEN) + +jobs: +- job: BUILD_WSL + displayName: 'Windows Subsystem for Linux (WSL)' + steps: + - checkout: self + clean: true + fetchTags: false + - task: CmdLine@2 + displayName: Fetch tests + inputs: + script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests + - task: CMake@1 + displayName: DirectXMath Tests + inputs: + cwd: Tests + cmakeArgs: . + - task: PowerShell@2 + displayName: Fetch SAL.H + inputs: + targetType: inline + script: | + $ProgressPreference = 'SilentlyContinue' + Invoke-WebRequest -Uri https://raw.githubusercontent.com/dotnet/corert/master/src/Native/inc/unix/sal.h -o $(Build.SourcesDirectory)/Inc/sal.h + $fileHash = Get-FileHash -Algorithm SHA512 $(Build.SourcesDirectory)/Inc/sal.h | ForEach { $_.Hash} | Out-String + $filehash = $fileHash.Trim() + Write-Host "##[debug]SHA512: " $filehash + if ($fileHash -ne "1643571673195d9eb892d2f2ac76eac7113ef7aa0ca116d79f3e4d3dc9df8a31600a9668b7e7678dfbe5a76906f9e0734ef8d6db0903ccc68fc742dd8238d8b0") { + Write-Error -Message "##[error]Computed hash does not match!" -ErrorAction Stop + } + + - task: CMake@1 + displayName: DirectXMath Tests Build + inputs: + cwd: Tests + cmakeArgs: --build . -v diff --git a/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub.yml b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub.yml new file mode 100644 index 000000000..393762bf0 --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-GitHub.yml @@ -0,0 +1,543 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# http://go.microsoft.com/fwlink/?LinkID=615560 + +# Builds the math3 test suite for DirectXMath. + +schedules: +- cron: "0 0 * * *" + displayName: 'Nightly build' + branches: + include: + - main + +resources: + repositories: + - repository: self + type: git + ref: refs/heads/main + trigger: + branches: + include: + - main + paths: + exclude: + - README.md + - HISTORY.md + - SECURITY.md + +name: $(Year:yyyy).$(Month).$(DayOfMonth)$(Rev:.r) + +pool: + vmImage: windows-2019 + +variables: + GITHUB_PAT: $(GITHUBPUBLICTOKEN) + +jobs: +- job: BUILD_DEV16 + displayName: 'Visual Studio 2019 (v142)' + cancelTimeoutInMinutes: 1 + steps: + - checkout: self + clean: true + fetchTags: false + - task: DeleteFiles@1 + displayName: Delete files from Tests + inputs: + SourceFolder: Tests + Contents: '**' + RemoveSourceFolder: true + RemoveDotFiles: true + - task: CmdLine@2 + displayName: Fetch Tests + inputs: + script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x86dbg + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x86 + configuration: Debug + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x86rel + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x86 + configuration: Release + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x64dbg + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x64 + configuration: Debug + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x64rel + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x64 + configuration: Release + - task: VSBuild@1 + displayName: Build solution math3_2019.sln arm64dbg + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: ARM64 + configuration: Debug + - task: VSBuild@1 + displayName: Build solution math3_2019.sln arm64rel + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: ARM64 + configuration: Release + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x86dbg sse3 + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x86 + configuration: SSE3 Debug + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x86rel sse3 + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x86 + configuration: SSE3 Release + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x64dbg sse3 + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x64 + configuration: SSE3 Debug + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x64rel sse3 + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x64 + configuration: SSE3 Release + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x86dbg sse4 + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x86 + configuration: SSE4 Debug + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x86rel sse4 + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x86 + configuration: SSE4 Release + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x64dbg sse4 + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x64 + configuration: SSE4 Debug + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x64rel sse4 + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x64 + configuration: SSE4 Release + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x86dbg avx + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x86 + configuration: AVX Debug + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x86rel avx + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x86 + configuration: AVX Release + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x64dbg avx + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x64 + configuration: AVX Debug + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x64rel avx + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x64 + configuration: AVX Release + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x86dbg avx2 + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x86 + configuration: AVX2 Debug + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x86rel avx2 + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x86 + configuration: AVX2 Release + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x64dbg avx2 + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x64 + configuration: AVX2 Debug + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x64rel avx2 + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x64 + configuration: AVX2 Release + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x86dbg nointrinsics + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x86 + configuration: NI Debug + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x86rel nointrinsics + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x86 + configuration: NI Release + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x64dbg nointrinsics + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x64 + configuration: NI Debug + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x64rel nointrinsics + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x64 + configuration: NI Release + - task: VSBuild@1 + displayName: Build solution math3_2019.sln arm64dbg nointrinsics + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: ARM64 + configuration: NI Debug + - task: VSBuild@1 + displayName: Build solution math3_2019.sln arm86rel nointrinsics + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: ARM64 + configuration: NI Release + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x86dbg x87 + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x86 + configuration: x87 Debug + - task: VSBuild@1 + displayName: Build solution math3_2019.sln x86rel x87 + inputs: + solution: Tests/math3/math3_2019.sln + vsVersion: 16.0 + platform: x86 + configuration: x87 Release + - task: VSBuild@1 + displayName: Build solution shmath_2019.sln x64dbg + inputs: + solution: Tests/shmath/shmath_2019.sln + vsVersion: 16.0 + platform: x64 + configuration: Debug + - task: VSBuild@1 + displayName: Build solution shmath_2019.sln x64rel + inputs: + solution: Tests/shmath/shmath_2019.sln + vsVersion: 16.0 + platform: x64 + configuration: Release + - task: VSBuild@1 + displayName: Build solution shmath_2019.sln arm64dbg + inputs: + solution: Tests/shmath/shmath_2019.sln + vsVersion: 16.0 + platform: ARM64 + configuration: Debug + - task: VSBuild@1 + displayName: Build solution shmath_2019.sln arm64rel + inputs: + solution: Tests/shmath/shmath_2019.sln + vsVersion: 16.0 + platform: ARM64 + configuration: Release + - task: VSBuild@1 + displayName: Build solution XDSPTest_2019 x64dbg + inputs: + solution: Tests/xdsp/XDSPTest_2019.sln + vsVersion: 16.0 + platform: x64 + configuration: Debug + - task: VSBuild@1 + displayName: Build solution XDSPTest_2019 x64rel + inputs: + solution: Tests/xdsp/XDSPTest_2019.sln + vsVersion: 16.0 + platform: x64 + configuration: Release + - task: VSBuild@1 + displayName: Build solution XDSPTest_2019 arm64dbg + inputs: + solution: Tests/xdsp/XDSPTest_2019.sln + vsVersion: 16.0 + platform: ARM64 + configuration: Debug + - task: VSBuild@1 + displayName: Build solution XDSPTest_2019 arm64rel + inputs: + solution: Tests/xdsp/XDSPTest_2019.sln + vsVersion: 16.0 + platform: ARM64 + configuration: Release + +- job: BUILD_DEV15 + displayName: 'Visual Studio 2019 (v141)' + steps: + - checkout: self + clean: true + fetchTags: false + - task: CmdLine@2 + displayName: Fetch Tests + inputs: + script: git clone --quiet https://%GITHUB_PAT%@github.com/walbourn/directxmathtest.git Tests + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x86dbg + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x86 + configuration: Debug + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x86rel + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x86 + configuration: Release + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x64dbg + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x64 + configuration: Debug + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x64rel + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x64 + configuration: Release + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x86dbg sse3 + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x86 + configuration: SSE3 Debug + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x86rel sse3 + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x86 + configuration: SSE3 Release + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x64dbg sse3 + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x64 + configuration: SSE3 Debug + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x64rel sse3 + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x64 + configuration: SSE3 Release + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x86dbg sse4 + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x86 + configuration: SSE4 Debug + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x86rel sse4 + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x86 + configuration: SSE4 Release + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x64dbg sse4 + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x64 + configuration: SSE4 Debug + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x64rel sse4 + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x64 + configuration: SSE4 Release + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x86dbg avx + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x86 + configuration: AVX Debug + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x86rel avx + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x86 + configuration: AVX Release + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x64dbg avx + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x64 + configuration: AVX Debug + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x64rel avx + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x64 + configuration: AVX Release + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x86dbg avx2 + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x86 + configuration: AVX2 Debug + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x86rel avx2 + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x86 + configuration: AVX2 Release + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x64dbg avx2 + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x64 + configuration: AVX2 Debug + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x64rel avx2 + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x64 + configuration: AVX2 Release + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x86dbg nointrinsics + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x86 + configuration: NI Debug + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x86rel nointrinsics + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x86 + configuration: NI Release + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x64dbg nointrinsics + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x64 + configuration: NI Debug + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x64rel nointrinsics + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x64 + configuration: NI Release + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x86dbg x87 + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x86 + configuration: x87 Debug + - task: VSBuild@1 + displayName: Build solution math3_2017.sln x86rel x87 + inputs: + solution: Tests/math3/math3_2017.sln + vsVersion: 16.0 + platform: x86 + configuration: x87 Release + - task: VSBuild@1 + displayName: Build solution shmath_2017.sln x64dbg + inputs: + solution: Tests/shmath/shmath_2017.sln + vsVersion: 16.0 + platform: x64 + configuration: Debug + - task: VSBuild@1 + displayName: Build solution shmath_2017.sln x64rel + inputs: + solution: Tests/shmath/shmath_2017.sln + vsVersion: 16.0 + platform: x64 + configuration: Release + - task: VSBuild@1 + displayName: Build solution XDSPTest_2017 x64dbg + inputs: + solution: Tests/xdsp/XDSPTest_2017.sln + vsVersion: 16.0 + platform: x64 + configuration: Debug + - task: VSBuild@1 + displayName: Build solution XDSPTest_2017 x64rel + inputs: + solution: Tests/xdsp/XDSPTest_2017.sln + vsVersion: 16.0 + platform: x64 + configuration: Release diff --git a/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-config.cmake.in b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-config.cmake.in new file mode 100644 index 000000000..2a485225c --- /dev/null +++ b/src/thirdparty/DirectXMath-dec2022/build/DirectXMath-config.cmake.in @@ -0,0 +1,5 @@ +@PACKAGE_INIT@ + +include(${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@-targets.cmake) + +check_required_components("@PROJECT_NAME@") diff --git a/src/thirdparty/dotnetrt/sal.h b/src/thirdparty/dotnetrt/sal.h new file mode 100644 index 000000000..2e0457140 --- /dev/null +++ b/src/thirdparty/dotnetrt/sal.h @@ -0,0 +1,2953 @@ +// VALVE EDIT: +// taken from https://github.com/dotnet/runtime/blob/main/src/coreclr/pal/inc/rt/sal.h +// used for DirectXMath compatibly on POSIX + +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +/*** +*sal.h - markers for documenting the semantics of APIs +* + +* +*Purpose: +* sal.h provides a set of annotations to describe how a function uses its +* parameters - the assumptions it makes about them, and the guarantees it makes +* upon finishing. +****/ +#pragma once + +/*========================================================================== + + The comments in this file are intended to give basic understanding of + the usage of SAL, the Microsoft Source Code Annotation Language. + For more details, please see http://go.microsoft.com/fwlink/?LinkID=242134 + + The macros are defined in 3 layers, plus the structural set: + + _In_/_Out_/_Ret_ Layer: + ---------------------- + This layer provides the highest abstraction and its macros should be used + in most cases. These macros typically start with: + _In_ : input parameter to a function, unmodified by called function + _Out_ : output parameter, written to by called function, pointed-to + location not expected to be initialized prior to call + _Outptr_ : like _Out_ when returned variable is a pointer type + (so param is pointer-to-pointer type). Called function + provides/allocated space. + _Outref_ : like _Outptr_, except param is reference-to-pointer type. + _Inout_ : inout parameter, read from and potentially modified by + called function. + _Ret_ : for return values + _Field_ : class/struct field invariants + For common usage, this class of SAL provides the most concise annotations. + Note that _In_/_Out_/_Inout_/_Outptr_ annotations are designed to be used + with a parameter target. Using them with _At_ to specify non-parameter + targets may yield unexpected results. + + This layer also includes a number of other properties that can be specified + to extend the ability of code analysis, most notably: + -- Designating parameters as format strings for printf/scanf/scanf_s + -- Requesting stricter type checking for C enum parameters + + _Pre_/_Post_ Layer: + ------------------ + The macros of this layer only should be used when there is no suitable macro + in the _In_/_Out_ layer. Its macros start with _Pre_ or _Post_. + This layer provides the most flexibility for annotations. + + Implementation Abstraction Layer: + -------------------------------- + Macros from this layer should never be used directly. The layer only exists + to hide the implementation of the annotation macros. + + Structural Layer: + ---------------- + These annotations, like _At_ and _When_, are used with annotations from + any of the other layers as modifiers, indicating exactly when and where + the annotations apply. + + + Common syntactic conventions: + ---------------------------- + + Usage: + ----- + _In_, _Out_, _Inout_, _Pre_, _Post_, are for formal parameters. + _Ret_, _Deref_ret_ must be used for return values. + + Nullness: + -------- + If the parameter can be NULL as a precondition to the function, the + annotation contains _opt. If the macro does not contain '_opt' the + parameter cannot be NULL. + + If an out/inout parameter returns a null pointer as a postcondition, this is + indicated by _Ret_maybenull_ or _result_maybenull_. If the macro is not + of this form, then the result will not be NULL as a postcondition. + _Outptr_ - output value is not NULL + _Outptr_result_maybenull_ - output value might be NULL + + String Type: + ----------- + _z: NullTerminated string + for _In_ parameters the buffer must have the specified stringtype before the call + for _Out_ parameters the buffer must have the specified stringtype after the call + for _Inout_ parameters both conditions apply + + Extent Syntax: + ------------- + Buffer sizes are expressed as element counts, unless the macro explicitly + contains _byte_ or _bytes_. Some annotations specify two buffer sizes, in + which case the second is used to indicate how much of the buffer is valid + as a postcondition. This table outlines the precondition buffer allocation + size, precondition number of valid elements, postcondition allocation size, + and postcondition number of valid elements for representative buffer size + annotations: + Pre | Pre | Post | Post + alloc | valid | alloc | valid + Annotation elems | elems | elems | elems + ---------- ------------------------------------ + _In_reads_(s) s | s | s | s + _Inout_updates_(s) s | s | s | s + _Inout_updates_to_(s,c) s | s | s | c + _Out_writes_(s) s | 0 | s | s + _Out_writes_to_(s,c) s | 0 | s | c + _Outptr_result_buffer_(s) ? | ? | s | s + _Outptr_result_buffer_to_(s,c) ? | ? | s | c + + For the _Outptr_ annotations, the buffer in question is at one level of + dereference. The called function is responsible for supplying the buffer. + + Success and failure: + ------------------- + The SAL concept of success allows functions to define expressions that can + be tested by the caller, which if it evaluates to non-zero, indicates the + function succeeded, which means that its postconditions are guaranteed to + hold. Otherwise, if the expression evaluates to zero, the function is + considered to have failed, and the postconditions are not guaranteed. + + The success criteria can be specified with the _Success_(expr) annotation: + _Success_(return != FALSE) BOOL + PathCanonicalizeA(_Out_writes_(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) : + pszBuf is only guaranteed to be NULL-terminated when TRUE is returned, + and FALSE indicates failure. In common practice, callers check for zero + vs. non-zero returns, so it is preferable to express the success + criteria in terms of zero/non-zero, not checked for exactly TRUE. + + Functions can specify that some postconditions will still hold, even when + the function fails, using _On_failure_(anno-list), or postconditions that + hold regardless of success or failure using _Always_(anno-list). + + The annotation _Return_type_success_(expr) may be used with a typedef to + give a default _Success_ criteria to all functions returning that type. + This is the case for common Windows API status types, including + HRESULT and NTSTATUS. This may be overridden on a per-function basis by + specifying a _Success_ annotation locally. + +============================================================================*/ + +#define __ATTR_SAL + +#ifndef _SAL_VERSION /*IFSTRIP=IGN*/ +#define _SAL_VERSION 20 +#endif + +#ifdef _PREFAST_ // [ + +// choose attribute or __declspec implementation +#ifndef _USE_DECLSPECS_FOR_SAL // [ +#define _USE_DECLSPECS_FOR_SAL 1 +#endif // ] + +#if _USE_DECLSPECS_FOR_SAL // [ +#undef _USE_ATTRIBUTES_FOR_SAL +#define _USE_ATTRIBUTES_FOR_SAL 0 +#elif !defined(_USE_ATTRIBUTES_FOR_SAL) // ][ +#if _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // [ +#define _USE_ATTRIBUTES_FOR_SAL 1 +#else // ][ +#define _USE_ATTRIBUTES_FOR_SAL 0 +#endif // ] +#endif // ] + + +#if !_USE_DECLSPECS_FOR_SAL // [ +#if !_USE_ATTRIBUTES_FOR_SAL // [ +#if _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // [ +#undef _USE_ATTRIBUTES_FOR_SAL +#define _USE_ATTRIBUTES_FOR_SAL 1 +#else // ][ +#undef _USE_DECLSPECS_FOR_SAL +#define _USE_DECLSPECS_FOR_SAL 1 +#endif // ] +#endif // ] +#endif // ] + +#else + +// Disable expansion of SAL macros in non-Prefast mode to +// improve compiler throughput. +#ifndef _USE_DECLSPECS_FOR_SAL // [ +#define _USE_DECLSPECS_FOR_SAL 0 +#endif // ] +#ifndef _USE_ATTRIBUTES_FOR_SAL // [ +#define _USE_ATTRIBUTES_FOR_SAL 0 +#endif // ] + +#endif // ] + +// safeguard for MIDL and RC builds +#if _USE_DECLSPECS_FOR_SAL && ( defined( MIDL_PASS ) || defined(__midl) || defined(RC_INVOKED) || !defined(_PREFAST_) ) /*IFSTRIP=IGN*/ // [ +#undef _USE_DECLSPECS_FOR_SAL +#define _USE_DECLSPECS_FOR_SAL 0 +#endif // ] +#if _USE_ATTRIBUTES_FOR_SAL && ( !defined(_MSC_EXTENSIONS) || defined( MIDL_PASS ) || defined(__midl) || defined(RC_INVOKED) ) /*IFSTRIP=IGN*/ // [ +#undef _USE_ATTRIBUTES_FOR_SAL +#define _USE_ATTRIBUTES_FOR_SAL 0 +#endif // ] + +#if _USE_DECLSPECS_FOR_SAL || _USE_ATTRIBUTES_FOR_SAL + +// Special enum type for Y/N/M +enum __SAL_YesNo {_SAL_notpresent, _SAL_no, _SAL_maybe, _SAL_yes, _SAL_default}; + +#endif + +#if defined(BUILD_WINDOWS) && !_USE_ATTRIBUTES_FOR_SAL /*IFSTRIP=IGN*/ +#define _SAL1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1") _GrouP_(annotes _SAL_nop_impl_) +#define _SAL1_1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.1") _GrouP_(annotes _SAL_nop_impl_) +#define _SAL1_2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.2") _GrouP_(annotes _SAL_nop_impl_) +#define _SAL2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "2") _GrouP_(annotes _SAL_nop_impl_) +#else +#define _SAL1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1") _Group_(annotes _SAL_nop_impl_) +#define _SAL1_1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.1") _Group_(annotes _SAL_nop_impl_) +#define _SAL1_2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.2") _Group_(annotes _SAL_nop_impl_) +#define _SAL2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "2") _Group_(annotes _SAL_nop_impl_) +#endif + +//============================================================================ +// Structural SAL: +// These annotations modify the use of other annotations. They may +// express the annotation target (i.e. what parameter/field the annotation +// applies to) or the condition under which the annotation is applicable. +//============================================================================ + +// _At_(target, annos) specifies that the annotations listed in 'annos' is to +// be applied to 'target' rather than to the identifier which is the current +// lexical target. +#define _At_(target, annos) _At_impl_(target, annos _SAL_nop_impl_) + +// _At_buffer_(target, iter, bound, annos) is similar to _At_, except that +// target names a buffer, and each annotation in annos is applied to each +// element of target up to bound, with the variable named in iter usable +// by the annotations to refer to relevant offsets within target. +#define _At_buffer_(target, iter, bound, annos) _At_buffer_impl_(target, iter, bound, annos _SAL_nop_impl_) + +// _When_(expr, annos) specifies that the annotations listed in 'annos' only +// apply when 'expr' evaluates to non-zero. +#define _When_(expr, annos) _When_impl_(expr, annos _SAL_nop_impl_) +#define _Group_(annos) _Group_impl_(annos _SAL_nop_impl_) +#define _GrouP_(annos) _GrouP_impl_(annos _SAL_nop_impl_) + +// indicates whether normal post conditions apply to a function +#define _Success_(expr) _SAL2_Source_(_Success_, (expr), _Success_impl_(expr)) + +// indicates whether post conditions apply to a function returning +// the type that this annotation is applied to +#define _Return_type_success_(expr) _SAL2_Source_(_Return_type_success_, (expr), _Success_impl_(expr)) + +// Establish postconditions that apply only if the function does not succeed +#define _On_failure_(annos) _On_failure_impl_(annos _SAL_nop_impl_) + +// Establish postconditions that apply in both success and failure cases. +// Only applicable with functions that have _Success_ or _Return_type_succss_. +#define _Always_(annos) _Always_impl_(annos _SAL_nop_impl_) + +// Usable on a function definition. Asserts that a function declaration is +// in scope, and its annotations are to be used. There are no other annotations +// allowed on the function definition. +#define _Use_decl_annotations_ _Use_decl_anno_impl_ + +// _Notref_ may precede a _Deref_ or "real" annotation, and removes one +// level of dereference if the parameter is a C++ reference (&). If the +// net deref on a "real" annotation is negative, it is simply discarded. +#define _Notref_ _Notref_impl_ + +// Annotations for defensive programming styles. +#define _Pre_defensive_ _SA_annotes0(SAL_pre_defensive) +#define _Post_defensive_ _SA_annotes0(SAL_post_defensive) + +#define _In_defensive_(annotes) _Pre_defensive_ _Group_(annotes) +#define _Out_defensive_(annotes) _Post_defensive_ _Group_(annotes) +#define _Inout_defensive_(annotes) _Pre_defensive_ _Post_defensive_ _Group_(annotes) + +//============================================================================ +// _In_\_Out_ Layer: +//============================================================================ + +// Reserved pointer parameters, must always be NULL. +#define _Reserved_ _SAL2_Source_(_Reserved_, (), _Pre1_impl_(__null_impl)) + +// _Const_ allows specification that any namable memory location is considered +// readonly for a given call. +#define _Const_ _SAL2_Source_(_Const_, (), _Pre1_impl_(__readaccess_impl_notref)) + + +// Input parameters -------------------------- + +// _In_ - Annotations for parameters where data is passed into the function, but not modified. +// _In_ by itself can be used with non-pointer types (although it is redundant). + +// e.g. void SetPoint( _In_ const POINT* pPT ); +#define _In_ _SAL2_Source_(_In_, (), _Pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_ _Deref_pre1_impl_(__readaccess_impl_notref)) +#define _In_opt_ _SAL2_Source_(_In_opt_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_ _Deref_pre_readonly_) + +// nullterminated 'in' parameters. +// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); +#define _In_z_ _SAL2_Source_(_In_z_, (), _In_ _Pre1_impl_(__zterm_impl)) +#define _In_opt_z_ _SAL2_Source_(_In_opt_z_, (), _In_opt_ _Pre1_impl_(__zterm_impl)) + + +// 'input' buffers with given size + +#define _In_reads_(size) _SAL2_Source_(_In_reads_, (size), _Pre_count_(size) _Deref_pre_readonly_) +#define _In_reads_opt_(size) _SAL2_Source_(_In_reads_opt_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_) +#define _In_reads_bytes_(size) _SAL2_Source_(_In_reads_bytes_, (size), _Pre_bytecount_(size) _Deref_pre_readonly_) +#define _In_reads_bytes_opt_(size) _SAL2_Source_(_In_reads_bytes_opt_, (size), _Pre_opt_bytecount_(size) _Deref_pre_readonly_) +#define _In_reads_z_(size) _SAL2_Source_(_In_reads_z_, (size), _In_reads_(size) _Pre_z_) +#define _In_reads_opt_z_(size) _SAL2_Source_(_In_reads_opt_z_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_ _Pre_opt_z_) +#define _In_reads_or_z_(size) _SAL2_Source_(_In_reads_or_z_, (size), _In_ _When_(_String_length_(_Curr_) < (size), _Pre_z_) _When_(_String_length_(_Curr_) >= (size), _Pre1_impl_(__count_impl(size)))) +#define _In_reads_or_z_opt_(size) _SAL2_Source_(_In_reads_or_z_opt_, (size), _In_opt_ _When_(_String_length_(_Curr_) < (size), _Pre_z_) _When_(_String_length_(_Curr_) >= (size), _Pre1_impl_(__count_impl(size)))) + + +// 'input' buffers valid to the given end pointer + +#define _In_reads_to_ptr_(ptr) _SAL2_Source_(_In_reads_to_ptr_, (ptr), _Pre_ptrdiff_count_(ptr) _Deref_pre_readonly_) +#define _In_reads_to_ptr_opt_(ptr) _SAL2_Source_(_In_reads_to_ptr_opt_, (ptr), _Pre_opt_ptrdiff_count_(ptr) _Deref_pre_readonly_) +#define _In_reads_to_ptr_z_(ptr) _SAL2_Source_(_In_reads_to_ptr_z_, (ptr), _In_reads_to_ptr_(ptr) _Pre_z_) +#define _In_reads_to_ptr_opt_z_(ptr) _SAL2_Source_(_In_reads_to_ptr_opt_z_, (ptr), _Pre_opt_ptrdiff_count_(ptr) _Deref_pre_readonly_ _Pre_opt_z_) + + + +// Output parameters -------------------------- + +// _Out_ - Annotations for pointer or reference parameters where data passed back to the caller. +// These are mostly used where the pointer/reference is to a non-pointer type. +// _Outptr_/_Outref) (see below) are typically used to return pointers via parameters. + +// e.g. void GetPoint( _Out_ POINT* pPT ); +#define _Out_ _SAL2_Source_(_Out_, (), _Out_impl_) +#define _Out_opt_ _SAL2_Source_(_Out_opt_, (), _Out_opt_impl_) + +#define _Out_writes_(size) _SAL2_Source_(_Out_writes_, (size), _Pre_cap_(size) _Post_valid_impl_) +#define _Out_writes_opt_(size) _SAL2_Source_(_Out_writes_opt_, (size), _Pre_opt_cap_(size) _Post_valid_impl_) +#define _Out_writes_bytes_(size) _SAL2_Source_(_Out_writes_bytes_, (size), _Pre_bytecap_(size) _Post_valid_impl_) +#define _Out_writes_bytes_opt_(size) _SAL2_Source_(_Out_writes_bytes_opt_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_) +#define _Out_writes_z_(size) _SAL2_Source_(_Out_writes_z_, (size), _Pre_cap_(size) _Post_valid_impl_ _Post_z_) +#define _Out_writes_opt_z_(size) _SAL2_Source_(_Out_writes_opt_z_, (size), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_z_) + +#define _Out_writes_to_(size,count) _SAL2_Source_(_Out_writes_to_, (size,count), _Pre_cap_(size) _Post_valid_impl_ _Post_count_(count)) +#define _Out_writes_to_opt_(size,count) _SAL2_Source_(_Out_writes_to_opt_, (size,count), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_count_(count)) +#define _Out_writes_all_(size) _SAL2_Source_(_Out_writes_all_, (size), _Out_writes_to_(_Old_(size), _Old_(size))) +#define _Out_writes_all_opt_(size) _SAL2_Source_(_Out_writes_all_opt_, (size), _Out_writes_to_opt_(_Old_(size), _Old_(size))) + +#define _Out_writes_bytes_to_(size,count) _SAL2_Source_(_Out_writes_bytes_to_, (size,count), _Pre_bytecap_(size) _Post_valid_impl_ _Post_bytecount_(count)) +#define _Out_writes_bytes_to_opt_(size,count) _SAL2_Source_(_Out_writes_bytes_to_opt_, (size,count), _Pre_opt_bytecap_(size) _Post_valid_impl_ _Post_bytecount_(count)) +#define _Out_writes_bytes_all_(size) _SAL2_Source_(_Out_writes_bytes_all_, (size), _Out_writes_bytes_to_(_Old_(size), _Old_(size))) +#define _Out_writes_bytes_all_opt_(size) _SAL2_Source_(_Out_writes_bytes_all_opt_, (size), _Out_writes_bytes_to_opt_(_Old_(size), _Old_(size))) + +#define _Out_writes_to_ptr_(ptr) _SAL2_Source_(_Out_writes_to_ptr_, (ptr), _Pre_ptrdiff_cap_(ptr) _Post_valid_impl_) +#define _Out_writes_to_ptr_opt_(ptr) _SAL2_Source_(_Out_writes_to_ptr_opt_, (ptr), _Pre_opt_ptrdiff_cap_(ptr) _Post_valid_impl_) +#define _Out_writes_to_ptr_z_(ptr) _SAL2_Source_(_Out_writes_to_ptr_z_, (ptr), _Pre_ptrdiff_cap_(ptr) _Post_valid_impl_ Post_z_) +#define _Out_writes_to_ptr_opt_z_(ptr) _SAL2_Source_(_Out_writes_to_ptr_opt_z_, (ptr), _Pre_opt_ptrdiff_cap_(ptr) _Post_valid_impl_ Post_z_) + + +// Inout parameters ---------------------------- + +// _Inout_ - Annotations for pointer or reference parameters where data is passed in and +// potentially modified. +// void ModifyPoint( _Inout_ POINT* pPT ); +// void ModifyPointByRef( _Inout_ POINT& pPT ); + +#define _Inout_ _SAL2_Source_(_Inout_, (), _Prepost_valid_) +#define _Inout_opt_ _SAL2_Source_(_Inout_opt_, (), _Prepost_opt_valid_) + +// For modifying string buffers +// void toupper( _Inout_z_ char* sz ); +#define _Inout_z_ _SAL2_Source_(_Inout_z_, (), _Prepost_z_) +#define _Inout_opt_z_ _SAL2_Source_(_Inout_opt_z_, (), _Prepost_opt_z_) + +// For modifying buffers with explicit element size +#define _Inout_updates_(size) _SAL2_Source_(_Inout_updates_, (size), _Pre_cap_(size) _Pre_valid_impl_ _Post_valid_impl_) +#define _Inout_updates_opt_(size) _SAL2_Source_(_Inout_updates_opt_, (size), _Pre_opt_cap_(size) _Pre_valid_impl_ _Post_valid_impl_) +#define _Inout_updates_z_(size) _SAL2_Source_(_Inout_updates_z_, (size), _Pre_cap_(size) _Pre_valid_impl_ _Post_valid_impl_ _Pre1_impl_(__zterm_impl) _Post1_impl_(__zterm_impl)) +#define _Inout_updates_opt_z_(size) _SAL2_Source_(_Inout_updates_opt_z_, (size), _Pre_opt_cap_(size) _Pre_valid_impl_ _Post_valid_impl_ _Pre1_impl_(__zterm_impl) _Post1_impl_(__zterm_impl)) + +#define _Inout_updates_to_(size,count) _SAL2_Source_(_Inout_updates_to_, (size,count), _Out_writes_to_(size,count) _Pre_valid_impl_ _Pre1_impl_(__count_impl(count))) +#define _Inout_updates_to_opt_(size,count) _SAL2_Source_(_Inout_updates_to_opt_, (size,count), _Out_writes_to_opt_(size,count) _Pre_valid_impl_ _Pre1_impl_(__count_impl(count))) + +#define _Inout_updates_all_(size) _SAL2_Source_(_Inout_updates_all_, (size), _Inout_updates_to_(_Old_(size), _Old_(size))) +#define _Inout_updates_all_opt_(size) _SAL2_Source_(_Inout_updates_all_opt_, (size), _Inout_updates_to_opt_(_Old_(size), _Old_(size))) + +// For modifying buffers with explicit byte size +#define _Inout_updates_bytes_(size) _SAL2_Source_(_Inout_updates_bytes_, (size), _Pre_bytecap_(size) _Pre_valid_impl_ _Post_valid_impl_) +#define _Inout_updates_bytes_opt_(size) _SAL2_Source_(_Inout_updates_bytes_opt_, (size), _Pre_opt_bytecap_(size) _Pre_valid_impl_ _Post_valid_impl_) + +#define _Inout_updates_bytes_to_(size,count) _SAL2_Source_(_Inout_updates_bytes_to_, (size,count), _Out_writes_bytes_to_(size,count) _Pre_valid_impl_ _Pre1_impl_(__bytecount_impl(count))) +#define _Inout_updates_bytes_to_opt_(size,count) _SAL2_Source_(_Inout_updates_bytes_to_opt_, (size,count), _Out_writes_bytes_to_opt_(size,count) _Pre_valid_impl_ _Pre1_impl_(__bytecount_impl(count))) + +#define _Inout_updates_bytes_all_(size) _SAL2_Source_(_Inout_updates_bytes_all_, (size), _Inout_updates_bytes_to_(_Old_(size), _Old_(size))) +#define _Inout_updates_bytes_all_opt_(size) _SAL2_Source_(_Inout_updates_bytes_all_opt_, (size), _Inout_updates_bytes_to_opt_(_Old_(size), _Old_(size))) + + +// Pointer to pointer parameters ------------------------- + +// _Outptr_ - Annotations for output params returning pointers +// These describe parameters where the called function provides the buffer: +// HRESULT SHStrDupW(_In_ LPCWSTR psz, _Outptr_ LPWSTR *ppwsz); +// The caller passes the address of an LPWSTR variable as ppwsz, and SHStrDupW allocates +// and initializes memory and returns the pointer to the new LPWSTR in *ppwsz. +// +// _Outptr_opt_ - describes parameters that are allowed to be NULL. +// _Outptr_*_result_maybenull_ - describes parameters where the called function might return NULL to the caller. +// +// Example: +// void MyFunc(_Outptr_opt_ int **ppData1, _Outptr_result_maybenull_ int **ppData2); +// Callers: +// MyFunc(NULL, NULL); // error: parameter 2, ppData2, should not be NULL +// MyFunc(&pData1, &pData2); // ok: both non-NULL +// if (*pData1 == *pData2) ... // error: pData2 might be NULL after call + +#define _Outptr_ _SAL2_Source_(_Outptr_, (), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(1))) +#define _Outptr_result_maybenull_ _SAL2_Source_(_Outptr_result_maybenull_, (), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(1))) +#define _Outptr_opt_ _SAL2_Source_(_Outptr_opt_, (), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(1))) +#define _Outptr_opt_result_maybenull_ _SAL2_Source_(_Outptr_opt_result_maybenull_, (), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(1))) + +// Annotations for _Outptr_ parameters returning pointers to null terminated strings. + +#define _Outptr_result_z_ _SAL2_Source_(_Outptr_result_z_, (), _Out_impl_ _Deref_post_z_) +#define _Outptr_opt_result_z_ _SAL2_Source_(_Outptr_opt_result_z_, (), _Out_opt_impl_ _Deref_post_z_) +#define _Outptr_result_maybenull_z_ _SAL2_Source_(_Outptr_result_maybenull_z_, (), _Out_impl_ _Deref_post_opt_z_) +#define _Outptr_opt_result_maybenull_z_ _SAL2_Source_(_Outptr_opt_result_maybenull_z_, (), _Out_opt_impl_ _Deref_post_opt_z_) + +// Annotations for _Outptr_ parameters where the output pointer is set to NULL if the function fails. + +#define _Outptr_result_nullonfailure_ _SAL2_Source_(_Outptr_result_nullonfailure_, (), _Outptr_ _On_failure_(_Deref_post_null_)) +#define _Outptr_opt_result_nullonfailure_ _SAL2_Source_(_Outptr_opt_result_nullonfailure_, (), _Outptr_opt_ _On_failure_(_Deref_post_null_)) + +// Annotations for _Outptr_ parameters which return a pointer to a ref-counted COM object, +// following the COM convention of setting the output to NULL on failure. +// The current implementation is identical to _Outptr_result_nullonfailure_. +// For pointers to types that are not COM objects, _Outptr_result_nullonfailure_ is preferred. + +#define _COM_Outptr_ _SAL2_Source_(_COM_Outptr_, (), _Outptr_ _On_failure_(_Deref_post_null_)) +#define _COM_Outptr_result_maybenull_ _SAL2_Source_(_COM_Outptr_result_maybenull_, (), _Outptr_result_maybenull_ _On_failure_(_Deref_post_null_)) +#define _COM_Outptr_opt_ _SAL2_Source_(_COM_Outptr_opt_, (), _Outptr_opt_ _On_failure_(_Deref_post_null_)) +#define _COM_Outptr_opt_result_maybenull_ _SAL2_Source_(_COM_Outptr_opt_result_maybenull_, (), _Outptr_opt_result_maybenull_ _On_failure_(_Deref_post_null_)) + +// Annotations for _Outptr_ parameters returning a pointer to buffer with a specified number of elements/bytes + +#define _Outptr_result_buffer_(size) _SAL2_Source_(_Outptr_result_buffer_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __cap_impl(size))) +#define _Outptr_opt_result_buffer_(size) _SAL2_Source_(_Outptr_opt_result_buffer_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __cap_impl(size))) +#define _Outptr_result_buffer_to_(size, count) _SAL2_Source_(_Outptr_result_buffer_to_, (size, count), _Out_impl_ _Deref_post3_impl_(__notnull_impl_notref, __cap_impl(size), __count_impl(count))) +#define _Outptr_opt_result_buffer_to_(size, count) _SAL2_Source_(_Outptr_opt_result_buffer_to_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__notnull_impl_notref, __cap_impl(size), __count_impl(count))) + +#define _Outptr_result_buffer_all_(size) _SAL2_Source_(_Outptr_result_buffer_all_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(size))) +#define _Outptr_opt_result_buffer_all_(size) _SAL2_Source_(_Outptr_opt_result_buffer_all_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(size))) + +#define _Outptr_result_buffer_maybenull_(size) _SAL2_Source_(_Outptr_result_buffer_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __cap_impl(size))) +#define _Outptr_opt_result_buffer_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_buffer_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __cap_impl(size))) +#define _Outptr_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_result_buffer_to_maybenull_, (size, count), _Out_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __cap_impl(size), __count_impl(count))) +#define _Outptr_opt_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_opt_result_buffer_to_maybenull_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __cap_impl(size), __count_impl(count))) + +#define _Outptr_result_buffer_all_maybenull_(size) _SAL2_Source_(_Outptr_result_buffer_all_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(size))) +#define _Outptr_opt_result_buffer_all_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_buffer_all_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(size))) + +#define _Outptr_result_bytebuffer_(size) _SAL2_Source_(_Outptr_result_bytebuffer_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecap_impl(size))) +#define _Outptr_opt_result_bytebuffer_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecap_impl(size))) +#define _Outptr_result_bytebuffer_to_(size, count) _SAL2_Source_(_Outptr_result_bytebuffer_to_, (size, count), _Out_impl_ _Deref_post3_impl_(__notnull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) +#define _Outptr_opt_result_bytebuffer_to_(size, count) _SAL2_Source_(_Outptr_opt_result_bytebuffer_to_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__notnull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) + +#define _Outptr_result_bytebuffer_all_(size) _SAL2_Source_(_Outptr_result_bytebuffer_all_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecount_impl(size))) +#define _Outptr_opt_result_bytebuffer_all_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_all_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecount_impl(size))) + +#define _Outptr_result_bytebuffer_maybenull_(size) _SAL2_Source_(_Outptr_result_bytebuffer_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecap_impl(size))) +#define _Outptr_opt_result_bytebuffer_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecap_impl(size))) +#define _Outptr_result_bytebuffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_result_bytebuffer_to_maybenull_, (size, count), _Out_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) +#define _Outptr_opt_result_bytebuffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_opt_result_bytebuffer_to_maybenull_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) + +#define _Outptr_result_bytebuffer_all_maybenull_(size) _SAL2_Source_(_Outptr_result_bytebuffer_all_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecount_impl(size))) +#define _Outptr_opt_result_bytebuffer_all_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_all_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecount_impl(size))) + +// Annotations for output reference to pointer parameters. + +#define _Outref_ _SAL2_Source_(_Outref_, (), _Out_impl_ _Post_notnull_) +#define _Outref_result_maybenull_ _SAL2_Source_(_Outref_result_maybenull_, (), _Pre2_impl_(__notnull_impl_notref, __cap_c_one_notref_impl) _Post_maybenull_ _Post_valid_impl_) + +#define _Outref_result_buffer_(size) _SAL2_Source_(_Outref_result_buffer_, (size), _Outref_ _Post1_impl_(__cap_impl(size))) +#define _Outref_result_bytebuffer_(size) _SAL2_Source_(_Outref_result_bytebuffer_, (size), _Outref_ _Post1_impl_(__bytecap_impl(size))) +#define _Outref_result_buffer_to_(size, count) _SAL2_Source_(_Outref_result_buffer_to_, (size, count), _Outref_result_buffer_(size) _Post1_impl_(__count_impl(count))) +#define _Outref_result_bytebuffer_to_(size, count) _SAL2_Source_(_Outref_result_bytebuffer_to_, (size, count), _Outref_result_bytebuffer_(size) _Post1_impl_(__bytecount_impl(count))) +#define _Outref_result_buffer_all_(size) _SAL2_Source_(_Outref_result_buffer_all_, (size), _Outref_result_buffer_to_(size, _Old_(size))) +#define _Outref_result_bytebuffer_all_(size) _SAL2_Source_(_Outref_result_bytebuffer_all_, (size), _Outref_result_bytebuffer_to_(size, _Old_(size))) + +#define _Outref_result_buffer_maybenull_(size) _SAL2_Source_(_Outref_result_buffer_maybenull_, (size), _Outref_result_maybenull_ _Post1_impl_(__cap_impl(size))) +#define _Outref_result_bytebuffer_maybenull_(size) _SAL2_Source_(_Outref_result_bytebuffer_maybenull_, (size), _Outref_result_maybenull_ _Post1_impl_(__bytecap_impl(size))) +#define _Outref_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outref_result_buffer_to_maybenull_, (size, count), _Outref_result_buffer_maybenull_(size) _Post1_impl_(__count_impl(count))) +#define _Outref_result_bytebuffer_to_maybenull_(size, count) _SAL2_Source_(_Outref_result_bytebuffer_to_maybenull_, (size, count), _Outref_result_bytebuffer_maybenull_(size) _Post1_impl_(__bytecount_impl(count))) +#define _Outref_result_buffer_all_maybenull_(size) _SAL2_Source_(_Outref_result_buffer_all_maybenull_, (size), _Outref_result_buffer_to_maybenull_(size, _Old_(size))) +#define _Outref_result_bytebuffer_all_maybenull_(size) _SAL2_Source_(_Outref_result_bytebuffer_all_maybenull_, (size), _Outref_result_bytebuffer_to_maybenull_(size, _Old_(size))) + +// Annotations for output reference to pointer parameters that guarantee +// that the pointer is set to NULL on failure. +#define _Outref_result_nullonfailure_ _SAL2_Source_(_Outref_result_nullonfailure_, (), _Outref_ _On_failure_(_Post_null_)) + +// Generic annotations to set output value of a by-pointer or by-reference parameter to null/zero on failure. +#define _Result_nullonfailure_ _SAL2_Source_(_Result_nullonfailure_, (), _On_failure_(_Notref_impl_ _Deref_impl_ _Post_null_)) +#define _Result_zeroonfailure_ _SAL2_Source_(_Result_zeroonfailure_, (), _On_failure_(_Notref_impl_ _Deref_impl_ _Out_range_(==, 0))) + + +// return values ------------------------------- + +// +// _Ret_ annotations +// +// describing conditions that hold for return values after the call + +// e.g. _Ret_z_ CString::operator const WCHAR*() const throw(); +#define _Ret_z_ _SAL2_Source_(_Ret_z_, (), _Ret2_impl_(__notnull_impl, __zterm_impl) _Ret_valid_impl_) +#define _Ret_maybenull_z_ _SAL2_Source_(_Ret_maybenull_z_, (), _Ret2_impl_(__maybenull_impl,__zterm_impl) _Ret_valid_impl_) + +// used with allocated but not yet initialized objects +#define _Ret_notnull_ _SAL2_Source_(_Ret_notnull_, (), _Ret1_impl_(__notnull_impl)) +#define _Ret_maybenull_ _SAL2_Source_(_Ret_maybenull_, (), _Ret1_impl_(__maybenull_impl)) +#define _Ret_null_ _SAL2_Source_(_Ret_null_, (), _Ret1_impl_(__null_impl)) + +// used with allocated and initialized objects +// returns single valid object +#define _Ret_valid_ _SAL2_Source_(_Ret_valid_, (), _Ret1_impl_(__notnull_impl_notref) _Ret_valid_impl_) + +// returns pointer to initialized buffer of specified size +#define _Ret_writes_(size) _SAL2_Source_(_Ret_writes_, (size), _Ret2_impl_(__notnull_impl, __count_impl(size)) _Ret_valid_impl_) +#define _Ret_writes_z_(size) _SAL2_Source_(_Ret_writes_z_, (size), _Ret3_impl_(__notnull_impl, __count_impl(size), __zterm_impl) _Ret_valid_impl_) +#define _Ret_writes_bytes_(size) _SAL2_Source_(_Ret_writes_bytes_, (size), _Ret2_impl_(__notnull_impl, __bytecount_impl(size)) _Ret_valid_impl_) +#define _Ret_writes_maybenull_(size) _SAL2_Source_(_Ret_writes_maybenull_, (size), _Ret2_impl_(__maybenull_impl,__count_impl(size)) _Ret_valid_impl_) +#define _Ret_writes_maybenull_z_(size) _SAL2_Source_(_Ret_writes_maybenull_z_, (size), _Ret3_impl_(__maybenull_impl,__count_impl(size),__zterm_impl) _Ret_valid_impl_) +#define _Ret_writes_bytes_maybenull_(size) _SAL2_Source_(_Ret_writes_bytes_maybenull_, (size), _Ret2_impl_(__maybenull_impl,__bytecount_impl(size)) _Ret_valid_impl_) + +// returns pointer to partially initialized buffer, with total size 'size' and initialized size 'count' +#define _Ret_writes_to_(size,count) _SAL2_Source_(_Ret_writes_to_, (size,count), _Ret3_impl_(__notnull_impl, __cap_impl(size), __count_impl(count)) _Ret_valid_impl_) +#define _Ret_writes_bytes_to_(size,count) _SAL2_Source_(_Ret_writes_bytes_to_, (size,count), _Ret3_impl_(__notnull_impl, __bytecap_impl(size), __bytecount_impl(count)) _Ret_valid_impl_) +#define _Ret_writes_to_maybenull_(size,count) _SAL2_Source_(_Ret_writes_to_maybenull_, (size,count), _Ret3_impl_(__maybenull_impl, __cap_impl(size), __count_impl(count)) _Ret_valid_impl_) +#define _Ret_writes_bytes_to_maybenull_(size,count) _SAL2_Source_(_Ret_writes_bytes_to_maybenull_, (size,count), _Ret3_impl_(__maybenull_impl, __bytecap_impl(size), __bytecount_impl(count)) _Ret_valid_impl_) + + +// Annotations for strict type checking +#define _Points_to_data_ _SAL2_Source_(_Points_to_data_, (), _Pre_ _Points_to_data_impl_) +#define _Literal_ _SAL2_Source_(_Literal_, (), _Pre_ _Literal_impl_) +#define _Notliteral_ _SAL2_Source_(_Notliteral_, (), _Pre_ _Notliteral_impl_) + +// Check the return value of a function e.g. _Check_return_ ErrorCode Foo(); +#define _Check_return_ _SAL2_Source_(_Check_return_, (), _Check_return_impl_) +#define _Must_inspect_result_ _SAL2_Source_(_Must_inspect_result_, (), _Must_inspect_impl_ _Check_return_impl_) + +// e.g. MyPrintF( _Printf_format_string_ const WCHAR* wzFormat, ... ); +#define _Printf_format_string_ _SAL2_Source_(_Printf_format_string_, (), _Printf_format_string_impl_) +#define _Scanf_format_string_ _SAL2_Source_(_Scanf_format_string_, (), _Scanf_format_string_impl_) +#define _Scanf_s_format_string_ _SAL2_Source_(_Scanf_s_format_string_, (), _Scanf_s_format_string_impl_) + +#define _Format_string_impl_(kind,where) _SA_annotes2(SAL_IsFormatString2, kind, where) +#define _Printf_format_string_params_(x) _SAL2_Source_(_Printf_format_string_params_, (x), _Format_string_impl_("printf", x)) +#define _Scanf_format_string_params_(x) _SAL2_Source_(_Scanf_format_string_params_, (x), _Format_string_impl_("scanf", x)) +#define _Scanf_s_format_string_params_(x) _SAL2_Source_(_Scanf_s_format_string_params_, (x), _Format_string_impl_("scanf_s", x)) + +// annotations to express value of integral or pointer parameter +#define _In_range_(lb,ub) _SAL2_Source_(_In_range_, (lb,ub), _In_range_impl_(lb,ub)) +#define _Out_range_(lb,ub) _SAL2_Source_(_Out_range_, (lb,ub), _Out_range_impl_(lb,ub)) +#define _Ret_range_(lb,ub) _SAL2_Source_(_Ret_range_, (lb,ub), _Ret_range_impl_(lb,ub)) +#define _Deref_in_range_(lb,ub) _SAL2_Source_(_Deref_in_range_, (lb,ub), _Deref_in_range_impl_(lb,ub)) +#define _Deref_out_range_(lb,ub) _SAL2_Source_(_Deref_out_range_, (lb,ub), _Deref_out_range_impl_(lb,ub)) +#define _Deref_ret_range_(lb,ub) _SAL2_Source_(_Deref_ret_range_, (lb,ub), _Deref_ret_range_impl_(lb,ub)) +#define _Pre_equal_to_(expr) _SAL2_Source_(_Pre_equal_to_, (expr), _In_range_(==, expr)) +#define _Post_equal_to_(expr) _SAL2_Source_(_Post_equal_to_, (expr), _Out_range_(==, expr)) + +// annotation to express that a value (usually a field of a mutable class) +// is not changed by a function call +#define _Unchanged_(e) _SAL2_Source_(_Unchanged_, (e), _At_(e, _Post_equal_to_(_Old_(e)) _Const_)) + +// Annotations to allow expressing generalized pre and post conditions. +// 'cond' may be any valid SAL expression that is considered to be true as a precondition +// or postcondition (respsectively). +#define _Pre_satisfies_(cond) _SAL2_Source_(_Pre_satisfies_, (cond), _Pre_satisfies_impl_(cond)) +#define _Post_satisfies_(cond) _SAL2_Source_(_Post_satisfies_, (cond), _Post_satisfies_impl_(cond)) + +// Annotations to express struct, class and field invariants +#define _Struct_size_bytes_(size) _SAL2_Source_(_Struct_size_bytes_, (size), _Writable_bytes_(size)) + +#define _Field_size_(size) _SAL2_Source_(_Field_size_, (size), _Notnull_ _Writable_elements_(size)) +#define _Field_size_opt_(size) _SAL2_Source_(_Field_size_opt_, (size), _Maybenull_ _Writable_elements_(size)) +#define _Field_size_part_(size, count) _SAL2_Source_(_Field_size_part_, (size, count), _Notnull_ _Writable_elements_(size) _Readable_elements_(count)) +#define _Field_size_part_opt_(size, count) _SAL2_Source_(_Field_size_part_opt_, (size, count), _Maybenull_ _Writable_elements_(size) _Readable_elements_(count)) +#define _Field_size_full_(size) _SAL2_Source_(_Field_size_full_, (size), _Field_size_part_(size, size)) +#define _Field_size_full_opt_(size) _SAL2_Source_(_Field_size_full_opt_, (size), _Field_size_part_opt_(size, size)) + +#define _Field_size_bytes_(size) _SAL2_Source_(_Field_size_bytes_, (size), _Notnull_ _Writable_bytes_(size)) +#define _Field_size_bytes_opt_(size) _SAL2_Source_(_Field_size_bytes_opt_, (size), _Maybenull_ _Writable_bytes_(size)) +#define _Field_size_bytes_part_(size, count) _SAL2_Source_(_Field_size_bytes_part_, (size, count), _Notnull_ _Writable_bytes_(size) _Readable_bytes_(count)) +#define _Field_size_bytes_part_opt_(size, count) _SAL2_Source_(_Field_size_bytes_part_opt_, (size, count), _Maybenull_ _Writable_bytes_(size) _Readable_bytes_(count)) +#define _Field_size_bytes_full_(size) _SAL2_Source_(_Field_size_bytes_full_, (size), _Field_size_bytes_part_(size, size)) +#define _Field_size_bytes_full_opt_(size) _SAL2_Source_(_Field_size_bytes_full_opt_, (size), _Field_size_bytes_part_opt_(size, size)) + +#define _Field_z_ _SAL2_Source_(_Field_z_, (), _Null_terminated_) + +#define _Field_range_(min,max) _SAL2_Source_(_Field_range_, (min,max), _Field_range_impl_(min,max)) + +//============================================================================ +// _Pre_\_Post_ Layer: +//============================================================================ + +// +// Raw Pre/Post for declaring custom pre/post conditions +// + +#define _Pre_ _Pre_impl_ +#define _Post_ _Post_impl_ + +// +// Validity property +// + +#define _Valid_ _Valid_impl_ +#define _Notvalid_ _Notvalid_impl_ +#define _Maybevalid_ _Maybevalid_impl_ + +// +// Buffer size properties +// + +// Expressing buffer sizes without specifying pre or post condition +#define _Readable_bytes_(size) _SAL2_Source_(_Readable_bytes_, (size), _Readable_bytes_impl_(size)) +#define _Readable_elements_(size) _SAL2_Source_(_Readable_elements_, (size), _Readable_elements_impl_(size)) +#define _Writable_bytes_(size) _SAL2_Source_(_Writable_bytes_, (size), _Writable_bytes_impl_(size)) +#define _Writable_elements_(size) _SAL2_Source_(_Writable_elements_, (size), _Writable_elements_impl_(size)) + +#define _Null_terminated_ _SAL2_Source_(_Null_terminated_, (), _Null_terminated_impl_) +#define _NullNull_terminated_ _SAL2_Source_(_NullNull_terminated_, (), _NullNull_terminated_impl_) + +// Expressing buffer size as pre or post condition +#define _Pre_readable_size_(size) _SAL2_Source_(_Pre_readable_size_, (size), _Pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Pre_writable_size_(size) _SAL2_Source_(_Pre_writable_size_, (size), _Pre1_impl_(__cap_impl(size))) +#define _Pre_readable_byte_size_(size) _SAL2_Source_(_Pre_readable_byte_size_, (size), _Pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) +#define _Pre_writable_byte_size_(size) _SAL2_Source_(_Pre_writable_byte_size_, (size), _Pre1_impl_(__bytecap_impl(size))) + +#define _Post_readable_size_(size) _SAL2_Source_(_Post_readable_size_, (size), _Post1_impl_(__count_impl(size)) _Post_valid_impl_) +#define _Post_writable_size_(size) _SAL2_Source_(_Post_writable_size_, (size), _Post1_impl_(__cap_impl(size))) +#define _Post_readable_byte_size_(size) _SAL2_Source_(_Post_readable_byte_size_, (size), _Post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) +#define _Post_writable_byte_size_(size) _SAL2_Source_(_Post_writable_byte_size_, (size), _Post1_impl_(__bytecap_impl(size))) + +// +// Pointer null-ness properties +// +#define _Null_ _Null_impl_ +#define _Notnull_ _Notnull_impl_ +#define _Maybenull_ _Maybenull_impl_ + +// +// _Pre_ annotations --- +// +// describing conditions that must be met before the call of the function + +// e.g. int strlen( _Pre_z_ const char* sz ); +// buffer is a zero terminated string +#define _Pre_z_ _SAL2_Source_(_Pre_z_, (), _Pre1_impl_(__zterm_impl) _Pre_valid_impl_) + +// valid size unknown or indicated by type (e.g.:LPSTR) +#define _Pre_valid_ _SAL2_Source_(_Pre_valid_, (), _Pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_) +#define _Pre_opt_valid_ _SAL2_Source_(_Pre_opt_valid_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_) + +#define _Pre_invalid_ _SAL2_Source_(_Pre_invalid_, (), _Deref_pre1_impl_(__notvalid_impl)) + +// Overrides recursive valid when some field is not yet initialized when using _Inout_ +#define _Pre_unknown_ _SAL2_Source_(_Pre_unknown_, (), _Pre1_impl_(__maybevalid_impl)) + +// used with allocated but not yet initialized objects +#define _Pre_notnull_ _SAL2_Source_(_Pre_notnull_, (), _Pre1_impl_(__notnull_impl_notref)) +#define _Pre_maybenull_ _SAL2_Source_(_Pre_maybenull_, (), _Pre1_impl_(__maybenull_impl_notref)) +#define _Pre_null_ _SAL2_Source_(_Pre_null_, (), _Pre1_impl_(__null_impl_notref)) + +// +// _Post_ annotations --- +// +// describing conditions that hold after the function call + +// void CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_ char* szFrom, size_t cchFrom ); +// buffer will be a zero-terminated string after the call +#define _Post_z_ _SAL2_Source_(_Post_z_, (), _Post1_impl_(__zterm_impl) _Post_valid_impl_) + +// e.g. HRESULT InitStruct( _Post_valid_ Struct* pobj ); +#define _Post_valid_ _SAL2_Source_(_Post_valid_, (), _Post_valid_impl_) +#define _Post_invalid_ _SAL2_Source_(_Post_invalid_, (), _Deref_post1_impl_(__notvalid_impl)) + +// e.g. void free( _Post_ptr_invalid_ void* pv ); +#define _Post_ptr_invalid_ _SAL2_Source_(_Post_ptr_invalid_, (), _Post1_impl_(__notvalid_impl)) + +// e.g. void ThrowExceptionIfNull( _Post_notnull_ const void* pv ); +#define _Post_notnull_ _SAL2_Source_(_Post_notnull_, (), _Post1_impl_(__notnull_impl)) + +// e.g. HRESULT GetObject(_Outptr_ _On_failure_(_At_(*p, _Post_null_)) T **p); +#define _Post_null_ _SAL2_Source_(_Post_null_, (), _Post1_impl_(__null_impl)) + +#define _Post_maybenull_ _SAL2_Source_(_Post_maybenull_, (), _Post1_impl_(__maybenull_impl)) + +#define _Prepost_z_ _SAL2_Source_(_Prepost_z_, (), _Pre_z_ _Post_z_) + + +// #pragma region Input Buffer SAL 1 compatibility macros + +/*========================================================================== + + This section contains definitions for macros defined for VS2010 and earlier. + Usage of these macros is still supported, but the SAL 2 macros defined above + are recommended instead. This comment block is retained to assist in + understanding SAL that still uses the older syntax. + + The macros are defined in 3 layers: + + _In_\_Out_ Layer: + ---------------- + This layer provides the highest abstraction and its macros should be used + in most cases. Its macros start with _In_, _Out_ or _Inout_. For the + typical case they provide the most concise annotations. + + _Pre_\_Post_ Layer: + ------------------ + The macros of this layer only should be used when there is no suitable macro + in the _In_\_Out_ layer. Its macros start with _Pre_, _Post_, _Ret_, + _Deref_pre_ _Deref_post_ and _Deref_ret_. This layer provides the most + flexibility for annotations. + + Implementation Abstraction Layer: + -------------------------------- + Macros from this layer should never be used directly. The layer only exists + to hide the implementation of the annotation macros. + + + Annotation Syntax: + |--------------|----------|----------------|-----------------------------| + | Usage | Nullness | ZeroTerminated | Extent | + |--------------|----------|----------------|-----------------------------| + | _In_ | <> | <> | <> | + | _Out_ | opt_ | z_ | [byte]cap_[c_|x_]( size ) | + | _Inout_ | | | [byte]count_[c_|x_]( size ) | + | _Deref_out_ | | | ptrdiff_cap_( ptr ) | + |--------------| | | ptrdiff_count_( ptr ) | + | _Ret_ | | | | + | _Deref_ret_ | | | | + |--------------| | | | + | _Pre_ | | | | + | _Post_ | | | | + | _Deref_pre_ | | | | + | _Deref_post_ | | | | + |--------------|----------|----------------|-----------------------------| + + Usage: + ----- + _In_, _Out_, _Inout_, _Pre_, _Post_, _Deref_pre_, _Deref_post_ are for + formal parameters. + _Ret_, _Deref_ret_ must be used for return values. + + Nullness: + -------- + If the pointer can be NULL the annotation contains _opt. If the macro + does not contain '_opt' the pointer may not be NULL. + + String Type: + ----------- + _z: NullTerminated string + for _In_ parameters the buffer must have the specified stringtype before the call + for _Out_ parameters the buffer must have the specified stringtype after the call + for _Inout_ parameters both conditions apply + + Extent Syntax: + |------|---------------|---------------| + | Unit | Writ\Readable | Argument Type | + |------|---------------|---------------| + | <> | cap_ | <> | + | byte | count_ | c_ | + | | | x_ | + |------|---------------|---------------| + + 'cap' (capacity) describes the writable size of the buffer and is typically used + with _Out_. The default unit is elements. Use 'bytecap' if the size is given in bytes + 'count' describes the readable size of the buffer and is typically used with _In_. + The default unit is elements. Use 'bytecount' if the size is given in bytes. + + Argument syntax for cap_, bytecap_, count_, bytecount_: + (|return)[+n] e.g. cch, return, cb+2 + + If the buffer size is a constant expression use the c_ postfix. + E.g. cap_c_(20), count_c_(MAX_PATH), bytecount_c_(16) + + If the buffer size is given by a limiting pointer use the ptrdiff_ versions + of the macros. + + If the buffer size is neither a parameter nor a constant expression use the x_ + postfix. e.g. bytecount_x_(num*size) x_ annotations accept any arbitrary string. + No analysis can be done for x_ annotations but they at least tell the tool that + the buffer has some sort of extent description. x_ annotations might be supported + by future compiler versions. + +============================================================================*/ + +// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch ) +// valid buffer extent described by another parameter +#define _In_count_(size) _SAL1_1_Source_(_In_count_, (size), _Pre_count_(size) _Deref_pre_readonly_) +#define _In_opt_count_(size) _SAL1_1_Source_(_In_opt_count_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_) +#define _In_bytecount_(size) _SAL1_1_Source_(_In_bytecount_, (size), _Pre_bytecount_(size) _Deref_pre_readonly_) +#define _In_opt_bytecount_(size) _SAL1_1_Source_(_In_opt_bytecount_, (size), _Pre_opt_bytecount_(size) _Deref_pre_readonly_) + +// valid buffer extent described by a constant extression +#define _In_count_c_(size) _SAL1_1_Source_(_In_count_c_, (size), _Pre_count_c_(size) _Deref_pre_readonly_) +#define _In_opt_count_c_(size) _SAL1_1_Source_(_In_opt_count_c_, (size), _Pre_opt_count_c_(size) _Deref_pre_readonly_) +#define _In_bytecount_c_(size) _SAL1_1_Source_(_In_bytecount_c_, (size), _Pre_bytecount_c_(size) _Deref_pre_readonly_) +#define _In_opt_bytecount_c_(size) _SAL1_1_Source_(_In_opt_bytecount_c_, (size), _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_) + +// nullterminated 'input' buffers with given size + +// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch ) +// nullterminated valid buffer extent described by another parameter +#define _In_z_count_(size) _SAL1_1_Source_(_In_z_count_, (size), _Pre_z_ _Pre_count_(size) _Deref_pre_readonly_) +#define _In_opt_z_count_(size) _SAL1_1_Source_(_In_opt_z_count_, (size), _Pre_opt_z_ _Pre_opt_count_(size) _Deref_pre_readonly_) +#define _In_z_bytecount_(size) _SAL1_1_Source_(_In_z_bytecount_, (size), _Pre_z_ _Pre_bytecount_(size) _Deref_pre_readonly_) +#define _In_opt_z_bytecount_(size) _SAL1_1_Source_(_In_opt_z_bytecount_, (size), _Pre_opt_z_ _Pre_opt_bytecount_(size) _Deref_pre_readonly_) + +// nullterminated valid buffer extent described by a constant extression +#define _In_z_count_c_(size) _SAL1_1_Source_(_In_z_count_c_, (size), _Pre_z_ _Pre_count_c_(size) _Deref_pre_readonly_) +#define _In_opt_z_count_c_(size) _SAL1_1_Source_(_In_opt_z_count_c_, (size), _Pre_opt_z_ _Pre_opt_count_c_(size) _Deref_pre_readonly_) +#define _In_z_bytecount_c_(size) _SAL1_1_Source_(_In_z_bytecount_c_, (size), _Pre_z_ _Pre_bytecount_c_(size) _Deref_pre_readonly_) +#define _In_opt_z_bytecount_c_(size) _SAL1_1_Source_(_In_opt_z_bytecount_c_, (size), _Pre_opt_z_ _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_) + +// buffer capacity is described by another pointer +// e.g. void Foo( _In_ptrdiff_count_(pchMax) const char* pch, const char* pchMax ) { while pch < pchMax ) pch++; } +#define _In_ptrdiff_count_(size) _SAL1_1_Source_(_In_ptrdiff_count_, (size), _Pre_ptrdiff_count_(size) _Deref_pre_readonly_) +#define _In_opt_ptrdiff_count_(size) _SAL1_1_Source_(_In_opt_ptrdiff_count_, (size), _Pre_opt_ptrdiff_count_(size) _Deref_pre_readonly_) + +// 'x' version for complex expressions that are not supported by the current compiler version +// e.g. void Set3ColMatrix( _In_count_x_(3*cRows) const Elem* matrix, int cRows ); +#define _In_count_x_(size) _SAL1_1_Source_(_In_count_x_, (size), _Pre_count_x_(size) _Deref_pre_readonly_) +#define _In_opt_count_x_(size) _SAL1_1_Source_(_In_opt_count_x_, (size), _Pre_opt_count_x_(size) _Deref_pre_readonly_) +#define _In_bytecount_x_(size) _SAL1_1_Source_(_In_bytecount_x_, (size), _Pre_bytecount_x_(size) _Deref_pre_readonly_) +#define _In_opt_bytecount_x_(size) _SAL1_1_Source_(_In_opt_bytecount_x_, (size), _Pre_opt_bytecount_x_(size) _Deref_pre_readonly_) + + +// 'out' with buffer size +// e.g. void GetIndices( _Out_cap_(cIndices) int* rgIndices, size_t cIndices ); +// buffer capacity is described by another parameter +#define _Out_cap_(size) _SAL1_1_Source_(_Out_cap_, (size), _Pre_cap_(size) _Post_valid_impl_) +#define _Out_opt_cap_(size) _SAL1_1_Source_(_Out_opt_cap_, (size), _Pre_opt_cap_(size) _Post_valid_impl_) +#define _Out_bytecap_(size) _SAL1_1_Source_(_Out_bytecap_, (size), _Pre_bytecap_(size) _Post_valid_impl_) +#define _Out_opt_bytecap_(size) _SAL1_1_Source_(_Out_opt_bytecap_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_) + +// buffer capacity is described by a constant expression +#define _Out_cap_c_(size) _SAL1_1_Source_(_Out_cap_c_, (size), _Pre_cap_c_(size) _Post_valid_impl_) +#define _Out_opt_cap_c_(size) _SAL1_1_Source_(_Out_opt_cap_c_, (size), _Pre_opt_cap_c_(size) _Post_valid_impl_) +#define _Out_bytecap_c_(size) _SAL1_1_Source_(_Out_bytecap_c_, (size), _Pre_bytecap_c_(size) _Post_valid_impl_) +#define _Out_opt_bytecap_c_(size) _SAL1_1_Source_(_Out_opt_bytecap_c_, (size), _Pre_opt_bytecap_c_(size) _Post_valid_impl_) + +// buffer capacity is described by another parameter multiplied by a constant expression +#define _Out_cap_m_(mult,size) _SAL1_1_Source_(_Out_cap_m_, (mult,size), _Pre_cap_m_(mult,size) _Post_valid_impl_) +#define _Out_opt_cap_m_(mult,size) _SAL1_1_Source_(_Out_opt_cap_m_, (mult,size), _Pre_opt_cap_m_(mult,size) _Post_valid_impl_) +#define _Out_z_cap_m_(mult,size) _SAL1_1_Source_(_Out_z_cap_m_, (mult,size), _Pre_cap_m_(mult,size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_cap_m_(mult,size) _SAL1_1_Source_(_Out_opt_z_cap_m_, (mult,size), _Pre_opt_cap_m_(mult,size) _Post_valid_impl_ _Post_z_) + +// buffer capacity is described by another pointer +// e.g. void Foo( _Out_ptrdiff_cap_(pchMax) char* pch, const char* pchMax ) { while pch < pchMax ) pch++; } +#define _Out_ptrdiff_cap_(size) _SAL1_1_Source_(_Out_ptrdiff_cap_, (size), _Pre_ptrdiff_cap_(size) _Post_valid_impl_) +#define _Out_opt_ptrdiff_cap_(size) _SAL1_1_Source_(_Out_opt_ptrdiff_cap_, (size), _Pre_opt_ptrdiff_cap_(size) _Post_valid_impl_) + +// buffer capacity is described by a complex expression +#define _Out_cap_x_(size) _SAL1_1_Source_(_Out_cap_x_, (size), _Pre_cap_x_(size) _Post_valid_impl_) +#define _Out_opt_cap_x_(size) _SAL1_1_Source_(_Out_opt_cap_x_, (size), _Pre_opt_cap_x_(size) _Post_valid_impl_) +#define _Out_bytecap_x_(size) _SAL1_1_Source_(_Out_bytecap_x_, (size), _Pre_bytecap_x_(size) _Post_valid_impl_) +#define _Out_opt_bytecap_x_(size) _SAL1_1_Source_(_Out_opt_bytecap_x_, (size), _Pre_opt_bytecap_x_(size) _Post_valid_impl_) + +// a zero terminated string is filled into a buffer of given capacity +// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); +// buffer capacity is described by another parameter +#define _Out_z_cap_(size) _SAL1_1_Source_(_Out_z_cap_, (size), _Pre_cap_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_cap_(size) _SAL1_1_Source_(_Out_opt_z_cap_, (size), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_z_) +#define _Out_z_bytecap_(size) _SAL1_1_Source_(_Out_z_bytecap_, (size), _Pre_bytecap_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_bytecap_(size) _SAL1_1_Source_(_Out_opt_z_bytecap_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_ _Post_z_) + +// buffer capacity is described by a constant expression +#define _Out_z_cap_c_(size) _SAL1_1_Source_(_Out_z_cap_c_, (size), _Pre_cap_c_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_cap_c_(size) _SAL1_1_Source_(_Out_opt_z_cap_c_, (size), _Pre_opt_cap_c_(size) _Post_valid_impl_ _Post_z_) +#define _Out_z_bytecap_c_(size) _SAL1_1_Source_(_Out_z_bytecap_c_, (size), _Pre_bytecap_c_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Out_opt_z_bytecap_c_, (size), _Pre_opt_bytecap_c_(size) _Post_valid_impl_ _Post_z_) + +// buffer capacity is described by a complex expression +#define _Out_z_cap_x_(size) _SAL1_1_Source_(_Out_z_cap_x_, (size), _Pre_cap_x_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_cap_x_(size) _SAL1_1_Source_(_Out_opt_z_cap_x_, (size), _Pre_opt_cap_x_(size) _Post_valid_impl_ _Post_z_) +#define _Out_z_bytecap_x_(size) _SAL1_1_Source_(_Out_z_bytecap_x_, (size), _Pre_bytecap_x_(size) _Post_valid_impl_ _Post_z_) +#define _Out_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Out_opt_z_bytecap_x_, (size), _Pre_opt_bytecap_x_(size) _Post_valid_impl_ _Post_z_) + +// a zero terminated string is filled into a buffer of given capacity +// e.g. size_t CopyCharRange( _In_count_(cchFrom) const char* rgchFrom, size_t cchFrom, _Out_cap_post_count_(cchTo,return)) char* rgchTo, size_t cchTo ); +#define _Out_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_cap_post_count_, (cap,count), _Pre_cap_(cap) _Post_valid_impl_ _Post_count_(count)) +#define _Out_opt_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_opt_cap_post_count_, (cap,count), _Pre_opt_cap_(cap) _Post_valid_impl_ _Post_count_(count)) +#define _Out_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_bytecap_post_bytecount_, (cap,count), _Pre_bytecap_(cap) _Post_valid_impl_ _Post_bytecount_(count)) +#define _Out_opt_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_opt_bytecap_post_bytecount_, (cap,count), _Pre_opt_bytecap_(cap) _Post_valid_impl_ _Post_bytecount_(count)) + +// a zero terminated string is filled into a buffer of given capacity +// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Out_z_cap_post_count_(cchTo,return+1) char* szTo, size_t cchTo ); +#define _Out_z_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_z_cap_post_count_, (cap,count), _Pre_cap_(cap) _Post_valid_impl_ _Post_z_count_(count)) +#define _Out_opt_z_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_opt_z_cap_post_count_, (cap,count), _Pre_opt_cap_(cap) _Post_valid_impl_ _Post_z_count_(count)) +#define _Out_z_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_z_bytecap_post_bytecount_, (cap,count), _Pre_bytecap_(cap) _Post_valid_impl_ _Post_z_bytecount_(count)) +#define _Out_opt_z_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_opt_z_bytecap_post_bytecount_, (cap,count), _Pre_opt_bytecap_(cap) _Post_valid_impl_ _Post_z_bytecount_(count)) + +// only use with dereferenced arguments e.g. '*pcch' +#define _Out_capcount_(capcount) _SAL1_1_Source_(_Out_capcount_, (capcount), _Pre_cap_(capcount) _Post_valid_impl_ _Post_count_(capcount)) +#define _Out_opt_capcount_(capcount) _SAL1_1_Source_(_Out_opt_capcount_, (capcount), _Pre_opt_cap_(capcount) _Post_valid_impl_ _Post_count_(capcount)) +#define _Out_bytecapcount_(capcount) _SAL1_1_Source_(_Out_bytecapcount_, (capcount), _Pre_bytecap_(capcount) _Post_valid_impl_ _Post_bytecount_(capcount)) +#define _Out_opt_bytecapcount_(capcount) _SAL1_1_Source_(_Out_opt_bytecapcount_, (capcount), _Pre_opt_bytecap_(capcount) _Post_valid_impl_ _Post_bytecount_(capcount)) + +#define _Out_capcount_x_(capcount) _SAL1_1_Source_(_Out_capcount_x_, (capcount), _Pre_cap_x_(capcount) _Post_valid_impl_ _Post_count_x_(capcount)) +#define _Out_opt_capcount_x_(capcount) _SAL1_1_Source_(_Out_opt_capcount_x_, (capcount), _Pre_opt_cap_x_(capcount) _Post_valid_impl_ _Post_count_x_(capcount)) +#define _Out_bytecapcount_x_(capcount) _SAL1_1_Source_(_Out_bytecapcount_x_, (capcount), _Pre_bytecap_x_(capcount) _Post_valid_impl_ _Post_bytecount_x_(capcount)) +#define _Out_opt_bytecapcount_x_(capcount) _SAL1_1_Source_(_Out_opt_bytecapcount_x_, (capcount), _Pre_opt_bytecap_x_(capcount) _Post_valid_impl_ _Post_bytecount_x_(capcount)) + +// e.g. GetString( _Out_z_capcount_(*pLen+1) char* sz, size_t* pLen ); +#define _Out_z_capcount_(capcount) _SAL1_1_Source_(_Out_z_capcount_, (capcount), _Pre_cap_(capcount) _Post_valid_impl_ _Post_z_count_(capcount)) +#define _Out_opt_z_capcount_(capcount) _SAL1_1_Source_(_Out_opt_z_capcount_, (capcount), _Pre_opt_cap_(capcount) _Post_valid_impl_ _Post_z_count_(capcount)) +#define _Out_z_bytecapcount_(capcount) _SAL1_1_Source_(_Out_z_bytecapcount_, (capcount), _Pre_bytecap_(capcount) _Post_valid_impl_ _Post_z_bytecount_(capcount)) +#define _Out_opt_z_bytecapcount_(capcount) _SAL1_1_Source_(_Out_opt_z_bytecapcount_, (capcount), _Pre_opt_bytecap_(capcount) _Post_valid_impl_ _Post_z_bytecount_(capcount)) + + +// 'inout' buffers with initialized elements before and after the call +// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndices, size_t cIndices ); +#define _Inout_count_(size) _SAL1_1_Source_(_Inout_count_, (size), _Prepost_count_(size)) +#define _Inout_opt_count_(size) _SAL1_1_Source_(_Inout_opt_count_, (size), _Prepost_opt_count_(size)) +#define _Inout_bytecount_(size) _SAL1_1_Source_(_Inout_bytecount_, (size), _Prepost_bytecount_(size)) +#define _Inout_opt_bytecount_(size) _SAL1_1_Source_(_Inout_opt_bytecount_, (size), _Prepost_opt_bytecount_(size)) + +#define _Inout_count_c_(size) _SAL1_1_Source_(_Inout_count_c_, (size), _Prepost_count_c_(size)) +#define _Inout_opt_count_c_(size) _SAL1_1_Source_(_Inout_opt_count_c_, (size), _Prepost_opt_count_c_(size)) +#define _Inout_bytecount_c_(size) _SAL1_1_Source_(_Inout_bytecount_c_, (size), _Prepost_bytecount_c_(size)) +#define _Inout_opt_bytecount_c_(size) _SAL1_1_Source_(_Inout_opt_bytecount_c_, (size), _Prepost_opt_bytecount_c_(size)) + +// nullterminated 'inout' buffers with initialized elements before and after the call +// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndices, size_t cIndices ); +#define _Inout_z_count_(size) _SAL1_1_Source_(_Inout_z_count_, (size), _Prepost_z_ _Prepost_count_(size)) +#define _Inout_opt_z_count_(size) _SAL1_1_Source_(_Inout_opt_z_count_, (size), _Prepost_z_ _Prepost_opt_count_(size)) +#define _Inout_z_bytecount_(size) _SAL1_1_Source_(_Inout_z_bytecount_, (size), _Prepost_z_ _Prepost_bytecount_(size)) +#define _Inout_opt_z_bytecount_(size) _SAL1_1_Source_(_Inout_opt_z_bytecount_, (size), _Prepost_z_ _Prepost_opt_bytecount_(size)) + +#define _Inout_z_count_c_(size) _SAL1_1_Source_(_Inout_z_count_c_, (size), _Prepost_z_ _Prepost_count_c_(size)) +#define _Inout_opt_z_count_c_(size) _SAL1_1_Source_(_Inout_opt_z_count_c_, (size), _Prepost_z_ _Prepost_opt_count_c_(size)) +#define _Inout_z_bytecount_c_(size) _SAL1_1_Source_(_Inout_z_bytecount_c_, (size), _Prepost_z_ _Prepost_bytecount_c_(size)) +#define _Inout_opt_z_bytecount_c_(size) _SAL1_1_Source_(_Inout_opt_z_bytecount_c_, (size), _Prepost_z_ _Prepost_opt_bytecount_c_(size)) + +#define _Inout_ptrdiff_count_(size) _SAL1_1_Source_(_Inout_ptrdiff_count_, (size), _Pre_ptrdiff_count_(size)) +#define _Inout_opt_ptrdiff_count_(size) _SAL1_1_Source_(_Inout_opt_ptrdiff_count_, (size), _Pre_opt_ptrdiff_count_(size)) + +#define _Inout_count_x_(size) _SAL1_1_Source_(_Inout_count_x_, (size), _Prepost_count_x_(size)) +#define _Inout_opt_count_x_(size) _SAL1_1_Source_(_Inout_opt_count_x_, (size), _Prepost_opt_count_x_(size)) +#define _Inout_bytecount_x_(size) _SAL1_1_Source_(_Inout_bytecount_x_, (size), _Prepost_bytecount_x_(size)) +#define _Inout_opt_bytecount_x_(size) _SAL1_1_Source_(_Inout_opt_bytecount_x_, (size), _Prepost_opt_bytecount_x_(size)) + +// e.g. void AppendToLPSTR( _In_ LPCSTR szFrom, _Inout_cap_(cchTo) LPSTR* szTo, size_t cchTo ); +#define _Inout_cap_(size) _SAL1_1_Source_(_Inout_cap_, (size), _Pre_valid_cap_(size) _Post_valid_) +#define _Inout_opt_cap_(size) _SAL1_1_Source_(_Inout_opt_cap_, (size), _Pre_opt_valid_cap_(size) _Post_valid_) +#define _Inout_bytecap_(size) _SAL1_1_Source_(_Inout_bytecap_, (size), _Pre_valid_bytecap_(size) _Post_valid_) +#define _Inout_opt_bytecap_(size) _SAL1_1_Source_(_Inout_opt_bytecap_, (size), _Pre_opt_valid_bytecap_(size) _Post_valid_) + +#define _Inout_cap_c_(size) _SAL1_1_Source_(_Inout_cap_c_, (size), _Pre_valid_cap_c_(size) _Post_valid_) +#define _Inout_opt_cap_c_(size) _SAL1_1_Source_(_Inout_opt_cap_c_, (size), _Pre_opt_valid_cap_c_(size) _Post_valid_) +#define _Inout_bytecap_c_(size) _SAL1_1_Source_(_Inout_bytecap_c_, (size), _Pre_valid_bytecap_c_(size) _Post_valid_) +#define _Inout_opt_bytecap_c_(size) _SAL1_1_Source_(_Inout_opt_bytecap_c_, (size), _Pre_opt_valid_bytecap_c_(size) _Post_valid_) + +#define _Inout_cap_x_(size) _SAL1_1_Source_(_Inout_cap_x_, (size), _Pre_valid_cap_x_(size) _Post_valid_) +#define _Inout_opt_cap_x_(size) _SAL1_1_Source_(_Inout_opt_cap_x_, (size), _Pre_opt_valid_cap_x_(size) _Post_valid_) +#define _Inout_bytecap_x_(size) _SAL1_1_Source_(_Inout_bytecap_x_, (size), _Pre_valid_bytecap_x_(size) _Post_valid_) +#define _Inout_opt_bytecap_x_(size) _SAL1_1_Source_(_Inout_opt_bytecap_x_, (size), _Pre_opt_valid_bytecap_x_(size) _Post_valid_) + +// inout string buffers with writable size +// e.g. void AppendStr( _In_z_ const char* szFrom, _Inout_z_cap_(cchTo) char* szTo, size_t cchTo ); +#define _Inout_z_cap_(size) _SAL1_1_Source_(_Inout_z_cap_, (size), _Pre_z_cap_(size) _Post_z_) +#define _Inout_opt_z_cap_(size) _SAL1_1_Source_(_Inout_opt_z_cap_, (size), _Pre_opt_z_cap_(size) _Post_z_) +#define _Inout_z_bytecap_(size) _SAL1_1_Source_(_Inout_z_bytecap_, (size), _Pre_z_bytecap_(size) _Post_z_) +#define _Inout_opt_z_bytecap_(size) _SAL1_1_Source_(_Inout_opt_z_bytecap_, (size), _Pre_opt_z_bytecap_(size) _Post_z_) + +#define _Inout_z_cap_c_(size) _SAL1_1_Source_(_Inout_z_cap_c_, (size), _Pre_z_cap_c_(size) _Post_z_) +#define _Inout_opt_z_cap_c_(size) _SAL1_1_Source_(_Inout_opt_z_cap_c_, (size), _Pre_opt_z_cap_c_(size) _Post_z_) +#define _Inout_z_bytecap_c_(size) _SAL1_1_Source_(_Inout_z_bytecap_c_, (size), _Pre_z_bytecap_c_(size) _Post_z_) +#define _Inout_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Inout_opt_z_bytecap_c_, (size), _Pre_opt_z_bytecap_c_(size) _Post_z_) + +#define _Inout_z_cap_x_(size) _SAL1_1_Source_(_Inout_z_cap_x_, (size), _Pre_z_cap_x_(size) _Post_z_) +#define _Inout_opt_z_cap_x_(size) _SAL1_1_Source_(_Inout_opt_z_cap_x_, (size), _Pre_opt_z_cap_x_(size) _Post_z_) +#define _Inout_z_bytecap_x_(size) _SAL1_1_Source_(_Inout_z_bytecap_x_, (size), _Pre_z_bytecap_x_(size) _Post_z_) +#define _Inout_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Inout_opt_z_bytecap_x_, (size), _Pre_opt_z_bytecap_x_(size) _Post_z_) + + +// returning pointers to valid objects +#define _Ret_ _SAL1_1_Source_(_Ret_, (), _Ret_valid_) +#define _Ret_opt_ _SAL1_1_Source_(_Ret_opt_, (), _Ret_opt_valid_) + +// annotations to express 'boundedness' of integral value parameter +#define _In_bound_ _SAL1_1_Source_(_In_bound_, (), _In_bound_impl_) +#define _Out_bound_ _SAL1_1_Source_(_Out_bound_, (), _Out_bound_impl_) +#define _Ret_bound_ _SAL1_1_Source_(_Ret_bound_, (), _Ret_bound_impl_) +#define _Deref_in_bound_ _SAL1_1_Source_(_Deref_in_bound_, (), _Deref_in_bound_impl_) +#define _Deref_out_bound_ _SAL1_1_Source_(_Deref_out_bound_, (), _Deref_out_bound_impl_) +#define _Deref_inout_bound_ _SAL1_1_Source_(_Deref_inout_bound_, (), _Deref_in_bound_ _Deref_out_bound_) +#define _Deref_ret_bound_ _SAL1_1_Source_(_Deref_ret_bound_, (), _Deref_ret_bound_impl_) + +// e.g. HRESULT HrCreatePoint( _Deref_out_opt_ POINT** ppPT ); +#define _Deref_out_ _SAL1_1_Source_(_Deref_out_, (), _Out_ _Deref_post_valid_) +#define _Deref_out_opt_ _SAL1_1_Source_(_Deref_out_opt_, (), _Out_ _Deref_post_opt_valid_) +#define _Deref_opt_out_ _SAL1_1_Source_(_Deref_opt_out_, (), _Out_opt_ _Deref_post_valid_) +#define _Deref_opt_out_opt_ _SAL1_1_Source_(_Deref_opt_out_opt_, (), _Out_opt_ _Deref_post_opt_valid_) + +// e.g. void CloneString( _In_z_ const WCHAR* wzFrom, _Deref_out_z_ WCHAR** pWzTo ); +#define _Deref_out_z_ _SAL1_1_Source_(_Deref_out_z_, (), _Out_ _Deref_post_z_) +#define _Deref_out_opt_z_ _SAL1_1_Source_(_Deref_out_opt_z_, (), _Out_ _Deref_post_opt_z_) +#define _Deref_opt_out_z_ _SAL1_1_Source_(_Deref_opt_out_z_, (), _Out_opt_ _Deref_post_z_) +#define _Deref_opt_out_opt_z_ _SAL1_1_Source_(_Deref_opt_out_opt_z_, (), _Out_opt_ _Deref_post_opt_z_) + +// +// _Deref_pre_ --- +// +// describing conditions for array elements of dereferenced pointer parameters that must be met before the call + +// e.g. void SaveStringArray( _In_count_(cStrings) _Deref_pre_z_ const WCHAR* const rgpwch[] ); +#define _Deref_pre_z_ _SAL1_1_Source_(_Deref_pre_z_, (), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__zterm_impl) _Pre_valid_impl_) +#define _Deref_pre_opt_z_ _SAL1_1_Source_(_Deref_pre_opt_z_, (), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__zterm_impl) _Pre_valid_impl_) + +// e.g. void FillInArrayOfStr32( _In_count_(cStrings) _Deref_pre_cap_c_(32) _Deref_post_z_ WCHAR* const rgpwch[] ); +// buffer capacity is described by another parameter +#define _Deref_pre_cap_(size) _SAL1_1_Source_(_Deref_pre_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_impl(size))) +#define _Deref_pre_opt_cap_(size) _SAL1_1_Source_(_Deref_pre_opt_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_impl(size))) +#define _Deref_pre_bytecap_(size) _SAL1_1_Source_(_Deref_pre_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size))) +#define _Deref_pre_opt_bytecap_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size))) + +// buffer capacity is described by a constant expression +#define _Deref_pre_cap_c_(size) _SAL1_1_Source_(_Deref_pre_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size))) +#define _Deref_pre_opt_cap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size))) +#define _Deref_pre_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size))) +#define _Deref_pre_opt_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size))) + +// buffer capacity is described by a complex condition +#define _Deref_pre_cap_x_(size) _SAL1_1_Source_(_Deref_pre_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size))) +#define _Deref_pre_opt_cap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size))) +#define _Deref_pre_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size))) +#define _Deref_pre_opt_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size))) + +// convenience macros for nullterminated buffers with given capacity +#define _Deref_pre_z_cap_(size) _SAL1_1_Source_(_Deref_pre_z_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_cap_(size) _SAL1_1_Source_(_Deref_pre_opt_z_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_z_bytecap_(size) _SAL1_1_Source_(_Deref_pre_z_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_bytecap_(size) _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) + +#define _Deref_pre_z_cap_c_(size) _SAL1_1_Source_(_Deref_pre_z_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_cap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_z_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_z_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) + +#define _Deref_pre_z_cap_x_(size) _SAL1_1_Source_(_Deref_pre_z_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_cap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_z_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_z_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) + +// known capacity and valid but unknown readable extent +#define _Deref_pre_valid_cap_(size) _SAL1_1_Source_(_Deref_pre_valid_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_cap_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_valid_bytecap_(size) _SAL1_1_Source_(_Deref_pre_valid_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_bytecap_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) + +#define _Deref_pre_valid_cap_c_(size) _SAL1_1_Source_(_Deref_pre_valid_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_cap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_valid_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) + +#define _Deref_pre_valid_cap_x_(size) _SAL1_1_Source_(_Deref_pre_valid_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_cap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_valid_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) + +// e.g. void SaveMatrix( _In_count_(n) _Deref_pre_count_(n) const Elem** matrix, size_t n ); +// valid buffer extent is described by another parameter +#define _Deref_pre_count_(size) _SAL1_1_Source_(_Deref_pre_count_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_count_(size) _SAL1_1_Source_(_Deref_pre_opt_count_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_bytecount_(size) _SAL1_1_Source_(_Deref_pre_bytecount_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_bytecount_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecount_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) + +// valid buffer extent is described by a constant expression +#define _Deref_pre_count_c_(size) _SAL1_1_Source_(_Deref_pre_count_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_count_c_(size) _SAL1_1_Source_(_Deref_pre_opt_count_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_bytecount_c_(size) _SAL1_1_Source_(_Deref_pre_bytecount_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_bytecount_c_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecount_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) + +// valid buffer extent is described by a complex expression +#define _Deref_pre_count_x_(size) _SAL1_1_Source_(_Deref_pre_count_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_count_x_(size) _SAL1_1_Source_(_Deref_pre_opt_count_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_bytecount_x_(size) _SAL1_1_Source_(_Deref_pre_bytecount_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) +#define _Deref_pre_opt_bytecount_x_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecount_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) + +// e.g. void PrintStringArray( _In_count_(cElems) _Deref_pre_valid_ LPCSTR rgStr[], size_t cElems ); +#define _Deref_pre_valid_ _SAL1_1_Source_(_Deref_pre_valid_, (), _Deref_pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_) +#define _Deref_pre_opt_valid_ _SAL1_1_Source_(_Deref_pre_opt_valid_, (), _Deref_pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_) +#define _Deref_pre_invalid_ _SAL1_1_Source_(_Deref_pre_invalid_, (), _Deref_pre1_impl_(__notvalid_impl)) + +#define _Deref_pre_notnull_ _SAL1_1_Source_(_Deref_pre_notnull_, (), _Deref_pre1_impl_(__notnull_impl_notref)) +#define _Deref_pre_maybenull_ _SAL1_1_Source_(_Deref_pre_maybenull_, (), _Deref_pre1_impl_(__maybenull_impl_notref)) +#define _Deref_pre_null_ _SAL1_1_Source_(_Deref_pre_null_, (), _Deref_pre1_impl_(__null_impl_notref)) + +// restrict access rights +#define _Deref_pre_readonly_ _SAL1_1_Source_(_Deref_pre_readonly_, (), _Deref_pre1_impl_(__readaccess_impl_notref)) +#define _Deref_pre_writeonly_ _SAL1_1_Source_(_Deref_pre_writeonly_, (), _Deref_pre1_impl_(__writeaccess_impl_notref)) + +// +// _Deref_post_ --- +// +// describing conditions for array elements or dereferenced pointer parameters that hold after the call + +// e.g. void CloneString( _In_z_ const Wchar_t* wzIn _Out_ _Deref_post_z_ WCHAR** pWzOut ); +#define _Deref_post_z_ _SAL1_1_Source_(_Deref_post_z_, (), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__zterm_impl) _Post_valid_impl_) +#define _Deref_post_opt_z_ _SAL1_1_Source_(_Deref_post_opt_z_, (), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__zterm_impl) _Post_valid_impl_) + +// e.g. HRESULT HrAllocateMemory( size_t cb, _Out_ _Deref_post_bytecap_(cb) void** ppv ); +// buffer capacity is described by another parameter +#define _Deref_post_cap_(size) _SAL1_1_Source_(_Deref_post_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_impl(size))) +#define _Deref_post_opt_cap_(size) _SAL1_1_Source_(_Deref_post_opt_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_impl(size))) +#define _Deref_post_bytecap_(size) _SAL1_1_Source_(_Deref_post_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size))) +#define _Deref_post_opt_bytecap_(size) _SAL1_1_Source_(_Deref_post_opt_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size))) + +// buffer capacity is described by a constant expression +#define _Deref_post_cap_c_(size) _SAL1_1_Source_(_Deref_post_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size))) +#define _Deref_post_opt_cap_c_(size) _SAL1_1_Source_(_Deref_post_opt_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size))) +#define _Deref_post_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size))) +#define _Deref_post_opt_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_opt_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size))) + +// buffer capacity is described by a complex expression +#define _Deref_post_cap_x_(size) _SAL1_1_Source_(_Deref_post_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size))) +#define _Deref_post_opt_cap_x_(size) _SAL1_1_Source_(_Deref_post_opt_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size))) +#define _Deref_post_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size))) +#define _Deref_post_opt_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_opt_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size))) + +// convenience macros for nullterminated buffers with given capacity +#define _Deref_post_z_cap_(size) _SAL1_1_Source_(_Deref_post_z_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_cap_(size) _SAL1_1_Source_(_Deref_post_opt_z_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_impl(size)) _Post_valid_impl_) +#define _Deref_post_z_bytecap_(size) _SAL1_1_Source_(_Deref_post_z_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_bytecap_(size) _SAL1_1_Source_(_Deref_post_opt_z_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_impl(size)) _Post_valid_impl_) + +#define _Deref_post_z_cap_c_(size) _SAL1_1_Source_(_Deref_post_z_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_cap_c_(size) _SAL1_1_Source_(_Deref_post_opt_z_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_z_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_opt_z_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Post_valid_impl_) + +#define _Deref_post_z_cap_x_(size) _SAL1_1_Source_(_Deref_post_z_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_cap_x_(size) _SAL1_1_Source_(_Deref_post_opt_z_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_z_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_opt_z_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Post_valid_impl_) + +// known capacity and valid but unknown readable extent +#define _Deref_post_valid_cap_(size) _SAL1_1_Source_(_Deref_post_valid_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_cap_(size) _SAL1_1_Source_(_Deref_post_opt_valid_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_impl(size)) _Post_valid_impl_) +#define _Deref_post_valid_bytecap_(size) _SAL1_1_Source_(_Deref_post_valid_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_bytecap_(size) _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size)) _Post_valid_impl_) + +#define _Deref_post_valid_cap_c_(size) _SAL1_1_Source_(_Deref_post_valid_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_cap_c_(size) _SAL1_1_Source_(_Deref_post_opt_valid_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_valid_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)) _Post_valid_impl_) + +#define _Deref_post_valid_cap_x_(size) _SAL1_1_Source_(_Deref_post_valid_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_cap_x_(size) _SAL1_1_Source_(_Deref_post_opt_valid_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_valid_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)) _Post_valid_impl_) + +// e.g. HRESULT HrAllocateZeroInitializedMemory( size_t cb, _Out_ _Deref_post_bytecount_(cb) void** ppv ); +// valid buffer extent is described by another parameter +#define _Deref_post_count_(size) _SAL1_1_Source_(_Deref_post_count_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_count_(size) _SAL1_1_Source_(_Deref_post_opt_count_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_impl(size)) _Post_valid_impl_) +#define _Deref_post_bytecount_(size) _SAL1_1_Source_(_Deref_post_bytecount_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_bytecount_(size) _SAL1_1_Source_(_Deref_post_opt_bytecount_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) + +// buffer capacity is described by a constant expression +#define _Deref_post_count_c_(size) _SAL1_1_Source_(_Deref_post_count_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_count_c_(size) _SAL1_1_Source_(_Deref_post_opt_count_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_bytecount_c_(size) _SAL1_1_Source_(_Deref_post_bytecount_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_bytecount_c_(size) _SAL1_1_Source_(_Deref_post_opt_bytecount_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_) + +// buffer capacity is described by a complex expression +#define _Deref_post_count_x_(size) _SAL1_1_Source_(_Deref_post_count_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_count_x_(size) _SAL1_1_Source_(_Deref_post_opt_count_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_bytecount_x_(size) _SAL1_1_Source_(_Deref_post_bytecount_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_) +#define _Deref_post_opt_bytecount_x_(size) _SAL1_1_Source_(_Deref_post_opt_bytecount_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_) + +// e.g. void GetStrings( _Out_count_(cElems) _Deref_post_valid_ LPSTR const rgStr[], size_t cElems ); +#define _Deref_post_valid_ _SAL1_1_Source_(_Deref_post_valid_, (), _Deref_post1_impl_(__notnull_impl_notref) _Post_valid_impl_) +#define _Deref_post_opt_valid_ _SAL1_1_Source_(_Deref_post_opt_valid_, (), _Deref_post1_impl_(__maybenull_impl_notref) _Post_valid_impl_) + +#define _Deref_post_notnull_ _SAL1_1_Source_(_Deref_post_notnull_, (), _Deref_post1_impl_(__notnull_impl_notref)) +#define _Deref_post_maybenull_ _SAL1_1_Source_(_Deref_post_maybenull_, (), _Deref_post1_impl_(__maybenull_impl_notref)) +#define _Deref_post_null_ _SAL1_1_Source_(_Deref_post_null_, (), _Deref_post1_impl_(__null_impl_notref)) + +// +// _Deref_ret_ --- +// + +#define _Deref_ret_z_ _SAL1_1_Source_(_Deref_ret_z_, (), _Deref_ret1_impl_(__notnull_impl_notref) _Deref_ret1_impl_(__zterm_impl)) +#define _Deref_ret_opt_z_ _SAL1_1_Source_(_Deref_ret_opt_z_, (), _Deref_ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__zterm_impl)) + +// +// special _Deref_ --- +// +#define _Deref2_pre_readonly_ _SAL1_1_Source_(_Deref2_pre_readonly_, (), _Deref2_pre1_impl_(__readaccess_impl_notref)) + +// +// _Ret_ --- +// + +// e.g. _Ret_opt_valid_ LPSTR void* CloneSTR( _Pre_valid_ LPSTR src ); +#define _Ret_opt_valid_ _SAL1_1_Source_(_Ret_opt_valid_, (), _Ret1_impl_(__maybenull_impl_notref) _Ret_valid_impl_) +#define _Ret_opt_z_ _SAL1_1_Source_(_Ret_opt_z_, (), _Ret2_impl_(__maybenull_impl,__zterm_impl) _Ret_valid_impl_) + +// e.g. _Ret_opt_bytecap_(cb) void* AllocateMemory( size_t cb ); +// Buffer capacity is described by another parameter +#define _Ret_cap_(size) _SAL1_1_Source_(_Ret_cap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_impl(size))) +#define _Ret_opt_cap_(size) _SAL1_1_Source_(_Ret_opt_cap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_impl(size))) +#define _Ret_bytecap_(size) _SAL1_1_Source_(_Ret_bytecap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_impl(size))) +#define _Ret_opt_bytecap_(size) _SAL1_1_Source_(_Ret_opt_bytecap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_impl(size))) + +// Buffer capacity is described by a constant expression +#define _Ret_cap_c_(size) _SAL1_1_Source_(_Ret_cap_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_c_impl(size))) +#define _Ret_opt_cap_c_(size) _SAL1_1_Source_(_Ret_opt_cap_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_c_impl(size))) +#define _Ret_bytecap_c_(size) _SAL1_1_Source_(_Ret_bytecap_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_c_impl(size))) +#define _Ret_opt_bytecap_c_(size) _SAL1_1_Source_(_Ret_opt_bytecap_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_c_impl(size))) + +// Buffer capacity is described by a complex condition +#define _Ret_cap_x_(size) _SAL1_1_Source_(_Ret_cap_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_x_impl(size))) +#define _Ret_opt_cap_x_(size) _SAL1_1_Source_(_Ret_opt_cap_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_x_impl(size))) +#define _Ret_bytecap_x_(size) _SAL1_1_Source_(_Ret_bytecap_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_x_impl(size))) +#define _Ret_opt_bytecap_x_(size) _SAL1_1_Source_(_Ret_opt_bytecap_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_x_impl(size))) + +// return value is nullterminated and capacity is given by another parameter +#define _Ret_z_cap_(size) _SAL1_1_Source_(_Ret_z_cap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__cap_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_z_cap_(size) _SAL1_1_Source_(_Ret_opt_z_cap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__cap_impl(size)) _Ret_valid_impl_) +#define _Ret_z_bytecap_(size) _SAL1_1_Source_(_Ret_z_bytecap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecap_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_z_bytecap_(size) _SAL1_1_Source_(_Ret_opt_z_bytecap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecap_impl(size)) _Ret_valid_impl_) + +// e.g. _Ret_opt_bytecount_(cb) void* AllocateZeroInitializedMemory( size_t cb ); +// Valid Buffer extent is described by another parameter +#define _Ret_count_(size) _SAL1_1_Source_(_Ret_count_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_count_(size) _SAL1_1_Source_(_Ret_opt_count_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_impl(size)) _Ret_valid_impl_) +#define _Ret_bytecount_(size) _SAL1_1_Source_(_Ret_bytecount_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_bytecount_(size) _SAL1_1_Source_(_Ret_opt_bytecount_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_impl(size)) _Ret_valid_impl_) + +// Valid Buffer extent is described by a constant expression +#define _Ret_count_c_(size) _SAL1_1_Source_(_Ret_count_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_c_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_count_c_(size) _SAL1_1_Source_(_Ret_opt_count_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_c_impl(size)) _Ret_valid_impl_) +#define _Ret_bytecount_c_(size) _SAL1_1_Source_(_Ret_bytecount_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_c_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_bytecount_c_(size) _SAL1_1_Source_(_Ret_opt_bytecount_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_c_impl(size)) _Ret_valid_impl_) + +// Valid Buffer extent is described by a complex expression +#define _Ret_count_x_(size) _SAL1_1_Source_(_Ret_count_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_x_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_count_x_(size) _SAL1_1_Source_(_Ret_opt_count_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_x_impl(size)) _Ret_valid_impl_) +#define _Ret_bytecount_x_(size) _SAL1_1_Source_(_Ret_bytecount_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_x_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_bytecount_x_(size) _SAL1_1_Source_(_Ret_opt_bytecount_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_x_impl(size)) _Ret_valid_impl_) + +// return value is nullterminated and length is given by another parameter +#define _Ret_z_count_(size) _SAL1_1_Source_(_Ret_z_count_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__count_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_z_count_(size) _SAL1_1_Source_(_Ret_opt_z_count_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__count_impl(size)) _Ret_valid_impl_) +#define _Ret_z_bytecount_(size) _SAL1_1_Source_(_Ret_z_bytecount_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecount_impl(size)) _Ret_valid_impl_) +#define _Ret_opt_z_bytecount_(size) _SAL1_1_Source_(_Ret_opt_z_bytecount_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecount_impl(size)) _Ret_valid_impl_) + + +// _Pre_ annotations --- +#define _Pre_opt_z_ _SAL1_1_Source_(_Pre_opt_z_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__zterm_impl) _Pre_valid_impl_) + +// restrict access rights +#define _Pre_readonly_ _SAL1_1_Source_(_Pre_readonly_, (), _Pre1_impl_(__readaccess_impl_notref)) +#define _Pre_writeonly_ _SAL1_1_Source_(_Pre_writeonly_, (), _Pre1_impl_(__writeaccess_impl_notref)) + +// e.g. void FreeMemory( _Pre_bytecap_(cb) _Post_ptr_invalid_ void* pv, size_t cb ); +// buffer capacity described by another parameter +#define _Pre_cap_(size) _SAL1_1_Source_(_Pre_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_impl(size))) +#define _Pre_opt_cap_(size) _SAL1_1_Source_(_Pre_opt_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_impl(size))) +#define _Pre_bytecap_(size) _SAL1_1_Source_(_Pre_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_impl(size))) +#define _Pre_opt_bytecap_(size) _SAL1_1_Source_(_Pre_opt_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_impl(size))) + +// buffer capacity described by a constant expression +#define _Pre_cap_c_(size) _SAL1_1_Source_(_Pre_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_impl(size))) +#define _Pre_opt_cap_c_(size) _SAL1_1_Source_(_Pre_opt_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_impl(size))) +#define _Pre_bytecap_c_(size) _SAL1_1_Source_(_Pre_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size))) +#define _Pre_opt_bytecap_c_(size) _SAL1_1_Source_(_Pre_opt_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size))) +#define _Pre_cap_c_one_ _SAL1_1_Source_(_Pre_cap_c_one_, (), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl)) +#define _Pre_opt_cap_c_one_ _SAL1_1_Source_(_Pre_opt_cap_c_one_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl)) + +// buffer capacity is described by another parameter multiplied by a constant expression +#define _Pre_cap_m_(mult,size) _SAL1_1_Source_(_Pre_cap_m_, (mult,size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__mult_impl(mult,size))) +#define _Pre_opt_cap_m_(mult,size) _SAL1_1_Source_(_Pre_opt_cap_m_, (mult,size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__mult_impl(mult,size))) + +// buffer capacity described by size of other buffer, only used by dangerous legacy APIs +// e.g. int strcpy(_Pre_cap_for_(src) char* dst, const char* src); +#define _Pre_cap_for_(param) _SAL1_1_Source_(_Pre_cap_for_, (param), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_for_impl(param))) +#define _Pre_opt_cap_for_(param) _SAL1_1_Source_(_Pre_opt_cap_for_, (param), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_for_impl(param))) + +// buffer capacity described by a complex condition +#define _Pre_cap_x_(size) _SAL1_1_Source_(_Pre_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(size))) +#define _Pre_opt_cap_x_(size) _SAL1_1_Source_(_Pre_opt_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(size))) +#define _Pre_bytecap_x_(size) _SAL1_1_Source_(_Pre_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size))) +#define _Pre_opt_bytecap_x_(size) _SAL1_1_Source_(_Pre_opt_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size))) + +// buffer capacity described by the difference to another pointer parameter +#define _Pre_ptrdiff_cap_(ptr) _SAL1_1_Source_(_Pre_ptrdiff_cap_, (ptr), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(__ptrdiff(ptr)))) +#define _Pre_opt_ptrdiff_cap_(ptr) _SAL1_1_Source_(_Pre_opt_ptrdiff_cap_, (ptr), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(__ptrdiff(ptr)))) + +// e.g. void AppendStr( _Pre_z_ const char* szFrom, _Pre_z_cap_(cchTo) _Post_z_ char* szTo, size_t cchTo ); +#define _Pre_z_cap_(size) _SAL1_1_Source_(_Pre_z_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_cap_(size) _SAL1_1_Source_(_Pre_opt_z_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) +#define _Pre_z_bytecap_(size) _SAL1_1_Source_(_Pre_z_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_bytecap_(size) _SAL1_1_Source_(_Pre_opt_z_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) + +#define _Pre_z_cap_c_(size) _SAL1_1_Source_(_Pre_z_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_cap_c_(size) _SAL1_1_Source_(_Pre_opt_z_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_z_bytecap_c_(size) _SAL1_1_Source_(_Pre_z_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Pre_opt_z_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) + +#define _Pre_z_cap_x_(size) _SAL1_1_Source_(_Pre_z_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_cap_x_(size) _SAL1_1_Source_(_Pre_opt_z_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_z_bytecap_x_(size) _SAL1_1_Source_(_Pre_z_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Pre_opt_z_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) + +// known capacity and valid but unknown readable extent +#define _Pre_valid_cap_(size) _SAL1_1_Source_(_Pre_valid_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_cap_(size) _SAL1_1_Source_(_Pre_opt_valid_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) +#define _Pre_valid_bytecap_(size) _SAL1_1_Source_(_Pre_valid_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_bytecap_(size) _SAL1_1_Source_(_Pre_opt_valid_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) + +#define _Pre_valid_cap_c_(size) _SAL1_1_Source_(_Pre_valid_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_cap_c_(size) _SAL1_1_Source_(_Pre_opt_valid_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_valid_bytecap_c_(size) _SAL1_1_Source_(_Pre_valid_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_bytecap_c_(size) _SAL1_1_Source_(_Pre_opt_valid_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) + +#define _Pre_valid_cap_x_(size) _SAL1_1_Source_(_Pre_valid_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_cap_x_(size) _SAL1_1_Source_(_Pre_opt_valid_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_valid_bytecap_x_(size) _SAL1_1_Source_(_Pre_valid_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Pre_opt_valid_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) + +// e.g. void AppendCharRange( _Pre_count_(cchFrom) const char* rgFrom, size_t cchFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); +// Valid buffer extent described by another parameter +#define _Pre_count_(size) _SAL1_1_Source_(_Pre_count_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_count_(size) _SAL1_1_Source_(_Pre_opt_count_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_impl(size)) _Pre_valid_impl_) +#define _Pre_bytecount_(size) _SAL1_1_Source_(_Pre_bytecount_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_bytecount_(size) _SAL1_1_Source_(_Pre_opt_bytecount_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) + +// Valid buffer extent described by a constant expression +#define _Pre_count_c_(size) _SAL1_1_Source_(_Pre_count_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_count_c_(size) _SAL1_1_Source_(_Pre_opt_count_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) +#define _Pre_bytecount_c_(size) _SAL1_1_Source_(_Pre_bytecount_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_bytecount_c_(size) _SAL1_1_Source_(_Pre_opt_bytecount_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) + +// Valid buffer extent described by a complex expression +#define _Pre_count_x_(size) _SAL1_1_Source_(_Pre_count_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_count_x_(size) _SAL1_1_Source_(_Pre_opt_count_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) +#define _Pre_bytecount_x_(size) _SAL1_1_Source_(_Pre_bytecount_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) +#define _Pre_opt_bytecount_x_(size) _SAL1_1_Source_(_Pre_opt_bytecount_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) + +// Valid buffer extent described by the difference to another pointer parameter +#define _Pre_ptrdiff_count_(ptr) _SAL1_1_Source_(_Pre_ptrdiff_count_, (ptr), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_x_impl(__ptrdiff(ptr))) _Pre_valid_impl_) +#define _Pre_opt_ptrdiff_count_(ptr) _SAL1_1_Source_(_Pre_opt_ptrdiff_count_, (ptr), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_x_impl(__ptrdiff(ptr))) _Pre_valid_impl_) + + +// char * strncpy(_Out_cap_(_Count) _Post_maybez_ char * _Dest, _In_z_ const char * _Source, _In_ size_t _Count) +// buffer maybe zero-terminated after the call +#define _Post_maybez_ _SAL1_1_Source_(_Post_maybez_, (), _Post1_impl_(__maybezterm_impl)) + +// e.g. SIZE_T HeapSize( _In_ HANDLE hHeap, DWORD dwFlags, _Pre_notnull_ _Post_bytecap_(return) LPCVOID lpMem ); +#define _Post_cap_(size) _SAL1_1_Source_(_Post_cap_, (size), _Post1_impl_(__cap_impl(size))) +#define _Post_bytecap_(size) _SAL1_1_Source_(_Post_bytecap_, (size), _Post1_impl_(__bytecap_impl(size))) + +// e.g. int strlen( _In_z_ _Post_count_(return+1) const char* sz ); +#define _Post_count_(size) _SAL1_1_Source_(_Post_count_, (size), _Post1_impl_(__count_impl(size)) _Post_valid_impl_) +#define _Post_bytecount_(size) _SAL1_1_Source_(_Post_bytecount_, (size), _Post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) +#define _Post_count_c_(size) _SAL1_1_Source_(_Post_count_c_, (size), _Post1_impl_(__count_c_impl(size)) _Post_valid_impl_) +#define _Post_bytecount_c_(size) _SAL1_1_Source_(_Post_bytecount_c_, (size), _Post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_) +#define _Post_count_x_(size) _SAL1_1_Source_(_Post_count_x_, (size), _Post1_impl_(__count_x_impl(size)) _Post_valid_impl_) +#define _Post_bytecount_x_(size) _SAL1_1_Source_(_Post_bytecount_x_, (size), _Post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_) + +// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_count_(return+1) char* szFrom, size_t cchFrom ); +#define _Post_z_count_(size) _SAL1_1_Source_(_Post_z_count_, (size), _Post2_impl_(__zterm_impl,__count_impl(size)) _Post_valid_impl_) +#define _Post_z_bytecount_(size) _SAL1_1_Source_(_Post_z_bytecount_, (size), _Post2_impl_(__zterm_impl,__bytecount_impl(size)) _Post_valid_impl_) +#define _Post_z_count_c_(size) _SAL1_1_Source_(_Post_z_count_c_, (size), _Post2_impl_(__zterm_impl,__count_c_impl(size)) _Post_valid_impl_) +#define _Post_z_bytecount_c_(size) _SAL1_1_Source_(_Post_z_bytecount_c_, (size), _Post2_impl_(__zterm_impl,__bytecount_c_impl(size)) _Post_valid_impl_) +#define _Post_z_count_x_(size) _SAL1_1_Source_(_Post_z_count_x_, (size), _Post2_impl_(__zterm_impl,__count_x_impl(size)) _Post_valid_impl_) +#define _Post_z_bytecount_x_(size) _SAL1_1_Source_(_Post_z_bytecount_x_, (size), _Post2_impl_(__zterm_impl,__bytecount_x_impl(size)) _Post_valid_impl_) + +// +// _Prepost_ --- +// +// describing conditions that hold before and after the function call + +#define _Prepost_opt_z_ _SAL1_1_Source_(_Prepost_opt_z_, (), _Pre_opt_z_ _Post_z_) + +#define _Prepost_count_(size) _SAL1_1_Source_(_Prepost_count_, (size), _Pre_count_(size) _Post_count_(size)) +#define _Prepost_opt_count_(size) _SAL1_1_Source_(_Prepost_opt_count_, (size), _Pre_opt_count_(size) _Post_count_(size)) +#define _Prepost_bytecount_(size) _SAL1_1_Source_(_Prepost_bytecount_, (size), _Pre_bytecount_(size) _Post_bytecount_(size)) +#define _Prepost_opt_bytecount_(size) _SAL1_1_Source_(_Prepost_opt_bytecount_, (size), _Pre_opt_bytecount_(size) _Post_bytecount_(size)) +#define _Prepost_count_c_(size) _SAL1_1_Source_(_Prepost_count_c_, (size), _Pre_count_c_(size) _Post_count_c_(size)) +#define _Prepost_opt_count_c_(size) _SAL1_1_Source_(_Prepost_opt_count_c_, (size), _Pre_opt_count_c_(size) _Post_count_c_(size)) +#define _Prepost_bytecount_c_(size) _SAL1_1_Source_(_Prepost_bytecount_c_, (size), _Pre_bytecount_c_(size) _Post_bytecount_c_(size)) +#define _Prepost_opt_bytecount_c_(size) _SAL1_1_Source_(_Prepost_opt_bytecount_c_, (size), _Pre_opt_bytecount_c_(size) _Post_bytecount_c_(size)) +#define _Prepost_count_x_(size) _SAL1_1_Source_(_Prepost_count_x_, (size), _Pre_count_x_(size) _Post_count_x_(size)) +#define _Prepost_opt_count_x_(size) _SAL1_1_Source_(_Prepost_opt_count_x_, (size), _Pre_opt_count_x_(size) _Post_count_x_(size)) +#define _Prepost_bytecount_x_(size) _SAL1_1_Source_(_Prepost_bytecount_x_, (size), _Pre_bytecount_x_(size) _Post_bytecount_x_(size)) +#define _Prepost_opt_bytecount_x_(size) _SAL1_1_Source_(_Prepost_opt_bytecount_x_, (size), _Pre_opt_bytecount_x_(size) _Post_bytecount_x_(size)) + +#define _Prepost_valid_ _SAL1_1_Source_(_Prepost_valid_, (), _Pre_valid_ _Post_valid_) +#define _Prepost_opt_valid_ _SAL1_1_Source_(_Prepost_opt_valid_, (), _Pre_opt_valid_ _Post_valid_) + +// +// _Deref_ --- +// +// short version for _Deref_pre_ _Deref_post_ +// describing conditions for array elements or dereferenced pointer parameters that hold before and after the call + +#define _Deref_prepost_z_ _SAL1_1_Source_(_Deref_prepost_z_, (), _Deref_pre_z_ _Deref_post_z_) +#define _Deref_prepost_opt_z_ _SAL1_1_Source_(_Deref_prepost_opt_z_, (), _Deref_pre_opt_z_ _Deref_post_opt_z_) + +#define _Deref_prepost_cap_(size) _SAL1_1_Source_(_Deref_prepost_cap_, (size), _Deref_pre_cap_(size) _Deref_post_cap_(size)) +#define _Deref_prepost_opt_cap_(size) _SAL1_1_Source_(_Deref_prepost_opt_cap_, (size), _Deref_pre_opt_cap_(size) _Deref_post_opt_cap_(size)) +#define _Deref_prepost_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_bytecap_, (size), _Deref_pre_bytecap_(size) _Deref_post_bytecap_(size)) +#define _Deref_prepost_opt_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecap_, (size), _Deref_pre_opt_bytecap_(size) _Deref_post_opt_bytecap_(size)) + +#define _Deref_prepost_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_cap_x_, (size), _Deref_pre_cap_x_(size) _Deref_post_cap_x_(size)) +#define _Deref_prepost_opt_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_cap_x_, (size), _Deref_pre_opt_cap_x_(size) _Deref_post_opt_cap_x_(size)) +#define _Deref_prepost_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_bytecap_x_, (size), _Deref_pre_bytecap_x_(size) _Deref_post_bytecap_x_(size)) +#define _Deref_prepost_opt_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecap_x_, (size), _Deref_pre_opt_bytecap_x_(size) _Deref_post_opt_bytecap_x_(size)) + +#define _Deref_prepost_z_cap_(size) _SAL1_1_Source_(_Deref_prepost_z_cap_, (size), _Deref_pre_z_cap_(size) _Deref_post_z_cap_(size)) +#define _Deref_prepost_opt_z_cap_(size) _SAL1_1_Source_(_Deref_prepost_opt_z_cap_, (size), _Deref_pre_opt_z_cap_(size) _Deref_post_opt_z_cap_(size)) +#define _Deref_prepost_z_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_z_bytecap_, (size), _Deref_pre_z_bytecap_(size) _Deref_post_z_bytecap_(size)) +#define _Deref_prepost_opt_z_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_opt_z_bytecap_, (size), _Deref_pre_opt_z_bytecap_(size) _Deref_post_opt_z_bytecap_(size)) + +#define _Deref_prepost_valid_cap_(size) _SAL1_1_Source_(_Deref_prepost_valid_cap_, (size), _Deref_pre_valid_cap_(size) _Deref_post_valid_cap_(size)) +#define _Deref_prepost_opt_valid_cap_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_cap_, (size), _Deref_pre_opt_valid_cap_(size) _Deref_post_opt_valid_cap_(size)) +#define _Deref_prepost_valid_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_valid_bytecap_, (size), _Deref_pre_valid_bytecap_(size) _Deref_post_valid_bytecap_(size)) +#define _Deref_prepost_opt_valid_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_bytecap_, (size), _Deref_pre_opt_valid_bytecap_(size) _Deref_post_opt_valid_bytecap_(size)) + +#define _Deref_prepost_valid_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_valid_cap_x_, (size), _Deref_pre_valid_cap_x_(size) _Deref_post_valid_cap_x_(size)) +#define _Deref_prepost_opt_valid_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_cap_x_, (size), _Deref_pre_opt_valid_cap_x_(size) _Deref_post_opt_valid_cap_x_(size)) +#define _Deref_prepost_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_valid_bytecap_x_, (size), _Deref_pre_valid_bytecap_x_(size) _Deref_post_valid_bytecap_x_(size)) +#define _Deref_prepost_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_bytecap_x_, (size), _Deref_pre_opt_valid_bytecap_x_(size) _Deref_post_opt_valid_bytecap_x_(size)) + +#define _Deref_prepost_count_(size) _SAL1_1_Source_(_Deref_prepost_count_, (size), _Deref_pre_count_(size) _Deref_post_count_(size)) +#define _Deref_prepost_opt_count_(size) _SAL1_1_Source_(_Deref_prepost_opt_count_, (size), _Deref_pre_opt_count_(size) _Deref_post_opt_count_(size)) +#define _Deref_prepost_bytecount_(size) _SAL1_1_Source_(_Deref_prepost_bytecount_, (size), _Deref_pre_bytecount_(size) _Deref_post_bytecount_(size)) +#define _Deref_prepost_opt_bytecount_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecount_, (size), _Deref_pre_opt_bytecount_(size) _Deref_post_opt_bytecount_(size)) + +#define _Deref_prepost_count_x_(size) _SAL1_1_Source_(_Deref_prepost_count_x_, (size), _Deref_pre_count_x_(size) _Deref_post_count_x_(size)) +#define _Deref_prepost_opt_count_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_count_x_, (size), _Deref_pre_opt_count_x_(size) _Deref_post_opt_count_x_(size)) +#define _Deref_prepost_bytecount_x_(size) _SAL1_1_Source_(_Deref_prepost_bytecount_x_, (size), _Deref_pre_bytecount_x_(size) _Deref_post_bytecount_x_(size)) +#define _Deref_prepost_opt_bytecount_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecount_x_, (size), _Deref_pre_opt_bytecount_x_(size) _Deref_post_opt_bytecount_x_(size)) + +#define _Deref_prepost_valid_ _SAL1_1_Source_(_Deref_prepost_valid_, (), _Deref_pre_valid_ _Deref_post_valid_) +#define _Deref_prepost_opt_valid_ _SAL1_1_Source_(_Deref_prepost_opt_valid_, (), _Deref_pre_opt_valid_ _Deref_post_opt_valid_) + +// +// _Deref_ +// +// used with references to arrays + +#define _Deref_out_z_cap_c_(size) _SAL1_1_Source_(_Deref_out_z_cap_c_, (size), _Deref_pre_cap_c_(size) _Deref_post_z_) +#define _Deref_inout_z_cap_c_(size) _SAL1_1_Source_(_Deref_inout_z_cap_c_, (size), _Deref_pre_z_cap_c_(size) _Deref_post_z_) +#define _Deref_out_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_out_z_bytecap_c_, (size), _Deref_pre_bytecap_c_(size) _Deref_post_z_) +#define _Deref_inout_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_inout_z_bytecap_c_, (size), _Deref_pre_z_bytecap_c_(size) _Deref_post_z_) +#define _Deref_inout_z_ _SAL1_1_Source_(_Deref_inout_z_, (), _Deref_prepost_z_) + +// #pragma endregion Input Buffer SAL 1 compatibility macros + + +//============================================================================ +// Implementation Layer: +//============================================================================ + + +// Naming conventions: +// A symbol the begins with _SA_ is for the machinery of creating any +// annotations; many of those come from sourceannotations.h in the case +// of attributes. + +// A symbol that ends with _impl is the very lowest level macro. It is +// not required to be a legal standalone annotation, and in the case +// of attribute annotations, usually is not. (In the case of some declspec +// annotations, it might be, but it should not be assumed so.) Those +// symols will be used in the _PreN..., _PostN... and _RetN... annotations +// to build up more complete annotations. + +// A symbol ending in _impl_ is reserved to the implementation as well, +// but it does form a complete annotation; usually they are used to build +// up even higher level annotations. + + +#if _USE_ATTRIBUTES_FOR_SAL || _USE_DECLSPECS_FOR_SAL // [ +// Sharable "_impl" macros: these can be shared between the various annotation +// forms but are part of the implementation of the macros. These are collected +// here to assure that only necessary differences in the annotations +// exist. + +#define _Always_impl_(annos) _Group_(annos _SAL_nop_impl_) _On_failure_impl_(annos _SAL_nop_impl_) +#define _Bound_impl_ _SA_annotes0(SAL_bound) +#define _Field_range_impl_(min,max) _Range_impl_(min,max) +#define _Literal_impl_ _SA_annotes1(SAL_constant, __yes) +#define _Maybenull_impl_ _SA_annotes1(SAL_null, __maybe) +#define _Maybevalid_impl_ _SA_annotes1(SAL_valid, __maybe) +#define _Must_inspect_impl_ _Post_impl_ _SA_annotes0(SAL_mustInspect) +#define _Notliteral_impl_ _SA_annotes1(SAL_constant, __no) +#define _Notnull_impl_ _SA_annotes1(SAL_null, __no) +#define _Notvalid_impl_ _SA_annotes1(SAL_valid, __no) +#define _NullNull_terminated_impl_ _Group_(_SA_annotes1(SAL_nullTerminated, __yes) _SA_annotes1(SAL_readableTo,inexpressibleCount("NullNull terminated string"))) +#define _Null_impl_ _SA_annotes1(SAL_null, __yes) +#define _Null_terminated_impl_ _SA_annotes1(SAL_nullTerminated, __yes) +#define _Out_impl_ _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl) _Post_valid_impl_ +#define _Out_opt_impl_ _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl) _Post_valid_impl_ +#define _Points_to_data_impl_ _At_(*_Curr_, _SA_annotes1(SAL_mayBePointer, __no)) +#define _Post_satisfies_impl_(cond) _Post_impl_ _Satisfies_impl_(cond) +#define _Post_valid_impl_ _Post1_impl_(__valid_impl) +#define _Pre_satisfies_impl_(cond) _Pre_impl_ _Satisfies_impl_(cond) +#define _Pre_valid_impl_ _Pre1_impl_(__valid_impl) +#define _Range_impl_(min,max) _SA_annotes2(SAL_range, min, max) +#define _Readable_bytes_impl_(size) _SA_annotes1(SAL_readableTo, byteCount(size)) +#define _Readable_elements_impl_(size) _SA_annotes1(SAL_readableTo, elementCount(size)) +#define _Ret_valid_impl_ _Ret1_impl_(__valid_impl) +#define _Satisfies_impl_(cond) _SA_annotes1(SAL_satisfies, cond) +#define _Valid_impl_ _SA_annotes1(SAL_valid, __yes) +#define _Writable_bytes_impl_(size) _SA_annotes1(SAL_writableTo, byteCount(size)) +#define _Writable_elements_impl_(size) _SA_annotes1(SAL_writableTo, elementCount(size)) + +#define _In_range_impl_(min,max) _Pre_impl_ _Range_impl_(min,max) +#define _Out_range_impl_(min,max) _Post_impl_ _Range_impl_(min,max) +#define _Ret_range_impl_(min,max) _Post_impl_ _Range_impl_(min,max) +#define _Deref_in_range_impl_(min,max) _Deref_pre_impl_ _Range_impl_(min,max) +#define _Deref_out_range_impl_(min,max) _Deref_post_impl_ _Range_impl_(min,max) +#define _Deref_ret_range_impl_(min,max) _Deref_post_impl_ _Range_impl_(min,max) + +#define _Deref_pre_impl_ _Pre_impl_ _Notref_impl_ _Deref_impl_ +#define _Deref_post_impl_ _Post_impl_ _Notref_impl_ _Deref_impl_ + +// The following are for the implementation machinery, and are not +// suitable for annotating general code. +// We're tying to phase this out, someday. The parser quotes the param. +#define __AuToQuOtE _SA_annotes0(SAL_AuToQuOtE) + +// Normally the parser does some simple type checking of annotation params, +// defer that check to the plugin. +#define __deferTypecheck _SA_annotes0(SAL_deferTypecheck) + +#define _SA_SPECSTRIZE( x ) #x +#define _SAL_nop_impl_ /* nothing */ +#define __nop_impl(x) x +#endif + + +#if _USE_ATTRIBUTES_FOR_SAL // [ + +// Using attributes for sal + +#include "codeanalysis\sourceannotations.h" + + +#define _SA_annotes0(n) [SAL_annotes(Name=#n)] +#define _SA_annotes1(n,pp1) [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1))] +#define _SA_annotes2(n,pp1,pp2) [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1), p2=_SA_SPECSTRIZE(pp2))] +#define _SA_annotes3(n,pp1,pp2,pp3) [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1), p2=_SA_SPECSTRIZE(pp2), p3=_SA_SPECSTRIZE(pp3))] + +#define _Pre_impl_ [SAL_pre] +#define _Post_impl_ [SAL_post] +#define _Deref_impl_ [SAL_deref] +#define _Notref_impl_ [SAL_notref] + + +// Declare a function to be an annotation or primop (respectively). +// Done this way so that they don't appear in the regular compiler's +// namespace. +#define __ANNOTATION(fun) _SA_annotes0(SAL_annotation) void __SA_##fun; +#define __PRIMOP(type, fun) _SA_annotes0(SAL_primop) type __SA_##fun; +#define __QUALIFIER(fun) _SA_annotes0(SAL_qualifier) void __SA_##fun; + +// Benign declspec needed here for WindowsPREfast +#define __In_impl_ [SA_Pre(Valid=SA_Yes)] [SA_Pre(Deref=1, Notref=1, Access=SA_Read)] __declspec("SAL_pre SAL_valid") + +#elif _USE_DECLSPECS_FOR_SAL // ][ + +// Using declspecs for sal + +#define _SA_annotes0(n) __declspec(#n) +#define _SA_annotes1(n,pp1) __declspec(#n "(" _SA_SPECSTRIZE(pp1) ")" ) +#define _SA_annotes2(n,pp1,pp2) __declspec(#n "(" _SA_SPECSTRIZE(pp1) "," _SA_SPECSTRIZE(pp2) ")") +#define _SA_annotes3(n,pp1,pp2,pp3) __declspec(#n "(" _SA_SPECSTRIZE(pp1) "," _SA_SPECSTRIZE(pp2) "," _SA_SPECSTRIZE(pp3) ")") + +#define _Pre_impl_ _SA_annotes0(SAL_pre) +#define _Post_impl_ _SA_annotes0(SAL_post) +#define _Deref_impl_ _SA_annotes0(SAL_deref) +#define _Notref_impl_ _SA_annotes0(SAL_notref) + +// Declare a function to be an annotation or primop (respectively). +// Done this way so that they don't appear in the regular compiler's +// namespace. +#define __ANNOTATION(fun) _SA_annotes0(SAL_annotation) void __SA_##fun + +#define __PRIMOP(type, fun) _SA_annotes0(SAL_primop) type __SA_##fun + +#define __QUALIFIER(fun) _SA_annotes0(SAL_qualifier) void __SA_##fun; + +#define __In_impl_ _Pre_impl_ _SA_annotes0(SAL_valid) _Pre_impl_ _Deref_impl_ _Notref_impl_ _SA_annotes0(SAL_readonly) + +#else // ][ + +// Using "nothing" for sal + +#define _SA_annotes0(n) +#define _SA_annotes1(n,pp1) +#define _SA_annotes2(n,pp1,pp2) +#define _SA_annotes3(n,pp1,pp2,pp3) + +#define __ANNOTATION(fun) +#define __PRIMOP(type, fun) +#define __QUALIFIER(type, fun) + +#endif // ] + +#if _USE_ATTRIBUTES_FOR_SAL || _USE_DECLSPECS_FOR_SAL // [ + +// Declare annotations that need to be declared. +__ANNOTATION(SAL_useHeader(void)); +__ANNOTATION(SAL_bound(void)); +__ANNOTATION(SAL_allocator(void)); //??? resolve with PFD +__ANNOTATION(SAL_file_parser(__AuToQuOtE __In_impl_ char *, __In_impl_ char *)); +__ANNOTATION(SAL_source_code_content(__In_impl_ char *)); +__ANNOTATION(SAL_analysisHint(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_untrusted_data_source(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_untrusted_data_source_this(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_validated(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_validated_this(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_encoded(void)); +__ANNOTATION(SAL_adt(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_add_adt_property(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_remove_adt_property(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_transfer_adt_property_from(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_post_type(__AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_volatile(void)); +__ANNOTATION(SAL_nonvolatile(void)); +__ANNOTATION(SAL_entrypoint(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); +__ANNOTATION(SAL_blocksOn(__In_impl_ void*)); +__ANNOTATION(SAL_mustInspect(void)); + +// Only appears in model files, but needs to be declared. +__ANNOTATION(SAL_TypeName(__AuToQuOtE __In_impl_ char *)); + +// To be declared well-known soon. +__ANNOTATION(SAL_interlocked(void);) + +#pragma warning (suppress: 28227 28241) +__ANNOTATION(SAL_name(__In_impl_ char *, __In_impl_ char *, __In_impl_ char *);) + +__PRIMOP(char *, _Macro_value_(__In_impl_ char *)); +__PRIMOP(int, _Macro_defined_(__In_impl_ char *)); +__PRIMOP(char *, _Strstr_(__In_impl_ char *, __In_impl_ char *)); + +#endif // ] + +#if _USE_ATTRIBUTES_FOR_SAL // [ + +#define _Check_return_impl_ [SA_Post(MustCheck=SA_Yes)] + +#define _Success_impl_(expr) [SA_Success(Condition=#expr)] +#define _On_failure_impl_(annos) [SAL_context(p1="SAL_failed")] _Group_(_Post_impl_ _Group_(annos _SAL_nop_impl_)) + +#define _Printf_format_string_impl_ [SA_FormatString(Style="printf")] +#define _Scanf_format_string_impl_ [SA_FormatString(Style="scanf")] +#define _Scanf_s_format_string_impl_ [SA_FormatString(Style="scanf_s")] + +#define _In_bound_impl_ [SA_PreBound(Deref=0)] +#define _Out_bound_impl_ [SA_PostBound(Deref=0)] +#define _Ret_bound_impl_ [SA_PostBound(Deref=0)] +#define _Deref_in_bound_impl_ [SA_PreBound(Deref=1)] +#define _Deref_out_bound_impl_ [SA_PostBound(Deref=1)] +#define _Deref_ret_bound_impl_ [SA_PostBound(Deref=1)] + +#define __valid_impl Valid=SA_Yes +#define __maybevalid_impl Valid=SA_Maybe +#define __notvalid_impl Valid=SA_No + +#define __null_impl Null=SA_Yes +#define __maybenull_impl Null=SA_Maybe +#define __notnull_impl Null=SA_No + +#define __null_impl_notref Null=SA_Yes,Notref=1 +#define __maybenull_impl_notref Null=SA_Maybe,Notref=1 +#define __notnull_impl_notref Null=SA_No,Notref=1 + +#define __zterm_impl NullTerminated=SA_Yes +#define __maybezterm_impl NullTerminated=SA_Maybe +#define __maybzterm_impl NullTerminated=SA_Maybe +#define __notzterm_impl NullTerminated=SA_No + +#define __readaccess_impl Access=SA_Read +#define __writeaccess_impl Access=SA_Write +#define __allaccess_impl Access=SA_ReadWrite + +#define __readaccess_impl_notref Access=SA_Read,Notref=1 +#define __writeaccess_impl_notref Access=SA_Write,Notref=1 +#define __allaccess_impl_notref Access=SA_ReadWrite,Notref=1 + +#if _MSC_VER >= 1610 /*IFSTRIP=IGN*/ // [ + +// For SAL2, we need to expect general expressions. + +#define __cap_impl(size) WritableElements="\n"#size +#define __bytecap_impl(size) WritableBytes="\n"#size +#define __bytecount_impl(size) ValidBytes="\n"#size +#define __count_impl(size) ValidElements="\n"#size + +#else // ][ + +#define __cap_impl(size) WritableElements=#size +#define __bytecap_impl(size) WritableBytes=#size +#define __bytecount_impl(size) ValidBytes=#size +#define __count_impl(size) ValidElements=#size + +#endif // ] + +#define __cap_c_impl(size) WritableElementsConst=size +#define __cap_c_one_notref_impl WritableElementsConst=1,Notref=1 +#define __cap_for_impl(param) WritableElementsLength=#param +#define __cap_x_impl(size) WritableElements="\n@"#size + +#define __bytecap_c_impl(size) WritableBytesConst=size +#define __bytecap_x_impl(size) WritableBytes="\n@"#size + +#define __mult_impl(mult,size) __cap_impl((mult)*(size)) + +#define __count_c_impl(size) ValidElementsConst=size +#define __count_x_impl(size) ValidElements="\n@"#size + +#define __bytecount_c_impl(size) ValidBytesConst=size +#define __bytecount_x_impl(size) ValidBytes="\n@"#size + + +#define _At_impl_(target, annos) [SAL_at(p1=#target)] _Group_(annos) +#define _At_buffer_impl_(target, iter, bound, annos) [SAL_at_buffer(p1=#target, p2=#iter, p3=#bound)] _Group_(annos) +#define _When_impl_(expr, annos) [SAL_when(p1=#expr)] _Group_(annos) + +#define _Group_impl_(annos) [SAL_begin] annos [SAL_end] +#define _GrouP_impl_(annos) [SAL_BEGIN] annos [SAL_END] + +#define _Use_decl_anno_impl_ _SA_annotes0(SAL_useHeader) // this is a special case! + +#define _Pre1_impl_(p1) [SA_Pre(p1)] +#define _Pre2_impl_(p1,p2) [SA_Pre(p1,p2)] +#define _Pre3_impl_(p1,p2,p3) [SA_Pre(p1,p2,p3)] + +#define _Post1_impl_(p1) [SA_Post(p1)] +#define _Post2_impl_(p1,p2) [SA_Post(p1,p2)] +#define _Post3_impl_(p1,p2,p3) [SA_Post(p1,p2,p3)] + +#define _Ret1_impl_(p1) [SA_Post(p1)] +#define _Ret2_impl_(p1,p2) [SA_Post(p1,p2)] +#define _Ret3_impl_(p1,p2,p3) [SA_Post(p1,p2,p3)] + +#define _Deref_pre1_impl_(p1) [SA_Pre(Deref=1,p1)] +#define _Deref_pre2_impl_(p1,p2) [SA_Pre(Deref=1,p1,p2)] +#define _Deref_pre3_impl_(p1,p2,p3) [SA_Pre(Deref=1,p1,p2,p3)] + + +#define _Deref_post1_impl_(p1) [SA_Post(Deref=1,p1)] +#define _Deref_post2_impl_(p1,p2) [SA_Post(Deref=1,p1,p2)] +#define _Deref_post3_impl_(p1,p2,p3) [SA_Post(Deref=1,p1,p2,p3)] + +#define _Deref_ret1_impl_(p1) [SA_Post(Deref=1,p1)] +#define _Deref_ret2_impl_(p1,p2) [SA_Post(Deref=1,p1,p2)] +#define _Deref_ret3_impl_(p1,p2,p3) [SA_Post(Deref=1,p1,p2,p3)] + +#define _Deref2_pre1_impl_(p1) [SA_Pre(Deref=2,Notref=1,p1)] +#define _Deref2_post1_impl_(p1) [SA_Post(Deref=2,Notref=1,p1)] +#define _Deref2_ret1_impl_(p1) [SA_Post(Deref=2,Notref=1,p1)] + +// Obsolete -- may be needed for transition to attributes. +#define __inner_typefix(ctype) [SAL_typefix(p1=_SA_SPECSTRIZE(ctype))] +#define __inner_exceptthat [SAL_except] + + +#elif _USE_DECLSPECS_FOR_SAL // ][ + +#define _Check_return_impl_ __post _SA_annotes0(SAL_checkReturn) + +#define _Success_impl_(expr) _SA_annotes1(SAL_success, expr) +#define _On_failure_impl_(annos) _SA_annotes1(SAL_context, SAL_failed) _Group_(_Post_impl_ _Group_(_SAL_nop_impl_ annos)) + +#define _Printf_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "printf") +#define _Scanf_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "scanf") +#define _Scanf_s_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "scanf_s") + +#define _In_bound_impl_ _Pre_impl_ _Bound_impl_ +#define _Out_bound_impl_ _Post_impl_ _Bound_impl_ +#define _Ret_bound_impl_ _Post_impl_ _Bound_impl_ +#define _Deref_in_bound_impl_ _Deref_pre_impl_ _Bound_impl_ +#define _Deref_out_bound_impl_ _Deref_post_impl_ _Bound_impl_ +#define _Deref_ret_bound_impl_ _Deref_post_impl_ _Bound_impl_ + + +#define __null_impl _SA_annotes0(SAL_null) // _SA_annotes1(SAL_null, __yes) +#define __notnull_impl _SA_annotes0(SAL_notnull) // _SA_annotes1(SAL_null, __no) +#define __maybenull_impl _SA_annotes0(SAL_maybenull) // _SA_annotes1(SAL_null, __maybe) + +#define __valid_impl _SA_annotes0(SAL_valid) // _SA_annotes1(SAL_valid, __yes) +#define __notvalid_impl _SA_annotes0(SAL_notvalid) // _SA_annotes1(SAL_valid, __no) +#define __maybevalid_impl _SA_annotes0(SAL_maybevalid) // _SA_annotes1(SAL_valid, __maybe) + +#define __null_impl_notref _Notref_ _Null_impl_ +#define __maybenull_impl_notref _Notref_ _Maybenull_impl_ +#define __notnull_impl_notref _Notref_ _Notnull_impl_ + +#define __zterm_impl _SA_annotes1(SAL_nullTerminated, __yes) +#define __maybezterm_impl _SA_annotes1(SAL_nullTerminated, __maybe) +#define __maybzterm_impl _SA_annotes1(SAL_nullTerminated, __maybe) +#define __notzterm_impl _SA_annotes1(SAL_nullTerminated, __no) + +#define __readaccess_impl _SA_annotes1(SAL_access, 0x1) +#define __writeaccess_impl _SA_annotes1(SAL_access, 0x2) +#define __allaccess_impl _SA_annotes1(SAL_access, 0x3) + +#define __readaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x1) +#define __writeaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x2) +#define __allaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x3) + +#define __cap_impl(size) _SA_annotes1(SAL_writableTo,elementCount(size)) +#define __cap_c_impl(size) _SA_annotes1(SAL_writableTo,elementCount(size)) +#define __cap_c_one_notref_impl _Notref_ _SA_annotes1(SAL_writableTo,elementCount(1)) +#define __cap_for_impl(param) _SA_annotes1(SAL_writableTo,inexpressibleCount(sizeof(param))) +#define __cap_x_impl(size) _SA_annotes1(SAL_writableTo,inexpressibleCount(#size)) + +#define __bytecap_impl(size) _SA_annotes1(SAL_writableTo,byteCount(size)) +#define __bytecap_c_impl(size) _SA_annotes1(SAL_writableTo,byteCount(size)) +#define __bytecap_x_impl(size) _SA_annotes1(SAL_writableTo,inexpressibleCount(#size)) + +#define __mult_impl(mult,size) _SA_annotes1(SAL_writableTo,(mult)*(size)) + +#define __count_impl(size) _SA_annotes1(SAL_readableTo,elementCount(size)) +#define __count_c_impl(size) _SA_annotes1(SAL_readableTo,elementCount(size)) +#define __count_x_impl(size) _SA_annotes1(SAL_readableTo,inexpressibleCount(#size)) + +#define __bytecount_impl(size) _SA_annotes1(SAL_readableTo,byteCount(size)) +#define __bytecount_c_impl(size) _SA_annotes1(SAL_readableTo,byteCount(size)) +#define __bytecount_x_impl(size) _SA_annotes1(SAL_readableTo,inexpressibleCount(#size)) + +#define _At_impl_(target, annos) _SA_annotes0(SAL_at(target)) _Group_(annos) +#define _At_buffer_impl_(target, iter, bound, annos) _SA_annotes3(SAL_at_buffer, target, iter, bound) _Group_(annos) +#define _Group_impl_(annos) _SA_annotes0(SAL_begin) annos _SA_annotes0(SAL_end) +#define _GrouP_impl_(annos) _SA_annotes0(SAL_BEGIN) annos _SA_annotes0(SAL_END) +#define _When_impl_(expr, annos) _SA_annotes0(SAL_when(expr)) _Group_(annos) + +#define _Use_decl_anno_impl_ __declspec("SAL_useHeader()") // this is a special case! + +#define _Pre1_impl_(p1) _Pre_impl_ p1 +#define _Pre2_impl_(p1,p2) _Pre_impl_ p1 _Pre_impl_ p2 +#define _Pre3_impl_(p1,p2,p3) _Pre_impl_ p1 _Pre_impl_ p2 _Pre_impl_ p3 + +#define _Post1_impl_(p1) _Post_impl_ p1 +#define _Post2_impl_(p1,p2) _Post_impl_ p1 _Post_impl_ p2 +#define _Post3_impl_(p1,p2,p3) _Post_impl_ p1 _Post_impl_ p2 _Post_impl_ p3 + +#define _Ret1_impl_(p1) _Post_impl_ p1 +#define _Ret2_impl_(p1,p2) _Post_impl_ p1 _Post_impl_ p2 +#define _Ret3_impl_(p1,p2,p3) _Post_impl_ p1 _Post_impl_ p2 _Post_impl_ p3 + +#define _Deref_pre1_impl_(p1) _Deref_pre_impl_ p1 +#define _Deref_pre2_impl_(p1,p2) _Deref_pre_impl_ p1 _Deref_pre_impl_ p2 +#define _Deref_pre3_impl_(p1,p2,p3) _Deref_pre_impl_ p1 _Deref_pre_impl_ p2 _Deref_pre_impl_ p3 + +#define _Deref_post1_impl_(p1) _Deref_post_impl_ p1 +#define _Deref_post2_impl_(p1,p2) _Deref_post_impl_ p1 _Deref_post_impl_ p2 +#define _Deref_post3_impl_(p1,p2,p3) _Deref_post_impl_ p1 _Deref_post_impl_ p2 _Deref_post_impl_ p3 + +#define _Deref_ret1_impl_(p1) _Deref_post_impl_ p1 +#define _Deref_ret2_impl_(p1,p2) _Deref_post_impl_ p1 _Deref_post_impl_ p2 +#define _Deref_ret3_impl_(p1,p2,p3) _Deref_post_impl_ p1 _Deref_post_impl_ p2 _Deref_post_impl_ p3 + +#define _Deref2_pre1_impl_(p1) _Deref_pre_impl_ _Notref_impl_ _Deref_impl_ p1 +#define _Deref2_post1_impl_(p1) _Deref_post_impl_ _Notref_impl_ _Deref_impl_ p1 +#define _Deref2_ret1_impl_(p1) _Deref_post_impl_ _Notref_impl_ _Deref_impl_ p1 + +#define __inner_typefix(ctype) _SA_annotes1(SAL_typefix, ctype) +#define __inner_exceptthat _SA_annotes0(SAL_except) + +#elif defined(_MSC_EXTENSIONS) && !defined( MIDL_PASS ) && !defined(__midl) && !defined(RC_INVOKED) && defined(_PFT_VER) && _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // ][ + +// minimum attribute expansion for foreground build + +#pragma push_macro( "SA" ) +#pragma push_macro( "REPEATABLE" ) + +#ifdef __cplusplus // [ +#define SA( id ) id +#define REPEATABLE [repeatable] +#else // !__cplusplus // ][ +#define SA( id ) SA_##id +#define REPEATABLE +#endif // !__cplusplus // ] + +REPEATABLE +[source_annotation_attribute( SA( Parameter ) )] +struct __P_impl +{ +#ifdef __cplusplus // [ + __P_impl(); +#endif // ] + int __d_; +}; +typedef struct __P_impl __P_impl; + +REPEATABLE +[source_annotation_attribute( SA( ReturnValue ) )] +struct __R_impl +{ +#ifdef __cplusplus // [ + __R_impl(); +#endif // ] + int __d_; +}; +typedef struct __R_impl __R_impl; + +[source_annotation_attribute( SA( Method ) )] +struct __M_ +{ +#ifdef __cplusplus // [ + __M_(); +#endif // ] + int __d_; +}; +typedef struct __M_ __M_; + +[source_annotation_attribute( SA( All ) )] +struct __A_ +{ +#ifdef __cplusplus // [ + __A_(); +#endif // ] + int __d_; +}; +typedef struct __A_ __A_; + +[source_annotation_attribute( SA( Field ) )] +struct __F_ +{ +#ifdef __cplusplus // [ + __F_(); +#endif // ] + int __d_; +}; +typedef struct __F_ __F_; + +#pragma pop_macro( "REPEATABLE" ) +#pragma pop_macro( "SA" ) + + +#define _SAL_nop_impl_ + +#define _At_impl_(target, annos) [__A_(__d_=0)] +#define _At_buffer_impl_(target, iter, bound, annos) [__A_(__d_=0)] +#define _When_impl_(expr, annos) annos +#define _Group_impl_(annos) annos +#define _GrouP_impl_(annos) annos +#define _Use_decl_anno_impl_ [__M_(__d_=0)] + +#define _Points_to_data_impl_ [__P_impl(__d_=0)] +#define _Literal_impl_ [__P_impl(__d_=0)] +#define _Notliteral_impl_ [__P_impl(__d_=0)] + +#define _Pre_valid_impl_ [__P_impl(__d_=0)] +#define _Post_valid_impl_ [__P_impl(__d_=0)] +#define _Ret_valid_impl_ [__R_impl(__d_=0)] + +#define _Check_return_impl_ [__R_impl(__d_=0)] +#define _Must_inspect_impl_ [__R_impl(__d_=0)] + +#define _Success_impl_(expr) [__M_(__d_=0)] +#define _On_failure_impl_(expr) [__M_(__d_=0)] +#define _Always_impl_(expr) [__M_(__d_=0)] + +#define _Printf_format_string_impl_ [__P_impl(__d_=0)] +#define _Scanf_format_string_impl_ [__P_impl(__d_=0)] +#define _Scanf_s_format_string_impl_ [__P_impl(__d_=0)] + +#define _Raises_SEH_exception_impl_ [__M_(__d_=0)] +#define _Maybe_raises_SEH_exception_impl_ [__M_(__d_=0)] + +#define _In_bound_impl_ [__P_impl(__d_=0)] +#define _Out_bound_impl_ [__P_impl(__d_=0)] +#define _Ret_bound_impl_ [__R_impl(__d_=0)] +#define _Deref_in_bound_impl_ [__P_impl(__d_=0)] +#define _Deref_out_bound_impl_ [__P_impl(__d_=0)] +#define _Deref_ret_bound_impl_ [__R_impl(__d_=0)] + +#define _Range_impl_(min,max) [__P_impl(__d_=0)] +#define _In_range_impl_(min,max) [__P_impl(__d_=0)] +#define _Out_range_impl_(min,max) [__P_impl(__d_=0)] +#define _Ret_range_impl_(min,max) [__R_impl(__d_=0)] +#define _Deref_in_range_impl_(min,max) [__P_impl(__d_=0)] +#define _Deref_out_range_impl_(min,max) [__P_impl(__d_=0)] +#define _Deref_ret_range_impl_(min,max) [__R_impl(__d_=0)] + +#define _Field_range_impl_(min,max) [__F_(__d_=0)] + +#define _Pre_satisfies_impl_(cond) [__A_(__d_=0)] +#define _Post_satisfies_impl_(cond) [__A_(__d_=0)] +#define _Satisfies_impl_(cond) [__A_(__d_=0)] + +#define _Null_impl_ [__A_(__d_=0)] +#define _Notnull_impl_ [__A_(__d_=0)] +#define _Maybenull_impl_ [__A_(__d_=0)] + +#define _Valid_impl_ [__A_(__d_=0)] +#define _Notvalid_impl_ [__A_(__d_=0)] +#define _Maybevalid_impl_ [__A_(__d_=0)] + +#define _Readable_bytes_impl_(size) [__A_(__d_=0)] +#define _Readable_elements_impl_(size) [__A_(__d_=0)] +#define _Writable_bytes_impl_(size) [__A_(__d_=0)] +#define _Writable_elements_impl_(size) [__A_(__d_=0)] + +#define _Null_terminated_impl_ [__A_(__d_=0)] +#define _NullNull_terminated_impl_ [__A_(__d_=0)] + +#define _Pre_impl_ [__P_impl(__d_=0)] +#define _Pre1_impl_(p1) [__P_impl(__d_=0)] +#define _Pre2_impl_(p1,p2) [__P_impl(__d_=0)] +#define _Pre3_impl_(p1,p2,p3) [__P_impl(__d_=0)] + +#define _Post_impl_ [__P_impl(__d_=0)] +#define _Post1_impl_(p1) [__P_impl(__d_=0)] +#define _Post2_impl_(p1,p2) [__P_impl(__d_=0)] +#define _Post3_impl_(p1,p2,p3) [__P_impl(__d_=0)] + +#define _Ret1_impl_(p1) [__R_impl(__d_=0)] +#define _Ret2_impl_(p1,p2) [__R_impl(__d_=0)] +#define _Ret3_impl_(p1,p2,p3) [__R_impl(__d_=0)] + +#define _Deref_pre1_impl_(p1) [__P_impl(__d_=0)] +#define _Deref_pre2_impl_(p1,p2) [__P_impl(__d_=0)] +#define _Deref_pre3_impl_(p1,p2,p3) [__P_impl(__d_=0)] + +#define _Deref_post1_impl_(p1) [__P_impl(__d_=0)] +#define _Deref_post2_impl_(p1,p2) [__P_impl(__d_=0)] +#define _Deref_post3_impl_(p1,p2,p3) [__P_impl(__d_=0)] + +#define _Deref_ret1_impl_(p1) [__R_impl(__d_=0)] +#define _Deref_ret2_impl_(p1,p2) [__R_impl(__d_=0)] +#define _Deref_ret3_impl_(p1,p2,p3) [__R_impl(__d_=0)] + +#define _Deref2_pre1_impl_(p1) //[__P_impl(__d_=0)] +#define _Deref2_post1_impl_(p1) //[__P_impl(__d_=0)] +#define _Deref2_ret1_impl_(p1) //[__P_impl(__d_=0)] + +#else // ][ + + +#define _SAL_nop_impl_ X + +#define _At_impl_(target, annos) +#define _When_impl_(expr, annos) +#define _Group_impl_(annos) +#define _GrouP_impl_(annos) +#define _At_buffer_impl_(target, iter, bound, annos) +#define _Use_decl_anno_impl_ +#define _Points_to_data_impl_ +#define _Literal_impl_ +#define _Notliteral_impl_ +#define _Notref_impl_ + +#define _Pre_valid_impl_ +#define _Post_valid_impl_ +#define _Ret_valid_impl_ + +#define _Check_return_impl_ +#define _Must_inspect_impl_ + +#define _Success_impl_(expr) +#define _On_failure_impl_(annos) +#define _Always_impl_(annos) + +#define _Printf_format_string_impl_ +#define _Scanf_format_string_impl_ +#define _Scanf_s_format_string_impl_ + +#define _In_bound_impl_ +#define _Out_bound_impl_ +#define _Ret_bound_impl_ +#define _Deref_in_bound_impl_ +#define _Deref_out_bound_impl_ +#define _Deref_ret_bound_impl_ + +#define _Range_impl_(min,max) +#define _In_range_impl_(min,max) +#define _Out_range_impl_(min,max) +#define _Ret_range_impl_(min,max) +#define _Deref_in_range_impl_(min,max) +#define _Deref_out_range_impl_(min,max) +#define _Deref_ret_range_impl_(min,max) + +#define _Satisfies_impl_(expr) +#define _Pre_satisfies_impl_(expr) +#define _Post_satisfies_impl_(expr) + +#define _Null_impl_ +#define _Notnull_impl_ +#define _Maybenull_impl_ + +#define _Valid_impl_ +#define _Notvalid_impl_ +#define _Maybevalid_impl_ + +#define _Field_range_impl_(min,max) + +#define _Pre_impl_ +#define _Pre1_impl_(p1) +#define _Pre2_impl_(p1,p2) +#define _Pre3_impl_(p1,p2,p3) + +#define _Post_impl_ +#define _Post1_impl_(p1) +#define _Post2_impl_(p1,p2) +#define _Post3_impl_(p1,p2,p3) + +#define _Ret1_impl_(p1) +#define _Ret2_impl_(p1,p2) +#define _Ret3_impl_(p1,p2,p3) + +#define _Deref_pre1_impl_(p1) +#define _Deref_pre2_impl_(p1,p2) +#define _Deref_pre3_impl_(p1,p2,p3) + +#define _Deref_post1_impl_(p1) +#define _Deref_post2_impl_(p1,p2) +#define _Deref_post3_impl_(p1,p2,p3) + +#define _Deref_ret1_impl_(p1) +#define _Deref_ret2_impl_(p1,p2) +#define _Deref_ret3_impl_(p1,p2,p3) + +#define _Deref2_pre1_impl_(p1) +#define _Deref2_post1_impl_(p1) +#define _Deref2_ret1_impl_(p1) + +#define _Readable_bytes_impl_(size) +#define _Readable_elements_impl_(size) +#define _Writable_bytes_impl_(size) +#define _Writable_elements_impl_(size) + +#define _Null_terminated_impl_ +#define _NullNull_terminated_impl_ + +// Obsolete -- may be needed for transition to attributes. +#define __inner_typefix(ctype) +#define __inner_exceptthat + +#endif // ] + +// This section contains the deprecated annotations + +/* + ------------------------------------------------------------------------------- + Introduction + + sal.h provides a set of annotations to describe how a function uses its + parameters - the assumptions it makes about them, and the guarantees it makes + upon finishing. + + Annotations may be placed before either a function parameter's type or its return + type, and describe the function's behavior regarding the parameter or return value. + There are two classes of annotations: buffer annotations and advanced annotations. + Buffer annotations describe how functions use their pointer parameters, and + advanced annotations either describe complex/unusual buffer behavior, or provide + additional information about a parameter that is not otherwise expressible. + + ------------------------------------------------------------------------------- + Buffer Annotations + + The most important annotations in sal.h provide a consistent way to annotate + buffer parameters or return values for a function. Each of these annotations describes + a single buffer (which could be a string, a fixed-length or variable-length array, + or just a pointer) that the function interacts with: where it is, how large it is, + how much is initialized, and what the function does with it. + + The appropriate macro for a given buffer can be constructed using the table below. + Just pick the appropriate values from each category, and combine them together + with a leading underscore. Some combinations of values do not make sense as buffer + annotations. Only meaningful annotations can be added to your code; for a list of + these, see the buffer annotation definitions section. + + Only a single buffer annotation should be used for each parameter. + + |------------|------------|---------|--------|----------|----------|---------------| + | Level | Usage | Size | Output | NullTerm | Optional | Parameters | + |------------|------------|---------|--------|----------|----------|---------------| + | <> | <> | <> | <> | _z | <> | <> | + | _deref | _in | _ecount | _full | _nz | _opt | (size) | + | _deref_opt | _out | _bcount | _part | | | (size,length) | + | | _inout | | | | | | + | | | | | | | | + |------------|------------|---------|--------|----------|----------|---------------| + + Level: Describes the buffer pointer's level of indirection from the parameter or + return value 'p'. + + <> : p is the buffer pointer. + _deref : *p is the buffer pointer. p must not be NULL. + _deref_opt : *p may be the buffer pointer. p may be NULL, in which case the rest of + the annotation is ignored. + + Usage: Describes how the function uses the buffer. + + <> : The buffer is not accessed. If used on the return value or with _deref, the + function will provide the buffer, and it will be uninitialized at exit. + Otherwise, the caller must provide the buffer. This should only be used + for alloc and free functions. + _in : The function will only read from the buffer. The caller must provide the + buffer and initialize it. Cannot be used with _deref. + _out : The function will only write to the buffer. If used on the return value or + with _deref, the function will provide the buffer and initialize it. + Otherwise, the caller must provide the buffer, and the function will + initialize it. + _inout : The function may freely read from and write to the buffer. The caller must + provide the buffer and initialize it. If used with _deref, the buffer may + be reallocated by the function. + + Size: Describes the total size of the buffer. This may be less than the space actually + allocated for the buffer, in which case it describes the accessible amount. + + <> : No buffer size is given. If the type specifies the buffer size (such as + with LPSTR and LPWSTR), that amount is used. Otherwise, the buffer is one + element long. Must be used with _in, _out, or _inout. + _ecount : The buffer size is an explicit element count. + _bcount : The buffer size is an explicit byte count. + + Output: Describes how much of the buffer will be initialized by the function. For + _inout buffers, this also describes how much is initialized at entry. Omit this + category for _in buffers; they must be fully initialized by the caller. + + <> : The type specifies how much is initialized. For instance, a function initializing + an LPWSTR must NULL-terminate the string. + _full : The function initializes the entire buffer. + _part : The function initializes part of the buffer, and explicitly indicates how much. + + NullTerm: States if the present of a '\0' marks the end of valid elements in the buffer. + _z : A '\0' indicated the end of the buffer + _nz : The buffer may not be null terminated and a '\0' does not indicate the end of the + buffer. + Optional: Describes if the buffer itself is optional. + + <> : The pointer to the buffer must not be NULL. + _opt : The pointer to the buffer might be NULL. It will be checked before being dereferenced. + + Parameters: Gives explicit counts for the size and length of the buffer. + + <> : There is no explicit count. Use when neither _ecount nor _bcount is used. + (size) : Only the buffer's total size is given. Use with _ecount or _bcount but not _part. + (size,length) : The buffer's total size and initialized length are given. Use with _ecount_part + and _bcount_part. + + ------------------------------------------------------------------------------- + Buffer Annotation Examples + + LWSTDAPI_(BOOL) StrToIntExA( + __in LPCSTR pszString, + DWORD dwFlags, + __out int *piRet -- A pointer whose dereference will be filled in. + ); + + void MyPaintingFunction( + __in HWND hwndControl, -- An initialized read-only parameter. + __in_opt HDC hdcOptional, -- An initialized read-only parameter that might be NULL. + __inout IPropertyStore *ppsStore -- An initialized parameter that may be freely used + -- and modified. + ); + + LWSTDAPI_(BOOL) PathCompactPathExA( + __out_ecount(cchMax) LPSTR pszOut, -- A string buffer with cch elements that will + -- be NULL terminated on exit. + __in LPCSTR pszSrc, + UINT cchMax, + DWORD dwFlags + ); + + HRESULT SHLocalAllocBytes( + size_t cb, + __deref_bcount(cb) T **ppv -- A pointer whose dereference will be set to an + -- uninitialized buffer with cb bytes. + ); + + __inout_bcount_full(cb) : A buffer with cb elements that is fully initialized at + entry and exit, and may be written to by this function. + + __out_ecount_part(count, *countOut) : A buffer with count elements that will be + partially initialized by this function. The function indicates how much it + initialized by setting *countOut. + + ------------------------------------------------------------------------------- + Advanced Annotations + + Advanced annotations describe behavior that is not expressible with the regular + buffer macros. These may be used either to annotate buffer parameters that involve + complex or conditional behavior, or to enrich existing annotations with additional + information. + + __success(expr) f : + indicates whether function f succeeded or not. If is true at exit, + all the function's guarantees (as given by other annotations) must hold. If + is false at exit, the caller should not expect any of the function's guarantees + to hold. If not used, the function must always satisfy its guarantees. Added + automatically to functions that indicate success in standard ways, such as by + returning an HRESULT. + + __nullterminated p : + Pointer p is a buffer that may be read or written up to and including the first + NULL character or pointer. May be used on typedefs, which marks valid (properly + initialized) instances of that type as being NULL-terminated. + + __nullnullterminated p : + Pointer p is a buffer that may be read or written up to and including the first + sequence of two NULL characters or pointers. May be used on typedefs, which marks + valid instances of that type as being double-NULL terminated. + + __reserved v : + Value v must be 0/NULL, reserved for future use. + + __checkReturn v : + Return value v must not be ignored by callers of this function. + + __typefix(ctype) v : + Value v should be treated as an instance of ctype, rather than its declared type. + + __override f : + Specify C#-style 'override' behaviour for overriding virtual methods. + + __callback f : + Function f can be used as a function pointer. + + __format_string p : + Pointer p is a string that contains % markers in the style of printf. + + __blocksOn(resource) f : + Function f blocks on the resource 'resource'. + + FALLTHROUGH : + Annotates switch statement labels where fall-through is desired, to distinguish + from forgotten break statements. + + ------------------------------------------------------------------------------- + Advanced Annotation Examples + + __success(return != FALSE) LWSTDAPI_(BOOL) + PathCanonicalizeA(__out_ecount(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) : + pszBuf is only guaranteed to be NULL-terminated when TRUE is returned. + + typedef __nullterminated WCHAR* LPWSTR : Initialized LPWSTRs are NULL-terminated strings. + + __out_ecount(cch) __typefix(LPWSTR) void *psz : psz is a buffer parameter which will be + a NULL-terminated WCHAR string at exit, and which initially contains cch WCHARs. + + ------------------------------------------------------------------------------- +*/ + +#define __specstrings + +#ifdef __cplusplus // [ +#ifndef __nothrow // [ +# define __nothrow NOTHROW_DECL +#endif // ] +extern "C" { +#else // ][ +#ifndef __nothrow // [ +# define __nothrow +#endif // ] +#endif /* #ifdef __cplusplus */ // ] + + +/* + ------------------------------------------------------------------------------- + Helper Macro Definitions + + These express behavior common to many of the high-level annotations. + DO NOT USE THESE IN YOUR CODE. + ------------------------------------------------------------------------------- +*/ + +/* + The helper annotations are only understood by the compiler version used by + various defect detection tools. When the regular compiler is running, they + are defined into nothing, and do not affect the compiled code. +*/ + +#if !defined(__midl) && defined(_PREFAST_) // [ + + /* + In the primitive "SAL_*" annotations "SAL" stands for Standard + Annotation Language. These "SAL_*" annotations are the + primitives the compiler understands and high-level MACROs + will decompose into these primivates. + */ + + #define _SA_SPECSTRIZE( x ) #x + + /* + __null p + __notnull p + __maybenull p + + Annotates a pointer p. States that pointer p is null. Commonly used + in the negated form __notnull or the possibly null form __maybenull. + */ + +#ifndef PAL_STDCPP_COMPAT + #define __null _Null_impl_ + #define __notnull _Notnull_impl_ + #define __maybenull _Maybenull_impl_ +#endif // !PAL_STDCPP_COMPAT + + /* + __readonly l + __notreadonly l + __maybereadonly l + + Annotates a location l. States that location l is not modified after + this point. If the annotation is placed on the precondition state of + a function, the restriction only applies until the postcondition state + of the function. __maybereadonly states that the annotated location + may be modified, whereas __notreadonly states that a location must be + modified. + */ + + #define __readonly _Pre1_impl_(__readaccess_impl) + #define __notreadonly _Pre1_impl_(__allaccess_impl) + #define __maybereadonly _Pre1_impl_(__readaccess_impl) + + /* + __valid v + __notvalid v + __maybevalid v + + Annotates any value v. States that the value satisfies all properties of + valid values of its type. For example, for a string buffer, valid means + that the buffer pointer is either NULL or points to a NULL-terminated string. + */ + + #define __valid _Valid_impl_ + #define __notvalid _Notvalid_impl_ + #define __maybevalid _Maybevalid_impl_ + + /* + __readableTo(extent) p + + Annotates a buffer pointer p. If the buffer can be read, extent describes + how much of the buffer is readable. For a reader of the buffer, this is + an explicit permission to read up to that amount, rather than a restriction to + read only up to it. + */ + + #define __readableTo(extent) _SA_annotes1(SAL_readableTo, extent) + + /* + + __elem_readableTo(size) + + Annotates a buffer pointer p as being readable to size elements. + */ + + #define __elem_readableTo(size) _SA_annotes1(SAL_readableTo, elementCount( size )) + + /* + __byte_readableTo(size) + + Annotates a buffer pointer p as being readable to size bytes. + */ + #define __byte_readableTo(size) _SA_annotes1(SAL_readableTo, byteCount(size)) + + /* + __writableTo(extent) p + + Annotates a buffer pointer p. If the buffer can be modified, extent + describes how much of the buffer is writable (usually the allocation + size). For a writer of the buffer, this is an explicit permission to + write up to that amount, rather than a restriction to write only up to it. + */ + #define __writableTo(size) _SA_annotes1(SAL_writableTo, size) + + /* + __elem_writableTo(size) + + Annotates a buffer pointer p as being writable to size elements. + */ + #define __elem_writableTo(size) _SA_annotes1(SAL_writableTo, elementCount( size )) + + /* + __byte_writableTo(size) + + Annotates a buffer pointer p as being writable to size bytes. + */ + #define __byte_writableTo(size) _SA_annotes1(SAL_writableTo, byteCount( size)) + + /* + __deref p + + Annotates a pointer p. The next annotation applies one dereference down + in the type. If readableTo(p, size) then the next annotation applies to + all elements *(p+i) for which i satisfies the size. If p is a pointer + to a struct, the next annotation applies to all fields of the struct. + */ + #define __deref _Deref_impl_ + + /* + __pre __next_annotation + + The next annotation applies in the precondition state + */ + #define __pre _Pre_impl_ + + /* + __post __next_annotation + + The next annotation applies in the postcondition state + */ + #define __post _Post_impl_ + + /* + __precond() + + When is true, the next annotation applies in the precondition state + (currently not enabled) + */ + #define __precond(expr) __pre + + /* + __postcond() + + When is true, the next annotation applies in the postcondition state + (currently not enabled) + */ + #define __postcond(expr) __post + + /* + __exceptthat + + Given a set of annotations Q containing __exceptthat maybeP, the effect of + the except clause is to erase any P or notP annotations (explicit or + implied) within Q at the same level of dereferencing that the except + clause appears, and to replace it with maybeP. + + Example 1: __valid __pre_except_maybenull on a pointer p means that the + pointer may be null, and is otherwise valid, thus overriding + the implicit notnull annotation implied by __valid on + pointers. + + Example 2: __valid __deref __pre_except_maybenull on an int **p means + that p is not null (implied by valid), but the elements + pointed to by p could be null, and are otherwise valid. + */ + #define __exceptthat __inner_exceptthat + + /* + _refparam + + Added to all out parameter macros to indicate that they are all reference + parameters. + */ + #define __refparam _Notref_ __deref __notreadonly + + /* + __inner_* + + Helper macros that directly correspond to certain high-level annotations. + + */ + + /* + Macros to classify the entrypoints and indicate their category. + + Pre-defined control point categories include: RPC, LPC, DeviceDriver, UserToKernel, ISAPI, COM. + + */ + #define __inner_control_entrypoint(category) _SA_annotes2(SAL_entrypoint, controlEntry, category) + + + /* + Pre-defined data entry point categories include: Registry, File, Network. + */ + #define __inner_data_entrypoint(category) _SA_annotes2(SAL_entrypoint, dataEntry, category) + + #define __inner_override _SA_annotes0(__override) + #define __inner_callback _SA_annotes0(__callback) + #define __inner_blocksOn(resource) _SA_annotes1(SAL_blocksOn, resource) + + #define __post_except_maybenull __post __inner_exceptthat _Maybenull_impl_ + #define __pre_except_maybenull __pre __inner_exceptthat _Maybenull_impl_ + + #define __post_deref_except_maybenull __post __deref __inner_exceptthat _Maybenull_impl_ + #define __pre_deref_except_maybenull __pre __deref __inner_exceptthat _Maybenull_impl_ + + #define __inexpressible_readableTo(size) _Readable_elements_impl_(_Inexpressible_(size)) + #define __inexpressible_writableTo(size) _Writable_elements_impl_(_Inexpressible_(size)) + + +#else // ][ +#ifndef PAL_STDCPP_COMPAT + #define __null + #define __notnull + #define __deref +#endif // !PAL_STDCPP_COMPAT + #define __maybenull + #define __readonly + #define __notreadonly + #define __maybereadonly + #define __valid + #define __notvalid + #define __maybevalid + #define __readableTo(extent) + #define __elem_readableTo(size) + #define __byte_readableTo(size) + #define __writableTo(size) + #define __elem_writableTo(size) + #define __byte_writableTo(size) + #define __pre + #define __post + #define __precond(expr) + #define __postcond(expr) + #define __exceptthat + #define __inner_override + #define __inner_callback + #define __inner_blocksOn(resource) + #define __refparam + #define __inner_control_entrypoint(category) + #define __inner_data_entrypoint(category) + + #define __post_except_maybenull + #define __pre_except_maybenull + #define __post_deref_except_maybenull + #define __pre_deref_except_maybenull + + #define __inexpressible_readableTo(size) + #define __inexpressible_writableTo(size) + +#endif /* #if !defined(__midl) && defined(_PREFAST_) */ // ] + +/* +------------------------------------------------------------------------------- +Buffer Annotation Definitions + +Any of these may be used to directly annotate functions, but only one should +be used for each parameter. To determine which annotation to use for a given +buffer, use the table in the buffer annotations section. +------------------------------------------------------------------------------- +*/ + +#define __ecount(size) _SAL1_Source_(__ecount, (size), __notnull __elem_writableTo(size)) +#define __bcount(size) _SAL1_Source_(__bcount, (size), __notnull __byte_writableTo(size)) +#define __in_ecount(size) _SAL1_Source_(__in_ecount, (size), _In_reads_(size)) +#define __in_bcount(size) _SAL1_Source_(__in_bcount, (size), _In_reads_bytes_(size)) +#define __in_z _SAL1_Source_(__in_z, (), _In_z_) +#define __in_ecount_z(size) _SAL1_Source_(__in_ecount_z, (size), _In_reads_z_(size)) +#define __in_bcount_z(size) _SAL1_Source_(__in_bcount_z, (size), __in_bcount(size) __pre __nullterminated) +#define __in_nz _SAL1_Source_(__in_nz, (), __in) +#define __in_ecount_nz(size) _SAL1_Source_(__in_ecount_nz, (size), __in_ecount(size)) +#define __in_bcount_nz(size) _SAL1_Source_(__in_bcount_nz, (size), __in_bcount(size)) +#define __out_ecount(size) _SAL1_Source_(__out_ecount, (size), _Out_writes_(size)) +#define __out_bcount(size) _SAL1_Source_(__out_bcount, (size), _Out_writes_bytes_(size)) +#define __out_ecount_part(size,length) _SAL1_Source_(__out_ecount_part, (size,length), _Out_writes_to_(size,length)) +#define __out_bcount_part(size,length) _SAL1_Source_(__out_bcount_part, (size,length), _Out_writes_bytes_to_(size,length)) +#define __out_ecount_full(size) _SAL1_Source_(__out_ecount_full, (size), _Out_writes_all_(size)) +#define __out_bcount_full(size) _SAL1_Source_(__out_bcount_full, (size), _Out_writes_bytes_all_(size)) +#define __out_z _SAL1_Source_(__out_z, (), __post __valid __refparam __post __nullterminated) +#define __out_z_opt _SAL1_Source_(__out_z_opt, (), __post __valid __refparam __post __nullterminated __pre_except_maybenull) +#define __out_ecount_z(size) _SAL1_Source_(__out_ecount_z, (size), __ecount(size) __post __valid __refparam __post __nullterminated) +#define __out_bcount_z(size) _SAL1_Source_(__out_bcount_z, (size), __bcount(size) __post __valid __refparam __post __nullterminated) +#define __out_ecount_part_z(size,length) _SAL1_Source_(__out_ecount_part_z, (size,length), __out_ecount_part(size,length) __post __nullterminated) +#define __out_bcount_part_z(size,length) _SAL1_Source_(__out_bcount_part_z, (size,length), __out_bcount_part(size,length) __post __nullterminated) +#define __out_ecount_full_z(size) _SAL1_Source_(__out_ecount_full_z, (size), __out_ecount_full(size) __post __nullterminated) +#define __out_bcount_full_z(size) _SAL1_Source_(__out_bcount_full_z, (size), __out_bcount_full(size) __post __nullterminated) +#define __out_nz _SAL1_Source_(__out_nz, (), __post __valid __refparam) +#define __out_nz_opt _SAL1_Source_(__out_nz_opt, (), __post __valid __refparam __post_except_maybenull_) +#define __out_ecount_nz(size) _SAL1_Source_(__out_ecount_nz, (size), __ecount(size) __post __valid __refparam) +#define __out_bcount_nz(size) _SAL1_Source_(__out_bcount_nz, (size), __bcount(size) __post __valid __refparam) +#define __inout _SAL1_Source_(__inout, (), _Inout_) +#define __inout_ecount(size) _SAL1_Source_(__inout_ecount, (size), _Inout_updates_(size)) +#define __inout_bcount(size) _SAL1_Source_(__inout_bcount, (size), _Inout_updates_bytes_(size)) +#define __inout_ecount_part(size,length) _SAL1_Source_(__inout_ecount_part, (size,length), _Inout_updates_to_(size,length)) +#define __inout_bcount_part(size,length) _SAL1_Source_(__inout_bcount_part, (size,length), _Inout_updates_bytes_to_(size,length)) +#define __inout_ecount_full(size) _SAL1_Source_(__inout_ecount_full, (size), _Inout_updates_all_(size)) +#define __inout_bcount_full(size) _SAL1_Source_(__inout_bcount_full, (size), _Inout_updates_bytes_all_(size)) +#define __inout_z _SAL1_Source_(__inout_z, (), _Inout_z_) +#define __inout_ecount_z(size) _SAL1_Source_(__inout_ecount_z, (size), _Inout_updates_z_(size)) +#define __inout_bcount_z(size) _SAL1_Source_(__inout_bcount_z, (size), __inout_bcount(size) __pre __nullterminated __post __nullterminated) +#define __inout_nz _SAL1_Source_(__inout_nz, (), __inout) +#define __inout_ecount_nz(size) _SAL1_Source_(__inout_ecount_nz, (size), __inout_ecount(size)) +#define __inout_bcount_nz(size) _SAL1_Source_(__inout_bcount_nz, (size), __inout_bcount(size)) +#define __ecount_opt(size) _SAL1_Source_(__ecount_opt, (size), __ecount(size) __pre_except_maybenull) +#define __bcount_opt(size) _SAL1_Source_(__bcount_opt, (size), __bcount(size) __pre_except_maybenull) +#define __in_opt _SAL1_Source_(__in_opt, (), _In_opt_) +#define __in_ecount_opt(size) _SAL1_Source_(__in_ecount_opt, (size), _In_reads_opt_(size)) +#define __in_bcount_opt(size) _SAL1_Source_(__in_bcount_opt, (size), _In_reads_bytes_opt_(size)) +#define __in_z_opt _SAL1_Source_(__in_z_opt, (), _In_opt_z_) +#define __in_ecount_z_opt(size) _SAL1_Source_(__in_ecount_z_opt, (size), __in_ecount_opt(size) __pre __nullterminated) +#define __in_bcount_z_opt(size) _SAL1_Source_(__in_bcount_z_opt, (size), __in_bcount_opt(size) __pre __nullterminated) +#define __in_nz_opt _SAL1_Source_(__in_nz_opt, (), __in_opt) +#define __in_ecount_nz_opt(size) _SAL1_Source_(__in_ecount_nz_opt, (size), __in_ecount_opt(size)) +#define __in_bcount_nz_opt(size) _SAL1_Source_(__in_bcount_nz_opt, (size), __in_bcount_opt(size)) +#define __out_opt _SAL1_Source_(__out_opt, (), _Out_opt_) +#define __out_ecount_opt(size) _SAL1_Source_(__out_ecount_opt, (size), _Out_writes_opt_(size)) +#define __out_bcount_opt(size) _SAL1_Source_(__out_bcount_opt, (size), _Out_writes_bytes_opt_(size)) +#define __out_ecount_part_opt(size,length) _SAL1_Source_(__out_ecount_part_opt, (size,length), __out_ecount_part(size,length) __pre_except_maybenull) +#define __out_bcount_part_opt(size,length) _SAL1_Source_(__out_bcount_part_opt, (size,length), __out_bcount_part(size,length) __pre_except_maybenull) +#define __out_ecount_full_opt(size) _SAL1_Source_(__out_ecount_full_opt, (size), __out_ecount_full(size) __pre_except_maybenull) +#define __out_bcount_full_opt(size) _SAL1_Source_(__out_bcount_full_opt, (size), __out_bcount_full(size) __pre_except_maybenull) +#define __out_ecount_z_opt(size) _SAL1_Source_(__out_ecount_z_opt, (size), __out_ecount_opt(size) __post __nullterminated) +#define __out_bcount_z_opt(size) _SAL1_Source_(__out_bcount_z_opt, (size), __out_bcount_opt(size) __post __nullterminated) +#define __out_ecount_part_z_opt(size,length) _SAL1_Source_(__out_ecount_part_z_opt, (size,length), __out_ecount_part_opt(size,length) __post __nullterminated) +#define __out_bcount_part_z_opt(size,length) _SAL1_Source_(__out_bcount_part_z_opt, (size,length), __out_bcount_part_opt(size,length) __post __nullterminated) +#define __out_ecount_full_z_opt(size) _SAL1_Source_(__out_ecount_full_z_opt, (size), __out_ecount_full_opt(size) __post __nullterminated) +#define __out_bcount_full_z_opt(size) _SAL1_Source_(__out_bcount_full_z_opt, (size), __out_bcount_full_opt(size) __post __nullterminated) +#define __out_ecount_nz_opt(size) _SAL1_Source_(__out_ecount_nz_opt, (size), __out_ecount_opt(size) __post __nullterminated) +#define __out_bcount_nz_opt(size) _SAL1_Source_(__out_bcount_nz_opt, (size), __out_bcount_opt(size) __post __nullterminated) +#define __inout_opt _SAL1_Source_(__inout_opt, (), _Inout_opt_) +#define __inout_ecount_opt(size) _SAL1_Source_(__inout_ecount_opt, (size), __inout_ecount(size) __pre_except_maybenull) +#define __inout_bcount_opt(size) _SAL1_Source_(__inout_bcount_opt, (size), __inout_bcount(size) __pre_except_maybenull) +#define __inout_ecount_part_opt(size,length) _SAL1_Source_(__inout_ecount_part_opt, (size,length), __inout_ecount_part(size,length) __pre_except_maybenull) +#define __inout_bcount_part_opt(size,length) _SAL1_Source_(__inout_bcount_part_opt, (size,length), __inout_bcount_part(size,length) __pre_except_maybenull) +#define __inout_ecount_full_opt(size) _SAL1_Source_(__inout_ecount_full_opt, (size), __inout_ecount_full(size) __pre_except_maybenull) +#define __inout_bcount_full_opt(size) _SAL1_Source_(__inout_bcount_full_opt, (size), __inout_bcount_full(size) __pre_except_maybenull) +#define __inout_z_opt _SAL1_Source_(__inout_z_opt, (), __inout_opt __pre __nullterminated __post __nullterminated) +#define __inout_ecount_z_opt(size) _SAL1_Source_(__inout_ecount_z_opt, (size), __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated) +#define __inout_ecount_z_opt(size) _SAL1_Source_(__inout_ecount_z_opt, (size), __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated) +#define __inout_bcount_z_opt(size) _SAL1_Source_(__inout_bcount_z_opt, (size), __inout_bcount_opt(size)) +#define __inout_nz_opt _SAL1_Source_(__inout_nz_opt, (), __inout_opt) +#define __inout_ecount_nz_opt(size) _SAL1_Source_(__inout_ecount_nz_opt, (size), __inout_ecount_opt(size)) +#define __inout_bcount_nz_opt(size) _SAL1_Source_(__inout_bcount_nz_opt, (size), __inout_bcount_opt(size)) +#define __deref_ecount(size) _SAL1_Source_(__deref_ecount, (size), _Notref_ __ecount(1) __post _Notref_ __elem_readableTo(1) __post _Notref_ __deref _Notref_ __notnull __post __deref __elem_writableTo(size)) +#define __deref_bcount(size) _SAL1_Source_(__deref_bcount, (size), _Notref_ __ecount(1) __post _Notref_ __elem_readableTo(1) __post _Notref_ __deref _Notref_ __notnull __post __deref __byte_writableTo(size)) +#define __deref_out _SAL1_Source_(__deref_out, (), _Outptr_) +#define __deref_out_ecount(size) _SAL1_Source_(__deref_out_ecount, (size), _Outptr_result_buffer_(size)) +#define __deref_out_bcount(size) _SAL1_Source_(__deref_out_bcount, (size), _Outptr_result_bytebuffer_(size)) +#define __deref_out_ecount_part(size,length) _SAL1_Source_(__deref_out_ecount_part, (size,length), _Outptr_result_buffer_to_(size,length)) +#define __deref_out_bcount_part(size,length) _SAL1_Source_(__deref_out_bcount_part, (size,length), _Outptr_result_bytebuffer_to_(size,length)) +#define __deref_out_ecount_full(size) _SAL1_Source_(__deref_out_ecount_full, (size), __deref_out_ecount_part(size,size)) +#define __deref_out_bcount_full(size) _SAL1_Source_(__deref_out_bcount_full, (size), __deref_out_bcount_part(size,size)) +#define __deref_out_z _SAL1_Source_(__deref_out_z, (), _Outptr_result_z_) +#define __deref_out_ecount_z(size) _SAL1_Source_(__deref_out_ecount_z, (size), __deref_out_ecount(size) __post __deref __nullterminated) +#define __deref_out_bcount_z(size) _SAL1_Source_(__deref_out_bcount_z, (size), __deref_out_bcount(size) __post __deref __nullterminated) +#define __deref_out_nz _SAL1_Source_(__deref_out_nz, (), __deref_out) +#define __deref_out_ecount_nz(size) _SAL1_Source_(__deref_out_ecount_nz, (size), __deref_out_ecount(size)) +#define __deref_out_bcount_nz(size) _SAL1_Source_(__deref_out_bcount_nz, (size), __deref_out_ecount(size)) +#define __deref_inout _SAL1_Source_(__deref_inout, (), _Notref_ __notnull _Notref_ __elem_readableTo(1) __pre __deref __valid __post _Notref_ __deref __valid __refparam) +#define __deref_inout_z _SAL1_Source_(__deref_inout_z, (), __deref_inout __pre __deref __nullterminated __post _Notref_ __deref __nullterminated) +#define __deref_inout_ecount(size) _SAL1_Source_(__deref_inout_ecount, (size), __deref_inout __pre __deref __elem_writableTo(size) __post _Notref_ __deref __elem_writableTo(size)) +#define __deref_inout_bcount(size) _SAL1_Source_(__deref_inout_bcount, (size), __deref_inout __pre __deref __byte_writableTo(size) __post _Notref_ __deref __byte_writableTo(size)) +#define __deref_inout_ecount_part(size,length) _SAL1_Source_(__deref_inout_ecount_part, (size,length), __deref_inout_ecount(size) __pre __deref __elem_readableTo(length) __post __deref __elem_readableTo(length)) +#define __deref_inout_bcount_part(size,length) _SAL1_Source_(__deref_inout_bcount_part, (size,length), __deref_inout_bcount(size) __pre __deref __byte_readableTo(length) __post __deref __byte_readableTo(length)) +#define __deref_inout_ecount_full(size) _SAL1_Source_(__deref_inout_ecount_full, (size), __deref_inout_ecount_part(size,size)) +#define __deref_inout_bcount_full(size) _SAL1_Source_(__deref_inout_bcount_full, (size), __deref_inout_bcount_part(size,size)) +#define __deref_inout_ecount_z(size) _SAL1_Source_(__deref_inout_ecount_z, (size), __deref_inout_ecount(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_bcount_z(size) _SAL1_Source_(__deref_inout_bcount_z, (size), __deref_inout_bcount(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_nz _SAL1_Source_(__deref_inout_nz, (), __deref_inout) +#define __deref_inout_ecount_nz(size) _SAL1_Source_(__deref_inout_ecount_nz, (size), __deref_inout_ecount(size)) +#define __deref_inout_bcount_nz(size) _SAL1_Source_(__deref_inout_bcount_nz, (size), __deref_inout_ecount(size)) +#define __deref_ecount_opt(size) _SAL1_Source_(__deref_ecount_opt, (size), __deref_ecount(size) __post_deref_except_maybenull) +#define __deref_bcount_opt(size) _SAL1_Source_(__deref_bcount_opt, (size), __deref_bcount(size) __post_deref_except_maybenull) +#define __deref_out_opt _SAL1_Source_(__deref_out_opt, (), __deref_out __post_deref_except_maybenull) +#define __deref_out_ecount_opt(size) _SAL1_Source_(__deref_out_ecount_opt, (size), __deref_out_ecount(size) __post_deref_except_maybenull) +#define __deref_out_bcount_opt(size) _SAL1_Source_(__deref_out_bcount_opt, (size), __deref_out_bcount(size) __post_deref_except_maybenull) +#define __deref_out_ecount_part_opt(size,length) _SAL1_Source_(__deref_out_ecount_part_opt, (size,length), __deref_out_ecount_part(size,length) __post_deref_except_maybenull) +#define __deref_out_bcount_part_opt(size,length) _SAL1_Source_(__deref_out_bcount_part_opt, (size,length), __deref_out_bcount_part(size,length) __post_deref_except_maybenull) +#define __deref_out_ecount_full_opt(size) _SAL1_Source_(__deref_out_ecount_full_opt, (size), __deref_out_ecount_full(size) __post_deref_except_maybenull) +#define __deref_out_bcount_full_opt(size) _SAL1_Source_(__deref_out_bcount_full_opt, (size), __deref_out_bcount_full(size) __post_deref_except_maybenull) +#define __deref_out_z_opt _SAL1_Source_(__deref_out_z_opt, (), _Outptr_result_maybenull_z_) +#define __deref_out_ecount_z_opt(size) _SAL1_Source_(__deref_out_ecount_z_opt, (size), __deref_out_ecount_opt(size) __post __deref __nullterminated) +#define __deref_out_bcount_z_opt(size) _SAL1_Source_(__deref_out_bcount_z_opt, (size), __deref_out_bcount_opt(size) __post __deref __nullterminated) +#define __deref_out_nz_opt _SAL1_Source_(__deref_out_nz_opt, (), __deref_out_opt) +#define __deref_out_ecount_nz_opt(size) _SAL1_Source_(__deref_out_ecount_nz_opt, (size), __deref_out_ecount_opt(size)) +#define __deref_out_bcount_nz_opt(size) _SAL1_Source_(__deref_out_bcount_nz_opt, (size), __deref_out_bcount_opt(size)) +#define __deref_inout_opt _SAL1_Source_(__deref_inout_opt, (), __deref_inout __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_ecount_opt(size) _SAL1_Source_(__deref_inout_ecount_opt, (size), __deref_inout_ecount(size) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_bcount_opt(size) _SAL1_Source_(__deref_inout_bcount_opt, (size), __deref_inout_bcount(size) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_ecount_part_opt(size,length) _SAL1_Source_(__deref_inout_ecount_part_opt, (size,length), __deref_inout_ecount_part(size,length) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_bcount_part_opt(size,length) _SAL1_Source_(__deref_inout_bcount_part_opt, (size,length), __deref_inout_bcount_part(size,length) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_ecount_full_opt(size) _SAL1_Source_(__deref_inout_ecount_full_opt, (size), __deref_inout_ecount_full(size) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_bcount_full_opt(size) _SAL1_Source_(__deref_inout_bcount_full_opt, (size), __deref_inout_bcount_full(size) __pre_deref_except_maybenull __post_deref_except_maybenull) +#define __deref_inout_z_opt _SAL1_Source_(__deref_inout_z_opt, (), __deref_inout_opt __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_ecount_z_opt(size) _SAL1_Source_(__deref_inout_ecount_z_opt, (size), __deref_inout_ecount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_bcount_z_opt(size) _SAL1_Source_(__deref_inout_bcount_z_opt, (size), __deref_inout_bcount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_inout_nz_opt _SAL1_Source_(__deref_inout_nz_opt, (), __deref_inout_opt) +#define __deref_inout_ecount_nz_opt(size) _SAL1_Source_(__deref_inout_ecount_nz_opt, (size), __deref_inout_ecount_opt(size)) +#define __deref_inout_bcount_nz_opt(size) _SAL1_Source_(__deref_inout_bcount_nz_opt, (size), __deref_inout_bcount_opt(size)) +#define __deref_opt_ecount(size) _SAL1_Source_(__deref_opt_ecount, (size), __deref_ecount(size) __pre_except_maybenull) +#define __deref_opt_bcount(size) _SAL1_Source_(__deref_opt_bcount, (size), __deref_bcount(size) __pre_except_maybenull) +#define __deref_opt_out _SAL1_Source_(__deref_opt_out, (), _Outptr_opt_) +#define __deref_opt_out_z _SAL1_Source_(__deref_opt_out_z, (), _Outptr_opt_result_z_) +#define __deref_opt_out_ecount(size) _SAL1_Source_(__deref_opt_out_ecount, (size), __deref_out_ecount(size) __pre_except_maybenull) +#define __deref_opt_out_bcount(size) _SAL1_Source_(__deref_opt_out_bcount, (size), __deref_out_bcount(size) __pre_except_maybenull) +#define __deref_opt_out_ecount_part(size,length) _SAL1_Source_(__deref_opt_out_ecount_part, (size,length), __deref_out_ecount_part(size,length) __pre_except_maybenull) +#define __deref_opt_out_bcount_part(size,length) _SAL1_Source_(__deref_opt_out_bcount_part, (size,length), __deref_out_bcount_part(size,length) __pre_except_maybenull) +#define __deref_opt_out_ecount_full(size) _SAL1_Source_(__deref_opt_out_ecount_full, (size), __deref_out_ecount_full(size) __pre_except_maybenull) +#define __deref_opt_out_bcount_full(size) _SAL1_Source_(__deref_opt_out_bcount_full, (size), __deref_out_bcount_full(size) __pre_except_maybenull) +#define __deref_opt_inout _SAL1_Source_(__deref_opt_inout, (), _Inout_opt_) +#define __deref_opt_inout_ecount(size) _SAL1_Source_(__deref_opt_inout_ecount, (size), __deref_inout_ecount(size) __pre_except_maybenull) +#define __deref_opt_inout_bcount(size) _SAL1_Source_(__deref_opt_inout_bcount, (size), __deref_inout_bcount(size) __pre_except_maybenull) +#define __deref_opt_inout_ecount_part(size,length) _SAL1_Source_(__deref_opt_inout_ecount_part, (size,length), __deref_inout_ecount_part(size,length) __pre_except_maybenull) +#define __deref_opt_inout_bcount_part(size,length) _SAL1_Source_(__deref_opt_inout_bcount_part, (size,length), __deref_inout_bcount_part(size,length) __pre_except_maybenull) +#define __deref_opt_inout_ecount_full(size) _SAL1_Source_(__deref_opt_inout_ecount_full, (size), __deref_inout_ecount_full(size) __pre_except_maybenull) +#define __deref_opt_inout_bcount_full(size) _SAL1_Source_(__deref_opt_inout_bcount_full, (size), __deref_inout_bcount_full(size) __pre_except_maybenull) +#define __deref_opt_inout_z _SAL1_Source_(__deref_opt_inout_z, (), __deref_opt_inout __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_ecount_z(size) _SAL1_Source_(__deref_opt_inout_ecount_z, (size), __deref_opt_inout_ecount(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_bcount_z(size) _SAL1_Source_(__deref_opt_inout_bcount_z, (size), __deref_opt_inout_bcount(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_nz _SAL1_Source_(__deref_opt_inout_nz, (), __deref_opt_inout) +#define __deref_opt_inout_ecount_nz(size) _SAL1_Source_(__deref_opt_inout_ecount_nz, (size), __deref_opt_inout_ecount(size)) +#define __deref_opt_inout_bcount_nz(size) _SAL1_Source_(__deref_opt_inout_bcount_nz, (size), __deref_opt_inout_bcount(size)) +#define __deref_opt_ecount_opt(size) _SAL1_Source_(__deref_opt_ecount_opt, (size), __deref_ecount_opt(size) __pre_except_maybenull) +#define __deref_opt_bcount_opt(size) _SAL1_Source_(__deref_opt_bcount_opt, (size), __deref_bcount_opt(size) __pre_except_maybenull) +#define __deref_opt_out_opt _SAL1_Source_(__deref_opt_out_opt, (), _Outptr_opt_result_maybenull_) +#define __deref_opt_out_ecount_opt(size) _SAL1_Source_(__deref_opt_out_ecount_opt, (size), __deref_out_ecount_opt(size) __pre_except_maybenull) +#define __deref_opt_out_bcount_opt(size) _SAL1_Source_(__deref_opt_out_bcount_opt, (size), __deref_out_bcount_opt(size) __pre_except_maybenull) +#define __deref_opt_out_ecount_part_opt(size,length) _SAL1_Source_(__deref_opt_out_ecount_part_opt, (size,length), __deref_out_ecount_part_opt(size,length) __pre_except_maybenull) +#define __deref_opt_out_bcount_part_opt(size,length) _SAL1_Source_(__deref_opt_out_bcount_part_opt, (size,length), __deref_out_bcount_part_opt(size,length) __pre_except_maybenull) +#define __deref_opt_out_ecount_full_opt(size) _SAL1_Source_(__deref_opt_out_ecount_full_opt, (size), __deref_out_ecount_full_opt(size) __pre_except_maybenull) +#define __deref_opt_out_bcount_full_opt(size) _SAL1_Source_(__deref_opt_out_bcount_full_opt, (size), __deref_out_bcount_full_opt(size) __pre_except_maybenull) +#define __deref_opt_out_z_opt _SAL1_Source_(__deref_opt_out_z_opt, (), __post __deref __valid __refparam __pre_except_maybenull __pre_deref_except_maybenull __post_deref_except_maybenull __post __deref __nullterminated) +#define __deref_opt_out_ecount_z_opt(size) _SAL1_Source_(__deref_opt_out_ecount_z_opt, (size), __deref_opt_out_ecount_opt(size) __post __deref __nullterminated) +#define __deref_opt_out_bcount_z_opt(size) _SAL1_Source_(__deref_opt_out_bcount_z_opt, (size), __deref_opt_out_bcount_opt(size) __post __deref __nullterminated) +#define __deref_opt_out_nz_opt _SAL1_Source_(__deref_opt_out_nz_opt, (), __deref_opt_out_opt) +#define __deref_opt_out_ecount_nz_opt(size) _SAL1_Source_(__deref_opt_out_ecount_nz_opt, (size), __deref_opt_out_ecount_opt(size)) +#define __deref_opt_out_bcount_nz_opt(size) _SAL1_Source_(__deref_opt_out_bcount_nz_opt, (size), __deref_opt_out_bcount_opt(size)) +#define __deref_opt_inout_opt _SAL1_Source_(__deref_opt_inout_opt, (), __deref_inout_opt __pre_except_maybenull) +#define __deref_opt_inout_ecount_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_opt, (size), __deref_inout_ecount_opt(size) __pre_except_maybenull) +#define __deref_opt_inout_bcount_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_opt, (size), __deref_inout_bcount_opt(size) __pre_except_maybenull) +#define __deref_opt_inout_ecount_part_opt(size,length) _SAL1_Source_(__deref_opt_inout_ecount_part_opt, (size,length), __deref_inout_ecount_part_opt(size,length) __pre_except_maybenull) +#define __deref_opt_inout_bcount_part_opt(size,length) _SAL1_Source_(__deref_opt_inout_bcount_part_opt, (size,length), __deref_inout_bcount_part_opt(size,length) __pre_except_maybenull) +#define __deref_opt_inout_ecount_full_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_full_opt, (size), __deref_inout_ecount_full_opt(size) __pre_except_maybenull) +#define __deref_opt_inout_bcount_full_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_full_opt, (size), __deref_inout_bcount_full_opt(size) __pre_except_maybenull) +#define __deref_opt_inout_z_opt _SAL1_Source_(__deref_opt_inout_z_opt, (), __deref_opt_inout_opt __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_ecount_z_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_z_opt, (size), __deref_opt_inout_ecount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_bcount_z_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_z_opt, (size), __deref_opt_inout_bcount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) +#define __deref_opt_inout_nz_opt _SAL1_Source_(__deref_opt_inout_nz_opt, (), __deref_opt_inout_opt) +#define __deref_opt_inout_ecount_nz_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_nz_opt, (size), __deref_opt_inout_ecount_opt(size)) +#define __deref_opt_inout_bcount_nz_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_nz_opt, (size), __deref_opt_inout_bcount_opt(size)) + +/* +------------------------------------------------------------------------------- +Advanced Annotation Definitions + +Any of these may be used to directly annotate functions, and may be used in +combination with each other or with regular buffer macros. For an explanation +of each annotation, see the advanced annotations section. +------------------------------------------------------------------------------- +*/ + +#define __success(expr) _Success_(expr) +#define __nullterminated _Null_terminated_ +#define __nullnullterminated +#define __clr_reserved _SAL1_Source_(__reserved, (), _Reserved_) +#define __checkReturn _SAL1_Source_(__checkReturn, (), _Check_return_) +#define __typefix(ctype) _SAL1_Source_(__typefix, (ctype), __inner_typefix(ctype)) +#define __override __inner_override +#define __callback __inner_callback +#define __format_string _Printf_format_string_ +#define __blocksOn(resource) __inner_blocksOn(resource) +#define __control_entrypoint(category) __inner_control_entrypoint(category) +#define __data_entrypoint(category) __inner_data_entrypoint(category) +#define __useHeader _Use_decl_anno_impl_ +#define __on_failure(annotes) _On_failure_impl_(annotes _SAL_nop_impl_) + +#ifndef __has_cpp_attribute +#define __has_cpp_attribute(x) (0) +#endif + +#ifndef __fallthrough // [ +#if __has_cpp_attribute(fallthrough) +#define __fallthrough [[fallthrough]] +#else +#define __fallthrough +#endif +#endif // ] + +#ifndef __analysis_assume // [ +#ifdef _PREFAST_ // [ +#define __analysis_assume(expr) __assume(expr) +#else // ][ +#define __analysis_assume(expr) +#endif // ] +#endif // ] + +#ifndef _Analysis_assume_ // [ +#ifdef _PREFAST_ // [ +#define _Analysis_assume_(expr) __assume(expr) +#else // ][ +#define _Analysis_assume_(expr) +#endif // ] +#endif // ] + +#define _Analysis_noreturn_ _SAL2_Source_(_Analysis_noreturn_, (), _SA_annotes0(SAL_terminates)) + +#ifdef _PREFAST_ // [ +__inline __nothrow +void __AnalysisAssumeNullterminated(_Post_ __nullterminated void *p); + +#define _Analysis_assume_nullterminated_(x) __AnalysisAssumeNullterminated(x) +#else // ][ +#define _Analysis_assume_nullterminated_(x) +#endif // ] + +// +// Set the analysis mode (global flags to analysis). +// They take effect at the point of declaration; use at global scope +// as a declaration. +// + +// Synthesize a unique symbol. +#define ___MKID(x, y) x ## y +#define __MKID(x, y) ___MKID(x, y) +#define __GENSYM(x) __MKID(x, __COUNTER__) + +__ANNOTATION(SAL_analysisMode(__AuToQuOtE __In_impl_ char *mode);) + +#define _Analysis_mode_impl_(mode) _SA_annotes1(SAL_analysisMode, #mode) + +#define _Analysis_mode_(mode) \ + typedef _Analysis_mode_impl_(mode) int \ + __GENSYM(__prefast_analysis_mode_flag); + +// The following are predefined: +// _Analysis_operator_new_throw_ (operator new throws) +// _Analysis_operator_new_null_ (operator new returns null) +// _Analysis_operator_new_never_fails_ (operator new never fails) +// + +// Function class annotations. +__ANNOTATION(SAL_functionClassNew(__In_impl_ char*);) +__PRIMOP(int, _In_function_class_(__In_impl_ char*);) +#define _In_function_class_(x) _In_function_class_(#x) + +#define _Function_class_(x) _SA_annotes1(SAL_functionClassNew, #x) + +/* + * interlocked operand used in interlocked instructions + */ +//#define _Interlocked_operand_ _Pre_ _SA_annotes0(SAL_interlocked) + +#define _Enum_is_bitflag_ _SA_annotes0(SAL_enumIsBitflag) +#define _Strict_type_match_ _SA_annotes0(SAL_strictType2) + +#define _Maybe_raises_SEH_exception_ _Pre_ _SA_annotes1(SAL_inTry,__yes) +#define _Raises_SEH_exception_ _Group_(_Maybe_raises_SEH_exception_ _Analysis_noreturn_) + +#ifdef __cplusplus // [ +} +#endif // ] From f76d081c8282c4b3471d9681c7a95b85028bac7c Mon Sep 17 00:00:00 2001 From: mastercoms Date: Sat, 4 Mar 2023 17:32:14 -0500 Subject: [PATCH 04/42] perf: fix code bugs with SSE2 compiler with vectorization, more math optimizations, and alignment, the compiler is more sensitive to some forms of code bugs --- src/engine/gl_lightmap.cpp | 8 ++++---- src/mathlib/mathlib_base.cpp | 2 +- src/public/mathlib/mathlib.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/engine/gl_lightmap.cpp b/src/engine/gl_lightmap.cpp index 5619120cd..4ee8b0a63 100644 --- a/src/engine/gl_lightmap.cpp +++ b/src/engine/gl_lightmap.cpp @@ -1339,7 +1339,7 @@ static void SortSurfacesByLightmapID( SurfaceHandle_t *pToSort, int iSurfaceCoun SurfaceHandle_t *pSortTemp = (SurfaceHandle_t *)stackalloc( sizeof( SurfaceHandle_t ) * iSurfaceCount ); //radix sort - for( int radix = 0; radix != 4; ++radix ) + for( int radix = 0; radix < 4; ++radix ) { //swap the inputs for the next pass { @@ -1350,7 +1350,7 @@ static void SortSurfacesByLightmapID( SurfaceHandle_t *pToSort, int iSurfaceCoun int iCounts[256] = { 0 }; int iBitOffset = radix * 8; - for( int i = 0; i != iSurfaceCount; ++i ) + for( int i = 0; i < iSurfaceCount; ++i ) { uint8 val = (materialSortInfoArray[MSurf_MaterialSortID( pSortTemp[i] )].lightmapPageID >> iBitOffset) & 0xFF; ++iCounts[val]; @@ -1358,12 +1358,12 @@ static void SortSurfacesByLightmapID( SurfaceHandle_t *pToSort, int iSurfaceCoun int iOffsetTable[256]; iOffsetTable[0] = 0; - for( int i = 0; i != 255; ++i ) + for( int i = 0; i < 255; ++i ) { iOffsetTable[i + 1] = iOffsetTable[i] + iCounts[i]; } - for( int i = 0; i != iSurfaceCount; ++i ) + for( int i = 0; i < iSurfaceCount; ++i ) { uint8 val = (materialSortInfoArray[MSurf_MaterialSortID( pSortTemp[i] )].lightmapPageID >> iBitOffset) & 0xFF; int iWriteIndex = iOffsetTable[val]; diff --git a/src/mathlib/mathlib_base.cpp b/src/mathlib/mathlib_base.cpp index 64069a7be..6503072cb 100644 --- a/src/mathlib/mathlib_base.cpp +++ b/src/mathlib/mathlib_base.cpp @@ -1488,7 +1488,7 @@ float SmoothCurve( float x ) inline float MovePeak( float x, float flPeakPos ) { // Todo: make this higher-order? - if( x < flPeakPos ) + if ( (x < flPeakPos || flPeakPos == 1) && flPeakPos != 0 ) return x * 0.5f / flPeakPos; else return 0.5 + 0.5 * (x - flPeakPos) / (1 - flPeakPos); diff --git a/src/public/mathlib/mathlib.h b/src/public/mathlib/mathlib.h index a6d302ff1..42317632b 100644 --- a/src/public/mathlib/mathlib.h +++ b/src/public/mathlib/mathlib.h @@ -1004,7 +1004,7 @@ void BuildGammaTable( float gamma, float texGamma, float brightness, int overbri // convert texture to linear 0..1 value inline float TexLightToLinear( int c, int exponent ) { - extern float power2_n[256]; + extern ALIGN128 float power2_n[256]; Assert( exponent >= -128 && exponent <= 127 ); return ( float )c * power2_n[exponent+128]; } From 0bd3752e3b26519eb2c40a577db96d0eefa62292 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Sat, 4 Mar 2023 23:49:45 -0500 Subject: [PATCH 05/42] perf: implement SSE2 math using DXMath and CSGO backports * remove redirecting fast math calls as SSE2 is now guaranteed this is a pretty destructive change and makes the diff a bit ugly, but I'm sure it's for the best. * replace SinCos with the DirectXMath minimax polynomial approximation This could have regressions, since it's not as accurate as sin and cos calls, but I found it to be sufficient in accuracy from experience. * remove SSE2_SinCos (Extended precision modular arithmetic). It seems to share lineage with sse_mathfun (and thus cephes), and I don't see a reason to use it over the much simpler minimax polynomial approximation however, we could reimplement it using sse_mathfun and finally introduce that code! * implement new fast math calls I profiled these and found them to be faster, particularly for the animation code which was a big win, especially in large fights. * remove SinCos table since it was only being used by a single effect no point in using memory, potentially slowing down cache for a single effect's lookup table for cos, which probably doesn't even save time * remove SSE clamp, it generates heavier assembly * implement various mathlib calls with DXMath some of these aren't used, but I figured I'd implement them anyway in case some more backports/other introduce usage of them. * use SIMD versions of Angle functions (except for AngleVectors, I couldn't really find a perf win on that one.) * optimize SSE rounding functions More efficient way to get around the round-to-nearest-even on 0.5 also, one of them was missing mitigation for this behavior. * enable SIMD Quaternion (faster on SSE2) I implemented some functions with DirectXMath. Works well with hlmv/other tools, in-game. * backport FourQuaternions from CSGO * backport SlerpBonesSpeedy from CSGO (needed to adjust QuaternionAligned usage for this one) this is another big win in large fights, especially with the new math calls. --- src/engine/initmathlib.cpp | 45 +- src/game/client/c_baseanimating.cpp | 2 +- src/game/client/c_baseanimating.h | 2 +- src/game/client/c_smokestack.cpp | 2 +- src/game/server/baseanimating.cpp | 2 +- src/mathlib/3dnow.cpp | 197 -- src/mathlib/3dnow.h | 16 - src/mathlib/mathlib.vpc | 4 - src/mathlib/mathlib_base.cpp | 176 +- src/mathlib/sse.cpp | 1107 ------ src/mathlib/sse.h | 27 - src/mathlib/sseconst.cpp | 4 + src/public/bone_setup.cpp | 522 ++- src/public/mathlib/dxmath.h | 20 + src/public/mathlib/math_pfns.h | 56 +- src/public/mathlib/mathlib.h | 68 +- src/public/mathlib/ssemath.h | 90 +- src/public/mathlib/ssequaternion.h | 842 ++++- src/public/mathlib/vector.h | 52 +- src/public/mathlib/vector4d.h | 34 +- .../DirectXMath-dec2022/Inc/DirectXMath.h | 6 +- src/thirdparty/dotnetrt/sal.h | 3130 ++--------------- src/thirdparty/sse_mathfun/sse_mathfun.h | 710 ++++ src/tier2/util_init.cpp | 2 +- src/tier3/mdlutils.cpp | 4 +- 25 files changed, 2517 insertions(+), 4603 deletions(-) delete mode 100644 src/mathlib/3dnow.cpp delete mode 100644 src/mathlib/3dnow.h delete mode 100644 src/mathlib/sse.cpp delete mode 100644 src/mathlib/sse.h create mode 100644 src/public/mathlib/dxmath.h create mode 100644 src/thirdparty/sse_mathfun/sse_mathfun.h diff --git a/src/engine/initmathlib.cpp b/src/engine/initmathlib.cpp index 962830534..f5da3a7ec 100644 --- a/src/engine/initmathlib.cpp +++ b/src/engine/initmathlib.cpp @@ -15,53 +15,10 @@ // memdbgon must be the last include file in a .cpp file!!! #include "tier0/memdbgon.h" -static bool s_bAllow3DNow = true; -static bool s_bAllowSSE2 = true; - void InitMathlib( void ) { MathLib_Init( 2.2f, // v_gamma.GetFloat() 2.2f, // v_texgamma.GetFloat() 0.0f /*v_brightness.GetFloat() */, - 2.0f /*mat_overbright.GetInt() */, s_bAllow3DNow, true, s_bAllowSSE2, true ); -} - -/* -=============== -R_SSE2 -=============== -*/ -CON_COMMAND( r_sse2, "Enable/disable SSE2 code" ) -{ - if (args.ArgC() == 1) - { - s_bAllowSSE2 = true; - } - else - { - s_bAllowSSE2 = atoi( args[1] ) ? true : false; - } - - InitMathlib(); - ConMsg( "SSE2 code is %s\n", MathLib_SSE2Enabled() ? "enabled" : "disabled" ); + 2.0f /*mat_overbright.GetInt() */ ); } - -/* -=============== -R_3DNow -=============== -*/ -CON_COMMAND( r_3dnow, "Enable/disable 3DNow code" ) -{ - if (args.ArgC() == 1) - { - s_bAllow3DNow = true; - } - else - { - s_bAllow3DNow = atoi( args[1] ) ? true : false; - } - - InitMathlib(); - ConMsg( "3DNow code is %s\n", MathLib_3DNowEnabled() ? "enabled" : "disabled" ); -} \ No newline at end of file diff --git a/src/game/client/c_baseanimating.cpp b/src/game/client/c_baseanimating.cpp index 1dac8e4ff..c78f8582f 100644 --- a/src/game/client/c_baseanimating.cpp +++ b/src/game/client/c_baseanimating.cpp @@ -2950,7 +2950,7 @@ bool C_BaseAnimating::SetupBones( matrix3x4_t *pBoneToWorldOut, int nMaxBones, i } Vector pos[MAXSTUDIOBONES]; - Quaternion q[MAXSTUDIOBONES]; + QuaternionAligned q[MAXSTUDIOBONES]; #if defined(FP_EXCEPTIONS_ENABLED) || defined(DBGFLAG_ASSERT) // Having these uninitialized means that some bugs are very hard // to reproduce. A memset of 0xFF is a simple way of getting NaNs. diff --git a/src/game/client/c_baseanimating.h b/src/game/client/c_baseanimating.h index c6715fb3f..1c8a74b30 100644 --- a/src/game/client/c_baseanimating.h +++ b/src/game/client/c_baseanimating.h @@ -68,7 +68,7 @@ struct RagdollInfo_t float m_flSaveTime; int m_nNumBones; Vector m_rgBonePos[MAXSTUDIOBONES]; - Quaternion m_rgBoneQuaternion[MAXSTUDIOBONES]; + QuaternionAligned m_rgBoneQuaternion[MAXSTUDIOBONES]; }; diff --git a/src/game/client/c_smokestack.cpp b/src/game/client/c_smokestack.cpp index c2c0a9f0d..7d2eac62d 100644 --- a/src/game/client/c_smokestack.cpp +++ b/src/game/client/c_smokestack.cpp @@ -406,7 +406,7 @@ void C_SmokeStack::RenderParticles( CParticleRenderIterator *pIterator ) // makes it get translucent and fade out for a longer time. //float alpha = cosf( -M_PI_F + tLifetime * M_PI_F * 2.f ) * 0.5f + 0.5f; float tLifetime = pParticle->m_Lifetime * m_InvLifetime; - float alpha = TableCos( -M_PI_F + tLifetime * M_PI_F * 2.f ) * 0.5f + 0.5f; + float alpha = FastCos( -M_PI_F + tLifetime * M_PI_F * 2.f ) * 0.5f + 0.5f; if( tLifetime > 0.5f ) alpha *= alpha; diff --git a/src/game/server/baseanimating.cpp b/src/game/server/baseanimating.cpp index f321b4205..e8f035929 100644 --- a/src/game/server/baseanimating.cpp +++ b/src/game/server/baseanimating.cpp @@ -1798,7 +1798,7 @@ void CBaseAnimating::SetupBones( matrix3x4_t *pBoneToWorld, int boneMask ) AddEFlags( EFL_SETTING_UP_BONES ); Vector pos[MAXSTUDIOBONES]; - Quaternion q[MAXSTUDIOBONES]; + QuaternionAligned q[MAXSTUDIOBONES]; // adjust hit boxes based on IK driven offset Vector adjOrigin = GetAbsOrigin() + Vector( 0, 0, m_flEstIkOffset ); diff --git a/src/mathlib/3dnow.cpp b/src/mathlib/3dnow.cpp deleted file mode 100644 index db17c8c10..000000000 --- a/src/mathlib/3dnow.cpp +++ /dev/null @@ -1,197 +0,0 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// -// -// Purpose: 3DNow Math primitives. -// -//=====================================================================================// - -#include -#include // Needed for FLT_EPSILON -#include "basetypes.h" -#include -#include "tier0/dbg.h" -#include "mathlib/mathlib.h" -#include "mathlib/amd3dx.h" -#include "mathlib/vector.h" - -// memdbgon must be the last include file in a .cpp file!!! -#include "tier0/memdbgon.h" - -#if !defined(COMPILER_MSVC64) && !defined(LINUX) -// Implement for 64-bit Windows if needed. -// Clang hits "fatal error: error in backend:" and other errors when trying -// to compile the inline assembly below. 3DNow support is highly unlikely to -// be useful/used, so it's not worth spending time on fixing. - -#pragma warning(disable:4244) // "conversion from 'const int' to 'float', possible loss of data" -#pragma warning(disable:4730) // "mixing _m64 and floating point expressions may result in incorrect code" - -//----------------------------------------------------------------------------- -// 3D Now Implementations of optimized routines: -//----------------------------------------------------------------------------- -float _3DNow_Sqrt(float x) -{ - Assert( s_bMathlibInitialized ); - float root = 0.f; -#ifdef _WIN32 - _asm - { - femms - movd mm0, x - PFRSQRT (mm1,mm0) - punpckldq mm0, mm0 - PFMUL (mm0, mm1) - movd root, mm0 - femms - } -#elif LINUX - __asm __volatile__( "femms" ); - __asm __volatile__ - ( - "pfrsqrt %y0, %y1 \n\t" - "punpckldq %y1, %y1 \n\t" - "pfmul %y1, %y0 \n\t" - : "=y" (root), "=y" (x) - :"0" (x) - ); - __asm __volatile__( "femms" ); -#else -#error -#endif - - return root; -} - -// NJS FIXME: Need to test Recripricol squareroot performance and accuraccy -// on AMD's before using the specialized instruction. -float _3DNow_RSqrt(float x) -{ - Assert( s_bMathlibInitialized ); - - return 1.f / _3DNow_Sqrt(x); -} - - -float FASTCALL _3DNow_VectorNormalize (Vector& vec) -{ - Assert( s_bMathlibInitialized ); - float *v = &vec[0]; - float radius = 0.f; - - if ( v[0] || v[1] || v[2] ) - { -#ifdef _WIN32 - _asm - { - mov eax, v - femms - movq mm0, QWORD PTR [eax] - movd mm1, DWORD PTR [eax+8] - movq mm2, mm0 - movq mm3, mm1 - PFMUL (mm0, mm0) - PFMUL (mm1, mm1) - PFACC (mm0, mm0) - PFADD (mm1, mm0) - PFRSQRT (mm0, mm1) - punpckldq mm1, mm1 - PFMUL (mm1, mm0) - PFMUL (mm2, mm0) - PFMUL (mm3, mm0) - movq QWORD PTR [eax], mm2 - movd DWORD PTR [eax+8], mm3 - movd radius, mm1 - femms - } -#elif LINUX - long long a,c; - int b,d; - memcpy(&a,&vec[0],sizeof(a)); - memcpy(&b,&vec[2],sizeof(b)); - memcpy(&c,&vec[0],sizeof(c)); - memcpy(&d,&vec[2],sizeof(d)); - - __asm __volatile__( "femms" ); - __asm __volatile__ - ( - "pfmul %y3, %y3\n\t" - "pfmul %y0, %y0 \n\t" - "pfacc %y3, %y3 \n\t" - "pfadd %y3, %y0 \n\t" - "pfrsqrt %y0, %y3 \n\t" - "punpckldq %y0, %y0 \n\t" - "pfmul %y3, %y0 \n\t" - "pfmul %y3, %y2 \n\t" - "pfmul %y3, %y1 \n\t" - : "=y" (radius), "=y" (c), "=y" (d) - : "y" (a), "0" (b), "1" (c), "2" (d) - ); - memcpy(&vec[0],&c,sizeof(c)); - memcpy(&vec[2],&d,sizeof(d)); - __asm __volatile__( "femms" ); - -#else -#error -#endif - } - return radius; -} - - -void FASTCALL _3DNow_VectorNormalizeFast (Vector& vec) -{ - _3DNow_VectorNormalize( vec ); -} - - -// JAY: This complains with the latest processor pack -#pragma warning(disable: 4730) - -float _3DNow_InvRSquared(const float* v) -{ - Assert( s_bMathlibInitialized ); - float r2 = 1.f; -#ifdef _WIN32 - _asm { // AMD 3DNow only routine - mov eax, v - femms - movq mm0, QWORD PTR [eax] - movd mm1, DWORD PTR [eax+8] - movd mm2, [r2] - PFMUL (mm0, mm0) - PFMUL (mm1, mm1) - PFACC (mm0, mm0) - PFADD (mm1, mm0) - PFMAX (mm1, mm2) - PFRCP (mm0, mm1) - movd [r2], mm0 - femms - } -#elif LINUX - long long a,c; - int b; - memcpy(&a,&v[0],sizeof(a)); - memcpy(&b,&v[2],sizeof(b)); - memcpy(&c,&v[0],sizeof(c)); - - __asm __volatile__( "femms" ); - __asm __volatile__ - ( - "PFMUL %y2, %y2 \n\t" - "PFMUL %y3, %y3 \n\t" - "PFACC %y2, %y2 \n\t" - "PFADD %y2, %y3 \n\t" - "PFMAX %y3, %y4 \n\t" - "PFRCP %y3, %y2 \n\t" - "movq %y2, %y0 \n\t" - : "=y" (r2) - : "0" (r2), "y" (a), "y" (b), "y" (c) - ); - __asm __volatile__( "femms" ); -#else -#error -#endif - - return r2; -} - -#endif // COMPILER_MSVC64 diff --git a/src/mathlib/3dnow.h b/src/mathlib/3dnow.h deleted file mode 100644 index c39b2ec5c..000000000 --- a/src/mathlib/3dnow.h +++ /dev/null @@ -1,16 +0,0 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// -// -// Purpose: -// -//=====================================================================================// - -#ifndef _3DNOW_H -#define _3DNOW_H - -float _3DNow_Sqrt(float x); -float _3DNow_RSqrt(float x); -float FASTCALL _3DNow_VectorNormalize (Vector& vec); -void FASTCALL _3DNow_VectorNormalizeFast (Vector& vec); -float _3DNow_InvRSquared(const float* v); - -#endif // _3DNOW_H diff --git a/src/mathlib/mathlib.vpc b/src/mathlib/mathlib.vpc index 17021025f..ae3c6c757 100644 --- a/src/mathlib/mathlib.vpc +++ b/src/mathlib/mathlib.vpc @@ -27,9 +27,7 @@ $Project "mathlib" $File "powsse.cpp" $File "sparse_convolution_noise.cpp" $File "sseconst.cpp" - $File "sse.cpp" [$WINDOWS||$POSIX] $File "ssenoise.cpp" - $File "3dnow.cpp" [$WINDOWS||$LINUX] $File "anorms.cpp" $File "bumpvects.cpp" $File "IceKey.cpp" @@ -76,7 +74,5 @@ $Project "mathlib" $Folder "Header Files" { $File "noisedata.h" - $File "sse.h" [$WINDOWS||$POSIX] - $File "3dnow.h" [$WINDOWS||$LINUX] } } diff --git a/src/mathlib/mathlib_base.cpp b/src/mathlib/mathlib_base.cpp index 6503072cb..4fe9d8fae 100644 --- a/src/mathlib/mathlib_base.cpp +++ b/src/mathlib/mathlib_base.cpp @@ -21,13 +21,6 @@ #include "mathlib/mathlib.h" #include "mathlib/vector.h" -#if !defined( _X360 ) -#include "mathlib/amd3dx.h" -#ifndef OSX -#include "3dnow.h" -#endif -#include "sse.h" -#endif #include "mathlib/ssemath.h" #include "mathlib/ssequaternion.h" @@ -47,83 +40,6 @@ const QAngle vec3_angle(0,0,0); const Vector vec3_invalid( FLT_MAX, FLT_MAX, FLT_MAX ); const int nanmask = 255<<23; -//----------------------------------------------------------------------------- -// Standard C implementations of optimized routines: -//----------------------------------------------------------------------------- -float _sqrtf(float _X) -{ - Assert( s_bMathlibInitialized ); - return sqrtf(_X); -} - -float _rsqrtf(float x) -{ - Assert( s_bMathlibInitialized ); - - return 1.f / _sqrtf( x ); -} - -float FASTCALL _VectorNormalize (Vector& vec) -{ -#ifdef _VPROF_MATHLIB - VPROF_BUDGET( "_VectorNormalize", "Mathlib" ); -#endif - Assert( s_bMathlibInitialized ); - float radius = sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z); - - // FLT_EPSILON is added to the radius to eliminate the possibility of divide by zero. - float iradius = 1.f / ( radius + FLT_EPSILON ); - - vec.x *= iradius; - vec.y *= iradius; - vec.z *= iradius; - - return radius; -} - -// TODO: Add fast C VectorNormalizeFast. -// Perhaps use approximate rsqrt trick, if the accuracy isn't too bad. -void FASTCALL _VectorNormalizeFast (Vector& vec) -{ - Assert( s_bMathlibInitialized ); - - // FLT_EPSILON is added to the radius to eliminate the possibility of divide by zero. - float iradius = 1.f / ( sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z) + FLT_EPSILON ); - - vec.x *= iradius; - vec.y *= iradius; - vec.z *= iradius; - -} - -float _InvRSquared(const float* v) -{ - Assert( s_bMathlibInitialized ); - float r2 = DotProduct(v, v); - return r2 < 1.f ? 1.f : 1/r2; -} - -//----------------------------------------------------------------------------- -// Function pointers selecting the appropriate implementation -//----------------------------------------------------------------------------- -float (*pfSqrt)(float x) = _sqrtf; -float (*pfRSqrt)(float x) = _rsqrtf; -float (*pfRSqrtFast)(float x) = _rsqrtf; -float (FASTCALL *pfVectorNormalize)(Vector& v) = _VectorNormalize; -void (FASTCALL *pfVectorNormalizeFast)(Vector& v) = _VectorNormalizeFast; -float (*pfInvRSquared)(const float* v) = _InvRSquared; -void (*pfFastSinCos)(float x, float* s, float* c) = SinCos; -float (*pfFastCos)(float x) = cosf; - -float SinCosTable[SIN_TABLE_SIZE]; -void InitSinCosTable() -{ - for( int i = 0; i < SIN_TABLE_SIZE; i++ ) - { - SinCosTable[i] = sin(i * 2.0 * M_PI / SIN_TABLE_SIZE); - } -} - qboolean VectorsEqual( const float *v1, const float *v2 ) { Assert( s_bMathlibInitialized ); @@ -1200,7 +1116,7 @@ void AngleMatrix( const QAngle &angles, matrix3x4_t& matrix ) float sr, sp, sy, cr, cp, cy; -#ifdef _X360 +#if defined(_X360) || USE_DXMATH fltx4 radians, scale, sine, cosine; radians = LoadUnaligned3SIMD( angles.Base() ); scale = ReplicateX4( M_PI_F / 180.f ); @@ -2024,7 +1940,7 @@ void AngleQuaternion( const RadianEuler &angles, Quaternion &outQuat ) float sr, sp, sy, cr, cp, cy; -#ifdef _X360 +#if defined(_X360) || USE_DXMATH fltx4 radians, scale, sine, cosine; radians = LoadUnaligned3SIMD( &angles.x ); scale = ReplicateX4( 0.5f ); @@ -2068,7 +1984,7 @@ void AngleQuaternion( const QAngle &angles, Quaternion &outQuat ) float sr, sp, sy, cr, cp, cy; -#ifdef _X360 +#if defined(_X360) fltx4 radians, scale, sine, cosine; radians = LoadUnaligned3SIMD( angles.Base() ); scale = ReplicateX4( 0.5f * M_PI_F / 180.f ); @@ -3317,92 +3233,14 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright // FIXME: Hook SSE into VectorAligned + Vector4DAligned #if !defined( _X360 ) - // Grab the processor information: - const CPUInformation& pi = *GetCPUInformation(); - - // Select the default generic routines. - pfSqrt = _sqrtf; - pfRSqrt = _rsqrtf; - pfRSqrtFast = _rsqrtf; - pfVectorNormalize = _VectorNormalize; - pfVectorNormalizeFast = _VectorNormalizeFast; - pfInvRSquared = _InvRSquared; - pfFastSinCos = SinCos; - pfFastCos = cosf; - - if ( bAllowMMX && pi.m_bMMX ) - { - // Select the MMX specific routines if available - // (MMX routines were used by SW span fillers - not currently used for HW) - s_bMMXEnabled = true; - } - else - { - s_bMMXEnabled = false; - } - - // SSE Generally performs better than 3DNow when present, so this is placed - // first to allow SSE to override these settings. -#if !defined( OSX ) && !defined( PLATFORM_WINDOWS_PC64 ) && !defined(LINUX) - if ( bAllow3DNow && pi.m_b3DNow ) - { - s_b3DNowEnabled = true; - - // Select the 3DNow specific routines if available; - pfVectorNormalize = _3DNow_VectorNormalize; - pfVectorNormalizeFast = _3DNow_VectorNormalizeFast; - pfInvRSquared = _3DNow_InvRSquared; - pfSqrt = _3DNow_Sqrt; - pfRSqrt = _3DNow_RSqrt; - pfRSqrtFast = _3DNow_RSqrt; - } - else -#endif - { - s_b3DNowEnabled = false; - } - - if ( bAllowSSE && pi.m_bSSE ) - { - s_bSSEEnabled = true; - -#ifndef PLATFORM_WINDOWS_PC64 - // These are not yet available. - // Select the SSE specific routines if available - pfVectorNormalize = _VectorNormalize; - pfVectorNormalizeFast = _SSE_VectorNormalizeFast; - pfInvRSquared = _SSE_InvRSquared; - pfSqrt = _SSE_Sqrt; - pfRSqrt = _SSE_RSqrtAccurate; - pfRSqrtFast = _SSE_RSqrtFast; -#endif -#ifdef PLATFORM_WINDOWS_PC32 - pfFastSinCos = _SSE_SinCos; - pfFastCos = _SSE_cos; -#endif - } - else - { - s_bSSEEnabled = false; - } - - if ( bAllowSSE2 && pi.m_bSSE2 ) - { - s_bSSE2Enabled = true; -#ifdef PLATFORM_WINDOWS_PC32 - pfFastSinCos = _SSE2_SinCos; - pfFastCos = _SSE2_cos; -#endif - } - else - { - s_bSSE2Enabled = false; - } + s_b3DNowEnabled = false; + s_bMMXEnabled = false; + s_bSSEEnabled = true; + s_bSSE2Enabled = true; #endif // !_X360 s_bMathlibInitialized = true; - InitSinCosTable(); BuildGammaTable( gamma, texGamma, brightness, overbright ); } diff --git a/src/mathlib/sse.cpp b/src/mathlib/sse.cpp deleted file mode 100644 index 018a7a5b9..000000000 --- a/src/mathlib/sse.cpp +++ /dev/null @@ -1,1107 +0,0 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// -// -// Purpose: SSE Math primitives. -// -//=====================================================================================// - -#include -#include // Needed for FLT_EPSILON -#include "basetypes.h" -#include -#include "tier0/dbg.h" -#include "mathlib/mathlib.h" -#include "mathlib/vector.h" -#include "sse.h" - -// memdbgon must be the last include file in a .cpp file!!! -#include "tier0/memdbgon.h" - -#ifndef COMPILER_MSVC64 -// Implement for 64-bit Windows if needed. - -static const uint32 _sincos_masks[] = { (uint32)0x0, (uint32)~0x0 }; -static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 }; - -//----------------------------------------------------------------------------- -// Macros and constants required by some of the SSE assembly: -//----------------------------------------------------------------------------- - -#ifdef _WIN32 - #define _PS_EXTERN_CONST(Name, Val) \ - const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val } - - #define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \ - const __declspec(align(16)) Type _ps_##Name[4] = { Val, Val, Val, Val }; \ - - #define _EPI32_CONST(Name, Val) \ - static const __declspec(align(16)) __int32 _epi32_##Name[4] = { Val, Val, Val, Val } - - #define _PS_CONST(Name, Val) \ - static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val } -#elif POSIX - #define _PS_EXTERN_CONST(Name, Val) \ - const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val } - - #define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \ - const Type _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }; \ - - #define _EPI32_CONST(Name, Val) \ - static const int32 _epi32_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val } - - #define _PS_CONST(Name, Val) \ - static const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val } -#endif - -_PS_EXTERN_CONST(am_0, 0.0f); -_PS_EXTERN_CONST(am_1, 1.0f); -_PS_EXTERN_CONST(am_m1, -1.0f); -_PS_EXTERN_CONST(am_0p5, 0.5f); -_PS_EXTERN_CONST(am_1p5, 1.5f); -_PS_EXTERN_CONST(am_pi, (float)M_PI); -_PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0)); -_PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI)); -_PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0)); -_PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI)); -_PS_EXTERN_CONST_TYPE(am_sign_mask, uint32, 0x80000000); -_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, uint32, ~0x80000000); -_PS_EXTERN_CONST_TYPE(am_min_norm_pos,uint32, 0x00800000); -_PS_EXTERN_CONST_TYPE(am_mant_mask, uint32, 0x7f800000); -_PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000); - -_EPI32_CONST(1, 1); -_EPI32_CONST(2, 2); - -_PS_CONST(sincos_p0, 0.15707963267948963959e1f); -_PS_CONST(sincos_p1, -0.64596409750621907082e0f); -_PS_CONST(sincos_p2, 0.7969262624561800806e-1f); -_PS_CONST(sincos_p3, -0.468175413106023168e-2f); - -#ifdef PFN_VECTORMA -void __cdecl _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest ); -#endif - -//----------------------------------------------------------------------------- -// SSE implementations of optimized routines: -//----------------------------------------------------------------------------- -float _SSE_Sqrt(float x) -{ - Assert( s_bMathlibInitialized ); - float root = 0.f; -#ifdef _WIN32 - _asm - { - sqrtss xmm0, x - movss root, xmm0 - } -#elif POSIX - _mm_store_ss( &root, _mm_sqrt_ss( _mm_load_ss( &x ) ) ); -#endif - return root; -} - -// Single iteration NewtonRaphson reciprocal square root: -// 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x)) -// Very low error, and fine to use in place of 1.f / sqrtf(x). -#if 0 -float _SSE_RSqrtAccurate(float x) -{ - Assert( s_bMathlibInitialized ); - - float rroot; - _asm - { - rsqrtss xmm0, x - movss rroot, xmm0 - } - - return (0.5f * rroot) * (3.f - (x * rroot) * rroot); -} -#else - -#ifdef POSIX -const __m128 f3 = _mm_set_ss(3.0f); // 3 as SSE value -const __m128 f05 = _mm_set_ss(0.5f); // 0.5 as SSE value -#endif - -// Intel / Kipps SSE RSqrt. Significantly faster than above. -float _SSE_RSqrtAccurate(float a) -{ - -#ifdef _WIN32 - float x; - float half = 0.5f; - float three = 3.f; - - __asm - { - movss xmm3, a; - movss xmm1, half; - movss xmm2, three; - rsqrtss xmm0, xmm3; - - mulss xmm3, xmm0; - mulss xmm1, xmm0; - mulss xmm3, xmm0; - subss xmm2, xmm3; - mulss xmm1, xmm2; - - movss x, xmm1; - } - - return x; -#elif POSIX - __m128 xx = _mm_load_ss( &a ); - __m128 xr = _mm_rsqrt_ss( xx ); - __m128 xt; - - xt = _mm_mul_ss( xr, xr ); - xt = _mm_mul_ss( xt, xx ); - xt = _mm_sub_ss( f3, xt ); - xt = _mm_mul_ss( xt, f05 ); - xr = _mm_mul_ss( xr, xt ); - - _mm_store_ss( &a, xr ); - return a; -#else - #error "Not Implemented" -#endif - -} -#endif - -// Simple SSE rsqrt. Usually accurate to around 6 (relative) decimal places -// or so, so ok for closed transforms. (ie, computing lighting normals) -float _SSE_RSqrtFast(float x) -{ - Assert( s_bMathlibInitialized ); - - float rroot; -#ifdef _WIN32 - _asm - { - rsqrtss xmm0, x - movss rroot, xmm0 - } -#elif POSIX - __asm__ __volatile__( "rsqrtss %0, %1" : "=x" (rroot) : "x" (x) ); -#else -#error -#endif - - return rroot; -} - -float FASTCALL _SSE_VectorNormalize (Vector& vec) -{ - Assert( s_bMathlibInitialized ); - - // NOTE: This is necessary to prevent an memory overwrite... - // sice vec only has 3 floats, we can't "movaps" directly into it. -#ifdef _WIN32 - __declspec(align(16)) float result[4]; -#elif POSIX - float result[4] __attribute__((aligned(16))); -#endif - - float *v = &vec[0]; -#ifdef _WIN32 - float *r = &result[0]; -#endif - - float radius = 0.f; - // Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't - // be much of a performance win, considering you will very likely miss 3 branch predicts in a row. - if ( v[0] || v[1] || v[2] ) - { -#ifdef _WIN32 - _asm - { - mov eax, v - mov edx, r -#ifdef ALIGNED_VECTOR - movaps xmm4, [eax] // r4 = vx, vy, vz, X - movaps xmm1, xmm4 // r1 = r4 -#else - movups xmm4, [eax] // r4 = vx, vy, vz, X - movaps xmm1, xmm4 // r1 = r4 -#endif - mulps xmm1, xmm4 // r1 = vx * vx, vy * vy, vz * vz, X - movhlps xmm3, xmm1 // r3 = vz * vz, X, X, X - movaps xmm2, xmm1 // r2 = r1 - shufps xmm2, xmm2, 1 // r2 = vy * vy, X, X, X - addss xmm1, xmm2 // r1 = (vx * vx) + (vy * vy), X, X, X - addss xmm1, xmm3 // r1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X - sqrtss xmm1, xmm1 // r1 = sqrt((vx * vx) + (vy * vy) + (vz * vz)), X, X, X - movss radius, xmm1 // radius = sqrt((vx * vx) + (vy * vy) + (vz * vz)) - rcpss xmm1, xmm1 // r1 = 1/radius, X, X, X - shufps xmm1, xmm1, 0 // r1 = 1/radius, 1/radius, 1/radius, X - mulps xmm4, xmm1 // r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X - movaps [edx], xmm4 // v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X - } -#elif POSIX - __asm__ __volatile__( -#ifdef ALIGNED_VECTOR - "movaps %2, %%xmm4 \n\t" - "movaps %%xmm4, %%xmm1 \n\t" -#else - "movups %2, %%xmm4 \n\t" - "movaps %%xmm4, %%xmm1 \n\t" -#endif - "mulps %%xmm4, %%xmm1 \n\t" - "movhlps %%xmm1, %%xmm3 \n\t" - "movaps %%xmm1, %%xmm2 \n\t" - "shufps $1, %%xmm2, %%xmm2 \n\t" - "addss %%xmm2, %%xmm1 \n\t" - "addss %%xmm3, %%xmm1 \n\t" - "sqrtss %%xmm1, %%xmm1 \n\t" - "movss %%xmm1, %0 \n\t" - "rcpss %%xmm1, %%xmm1 \n\t" - "shufps $0, %%xmm1, %%xmm1 \n\t" - "mulps %%xmm1, %%xmm4 \n\t" - "movaps %%xmm4, %1 \n\t" - : "=m" (radius), "=m" (result) - : "m" (*v) - : "xmm1", "xmm2", "xmm3", "xmm4" - ); -#else - #error "Not Implemented" -#endif - vec.x = result[0]; - vec.y = result[1]; - vec.z = result[2]; - - } - - return radius; -} - -void FASTCALL _SSE_VectorNormalizeFast (Vector& vec) -{ - float ool = _SSE_RSqrtAccurate( FLT_EPSILON + vec.x * vec.x + vec.y * vec.y + vec.z * vec.z ); - - vec.x *= ool; - vec.y *= ool; - vec.z *= ool; -} - -float _SSE_InvRSquared(const float* v) -{ - float inv_r2 = 1.f; -#ifdef _WIN32 - _asm { // Intel SSE only routine - mov eax, v - movss xmm5, inv_r2 // x5 = 1.0, 0, 0, 0 -#ifdef ALIGNED_VECTOR - movaps xmm4, [eax] // x4 = vx, vy, vz, X -#else - movups xmm4, [eax] // x4 = vx, vy, vz, X -#endif - movaps xmm1, xmm4 // x1 = x4 - mulps xmm1, xmm4 // x1 = vx * vx, vy * vy, vz * vz, X - movhlps xmm3, xmm1 // x3 = vz * vz, X, X, X - movaps xmm2, xmm1 // x2 = x1 - shufps xmm2, xmm2, 1 // x2 = vy * vy, X, X, X - addss xmm1, xmm2 // x1 = (vx * vx) + (vy * vy), X, X, X - addss xmm1, xmm3 // x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X - maxss xmm1, xmm5 // x1 = max( 1.0, x1 ) - rcpss xmm0, xmm1 // x0 = 1 / max( 1.0, x1 ) - movss inv_r2, xmm0 // inv_r2 = x0 - } -#elif POSIX - __asm__ __volatile__( - "movss %0, %%xmm5 \n\t" -#ifdef ALIGNED_VECTOR - "movaps %1, %%xmm4 \n\t" -#else - "movups %1, %%xmm4 \n\t" -#endif - "movaps %%xmm4, %%xmm1 \n\t" - "mulps %%xmm4, %%xmm1 \n\t" - "movhlps %%xmm1, %%xmm3 \n\t" - "movaps %%xmm1, %%xmm2 \n\t" - "shufps $1, %%xmm2, %%xmm2 \n\t" - "addss %%xmm2, %%xmm1 \n\t" - "addss %%xmm3, %%xmm1 \n\t" - "maxss %%xmm5, %%xmm1 \n\t" - "rcpss %%xmm1, %%xmm0 \n\t" - "movss %%xmm0, %0 \n\t" - : "+m" (inv_r2) - : "m" (*v) - : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -#else - #error "Not Implemented" -#endif - - return inv_r2; -} - - -#ifdef POSIX -// #define _PS_CONST(Name, Val) static const ALIGN16 float _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val } -#define _PS_CONST_TYPE(Name, Type, Val) static const ALIGN16 Type _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val } - -_PS_CONST_TYPE(sign_mask, int, (int)0x80000000); -_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000); - - -#define _PI32_CONST(Name, Val) static const ALIGN16 int _pi32_##Name[4] ALIGN16_POST = { Val, Val, Val, Val } - -_PI32_CONST(1, 1); -_PI32_CONST(inv1, ~1); -_PI32_CONST(2, 2); -_PI32_CONST(4, 4); -_PI32_CONST(0x7f, 0x7f); -_PS_CONST(1 , 1.0f); -_PS_CONST(0p5, 0.5f); - -_PS_CONST(minus_cephes_DP1, -0.78515625); -_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); -_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); -_PS_CONST(sincof_p0, -1.9515295891E-4); -_PS_CONST(sincof_p1, 8.3321608736E-3); -_PS_CONST(sincof_p2, -1.6666654611E-1); -_PS_CONST(coscof_p0, 2.443315711809948E-005); -_PS_CONST(coscof_p1, -1.388731625493765E-003); -_PS_CONST(coscof_p2, 4.166664568298827E-002); -_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI - -typedef union xmm_mm_union { - __m128 xmm; - __m64 mm[2]; -} xmm_mm_union; - -#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; } - -typedef __m128 v4sf; // vector of 4 float (sse1) -typedef __m64 v2si; // vector of 2 int (mmx) - -#endif - -void _SSE_SinCos(float x, float* s, float* c) -{ -#ifdef _WIN32 - float t4, t8, t12; - - __asm - { - movss xmm0, x - movss t12, xmm0 - movss xmm1, _ps_am_inv_sign_mask - mov eax, t12 - mulss xmm0, _ps_am_2_o_pi - andps xmm0, xmm1 - and eax, 0x80000000 - - cvttss2si edx, xmm0 - mov ecx, edx - mov t12, esi - mov esi, edx - add edx, 0x1 - shl ecx, (31 - 1) - shl edx, (31 - 1) - - movss xmm4, _ps_am_1 - cvtsi2ss xmm3, esi - mov t8, eax - and esi, 0x1 - - subss xmm0, xmm3 - movss xmm3, _sincos_inv_masks[esi * 4] - minss xmm0, xmm4 - - subss xmm4, xmm0 - - movss xmm6, xmm4 - andps xmm4, xmm3 - and ecx, 0x80000000 - movss xmm2, xmm3 - andnps xmm3, xmm0 - and edx, 0x80000000 - movss xmm7, t8 - andps xmm0, xmm2 - mov t8, ecx - mov t4, edx - orps xmm4, xmm3 - - mov eax, s //mov eax, [esp + 4 + 16] - mov edx, c //mov edx, [esp + 4 + 16 + 4] - - andnps xmm2, xmm6 - orps xmm0, xmm2 - - movss xmm2, t8 - movss xmm1, xmm0 - movss xmm5, xmm4 - xorps xmm7, xmm2 - movss xmm3, _ps_sincos_p3 - mulss xmm0, xmm0 - mulss xmm4, xmm4 - movss xmm2, xmm0 - movss xmm6, xmm4 - orps xmm1, xmm7 - movss xmm7, _ps_sincos_p2 - mulss xmm0, xmm3 - mulss xmm4, xmm3 - movss xmm3, _ps_sincos_p1 - addss xmm0, xmm7 - addss xmm4, xmm7 - movss xmm7, _ps_sincos_p0 - mulss xmm0, xmm2 - mulss xmm4, xmm6 - addss xmm0, xmm3 - addss xmm4, xmm3 - movss xmm3, t4 - mulss xmm0, xmm2 - mulss xmm4, xmm6 - orps xmm5, xmm3 - mov esi, t12 - addss xmm0, xmm7 - addss xmm4, xmm7 - mulss xmm0, xmm1 - mulss xmm4, xmm5 - - // use full stores since caller might reload with full loads - movss [eax], xmm0 - movss [edx], xmm4 - } -#elif POSIX - - Assert( "Needs testing, verify impl!\n" ); - - v4sf xx = _mm_load_ss( &x ); - - v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; - v2si mm0, mm1, mm2, mm3, mm4, mm5; - sign_bit_sin = xx; - /* take the absolute value */ - xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask); - /* extract the sign bit (upper one) */ - sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask); - - /* scale by 4/Pi */ - y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI); - - /* store the integer part of y in mm2:mm3 */ - xmm3 = _mm_movehl_ps(xmm3, y); - mm2 = _mm_cvttps_pi32(y); - mm3 = _mm_cvttps_pi32(xmm3); - - /* j=(j+1) & (~1) (see the cephes sources) */ - mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1); - mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1); - mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1); - mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1); - - y = _mm_cvtpi32x2_ps(mm2, mm3); - - mm4 = mm2; - mm5 = mm3; - - /* get the swap sign flag for the sine */ - mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4); - mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4); - mm0 = _mm_slli_pi32(mm0, 29); - mm1 = _mm_slli_pi32(mm1, 29); - v4sf swap_sign_bit_sin; - COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin); - - /* get the polynom selection mask for the sine */ - - mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2); - mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2); - mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); - mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); - v4sf poly_mask; - COPY_MM_TO_XMM(mm2, mm3, poly_mask); - - /* The magic pass: "Extended precision modular arithmetic" - x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(v4sf*)_ps_minus_cephes_DP1; - xmm2 = *(v4sf*)_ps_minus_cephes_DP2; - xmm3 = *(v4sf*)_ps_minus_cephes_DP3; - xmm1 = _mm_mul_ps(y, xmm1); - xmm2 = _mm_mul_ps(y, xmm2); - xmm3 = _mm_mul_ps(y, xmm3); - xx = _mm_add_ps(xx, xmm1); - xx = _mm_add_ps(xx, xmm2); - xx = _mm_add_ps(xx, xmm3); - - /* get the sign flag for the cosine */ - mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2); - mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2); - mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4); - mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4); - mm4 = _mm_slli_pi32(mm4, 29); - mm5 = _mm_slli_pi32(mm5, 29); - v4sf sign_bit_cos; - COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos); - _mm_empty(); /* good-bye mmx */ - - sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); - - - /* Evaluate the first polynom (0 <= x <= Pi/4) */ - v4sf z = _mm_mul_ps(xx,xx); - y = *(v4sf*)_ps_coscof_p0; - - y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1); - y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2); - y = _mm_mul_ps(y, z); - y = _mm_mul_ps(y, z); - v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); - y = _mm_sub_ps(y, tmp); - y = _mm_add_ps(y, *(v4sf*)_ps_1); - - /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - - v4sf y2 = *(v4sf*)_ps_sincof_p0; - y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); - y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); - y2 = _mm_mul_ps(y2, z); - y2 = _mm_mul_ps(y2, xx); - y2 = _mm_add_ps(y2, xx); - - /* select the correct result from the two polynoms */ - xmm3 = poly_mask; - v4sf ysin2 = _mm_and_ps(xmm3, y2); - v4sf ysin1 = _mm_andnot_ps(xmm3, y); - y2 = _mm_sub_ps(y2,ysin2); - y = _mm_sub_ps(y, ysin1); - - xmm1 = _mm_add_ps(ysin1,ysin2); - xmm2 = _mm_add_ps(y,y2); - - /* update the sign */ - _mm_store_ss( s, _mm_xor_ps(xmm1, sign_bit_sin) ); - _mm_store_ss( c, _mm_xor_ps(xmm2, sign_bit_cos) ); - -#else - #error "Not Implemented" -#endif -} - -float _SSE_cos( float x ) -{ -#ifdef _WIN32 - float temp; - __asm - { - movss xmm0, x - movss xmm1, _ps_am_inv_sign_mask - andps xmm0, xmm1 - addss xmm0, _ps_am_pi_o_2 - mulss xmm0, _ps_am_2_o_pi - - cvttss2si ecx, xmm0 - movss xmm5, _ps_am_1 - mov edx, ecx - shl edx, (31 - 1) - cvtsi2ss xmm1, ecx - and edx, 0x80000000 - and ecx, 0x1 - - subss xmm0, xmm1 - movss xmm6, _sincos_masks[ecx * 4] - minss xmm0, xmm5 - - movss xmm1, _ps_sincos_p3 - subss xmm5, xmm0 - - andps xmm5, xmm6 - movss xmm7, _ps_sincos_p2 - andnps xmm6, xmm0 - mov temp, edx - orps xmm5, xmm6 - movss xmm0, xmm5 - - mulss xmm5, xmm5 - movss xmm4, _ps_sincos_p1 - movss xmm2, xmm5 - mulss xmm5, xmm1 - movss xmm1, _ps_sincos_p0 - addss xmm5, xmm7 - mulss xmm5, xmm2 - movss xmm3, temp - addss xmm5, xmm4 - mulss xmm5, xmm2 - orps xmm0, xmm3 - addss xmm5, xmm1 - mulss xmm0, xmm5 - - movss x, xmm0 - - } -#elif POSIX - - Assert( "Needs testing, verify impl!\n" ); - - v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y; - v2si mm0, mm1, mm2, mm3; - /* take the absolute value */ - v4sf xx = _mm_load_ss( &x ); - - xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask); - - /* scale by 4/Pi */ - y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI); - - /* store the integer part of y in mm0:mm1 */ - xmm2 = _mm_movehl_ps(xmm2, y); - mm2 = _mm_cvttps_pi32(y); - mm3 = _mm_cvttps_pi32(xmm2); - - /* j=(j+1) & (~1) (see the cephes sources) */ - mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1); - mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1); - mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1); - mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1); - - y = _mm_cvtpi32x2_ps(mm2, mm3); - - - mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2); - mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2); - - /* get the swap sign flag in mm0:mm1 and the - polynom selection mask in mm2:mm3 */ - - mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4); - mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4); - mm0 = _mm_slli_pi32(mm0, 29); - mm1 = _mm_slli_pi32(mm1, 29); - - mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2); - mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2); - - mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); - mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); - - v4sf sign_bit, poly_mask; - COPY_MM_TO_XMM(mm0, mm1, sign_bit); - COPY_MM_TO_XMM(mm2, mm3, poly_mask); - _mm_empty(); /* good-bye mmx */ - - /* The magic pass: "Extended precision modular arithmetic" - x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(v4sf*)_ps_minus_cephes_DP1; - xmm2 = *(v4sf*)_ps_minus_cephes_DP2; - xmm3 = *(v4sf*)_ps_minus_cephes_DP3; - xmm1 = _mm_mul_ps(y, xmm1); - xmm2 = _mm_mul_ps(y, xmm2); - xmm3 = _mm_mul_ps(y, xmm3); - xx = _mm_add_ps(xx, xmm1); - xx = _mm_add_ps(xx, xmm2); - xx = _mm_add_ps(xx, xmm3); - - /* Evaluate the first polynom (0 <= x <= Pi/4) */ - y = *(v4sf*)_ps_coscof_p0; - v4sf z = _mm_mul_ps(xx,xx); - - y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1); - y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2); - y = _mm_mul_ps(y, z); - y = _mm_mul_ps(y, z); - v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); - y = _mm_sub_ps(y, tmp); - y = _mm_add_ps(y, *(v4sf*)_ps_1); - - /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - - v4sf y2 = *(v4sf*)_ps_sincof_p0; - y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); - y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); - y2 = _mm_mul_ps(y2, z); - y2 = _mm_mul_ps(y2, xx); - y2 = _mm_add_ps(y2, xx); - - /* select the correct result from the two polynoms */ - xmm3 = poly_mask; - y2 = _mm_and_ps(xmm3, y2); //, xmm3); - y = _mm_andnot_ps(xmm3, y); - y = _mm_add_ps(y,y2); - /* update the sign */ - - _mm_store_ss( &x, _mm_xor_ps(y, sign_bit) ); - -#else - #error "Not Implemented" -#endif - - return x; -} - -//----------------------------------------------------------------------------- -// SSE2 implementations of optimized routines: -//----------------------------------------------------------------------------- -#ifdef PLATFORM_WINDOWS_PC32 -void _SSE2_SinCos(float x, float* s, float* c) // any x -{ -#ifdef _WIN32 - __asm - { - movss xmm0, x - movaps xmm7, xmm0 - movss xmm1, _ps_am_inv_sign_mask - movss xmm2, _ps_am_sign_mask - movss xmm3, _ps_am_2_o_pi - andps xmm0, xmm1 - andps xmm7, xmm2 - mulss xmm0, xmm3 - - pxor xmm3, xmm3 - movd xmm5, _epi32_1 - movss xmm4, _ps_am_1 - - cvttps2dq xmm2, xmm0 - pand xmm5, xmm2 - movd xmm1, _epi32_2 - pcmpeqd xmm5, xmm3 - movd xmm3, _epi32_1 - cvtdq2ps xmm6, xmm2 - paddd xmm3, xmm2 - pand xmm2, xmm1 - pand xmm3, xmm1 - subss xmm0, xmm6 - pslld xmm2, (31 - 1) - minss xmm0, xmm4 - - mov eax, s // mov eax, [esp + 4 + 16] - mov edx, c // mov edx, [esp + 4 + 16 + 4] - - subss xmm4, xmm0 - pslld xmm3, (31 - 1) - - movaps xmm6, xmm4 - xorps xmm2, xmm7 - movaps xmm7, xmm5 - andps xmm6, xmm7 - andnps xmm7, xmm0 - andps xmm0, xmm5 - andnps xmm5, xmm4 - movss xmm4, _ps_sincos_p3 - orps xmm6, xmm7 - orps xmm0, xmm5 - movss xmm5, _ps_sincos_p2 - - movaps xmm1, xmm0 - movaps xmm7, xmm6 - mulss xmm0, xmm0 - mulss xmm6, xmm6 - orps xmm1, xmm2 - orps xmm7, xmm3 - movaps xmm2, xmm0 - movaps xmm3, xmm6 - mulss xmm0, xmm4 - mulss xmm6, xmm4 - movss xmm4, _ps_sincos_p1 - addss xmm0, xmm5 - addss xmm6, xmm5 - movss xmm5, _ps_sincos_p0 - mulss xmm0, xmm2 - mulss xmm6, xmm3 - addss xmm0, xmm4 - addss xmm6, xmm4 - mulss xmm0, xmm2 - mulss xmm6, xmm3 - addss xmm0, xmm5 - addss xmm6, xmm5 - mulss xmm0, xmm1 - mulss xmm6, xmm7 - - // use full stores since caller might reload with full loads - movss [eax], xmm0 - movss [edx], xmm6 - } -#elif POSIX - #warning "_SSE2_SinCos NOT implemented!" - Assert( 0 ); -#else - #error "Not Implemented" -#endif -} -#endif // PLATFORM_WINDOWS_PC32 - -#ifdef PLATFORM_WINDOWS_PC32 -float _SSE2_cos(float x) -{ -#ifdef _WIN32 - __asm - { - movss xmm0, x - movss xmm1, _ps_am_inv_sign_mask - movss xmm2, _ps_am_pi_o_2 - movss xmm3, _ps_am_2_o_pi - andps xmm0, xmm1 - addss xmm0, xmm2 - mulss xmm0, xmm3 - - pxor xmm3, xmm3 - movd xmm5, _epi32_1 - movss xmm4, _ps_am_1 - cvttps2dq xmm2, xmm0 - pand xmm5, xmm2 - movd xmm1, _epi32_2 - pcmpeqd xmm5, xmm3 - cvtdq2ps xmm6, xmm2 - pand xmm2, xmm1 - pslld xmm2, (31 - 1) - - subss xmm0, xmm6 - movss xmm3, _ps_sincos_p3 - minss xmm0, xmm4 - subss xmm4, xmm0 - andps xmm0, xmm5 - andnps xmm5, xmm4 - orps xmm0, xmm5 - - movaps xmm1, xmm0 - movss xmm4, _ps_sincos_p2 - mulss xmm0, xmm0 - movss xmm5, _ps_sincos_p1 - orps xmm1, xmm2 - movaps xmm7, xmm0 - mulss xmm0, xmm3 - movss xmm6, _ps_sincos_p0 - addss xmm0, xmm4 - mulss xmm0, xmm7 - addss xmm0, xmm5 - mulss xmm0, xmm7 - addss xmm0, xmm6 - mulss xmm0, xmm1 - movss x, xmm0 - } -#elif POSIX - #warning "_SSE2_cos NOT implemented!" - Assert( 0 ); -#else - #error "Not Implemented" -#endif - - return x; -} -#endif // PLATFORM_WINDOWS_PC32 - -#if 0 -// SSE Version of VectorTransform -void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1) -{ - Assert( s_bMathlibInitialized ); - Assert( in1 != out1 ); - -#ifdef _WIN32 - __asm - { - mov eax, in1; - mov ecx, in2; - mov edx, out1; - - movss xmm0, [eax]; - mulss xmm0, [ecx]; - movss xmm1, [eax+4]; - mulss xmm1, [ecx+4]; - movss xmm2, [eax+8]; - mulss xmm2, [ecx+8]; - addss xmm0, xmm1; - addss xmm0, xmm2; - addss xmm0, [ecx+12] - movss [edx], xmm0; - add ecx, 16; - - movss xmm0, [eax]; - mulss xmm0, [ecx]; - movss xmm1, [eax+4]; - mulss xmm1, [ecx+4]; - movss xmm2, [eax+8]; - mulss xmm2, [ecx+8]; - addss xmm0, xmm1; - addss xmm0, xmm2; - addss xmm0, [ecx+12] - movss [edx+4], xmm0; - add ecx, 16; - - movss xmm0, [eax]; - mulss xmm0, [ecx]; - movss xmm1, [eax+4]; - mulss xmm1, [ecx+4]; - movss xmm2, [eax+8]; - mulss xmm2, [ecx+8]; - addss xmm0, xmm1; - addss xmm0, xmm2; - addss xmm0, [ecx+12] - movss [edx+8], xmm0; - } -#elif POSIX - #warning "VectorTransformSSE C implementation only" - out1[0] = DotProduct(in1, in2[0]) + in2[0][3]; - out1[1] = DotProduct(in1, in2[1]) + in2[1][3]; - out1[2] = DotProduct(in1, in2[2]) + in2[2][3]; -#else - #error "Not Implemented" -#endif -} -#endif - -#if 0 -void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 ) -{ - Assert( s_bMathlibInitialized ); - Assert( in1 != out1 ); - -#ifdef _WIN32 - __asm - { - mov eax, in1; - mov ecx, in2; - mov edx, out1; - - movss xmm0, [eax]; - mulss xmm0, [ecx]; - movss xmm1, [eax+4]; - mulss xmm1, [ecx+4]; - movss xmm2, [eax+8]; - mulss xmm2, [ecx+8]; - addss xmm0, xmm1; - addss xmm0, xmm2; - movss [edx], xmm0; - add ecx, 16; - - movss xmm0, [eax]; - mulss xmm0, [ecx]; - movss xmm1, [eax+4]; - mulss xmm1, [ecx+4]; - movss xmm2, [eax+8]; - mulss xmm2, [ecx+8]; - addss xmm0, xmm1; - addss xmm0, xmm2; - movss [edx+4], xmm0; - add ecx, 16; - - movss xmm0, [eax]; - mulss xmm0, [ecx]; - movss xmm1, [eax+4]; - mulss xmm1, [ecx+4]; - movss xmm2, [eax+8]; - mulss xmm2, [ecx+8]; - addss xmm0, xmm1; - addss xmm0, xmm2; - movss [edx+8], xmm0; - } -#elif POSIX - #warning "VectorRotateSSE C implementation only" - out1[0] = DotProduct( in1, in2[0] ); - out1[1] = DotProduct( in1, in2[1] ); - out1[2] = DotProduct( in1, in2[2] ); -#else - #error "Not Implemented" -#endif -} -#endif - -#ifdef _WIN32 -void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest ) -{ - // FIXME: This don't work!! It will overwrite memory in the write to dest - Assert(0); - - Assert( s_bMathlibInitialized ); - _asm { // Intel SSE only routine - mov eax, DWORD PTR [esp+0x04] ; *start, s0..s2 - mov ecx, DWORD PTR [esp+0x0c] ; *direction, d0..d2 - mov edx, DWORD PTR [esp+0x10] ; *dest - movss xmm2, [esp+0x08] ; x2 = scale, 0, 0, 0 -#ifdef ALIGNED_VECTOR - movaps xmm3, [ecx] ; x3 = dir0,dir1,dir2,X - pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale - movaps xmm1, [eax] ; x1 = start1, start2, start3, X - mulps xmm3, xmm2 ; x3 *= x2 - addps xmm3, xmm1 ; x3 += x1 - movaps [edx], xmm3 ; *dest = x3 -#else - movups xmm3, [ecx] ; x3 = dir0,dir1,dir2,X - pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale - movups xmm1, [eax] ; x1 = start1, start2, start3, X - mulps xmm3, xmm2 ; x3 *= x2 - addps xmm3, xmm1 ; x3 += x1 - movups [edx], xmm3 ; *dest = x3 -#endif - } -} -#endif - -#ifdef _WIN32 -#ifdef PFN_VECTORMA -void _declspec(naked) __cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest ) -{ - // FIXME: This don't work!! It will overwrite memory in the write to dest - Assert(0); - - Assert( s_bMathlibInitialized ); - _asm - { - // Intel SSE only routine - mov eax, DWORD PTR [esp+0x04] ; *start, s0..s2 - mov ecx, DWORD PTR [esp+0x0c] ; *direction, d0..d2 - mov edx, DWORD PTR [esp+0x10] ; *dest - movss xmm2, [esp+0x08] ; x2 = scale, 0, 0, 0 -#ifdef ALIGNED_VECTOR - movaps xmm3, [ecx] ; x3 = dir0,dir1,dir2,X - pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale - movaps xmm1, [eax] ; x1 = start1, start2, start3, X - mulps xmm3, xmm2 ; x3 *= x2 - addps xmm3, xmm1 ; x3 += x1 - movaps [edx], xmm3 ; *dest = x3 -#else - movups xmm3, [ecx] ; x3 = dir0,dir1,dir2,X - pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale - movups xmm1, [eax] ; x1 = start1, start2, start3, X - mulps xmm3, xmm2 ; x3 *= x2 - addps xmm3, xmm1 ; x3 += x1 - movups [edx], xmm3 ; *dest = x3 -#endif - } -} -float (__cdecl *pfVectorMA)(Vector& v) = _VectorMA; -#endif -#endif - - -// SSE DotProduct -- it's a smidgen faster than the asm DotProduct... -// Should be validated too! :) -// NJS: (Nov 1 2002) -NOT- faster. may time a couple cycles faster in a single function like -// this, but when inlined, and instruction scheduled, the C version is faster. -// Verified this via VTune -/* -vec_t DotProduct (const vec_t *a, const vec_t *c) -{ - vec_t temp; - - __asm - { - mov eax, a; - mov ecx, c; - mov edx, DWORD PTR [temp] - movss xmm0, [eax]; - mulss xmm0, [ecx]; - movss xmm1, [eax+4]; - mulss xmm1, [ecx+4]; - movss xmm2, [eax+8]; - mulss xmm2, [ecx+8]; - addss xmm0, xmm1; - addss xmm0, xmm2; - movss [edx], xmm0; - fld DWORD PTR [edx]; - ret - } -} -*/ - -#endif // COMPILER_MSVC64 diff --git a/src/mathlib/sse.h b/src/mathlib/sse.h deleted file mode 100644 index 1b49c50c1..000000000 --- a/src/mathlib/sse.h +++ /dev/null @@ -1,27 +0,0 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// -// -// Purpose: -// -//=====================================================================================// - -#ifndef _SSE_H -#define _SSE_H - -float _SSE_Sqrt(float x); -float _SSE_RSqrtAccurate(float a); -float _SSE_RSqrtFast(float x); -float FASTCALL _SSE_VectorNormalize(Vector& vec); -void FASTCALL _SSE_VectorNormalizeFast(Vector& vec); -float _SSE_InvRSquared(const float* v); -void _SSE_SinCos(float x, float* s, float* c); -float _SSE_cos( float x); -#ifdef PLATFORM_WINDOWS_PC32 -void _SSE2_SinCos(float x, float* s, float* c); -float _SSE2_cos(float x); -#endif -#if 0 -void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1); -void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 ); -#endif - -#endif // _SSE_H diff --git a/src/mathlib/sseconst.cpp b/src/mathlib/sseconst.cpp index d68588fdd..6c2c83ca1 100644 --- a/src/mathlib/sseconst.cpp +++ b/src/mathlib/sseconst.cpp @@ -17,6 +17,8 @@ const fltx4 Four_Threes={3.0,3.0,3.0,3.0}; const fltx4 Four_Fours={4.0,4.0,4.0,4.0}; const fltx4 Four_Origin={0,0,0,1}; const fltx4 Four_NegativeOnes={-1,-1,-1,-1}; +const fltx4 Four_DegToRad = { ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)) }; +const fltx4 Four_360 = { 360.f, 360.f, 360.f, 360.f }; const fltx4 Four_2ToThe21s={ (float) (1<<21), (float) (1<<21), (float) (1<<21), (float)(1<<21) }; const fltx4 Four_2ToThe22s={ (float) (1<<22), (float) (1<<22), (float) (1<<22), (float)(1<<22) }; @@ -58,6 +60,8 @@ const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 }, }; +const int32 ALIGN16 g_SIMD_EveryOtherMask[4] = { 0, ~0, 0, ~0 }; + // FUNCTIONS // NOTE: WHY YOU **DO NOT** WANT TO PUT FUNCTIONS HERE diff --git a/src/public/bone_setup.cpp b/src/public/bone_setup.cpp index 35d72d5e3..27ab95dcf 100644 --- a/src/public/bone_setup.cpp +++ b/src/public/bone_setup.cpp @@ -430,7 +430,7 @@ void CalcBoneQuaternion( int frame, float s, AngleQuaternion( angle1, q1 ); AngleQuaternion( angle2, q2 ); - #ifdef _X360 + #if defined(_X360) || USE_DXMATH fltx4 q1simd, q2simd, qsimd; q1simd = LoadAlignedSIMD( q1 ); q2simd = LoadAlignedSIMD( q2 ); @@ -1370,7 +1370,476 @@ void WorldSpaceSlerp( g_MatrixPool.Free( targetBoneToWorld ); } +void SlerpBonesSpeedy( + const CStudioHdr* pStudioHdr, + QuaternionAligned q1[MAXSTUDIOBONES], + Vector pos1[MAXSTUDIOBONES], + mstudioseqdesc_t& seqdesc, // source of q2 and pos2 + int sequence, + const QuaternionAligned q2[MAXSTUDIOBONES], + const Vector pos2[MAXSTUDIOBONES], + float s, + int boneMask) +{ + if (s <= 0.0f) + return; + if (s > 1.0f) + { + s = 1.0f; + } + + if (seqdesc.flags & STUDIO_WORLD) + { + WorldSpaceSlerp(pStudioHdr, q1, pos1, seqdesc, sequence, q2, pos2, s, boneMask); + return; + } + int i; + virtualmodel_t* pVModel = pStudioHdr->GetVirtualModel(); + const virtualgroup_t* RESTRICT pSeqGroup = NULL; + if (pVModel) + { + pSeqGroup = pVModel->pSeqGroup(sequence); + } + + // Build weightlist for all bones + int nBoneCount = pStudioHdr->numbones(); + float* RESTRICT pS2 = (float*)stackalloc(nBoneCount * sizeof(float)); // 16-byte aligned + + + if (pSeqGroup) // hoist this branch outside of the inner loop for speed (even correctly predicted branches are an eight cycle latency) + { + for (i = 0; i < nBoneCount; i++) + { + // skip unused bones + if (!(pStudioHdr->boneFlags(i) & boneMask) || + pSeqGroup->boneMap[i] < 0) + { + pS2[i] = 0.0f; + } + else + { + // boneMap[i] is not a float, don't be lured by the siren call of fcmp + pS2[i] = s * seqdesc.weight(pSeqGroup->boneMap[i]); + } + } + } + else // !pSeqGroup + { + for (i = 0; i < nBoneCount; i++) + { + // skip unused bones + if (!(pStudioHdr->boneFlags(i) & boneMask)) + { + pS2[i] = 0.0f; + } + else + { + pS2[i] = s * seqdesc.weight(i); // blend in based on this bones weight + } + } + } + + float weight; + int nBoneCountRoundedFour = (nBoneCount) & (~(3)); + if (seqdesc.flags & STUDIO_DELTA) + { + // do as many as we can four at a time, then take care of stragglers. + for (i = 0; i < nBoneCountRoundedFour; i += 4) + { + // drag the next cache line in + PREFETCH360(q1, i * 16 + 128); + PREFETCH360(pos1, i * 16 + 128); + PREFETCH360(q2, i * 16 + 128); + PREFETCH360(pos2, i * 16 + 128); + + fltx4 weightfour = LoadAlignedSIMD(pS2 + i); // four weights + + FourQuaternions q1four, q2four; + FourQuaternions result; + + q1four.LoadAndSwizzleAligned(q1 + i); // four quaternions + q2four.LoadAndSwizzleAligned(q2 + i); // four quaternions + + if (seqdesc.flags & STUDIO_POST) + { + + // result = q1 * ( weight * q2 ) + result = q1four.MulAc(weightfour, q2four); + } + else + { + + // result = ( s * q1 ) * q2 + result = q2four.ScaleMul(weightfour, q1four); + } + + // mask out unused channels, replacing them with original data + { + fltx4 tinyScales = CmpLeSIMD(weightfour, Four_Zeros); + result.x = MaskedAssign(tinyScales, q1four.x, result.x); + result.y = MaskedAssign(tinyScales, q1four.y, result.y); + result.z = MaskedAssign(tinyScales, q1four.z, result.z); + result.w = MaskedAssign(tinyScales, q1four.w, result.w); + } + + + result.SwizzleAndStoreAlignedMasked(q1 + i, CmpGtSIMD(weightfour, Four_Zeros)); + + fltx4 originalpos1simd[4], pos1simd[4], pos2simd[4]; + originalpos1simd[0] = pos1simd[0] = LoadUnalignedSIMD(pos1[i + 0].Base()); + originalpos1simd[1] = pos1simd[1] = LoadUnalignedSIMD(pos1[i + 1].Base()); + originalpos1simd[2] = pos1simd[2] = LoadUnalignedSIMD(pos1[i + 2].Base()); + originalpos1simd[3] = pos1simd[3] = LoadUnalignedSIMD(pos1[i + 3].Base()); + pos2simd[0] = LoadUnalignedSIMD(pos2[i + 0].Base()); + pos2simd[1] = LoadUnalignedSIMD(pos2[i + 1].Base()); + pos2simd[2] = LoadUnalignedSIMD(pos2[i + 2].Base()); + pos2simd[3] = LoadUnalignedSIMD(pos2[i + 3].Base()); + + fltx4 splatweights[4] = { SplatXSIMD(weightfour), + SplatYSIMD(weightfour), + SplatZSIMD(weightfour), + SplatWSIMD(weightfour) }; + + fltx4 Zero = Four_Zeros; + pos1simd[0] = MaddSIMD(pos2simd[0], splatweights[0], pos1simd[0]); + splatweights[0] = (fltx4)CmpGtSIMD(splatweights[0], Zero); + pos1simd[1] = MaddSIMD(pos2simd[1], splatweights[1], pos1simd[1]); + splatweights[1] = (fltx4)CmpGtSIMD(splatweights[1], Zero); + pos1simd[2] = MaddSIMD(pos2simd[2], splatweights[2], pos1simd[2]); + splatweights[2] = (fltx4)CmpGtSIMD(splatweights[2], Zero); + pos1simd[3] = MaddSIMD(pos2simd[3], splatweights[3], pos1simd[3]); + splatweights[3] = (fltx4)CmpGtSIMD(splatweights[3], Zero); + + // mask out unweighted bones + /* + if (pS2[i+0] > 0) + StoreUnaligned3SIMD( pos1[i + 0].Base(), pos1simd[0] ); + if (pS2[i+1] > 0) + StoreUnaligned3SIMD( pos1[i + 1].Base(), pos1simd[1] ); + if (pS2[i+2] > 0) + StoreUnaligned3SIMD( pos1[i + 2].Base(), pos1simd[2] ); + if (pS2[i+3] > 0) + StoreUnaligned3SIMD( pos1[i + 3].Base(), pos1simd[3] ); + */ + StoreUnaligned3SIMD(pos1[i + 0].Base(), MaskedAssign((fltx4)splatweights[0], pos1simd[0], originalpos1simd[0])); + StoreUnaligned3SIMD(pos1[i + 1].Base(), MaskedAssign((fltx4)splatweights[1], pos1simd[1], originalpos1simd[1])); + StoreUnaligned3SIMD(pos1[i + 2].Base(), MaskedAssign((fltx4)splatweights[2], pos1simd[2], originalpos1simd[2])); + StoreUnaligned3SIMD(pos1[i + 3].Base(), MaskedAssign((fltx4)splatweights[3], pos1simd[3], originalpos1simd[3])); + + } + + // take care of stragglers + for (false; i < nBoneCount; i++) + { + weight = pS2[i]; + if (weight <= 0.0f) + continue; + + if (seqdesc.flags & STUDIO_POST) + { +#if !defined(_X360) && !USE_DXMATH + QuaternionMA(q1[i], weight, q2[i], q1[i]); +#else + fltx4 q1simd = LoadUnalignedSIMD(q1[i].Base()); + fltx4 q2simd = LoadAlignedSIMD(q2[i]); + fltx4 result = QuaternionMASIMD(q1simd, weight, q2simd); + StoreUnalignedSIMD(q1[i].Base(), result); +#endif + } + else + { +#if !defined(_X360) && !USE_DXMATH + QuaternionSM(weight, q2[i], q1[i], q1[i]); +#else + fltx4 q1simd = LoadUnalignedSIMD(q1[i].Base()); + fltx4 q2simd = LoadAlignedSIMD(q2[i]); + fltx4 result = QuaternionSMSIMD(weight, q2simd, q1simd); + StoreUnalignedSIMD(q1[i].Base(), result); +#endif + } + // do this explicitly to make the scheduling better + // (otherwise it might think pos1 and pos2 overlap, + // and thus save one before starting the next) + float x, y, z; + x = pos1[i][0] + pos2[i][0] * weight; + y = pos1[i][1] + pos2[i][1] * weight; + z = pos1[i][2] + pos2[i][2] * weight; + pos1[i][0] = x; + pos1[i][1] = y; + pos1[i][2] = z; + } + return; + } + + //// SLERP PHASE + + // Some bones need to be slerped with alignment. + // Others do not. + // Some need to be ignored altogether. + // Build arrays indicating which are which. + // This is the corral approach. Another approach + // would be to compute both the aligned and unaligned + // slerps of each bone in the first pass through the + // array, and then do a masked selection of each + // based on the masks. However there really isn't + // a convenient way to turn the int flags that + // specify which approach to take, into fltx4 masks. + + // float * RESTRICT pS2 = (float*)stackalloc( nBoneCount * sizeof(float) ); + int* RESTRICT aBonesSlerpAlign = (int*)stackalloc(nBoneCount * sizeof(int)); + float* RESTRICT aBonesSlerpAlignWeights = (float*)stackalloc(nBoneCount * sizeof(float)); + int* RESTRICT aBonesSlerpNoAlign = (int*)stackalloc(nBoneCount * sizeof(int)); + float* RESTRICT aBonesSlerpNoAlignWeights = (float*)stackalloc(nBoneCount * sizeof(float)); + int numBonesSlerpAlign = 0; + int numBonesSlerpNoAlign = 0; + + // BoneQuaternionAligned * RESTRICT testOutput = (BoneQuaternionAligned *)stackalloc(nBoneCount * sizeof(BoneQuaternionAligned)); + + // sweep forward through the array and determine where to corral each bone. + for (i = 0; i < nBoneCount; ++i) + { + float weight = pS2[i]; + if (weight == 1.0f) + { + q1[i] = q2[i]; + pos1[i] = pos2[i]; + } + else if (weight > 0.0f) // ignore small bones + { + if (pStudioHdr->boneFlags(i) & BONE_FIXED_ALIGNMENT) + { + aBonesSlerpNoAlign[numBonesSlerpNoAlign] = i; + aBonesSlerpNoAlignWeights[numBonesSlerpNoAlign] = weight; + ++numBonesSlerpNoAlign; + } + else + { + aBonesSlerpAlign[numBonesSlerpAlign] = i; + aBonesSlerpAlignWeights[numBonesSlerpAlign] = weight; + ++numBonesSlerpAlign; + } + } + } + + // okay, compute all the aligned, and all the unaligned bones, four at + // a time if possible. + const fltx4 One = Four_Ones; + ///////////////// + // // // Aligned! + nBoneCountRoundedFour = (numBonesSlerpAlign) & ~3; + for (i = 0; i < nBoneCountRoundedFour; i += 4) + { + // drag the next cache line in + PREFETCH360(q1, i * 16 + 128); + PREFETCH360(pos1, i * sizeof(*pos1) + 128); + PREFETCH360(q2, i * 16 + 128); + PREFETCH360(pos2, i * sizeof(*pos2) + 128); + + fltx4 weights = LoadAlignedSIMD(aBonesSlerpAlignWeights + i); + fltx4 oneMinusWeight = SubSIMD(One, weights); + + // position component: + // pos1[i][0] = pos1[i][0] * s1 + pos2[i][0] * weight; + fltx4 pos1simd[4]; + fltx4 pos2simd[4]; + pos1simd[0] = LoadUnaligned3SIMD(pos1[aBonesSlerpAlign[i + 0]].Base()); + pos1simd[1] = LoadUnaligned3SIMD(pos1[aBonesSlerpAlign[i + 1]].Base()); + pos1simd[2] = LoadUnaligned3SIMD(pos1[aBonesSlerpAlign[i + 2]].Base()); + pos1simd[3] = LoadUnaligned3SIMD(pos1[aBonesSlerpAlign[i + 3]].Base()); + pos2simd[0] = LoadUnaligned3SIMD(pos2[aBonesSlerpAlign[i + 0]].Base()); + pos2simd[1] = LoadUnaligned3SIMD(pos2[aBonesSlerpAlign[i + 1]].Base()); + pos2simd[2] = LoadUnaligned3SIMD(pos2[aBonesSlerpAlign[i + 2]].Base()); + pos2simd[3] = LoadUnaligned3SIMD(pos2[aBonesSlerpAlign[i + 3]].Base()); + + pos1simd[0] = MulSIMD(SplatXSIMD(oneMinusWeight), pos1simd[0]); + pos1simd[1] = MulSIMD(SplatYSIMD(oneMinusWeight), pos1simd[1]); + pos1simd[2] = MulSIMD(SplatZSIMD(oneMinusWeight), pos1simd[2]); + pos1simd[3] = MulSIMD(SplatWSIMD(oneMinusWeight), pos1simd[3]); + + fltx4 posWriteMasks[4]; // don't overwrite where there was zero weight + { + fltx4 splatweights[4]; + fltx4 Zero = Four_Zeros; + splatweights[0] = SplatXSIMD(weights); + splatweights[1] = SplatYSIMD(weights); + splatweights[2] = SplatZSIMD(weights); + splatweights[3] = SplatWSIMD(weights); + + pos1simd[0] = MaddSIMD(splatweights[0], pos2simd[0], pos1simd[0]); + posWriteMasks[0] = (fltx4)CmpGtSIMD(splatweights[0], Zero); + pos1simd[1] = MaddSIMD(splatweights[1], pos2simd[1], pos1simd[1]); + posWriteMasks[1] = (fltx4)CmpGtSIMD(splatweights[1], Zero); + pos1simd[2] = MaddSIMD(splatweights[2], pos2simd[2], pos1simd[2]); + posWriteMasks[2] = (fltx4)CmpGtSIMD(splatweights[2], Zero); + pos1simd[3] = MaddSIMD(splatweights[3], pos2simd[3], pos1simd[3]); + posWriteMasks[3] = (fltx4)CmpGtSIMD(splatweights[3], Zero); + } + + + FourQuaternions q1four, q2four, result; + q1four.LoadAndSwizzleAligned(q1 + aBonesSlerpAlign[i + 0], + q1 + aBonesSlerpAlign[i + 1], + q1 + aBonesSlerpAlign[i + 2], + q1 + aBonesSlerpAlign[i + 3]); + +#if 0 + // FIXME: the SIMD slerp doesn't handle quaternions that have opposite signs + q2four.LoadAndSwizzleAligned(q2 + aBonesSlerpAlign[i + 0], + q2 + aBonesSlerpAlign[i + 1], + q2 + aBonesSlerpAlign[i + 2], + q2 + aBonesSlerpAlign[i + 3]); + result = q2four.Slerp(q1four, oneMinusWeight); +#else + // force the quaternions to be the same sign (< 180 degree separation) + QuaternionAligned q20, q21, q22, q23; + QuaternionAlign(q1[aBonesSlerpAlign[i + 0]], q2[aBonesSlerpAlign[i + 0]], q20); + QuaternionAlign(q1[aBonesSlerpAlign[i + 1]], q2[aBonesSlerpAlign[i + 1]], q21); + QuaternionAlign(q1[aBonesSlerpAlign[i + 2]], q2[aBonesSlerpAlign[i + 2]], q22); + QuaternionAlign(q1[aBonesSlerpAlign[i + 3]], q2[aBonesSlerpAlign[i + 3]], q23); + q2four.LoadAndSwizzleAligned(&q20, &q21, &q22, &q23); + result = q2four.SlerpNoAlign(q1four, oneMinusWeight); +#endif + + result.SwizzleAndStoreAligned(q1 + aBonesSlerpAlign[i + 0], + q1 + aBonesSlerpAlign[i + 1], + q1 + aBonesSlerpAlign[i + 2], + q1 + aBonesSlerpAlign[i + 3]); + + StoreUnaligned3SIMD(pos1[aBonesSlerpAlign[i + 0]].Base(), pos1simd[0]); + StoreUnaligned3SIMD(pos1[aBonesSlerpAlign[i + 1]].Base(), pos1simd[1]); + StoreUnaligned3SIMD(pos1[aBonesSlerpAlign[i + 2]].Base(), pos1simd[2]); + StoreUnaligned3SIMD(pos1[aBonesSlerpAlign[i + 3]].Base(), pos1simd[3]); + } + + // handle stragglers + for (i; i < numBonesSlerpAlign; ++i) + { + QuaternionAligned q3; + weight = aBonesSlerpAlignWeights[i]; + int k = aBonesSlerpAlign[i]; + + float s1 = 1.0 - weight; + +#if defined(_X360) || USE_DXMATH + fltx4 q1simd, q2simd, result; + q1simd = LoadAlignedSIMD(q1[k].Base()); + q2simd = LoadAlignedSIMD(q2[k]); +#endif + +#if !defined(_X360) && !USE_DXMATH + QuaternionSlerp(q2[k], q1[k], s1, q3); +#else + result = QuaternionSlerpSIMD(q2simd, q1simd, s1); +#endif + +#if !defined(_X360) && !USE_DXMATH + q1[k][0] = q3[0]; + q1[k][1] = q3[1]; + q1[k][2] = q3[2]; + q1[k][3] = q3[3]; +#else + StoreAlignedSIMD(q1[k].Base(), result); +#endif + + pos1[k][0] = pos1[k][0] * s1 + pos2[k][0] * weight; + pos1[k][1] = pos1[k][1] * s1 + pos2[k][1] * weight; + pos1[k][2] = pos1[k][2] * s1 + pos2[k][2] * weight; + } + /////////////////// + // // // Unaligned! + nBoneCountRoundedFour = (numBonesSlerpNoAlign) & ~3; + for (i = 0; i < nBoneCountRoundedFour; i += 4) + { + // drag the next cache line in + PREFETCH360(q1, i * 16 + 128); + PREFETCH360(pos1, i * sizeof(*pos1) + 128); + PREFETCH360(q2, i * 16 + 128); + PREFETCH360(pos2, i * sizeof(*pos2) + 128); + + fltx4 weights = LoadAlignedSIMD(aBonesSlerpNoAlignWeights + i); + fltx4 oneMinusWeight = SubSIMD(One, weights); + + // position component: + // pos1[i][0] = pos1[i][0] * s1 + pos2[i][0] * weight; + fltx4 pos1simd[4]; + fltx4 pos2simd[4]; + pos1simd[0] = LoadUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i + 0]].Base()); + pos1simd[1] = LoadUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i + 1]].Base()); + pos1simd[2] = LoadUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i + 2]].Base()); + pos1simd[3] = LoadUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i + 3]].Base()); + pos2simd[0] = LoadUnaligned3SIMD(pos2[aBonesSlerpNoAlign[i + 0]].Base()); + pos2simd[1] = LoadUnaligned3SIMD(pos2[aBonesSlerpNoAlign[i + 1]].Base()); + pos2simd[2] = LoadUnaligned3SIMD(pos2[aBonesSlerpNoAlign[i + 2]].Base()); + pos2simd[3] = LoadUnaligned3SIMD(pos2[aBonesSlerpNoAlign[i + 3]].Base()); + + pos1simd[0] = MulSIMD(SplatXSIMD(oneMinusWeight), pos1simd[0]); + pos1simd[1] = MulSIMD(SplatYSIMD(oneMinusWeight), pos1simd[1]); + pos1simd[2] = MulSIMD(SplatZSIMD(oneMinusWeight), pos1simd[2]); + pos1simd[3] = MulSIMD(SplatWSIMD(oneMinusWeight), pos1simd[3]); + + pos1simd[0] = MaddSIMD(SplatXSIMD(weights), pos2simd[0], pos1simd[0]); + pos1simd[1] = MaddSIMD(SplatYSIMD(weights), pos2simd[1], pos1simd[1]); + pos1simd[2] = MaddSIMD(SplatZSIMD(weights), pos2simd[2], pos1simd[2]); + pos1simd[3] = MaddSIMD(SplatWSIMD(weights), pos2simd[3], pos1simd[3]); + + FourQuaternions q1four, q2four, result; + q1four.LoadAndSwizzleAligned(q1 + aBonesSlerpNoAlign[i + 0], + q1 + aBonesSlerpNoAlign[i + 1], + q1 + aBonesSlerpNoAlign[i + 2], + q1 + aBonesSlerpNoAlign[i + 3]); + q2four.LoadAndSwizzleAligned(q2 + aBonesSlerpNoAlign[i + 0], + q2 + aBonesSlerpNoAlign[i + 1], + q2 + aBonesSlerpNoAlign[i + 2], + q2 + aBonesSlerpNoAlign[i + 3]); + + result = q2four.SlerpNoAlign(q1four, oneMinusWeight); + + result.SwizzleAndStoreAligned(q1 + aBonesSlerpNoAlign[i + 0], + q1 + aBonesSlerpNoAlign[i + 1], + q1 + aBonesSlerpNoAlign[i + 2], + q1 + aBonesSlerpNoAlign[i + 3]); + + StoreUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i + 0]].Base(), pos1simd[0]); + StoreUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i + 1]].Base(), pos1simd[1]); + StoreUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i + 2]].Base(), pos1simd[2]); + StoreUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i + 3]].Base(), pos1simd[3]); + } + // handle stragglers + for (i; i < numBonesSlerpNoAlign; ++i) + { + weight = aBonesSlerpNoAlignWeights[i]; + int k = aBonesSlerpNoAlign[i]; + + float s1 = 1.0 - weight; + +#if defined(_X360) || USE_DXMATH + fltx4 q1simd, q2simd, result; + q1simd = LoadAlignedSIMD(q1[k].Base()); + q2simd = LoadAlignedSIMD(q2[k]); +#endif + +#if !defined(_X360) && !USE_DXMATH + QuaternionAligned q3; + QuaternionSlerpNoAlign(q2[k], q1[k], s1, q3); +#else + result = QuaternionSlerpNoAlignSIMD(q2simd, q1simd, s1); +#endif + +#if !defined(_X360) && !USE_DXMATH + q1[k][0] = q3[0]; + q1[k][1] = q3[1]; + q1[k][2] = q3[2]; + q1[k][3] = q3[3]; +#else + StoreAlignedSIMD(q1[k].Base(), result); +#endif + + pos1[k][0] = pos1[k][0] * s1 + pos2[k][0] * weight; + pos1[k][1] = pos1[k][1] * s1 + pos2[k][1] * weight; + pos1[k][2] = pos1[k][2] * s1 + pos2[k][2] * weight; + } +} //----------------------------------------------------------------------------- // Purpose: blend together q1,pos1 with q2,pos2. Return result in q1,pos1. @@ -1387,6 +1856,22 @@ void SlerpBones( float s, int boneMask ) { + // Test for 16-byte alignment, and if present, use the speedy SIMD version. + if ((reinterpret_cast(q1) & 0x0F) == 0 && + (reinterpret_cast(q2) & 0x0F) == 0) + { + return SlerpBonesSpeedy(pStudioHdr, + reinterpret_cast(q1), + pos1, + seqdesc, + sequence, + q2, + pos2, + s, + boneMask + ); + } + if (s <= 0.0f) return; if (s > 1.0f) @@ -1448,7 +1933,7 @@ void SlerpBones( if ( seqdesc.flags & STUDIO_POST ) { -#ifndef _X360 +#if !defined(_X360) && !USE_DXMATH QuaternionMA( q1[i], s2, q2[i], q1[i] ); #else fltx4 q1simd = LoadUnalignedSIMD( q1[i].Base() ); @@ -1456,14 +1941,10 @@ void SlerpBones( fltx4 result = QuaternionMASIMD( q1simd, s2, q2simd ); StoreUnalignedSIMD( q1[i].Base(), result ); #endif - // FIXME: are these correct? - pos1[i][0] = pos1[i][0] + pos2[i][0] * s2; - pos1[i][1] = pos1[i][1] + pos2[i][1] * s2; - pos1[i][2] = pos1[i][2] + pos2[i][2] * s2; } else { -#ifndef _X360 +#if !defined(_X360) && !USE_DXMATH QuaternionSM( s2, q2[i], q1[i], q1[i] ); #else fltx4 q1simd = LoadUnalignedSIMD( q1[i].Base() ); @@ -1471,12 +1952,17 @@ void SlerpBones( fltx4 result = QuaternionSMSIMD( s2, q2simd, q1simd ); StoreUnalignedSIMD( q1[i].Base(), result ); #endif - - // FIXME: are these correct? - pos1[i][0] = pos1[i][0] + pos2[i][0] * s2; - pos1[i][1] = pos1[i][1] + pos2[i][1] * s2; - pos1[i][2] = pos1[i][2] + pos2[i][2] * s2; } + // do this explicitly to make the scheduling better + // (otherwise it might think pos1 and pos2 overlap, + // and thus save one before starting the next) + float x, y, z; + x = pos1[i][0] + pos2[i][0] * s2; + y = pos1[i][1] + pos2[i][1] * s2; + z = pos1[i][2] + pos2[i][2] * s2; + pos1[i][0] = x; + pos1[i][1] = y; + pos1[i][2] = z; } return; } @@ -1490,14 +1976,14 @@ void SlerpBones( s1 = 1.0 - s2; -#ifdef _X360 +#if defined(_X360) || USE_DXMATH fltx4 q1simd, q2simd, result; q1simd = LoadUnalignedSIMD( q1[i].Base() ); q2simd = LoadAlignedSIMD( q2[i] ); #endif if ( pStudioHdr->boneFlags(i) & BONE_FIXED_ALIGNMENT ) { -#ifndef _X360 +#if !defined(_X360) && !USE_DXMATH QuaternionSlerpNoAlign( q2[i], q1[i], s1, q3 ); #else result = QuaternionSlerpNoAlignSIMD( q2simd, q1simd, s1 ); @@ -1505,14 +1991,14 @@ void SlerpBones( } else { -#ifndef _X360 +#if !defined(_X360) && !USE_DXMATH QuaternionSlerp( q2[i], q1[i], s1, q3 ); #else result = QuaternionSlerpSIMD( q2simd, q1simd, s1 ); #endif } -#ifndef _X360 +#if !defined(_X360) && !USE_DXMATH q1[i][0] = q3[0]; q1[i][1] = q3[1]; q1[i][2] = q3[2]; @@ -2632,14 +3118,14 @@ class CIKSolver X[i] = P[i]; normalize(X); -// Its y axis is perpendicular to P, so Y = unit( E - X(E·X) ). +// Its y axis is perpendicular to P, so Y = unit( E - X(E�X) ). float dDOTx = dot(D,X); for (i = 0 ; i < 3 ; i++) Y[i] = D[i] - dDOTx * X[i]; normalize(Y); -// Its z axis is perpendicular to both X and Y, so Z = X×Y. +// Its z axis is perpendicular to both X and Y, so Z = X�Y. cross(X,Y,Z); diff --git a/src/public/mathlib/dxmath.h b/src/public/mathlib/dxmath.h new file mode 100644 index 000000000..1af932614 --- /dev/null +++ b/src/public/mathlib/dxmath.h @@ -0,0 +1,20 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +//===========================================================================// + +#pragma once + +#define USE_DXMATH 1 + +#if USE_DXMATH +#if defined(_WIN32) +#include "../thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h" +#elif defined(POSIX) +#include "../thirdparty/dotnetrt/sal.h" +#include "../thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h" +#else +#undef USE_DXMATH +#endif +#endif diff --git a/src/public/mathlib/math_pfns.h b/src/public/mathlib/math_pfns.h index d43411ce8..e50675395 100644 --- a/src/public/mathlib/math_pfns.h +++ b/src/public/mathlib/math_pfns.h @@ -9,32 +9,50 @@ #if defined( _X360 ) #include +#else +#include "dxmath.h" +#if !USE_DXMATH +#include +#endif +#define USE_SSE2 +#include "../thirdparty/sse_mathfun/sse_mathfun.h" #endif #if !defined( _X360 ) -// These globals are initialized by mathlib and redirected based on available fpu features -extern float (*pfSqrt)(float x); -extern float (*pfRSqrt)(float x); -extern float (*pfRSqrtFast)(float x); -extern void (*pfFastSinCos)(float x, float *s, float *c); -extern float (*pfFastCos)(float x); +FORCEINLINE float RSqrt(float x) +{ + // The compiler will generate ideal instructions for a Newton-Raphson + // Specifying it directly results in worse assembly. + return 1.0f / sqrtf(x); +} + +FORCEINLINE float RSqrtFast(float x) +{ + // This results in the compiler simplifying down to a plain rsqrtss + const __m128 vec = _mm_set_ss( x ); + const __m128 r = _mm_rsqrt_ps( vec ); + float temp; + _mm_store_ss(&temp, r); + return temp; +} + +FORCEINLINE float CosFast(float x) +{ + // Compiler doesn't optimize ::cosf call, use a vectorized cos. This is better than DirectX::XMScalarCos + const __m128 vec = _mm_set_ss( x ); + const __m128 r = cos_ps(vec); + float temp; + _mm_store_ss(&temp, r); + return temp; +} // The following are not declared as macros because they are often used in limiting situations, // and sometimes the compiler simply refuses to inline them for some reason -#define FastSqrt(x) (*pfSqrt)(x) -#define FastRSqrt(x) (*pfRSqrt)(x) -#define FastRSqrtFast(x) (*pfRSqrtFast)(x) -#define FastSinCos(x,s,c) (*pfFastSinCos)(x,s,c) -#define FastCos(x) (*pfFastCos)(x) - -#if defined(__i386__) || defined(_M_IX86) -// On x86, the inline FPU or SSE sqrt instruction is faster than -// the overhead of setting up a function call and saving/restoring -// the FPU or SSE register state and can be scheduled better, too. -#undef FastSqrt -#define FastSqrt(x) ::sqrtf(x) -#endif +#define FastSqrt(x) ::sqrtf(x) // sqrt is optimized to an efficient SSE call with modern compilers +#define FastRSqrt(x) RSqrt(x) +#define FastRSqrtFast(x) RSqrtFast(x) +#define FastCos(x) CosFast(x) #endif // !_X360 diff --git a/src/public/mathlib/mathlib.h b/src/public/mathlib/mathlib.h index 42317632b..fa7486dfa 100644 --- a/src/public/mathlib/mathlib.h +++ b/src/public/mathlib/mathlib.h @@ -7,6 +7,8 @@ #ifndef MATH_LIB_H #define MATH_LIB_H +#include "dxmath.h" + #include #include "minmax.h" #include "tier0/basetypes.h" @@ -95,7 +97,7 @@ class FPExceptionEnabler -#ifdef DEBUG // stop crashing edit-and-continue +#ifdef DEBUG // stop crashing edit-and-continue FORCEINLINE float clamp( float val, float minVal, float maxVal ) { if ( maxVal < minVal ) @@ -438,10 +440,12 @@ inline vec_t RoundInt (vec_t in) int Q_log2(int val); // Math routines done in optimized assembly math package routines -void inline SinCos( float radians, float *sine, float *cosine ) +void FORCEINLINE SinCos( float radians, float *sine, float *cosine ) { #if defined( _X360 ) XMScalarSinCos( sine, cosine, radians ); +#elif USE_DXMATH + DirectX::XMScalarSinCos( sine, cosine, radians ); #elif defined( PLATFORM_WINDOWS_PC32 ) _asm { @@ -466,35 +470,7 @@ void inline SinCos( float radians, float *sine, float *cosine ) #endif } -#define SIN_TABLE_SIZE 256 -#define FTOIBIAS 12582912.f -extern float SinCosTable[SIN_TABLE_SIZE]; - -inline float TableCos( float theta ) -{ - union - { - int i; - float f; - } ftmp; - - // ideally, the following should compile down to: theta * constant + constant, changing any of these constants from defines sometimes fubars this. - ftmp.f = theta * ( float )( SIN_TABLE_SIZE / ( 2.0f * M_PI ) ) + ( FTOIBIAS + ( SIN_TABLE_SIZE / 4 ) ); - return SinCosTable[ ftmp.i & ( SIN_TABLE_SIZE - 1 ) ]; -} - -inline float TableSin( float theta ) -{ - union - { - int i; - float f; - } ftmp; - - // ideally, the following should compile down to: theta * constant + constant - ftmp.f = theta * ( float )( SIN_TABLE_SIZE / ( 2.0f * M_PI ) ) + FTOIBIAS; - return SinCosTable[ ftmp.i & ( SIN_TABLE_SIZE - 1 ) ]; -} +#define FastSinCos( angle, s, c) SinCos(angle, s, c) template FORCEINLINE T Square( T const &a ) @@ -1205,7 +1181,7 @@ inline float SimpleSplineRemapValClamped( float val, float A, float B, float C, FORCEINLINE int RoundFloatToInt(float f) { #if defined(__i386__) || defined(_M_IX86) || defined( PLATFORM_WINDOWS_PC64 ) || defined(__x86_64__) - return _mm_cvtss_si32(_mm_load_ss(&f)); + return _mm_cvt_ss2si(_mm_set_ss(f + f + 0.5f)) >> 1; #elif defined( _X360 ) #ifdef Assert Assert( IsFPUControlWordSet() ); @@ -1310,17 +1286,11 @@ FORCEINLINE int Float2Int( float a ) // Over 15x faster than: (int)floor(value) inline int Floor2Int( float a ) { - int RetVal; -#if defined( __i386__ ) - // Convert to int and back, compare, subtract one if too big - __m128 a128 = _mm_set_ss(a); - RetVal = _mm_cvtss_si32(a128); - __m128 rounded128 = _mm_cvt_si2ss(_mm_setzero_ps(), RetVal); - RetVal -= _mm_comigt_ss( rounded128, a128 ); +#if defined( _X360 ) + return static_cast( floor(a) ); #else - RetVal = static_cast( floor(a) ); + return _mm_cvt_ss2si(_mm_set_ss(a + a - 0.5f)) >> 1; #endif - return RetVal; } //----------------------------------------------------------------------------- @@ -1366,18 +1336,12 @@ inline float ClampToMsec( float in ) // Over 15x faster than: (int)ceil(value) inline int Ceil2Int( float a ) -{ - int RetVal; -#if defined( __i386__ ) - // Convert to int and back, compare, add one if too small - __m128 a128 = _mm_load_ss(&a); - RetVal = _mm_cvtss_si32(a128); - __m128 rounded128 = _mm_cvt_si2ss(_mm_setzero_ps(), RetVal); - RetVal += _mm_comilt_ss( rounded128, a128 ); +{ +#if defined( _X360 ) + return static_cast( ceil(a) ); #else - RetVal = static_cast( ceil(a) ); + return -(_mm_cvt_ss2si(_mm_set_ss(-0.5f - (a + a))) >> 1); #endif - return RetVal; } @@ -2169,7 +2133,7 @@ inline bool CloseEnough( const Vector &a, const Vector &b, float epsilon = EQUAL // Fast compare // maxUlps is the maximum error in terms of Units in the Last Place. This // specifies how big an error we are willing to accept in terms of the value -// of the least significant digit of the floating point number’s +// of the least significant digit of the floating point number's // representation. maxUlps can also be interpreted in terms of how many // representable floats we are willing to accept between A and B. // This function will allow maxUlps-1 floats between A and B. diff --git a/src/public/mathlib/ssemath.h b/src/public/mathlib/ssemath.h index c2ff48d75..3bcda408a 100644 --- a/src/public/mathlib/ssemath.h +++ b/src/public/mathlib/ssemath.h @@ -149,6 +149,8 @@ extern const fltx4 Four_2ToThe23s; // (1<<23).. extern const fltx4 Four_2ToThe24s; // (1<<24).. extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2) extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1 +extern const fltx4 Four_DegToRad; // (float)(M_PI_F / 180.f) times four +extern const fltx4 Four_360; // 360 360 360 360 #else #define Four_Zeros XMVectorZero() // 0 0 0 0 #define Four_Ones XMVectorSplatOne() // 1 1 1 1 @@ -164,6 +166,8 @@ extern const fltx4 Four_2ToThe23s; // (1<<23).. extern const fltx4 Four_2ToThe24s; // (1<<24).. extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2) extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1 +extern const fltx4 Four_DegToRad; // (float)(M_PI_F / 180.f) times four +extern const fltx4 Four_360; // 360 360 360 360 #endif extern const fltx4 Four_FLT_MAX; // FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX extern const fltx4 Four_Negative_FLT_MAX; // -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX @@ -182,6 +186,8 @@ extern const ALIGN16 uint32 g_SIMD_Low16BitsMask[] ALIGN16_POST; // 0xffff x 4 // to mask out the tail, g_SIMD_SkipTailMask[N & 3] what you want to use for the last iteration. extern const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST; +extern const int32 ALIGN16 g_SIMD_EveryOtherMask[]; // 0, ~0, 0, ~0 + // Define prefetch macros. // The characteristics of cache and prefetch are completely // different between the different platforms, so you DO NOT @@ -1951,6 +1957,34 @@ FORCEINLINE fltx4 RotateRight2( const fltx4 & a ) return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 1, 0, 3, 2 ) ); } +// a={ a.x, b.x, c.x, d.x } +// combine 4 fltx4s by throwing away 3/4s of the fields +FORCEINLINE fltx4 Compress4SIMD(fltx4 const a, fltx4 const& b, fltx4 const& c, fltx4 const& d) +{ + fltx4 aacc = _mm_shuffle_ps(a, c, MM_SHUFFLE_REV(0, 0, 0, 0)); + fltx4 bbdd = _mm_shuffle_ps(b, d, MM_SHUFFLE_REV(0, 0, 0, 0)); + return MaskedAssign(LoadAlignedSIMD(g_SIMD_EveryOtherMask), bbdd, aacc); +} + +// outa={a.x, a.x, a.y, a.y}, outb = a.z, a.z, a.w, a.w } +FORCEINLINE void ExpandSIMD(fltx4 const& a, fltx4& fl4OutA, fltx4& fl4OutB) +{ + fl4OutA = _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(0, 0, 1, 1)); + fl4OutB = _mm_shuffle_ps(a, a, MM_SHUFFLE_REV(2, 2, 3, 3)); + +} + +// construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous +FORCEINLINE fltx4 LoadGatherSIMD(const float& x, const float& y, const float& z, const float& w) +{ + // load the float into the low word of each vector register (this exploits the unaligned load op) + fltx4 vx = _mm_load_ss(&x); + fltx4 vy = _mm_load_ss(&y); + fltx4 vz = _mm_load_ss(&z); + fltx4 vw = _mm_load_ss(&w); + return Compress4SIMD(vx, vy, vz, vw); +} + FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) // a+b { @@ -1984,49 +2018,74 @@ FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b ) { +#if USE_DXMATH + return DirectX::XMVector3Dot(a, b); +#else fltx4 m = MulSIMD( a, b ); float flDot = SubFloat( m, 0 ) + SubFloat( m, 1 ) + SubFloat( m, 2 ); return ReplicateX4( flDot ); +#endif } FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b ) { +#if USE_DXMATH + return DirectX::XMVector4Dot(a, b); +#else fltx4 m = MulSIMD( a, b ); float flDot = SubFloat( m, 0 ) + SubFloat( m, 1 ) + SubFloat( m, 2 ) + SubFloat( m, 3 ); return ReplicateX4( flDot ); +#endif } -//TODO: implement as four-way Taylor series (see xbox implementation) FORCEINLINE fltx4 SinSIMD( const fltx4 &radians ) { +#if USE_DXMATH + return DirectX::XMVectorSin( radians ); +#else + //TODO: implement as four-way Taylor series (see xbox implementation) + // FIXME: Make a fast SSE version fltx4 result; SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) ); SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) ); SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) ); SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) ); return result; +#endif } FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) { +#if USE_DXMATH + DirectX::XMVectorSinCos( &sine, &cosine, radians ); +#else // FIXME: Make a fast SSE version SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); +#endif } FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) // a*b + c { +#if USE_DXMATH + DirectX::XMVectorSinCos( &sine, &cosine, radians ); +#else // FIXME: Make a fast SSE version SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) ); +#endif } -//TODO: implement as four-way Taylor series (see xbox implementation) + FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine ) { +#if USE_DXMATH + return DirectX::XMVectorASin( sine ); +#else + //TODO: implement as four-way Taylor series (see xbox implementation) // FIXME: Make a fast SSE version fltx4 result; SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) ); @@ -2034,27 +2093,36 @@ FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine ) SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) ); SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) ); return result; +#endif } FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs ) { +#if USE_DXMATH + return DirectX::XMVectorACos( cs ); +#else fltx4 result; SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) ); SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) ); SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) ); SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) ); return result; +#endif } // tan^1(a/b) .. ie, pass sin in as a and cos in as b FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b ) { +#if USE_DXMATH + return DirectX::XMVectorATan2( a, b ); +#else fltx4 result; SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) ); SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) ); SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) ); SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) ); return result; +#endif } FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a @@ -2142,16 +2210,20 @@ FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b) // Round towards positive infinity FORCEINLINE fltx4 CeilSIMD( const fltx4 &a ) { +#if USE_DXMATH + return DirectX::XMVectorCeiling(a); +#else fltx4 retVal; SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) ); SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) ); SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) ); SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) ); return retVal; - +#endif } fltx4 fabs( const fltx4 & x ); + // Round towards negative infinity // This is the implementation that was here before; it assumes // you are in round-to-floor mode, which I guess is usually the @@ -2244,6 +2316,9 @@ FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a ) // 2^x for all values (the antilog) FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower ) { +#if USE_DXMATH + return DirectX::XMVectorExp(toPower); +#else fltx4 retval; SubFloat( retval, 0 ) = powf( 2, SubFloat(toPower, 0) ); SubFloat( retval, 1 ) = powf( 2, SubFloat(toPower, 1) ); @@ -2251,6 +2326,7 @@ FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower ) SubFloat( retval, 3 ) = powf( 2, SubFloat(toPower, 3) ); return retval; +#endif } // Clamps the components of a vector to a specified minimum and maximum range. @@ -2354,12 +2430,16 @@ FORCEINLINE void StoreUnalignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a // fixed point conversion is done. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA ) { +#if USE_DXMATH + return DirectX::XMConvertVectorUIntToFloat(vSrcA, 0); +#else fltx4 retval; SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) ); SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) ); SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) ); SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) ); return retval; +#endif } @@ -2368,12 +2448,16 @@ FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA ) // fixed point conversion is done. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA ) { +#if USE_DXMATH + return DirectX::XMConvertVectorIntToFloat(vSrcA, 0); +#else fltx4 retval; SubFloat( retval, 0 ) = ( (float) (reinterpret_cast(&vSrcA)[0])); SubFloat( retval, 1 ) = ( (float) (reinterpret_cast(&vSrcA)[1])); SubFloat( retval, 2 ) = ( (float) (reinterpret_cast(&vSrcA)[2])); SubFloat( retval, 3 ) = ( (float) (reinterpret_cast(&vSrcA)[3])); return retval; +#endif } /* diff --git a/src/public/mathlib/ssequaternion.h b/src/public/mathlib/ssequaternion.h index 825a9e45f..5548fa39a 100644 --- a/src/public/mathlib/ssequaternion.h +++ b/src/public/mathlib/ssequaternion.h @@ -38,9 +38,8 @@ // the SSE2 registers, which lessens this problem a little. // permitted only on 360, as we've done careful tuning on its Altivec math: -#ifdef _X360 -#define ALLOW_SIMD_QUATERNION_MATH 1 // not on PC! -#endif +// UNDONE: we've enabled SSE2 +#define ALLOW_SIMD_QUATERNION_MATH 1 @@ -48,7 +47,6 @@ // Load/store quaternions //--------------------------------------------------------------------- #ifndef _X360 -#if ALLOW_SIMD_QUATERNION_MATH // Using STDC or SSE FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD ) { @@ -58,7 +56,7 @@ FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD ) FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD ) { - fltx4 retval = LoadAlignedSIMD( pSIMD ); + fltx4 retval = LoadAlignedSIMD( pSIMD->Base() ); return retval; } @@ -66,7 +64,6 @@ FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const flt { StoreAlignedSIMD( pSIMD->Base(), a ); } -#endif #else // for the transitional class -- load a QuaternionAligned @@ -177,6 +174,10 @@ FORCEINLINE fltx4 QuaternionBlendSIMD( const fltx4 &p, const fltx4 &q, float t ) // SSE and STDC FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q ) { +#if USE_DXMATH + fltx4 q2 = QuaternionAlignSIMD(p, q); + return DirectX::XMQuaternionMultiply(q2, p); +#else // decide if one of the quaternions is backwards fltx4 q2, result; q2 = QuaternionAlignSIMD( p, q ); @@ -185,6 +186,7 @@ FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q ) SubFloat( result, 2 ) = SubFloat( p, 0 ) * SubFloat( q2, 1 ) - SubFloat( p, 1 ) * SubFloat( q2, 0 ) + SubFloat( p, 2 ) * SubFloat( q2, 3 ) + SubFloat( p, 3 ) * SubFloat( q2, 2 ); SubFloat( result, 3 ) = -SubFloat( p, 0 ) * SubFloat( q2, 0 ) - SubFloat( p, 1 ) * SubFloat( q2, 1 ) - SubFloat( p, 2 ) * SubFloat( q2, 2 ) + SubFloat( p, 3 ) * SubFloat( q2, 3 ); return result; +#endif } #else @@ -224,6 +226,36 @@ FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q ) //--------------------------------------------------------------------- #ifndef _X360 +#if USE_DXMATH +// DirectXMath +FORCEINLINE fltx4 QuaternionScaleSIMD(const fltx4& p, float t) +{ + fltx4 sinom = Dot3SIMD(p, p); + sinom = SqrtSIMD(sinom); + sinom = MinSIMD(sinom, Four_Ones); + fltx4 sinsom = ArcSinSIMD(sinom); + fltx4 t4 = ReplicateX4(t); + sinsom = MulSIMD(sinsom, t4); + sinsom = SinSIMD(sinsom); + sinom = AddSIMD(sinom, Four_Epsilons); + sinom = ReciprocalSIMD(sinom); + t4 = MulSIMD(sinsom, sinom); + fltx4 result = MulSIMD(p, t4); + + // rescale rotation + sinsom = MulSIMD(sinsom, sinsom); + fltx4 r = SubSIMD(Four_Ones, sinsom); + r = MaxSIMD(r, Four_Zeros); + r = SqrtSIMD(r); + + // keep sign of rotation + fltx4 cmp = CmpGeSIMD(p, Four_Zeros); + r = MaskedAssign(cmp, r, NegSIMD(r)); + + result = SetWSIMD(result, r); + return result; +} +#else // SSE and STDC FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t ) { @@ -254,6 +286,7 @@ FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t ) SubFloat( q, 3 ) = fsel( SubFloat( p, 3 ), r, -r ); return q; } +#endif #else @@ -294,6 +327,13 @@ FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t ) //----------------------------------------------------------------------------- #ifndef _X360 +#if USE_DXMATH +// DXMath +FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD(const fltx4& p, const fltx4& q, float t) +{ + return DirectX::XMQuaternionSlerp(p, q, t); +} +#else // SSE and STDC FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t ) { @@ -340,6 +380,7 @@ FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, fl return result; } +#endif #else @@ -360,8 +401,795 @@ FORCEINLINE fltx4 QuaternionSlerpSIMD( const fltx4 &p, const fltx4 &q, float t ) return result; } - #endif // ALLOW_SIMD_QUATERNION_MATH +/// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are +/// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated. +class ALIGN16 FourQuaternions +{ +public: + fltx4 x, y, z, w; + + FourQuaternions(void) + { + } + + FourQuaternions(const fltx4& _x, + const fltx4& _y, + const fltx4& _z, + const fltx4& _w) + : x(_x), y(_y), z(_z), w(_w) + {} + +#if !defined(__SPU__) + // four rotations around the same axis. angles should be in radians. + FourQuaternions(const fltx4& axis, + const float& angle0, const float& angle1, const float& angle2, const float& angle3) + { + FromAxisAndAngles(axis, angle0, angle1, angle2, angle3); + } +#endif + + FourQuaternions(FourQuaternions const& src) + { + x = src.x; + y = src.y; + z = src.z; + w = src.w; + } + + FORCEINLINE void operator=(FourQuaternions const& src) + { + x = src.x; + y = src.y; + z = src.z; + w = src.w; + } + + /// this = this * q; + FORCEINLINE FourQuaternions Mul(FourQuaternions const& q) const; + + /// negate the vector part + FORCEINLINE FourQuaternions Conjugate() const; + + /// for a quaternion representing a rotation of angle theta, return + /// one of angle s*theta + /// scale is four floats -- one for each quat + FORCEINLINE FourQuaternions ScaleAngle(const fltx4& scale) const; + + /// ret = this * ( s * q ) + /// In other words, for a quaternion representing a rotation of angle theta, return + /// one of angle s*theta + /// s is four floats in a fltx4 -- one for each quaternion + FORCEINLINE FourQuaternions MulAc(const fltx4& s, const FourQuaternions& q) const; + + /// ret = ( s * this ) * q + FORCEINLINE FourQuaternions ScaleMul(const fltx4& s, const FourQuaternions& q) const; + + /// Slerp four quaternions at once, FROM me TO the specified out. + FORCEINLINE FourQuaternions Slerp(const FourQuaternions& to, const fltx4& t); + + FORCEINLINE FourQuaternions SlerpNoAlign(const FourQuaternions& originalto, const fltx4& t); + +#if !defined(__SPU__) + /// given an axis and four angles, populate this quaternion with the equivalent rotations + /// (ie, make these four quaternions represent four different rotations around the same axis) + /// angles should be in RADIANS + FORCEINLINE FourQuaternions& FromAxisAndAngles(const fltx4& axis, + const float& angle0, const float& angle1, const float& angle2, const float& angle3); + FORCEINLINE FourQuaternions& FromAxisAndAngles(const fltx4& axis, const fltx4& angles); + // one convenience imp if you're doing this in degrees + FORCEINLINE FourQuaternions& FromAxisAndAnglesInDegrees(const fltx4& axis, const fltx4& angles) + { + return FromAxisAndAngles(axis, MulSIMD(angles, Four_DegToRad)); + } +#endif + + // rotate (in place) a FourVectors by this quaternion. there's a corresponding RotateBy in FourVectors. + FORCEINLINE void RotateFourVectors(FourVectors* RESTRICT vecs) const RESTRICT; + + + /// LoadAndSwizzleAligned - load 4 QuaternionAligneds into a FourQuaternions, performing transpose op. + /// all 4 vectors must be 128 bit boundary + FORCEINLINE void LoadAndSwizzleAligned(const float* RESTRICT a, const float* RESTRICT b, const float* RESTRICT c, const float* RESTRICT d) + { +#if defined( _X360 ) + fltx4 tx = LoadAlignedSIMD(a); + fltx4 ty = LoadAlignedSIMD(b); + fltx4 tz = LoadAlignedSIMD(c); + fltx4 tw = LoadAlignedSIMD(d); + fltx4 r0 = __vmrghw(tx, tz); + fltx4 r1 = __vmrghw(ty, tw); + fltx4 r2 = __vmrglw(tx, tz); + fltx4 r3 = __vmrglw(ty, tw); + + x = __vmrghw(r0, r1); + y = __vmrglw(r0, r1); + z = __vmrghw(r2, r3); + w = __vmrglw(r2, r3); +#else + x = LoadAlignedSIMD(a); + y = LoadAlignedSIMD(b); + z = LoadAlignedSIMD(c); + w = LoadAlignedSIMD(d); + // now, matrix is: + // x y z w + // x y z w + // x y z w + // x y z w + TransposeSIMD(x, y, z, w); +#endif + } + + FORCEINLINE void LoadAndSwizzleAligned(const QuaternionAligned* RESTRICT a, + const QuaternionAligned* RESTRICT b, + const QuaternionAligned* RESTRICT c, + const QuaternionAligned* RESTRICT d) + { + LoadAndSwizzleAligned(a->Base(), b->Base(), c->Base(), d->Base()); + } + + + /// LoadAndSwizzleAligned - load 4 consecutive QuaternionAligneds into a FourQuaternions, + /// performing transpose op. + /// all 4 vectors must be 128 bit boundary + FORCEINLINE void LoadAndSwizzleAligned(const QuaternionAligned* qs) + { +#if defined( _X360 ) + fltx4 tx = LoadAlignedSIMD(qs++); + fltx4 ty = LoadAlignedSIMD(qs++); + fltx4 tz = LoadAlignedSIMD(qs++); + fltx4 tw = LoadAlignedSIMD(qs); + fltx4 r0 = __vmrghw(tx, tz); + fltx4 r1 = __vmrghw(ty, tw); + fltx4 r2 = __vmrglw(tx, tz); + fltx4 r3 = __vmrglw(ty, tw); + + x = __vmrghw(r0, r1); + y = __vmrglw(r0, r1); + z = __vmrghw(r2, r3); + w = __vmrglw(r2, r3); +#else + x = LoadAlignedSIMD(qs++); + y = LoadAlignedSIMD(qs++); + z = LoadAlignedSIMD(qs++); + w = LoadAlignedSIMD(qs++); + // now, matrix is: + // x y z w + // x y z w + // x y z w + // x y z w + TransposeSIMD(x, y, z, w); +#endif + } + + // Store the FourQuaternions out to four nonconsecutive ordinary quaternions in memory. + FORCEINLINE void SwizzleAndStoreAligned(QuaternionAligned* a, QuaternionAligned* b, QuaternionAligned* c, QuaternionAligned* d) + { +#if defined( _X360 ) + fltx4 r0 = __vmrghw(x, z); + fltx4 r1 = __vmrghw(y, w); + fltx4 r2 = __vmrglw(x, z); + fltx4 r3 = __vmrglw(y, w); + + fltx4 rx = __vmrghw(r0, r1); + fltx4 ry = __vmrglw(r0, r1); + fltx4 rz = __vmrghw(r2, r3); + fltx4 rw = __vmrglw(r2, r3); + + StoreAlignedSIMD(a, rx); + StoreAlignedSIMD(b, ry); + StoreAlignedSIMD(c, rz); + StoreAlignedSIMD(d, rw); +#else + fltx4 dupes[4] = { x, y, z, w }; + TransposeSIMD(dupes[0], dupes[1], dupes[2], dupes[3]); + StoreAlignedSIMD(a, dupes[0]); + StoreAlignedSIMD(b, dupes[1]); + StoreAlignedSIMD(c, dupes[2]); + StoreAlignedSIMD(d, dupes[3]); +#endif + } + + // Store the FourQuaternions out to four consecutive ordinary quaternions in memory. + FORCEINLINE void SwizzleAndStoreAligned(QuaternionAligned* qs) + { +#if defined( _X360 ) + fltx4 r0 = __vmrghw(x, z); + fltx4 r1 = __vmrghw(y, w); + fltx4 r2 = __vmrglw(x, z); + fltx4 r3 = __vmrglw(y, w); + + fltx4 rx = __vmrghw(r0, r1); + fltx4 ry = __vmrglw(r0, r1); + fltx4 rz = __vmrghw(r2, r3); + fltx4 rw = __vmrglw(r2, r3); + + StoreAlignedSIMD(qs, rx); + StoreAlignedSIMD(++qs, ry); + StoreAlignedSIMD(++qs, rz); + StoreAlignedSIMD(++qs, rw); +#else + SwizzleAndStoreAligned(qs, qs + 1, qs + 2, qs + 3); +#endif + } + + // Store the FourQuaternions out to four consecutive ordinary quaternions in memory. + // The mask specifies which of the quaternions are actually written out -- each + // word in the fltx4 should be all binary ones or zeros. Ones means the corresponding + // quat will be written. + FORCEINLINE void SwizzleAndStoreAlignedMasked(QuaternionAligned* RESTRICT qs, const fltx4& controlMask) + { + fltx4 originals[4]; + originals[0] = LoadAlignedSIMD(qs); + originals[1] = LoadAlignedSIMD(qs + 1); + originals[2] = LoadAlignedSIMD(qs + 2); + originals[3] = LoadAlignedSIMD(qs + 3); + + fltx4 masks[4] = { SplatXSIMD(controlMask), + SplatYSIMD(controlMask), + SplatZSIMD(controlMask), + SplatWSIMD(controlMask) }; + +#if defined( _X360 ) + fltx4 r0 = __vmrghw(x, z); + fltx4 r1 = __vmrghw(y, w); + fltx4 r2 = __vmrglw(x, z); + fltx4 r3 = __vmrglw(y, w); + + fltx4 rx = __vmrghw(r0, r1); + fltx4 ry = __vmrglw(r0, r1); + fltx4 rz = __vmrghw(r2, r3); + fltx4 rw = __vmrglw(r2, r3); +#else + fltx4 rx = x; + fltx4 ry = y; + fltx4 rz = z; + fltx4 rw = w; + TransposeSIMD(rx, ry, rz, rw); +#endif + + StoreAlignedSIMD(qs + 0, MaskedAssign(masks[0], rx, originals[0])); + StoreAlignedSIMD(qs + 1, MaskedAssign(masks[1], ry, originals[1])); + StoreAlignedSIMD(qs + 2, MaskedAssign(masks[2], rz, originals[2])); + StoreAlignedSIMD(qs + 3, MaskedAssign(masks[3], rw, originals[3])); + } +}; + + + +FORCEINLINE FourQuaternions FourQuaternions::Conjugate() const +{ + return FourQuaternions(NegSIMD(x), NegSIMD(y), NegSIMD(z), w); +} + +FORCEINLINE const fltx4 Dot(const FourQuaternions& a, const FourQuaternions& b) +{ + return + MaddSIMD(a.x, b.x, + MaddSIMD(a.y, b.y, + MaddSIMD(a.z, b.z, MulSIMD(a.w, b.w)) + ) + ); +} + + +FORCEINLINE const FourQuaternions Madd(const FourQuaternions& a, const fltx4& scale, const FourQuaternions& c) +{ + FourQuaternions ret; + ret.x = MaddSIMD(a.x, scale, c.x); + ret.y = MaddSIMD(a.y, scale, c.y); + ret.z = MaddSIMD(a.z, scale, c.z); + ret.w = MaddSIMD(a.w, scale, c.w); + return ret; +} + +FORCEINLINE const FourQuaternions Mul(const FourQuaternions& a, const fltx4& scale) +{ + FourQuaternions ret; + ret.x = MulSIMD(a.x, scale); + ret.y = MulSIMD(a.y, scale); + ret.z = MulSIMD(a.z, scale); + ret.w = MulSIMD(a.w, scale); + return ret; +} + +FORCEINLINE const FourQuaternions Add(const FourQuaternions& a, const FourQuaternions& b) +{ + FourQuaternions ret; + ret.x = AddSIMD(a.x, b.x); + ret.y = AddSIMD(a.y, b.y); + ret.z = AddSIMD(a.z, b.z); + ret.w = AddSIMD(a.w, b.w); + return ret; +} + +FORCEINLINE const FourQuaternions Sub(const FourQuaternions& a, const FourQuaternions& b) +{ + FourQuaternions ret; + ret.x = SubSIMD(a.x, b.x); + ret.y = SubSIMD(a.y, b.y); + ret.z = SubSIMD(a.z, b.z); + ret.w = SubSIMD(a.w, b.w); + return ret; +} + +FORCEINLINE const FourQuaternions Neg(const FourQuaternions& q) +{ + FourQuaternions ret; + ret.x = NegSIMD(q.x); + ret.y = NegSIMD(q.y); + ret.z = NegSIMD(q.z); + ret.w = NegSIMD(q.w); + return ret; +} + +FORCEINLINE const FourQuaternions MaskedAssign(const fltx4& mask, const FourQuaternions& a, const FourQuaternions& b) +{ + FourQuaternions ret; + ret.x = MaskedAssign(mask, a.x, b.x); + ret.y = MaskedAssign(mask, a.y, b.y); + ret.z = MaskedAssign(mask, a.z, b.z); + ret.w = MaskedAssign(mask, a.w, b.w); + return ret; +} + +#ifdef DIFFERENT_NATIVE_VECTOR_TYPES +FORCEINLINE const FourQuaternions MaskedAssign(const fltx4& mask, const FourQuaternions& a, const FourQuaternions& b) +{ + return MaskedAssign((bi32x4)mask, a, b); +} +#endif + + +FORCEINLINE FourQuaternions QuaternionAlign(const FourQuaternions& p, const FourQuaternions& q) +{ + // decide if one of the quaternions is backwards + fltx4 cmp = CmpLtSIMD(Dot(p, q), Four_Zeros); + return MaskedAssign(cmp, Neg(q), q); +} + + +FORCEINLINE const FourQuaternions QuaternionNormalize(const FourQuaternions& q) +{ + fltx4 radius = Dot(q, q); + fltx4 mask = CmpEqSIMD(radius, Four_Zeros); // all ones iff radius = 0 + fltx4 invRadius = ReciprocalSqrtSIMD(radius); + + FourQuaternions ret = MaskedAssign(mask, q, Mul(q, invRadius)); + return ret; +} + + +#if !defined(__SPU__) +FORCEINLINE FourQuaternions& FourQuaternions::FromAxisAndAngles(const fltx4& axis, + const float& angle0, const float& angle1, const float& angle2, const float& angle3) +{ + return FromAxisAndAngles(axis, LoadGatherSIMD(angle0, angle1, angle2, angle3)); +} + +FORCEINLINE FourQuaternions& FourQuaternions::FromAxisAndAngles(const fltx4& axis, + const fltx4& angles) +{ + // compute the half theta + fltx4 theta = MulSIMD(angles, Four_PointFives); + // compute the sine and cosine of each angle simultaneously + fltx4 vsines; fltx4 vcoses; + SinCosSIMD(vsines, vcoses, theta); + // now the sines and coses vectors contain the results for four angles. + // for each of the angles, splat them out and then swizzle together so + // as to get a < cos, sin, sin, sin > coefficient vector + + x = MulSIMD(vsines, SplatXSIMD(axis)); // sin(t0) * x, sin(t1) * x, etc + y = MulSIMD(vsines, SplatYSIMD(axis)); + z = MulSIMD(vsines, SplatZSIMD(axis)); + w = vcoses; + + + return *this; +} +#endif + + +/// this = this * q; +FORCEINLINE FourQuaternions FourQuaternions::Mul(FourQuaternions const& q) const +{ + // W = w1w2 - x1x2 - y1y2 - z1z2 + FourQuaternions ret; + fltx4 signMask = LoadAlignedSIMD((float*)g_SIMD_signmask); + // as we do the multiplication, also do a dot product, so we know whether + // one of the quats is backwards and if we therefore have to negate at the end + fltx4 dotProduct = MulSIMD(w, q.w); + + ret.w = MulSIMD(w, q.w); // W = w1w2 + ret.x = MulSIMD(w, q.x); // X = w1x2 + ret.y = MulSIMD(w, q.y); // Y = w1y2 + ret.z = MulSIMD(w, q.z); // Z = w1z2 + + dotProduct = MaddSIMD(x, q.x, dotProduct); + ret.w = MsubSIMD(x, q.x, ret.w); // W = w1w2 - x1x2 + ret.x = MaddSIMD(x, q.w, ret.x); // X = w1x2 + x1w2 + ret.y = MsubSIMD(x, q.z, ret.y); // Y = w1y2 - x1z2 + ret.z = MaddSIMD(x, q.y, ret.z); // Z = w1z2 + x1y2 + + dotProduct = MaddSIMD(y, q.y, dotProduct); + ret.w = MsubSIMD(y, q.y, ret.w); // W = w1w2 - x1x2 - y1y2 + ret.x = MaddSIMD(y, q.z, ret.x); // X = w1x2 + x1w2 + y1z2 + ret.y = MaddSIMD(y, q.w, ret.y); // Y = w1y2 - x1z2 + y1w2 + ret.z = MsubSIMD(y, q.x, ret.z); // Z = w1z2 + x1y2 - y1x2 + + dotProduct = MaddSIMD(z, q.z, dotProduct); + ret.w = MsubSIMD(z, q.z, ret.w); // W = w1w2 - x1x2 - y1y2 - z1z2 + ret.x = MsubSIMD(z, q.y, ret.x); // X = w1x2 + x1w2 + y1z2 - z1y2 + ret.y = MaddSIMD(z, q.x, ret.y); // Y = w1y2 - x1z2 + y1w2 + z1x2 + ret.z = MaddSIMD(z, q.w, ret.z); // Z = w1z2 + x1y2 - y1x2 + z1w2 + + fltx4 Zero = Four_Zeros; + fltx4 control = CmpLtSIMD(dotProduct, Four_Zeros); + signMask = MaskedAssign(control, signMask, Zero); // negate quats where q1.q2 < 0 + ret.w = XorSIMD(signMask, ret.w); + ret.x = XorSIMD(signMask, ret.x); + ret.y = XorSIMD(signMask, ret.y); + ret.z = XorSIMD(signMask, ret.z); + + return ret; +} + + +FORCEINLINE void FourQuaternions::RotateFourVectors(FourVectors* RESTRICT vecs) const RESTRICT +{ + fltx4 tmpX, tmpY, tmpZ, tmpW; + fltx4 outX, outY, outZ; + + tmpX = SubSIMD(MaddSIMD(w, vecs->x, MulSIMD(y, vecs->z)), + MulSIMD(z, vecs->y)); + + tmpY = SubSIMD(MaddSIMD(w, vecs->y, MulSIMD(z, vecs->x)), + MulSIMD(x, vecs->z)); + + tmpZ = SubSIMD(MaddSIMD(w, vecs->z, MulSIMD(x, vecs->y)), + MulSIMD(y, vecs->x)); + + tmpW = AddSIMD(MaddSIMD(x, vecs->x, MulSIMD(y, vecs->y)), + MulSIMD(z, vecs->z)); + + + outX = AddSIMD(SubSIMD(MaddSIMD(tmpW, x, MulSIMD(tmpX, w)), + MulSIMD(tmpY, z)), + MulSIMD(tmpZ, y)); + + outY = AddSIMD(SubSIMD(MaddSIMD(tmpW, y, MulSIMD(tmpY, w)), + MulSIMD(tmpZ, x)), + MulSIMD(tmpX, z)); + + outZ = AddSIMD(SubSIMD(MaddSIMD(tmpW, z, MulSIMD(tmpZ, w)), + MulSIMD(tmpX, y)), + MulSIMD(tmpY, x)); + + // although apparently redundant, assigning the results to intermediate local variables + // seems to improve code scheduling slightly in SN. + vecs->x = outX; + vecs->y = outY; + vecs->z = outZ; +} + + +/* +void QuaternionScale( const Quaternion &p, float t, Quaternion &q ) +{ + Assert( s_bMathlibInitialized ); + float r; + // FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to + // use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale. + float sinom = sqrt( DotProduct( &p.x, &p.x ) ); + sinom = min( sinom, 1.f ); + float sinsom = sin( asin( sinom ) * t ); + t = sinsom / (sinom + FLT_EPSILON); + VectorScale( &p.x, t, &q.x ); + // rescale rotation + r = 1.0f - sinsom * sinsom; + // Assert( r >= 0 ); + if (r < 0.0f) + r = 0.0f; + r = sqrt( r ); + // keep sign of rotation + if (p.w < 0) + q.w = -r; + else + q.w = r; + Assert( q.IsValid() ); + return; +} +*/ + +FORCEINLINE FourQuaternions FourQuaternions::ScaleAngle(const fltx4& scale) const +{ + FourQuaternions ret; + static const fltx4 OneMinusEpsilon = { 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f }; + const fltx4 Zero = Four_Zeros; + fltx4 signMask = LoadAlignedSIMD((float*)g_SIMD_signmask); + // work out if there are any tiny scales or angles, which are unstable + fltx4 tinyAngles = CmpGtSIMD(w, OneMinusEpsilon); + fltx4 negativeRotations = CmpLtSIMD(w, Zero); // if any w's are <0, we will need to negate later down + + // figure out the theta + fltx4 angles = ArcCosSIMD(w); + + // test also if w > -1 + fltx4 negativeWs = XorSIMD(signMask, w); + tinyAngles = OrSIMD(CmpGtSIMD(negativeWs, OneMinusEpsilon), tinyAngles); + + // meanwhile start working on computing the dot product of the + // vector component, and trust in the scheduler to interleave them + fltx4 vLenSq = MulSIMD(x, x); + vLenSq = MaddSIMD(y, y, vLenSq); + vLenSq = MaddSIMD(z, z, vLenSq); + + // scale the angles + angles = MulSIMD(angles, scale); + + // clear out the sign mask where w>=0 + signMask = MaskedAssign(negativeRotations, signMask, Zero); + + // work out the new w component and vector length + fltx4 vLenRecip = ReciprocalSqrtSIMD(vLenSq); // interleave with Cos to hide latencies + fltx4 sine; + SinCosSIMD(sine, ret.w, angles); + ret.x = MulSIMD(x, vLenRecip); // renormalize so the vector length + w = 1 + ret.y = MulSIMD(y, vLenRecip); // renormalize so the vector length + w = 1 + ret.z = MulSIMD(z, vLenRecip); // renormalize so the vector length + w = 1 + ret.x = MulSIMD(ret.x, sine); + ret.y = MulSIMD(ret.y, sine); + ret.z = MulSIMD(ret.z, sine); + + // negate where necessary + ret.x = XorSIMD(ret.x, signMask); + ret.y = XorSIMD(ret.y, signMask); + ret.z = XorSIMD(ret.z, signMask); + ret.w = XorSIMD(ret.w, signMask); + + // finally, toss results from where cos(theta) is close to 1 -- these are non rotations. + ret.x = MaskedAssign(tinyAngles, x, ret.x); + ret.y = MaskedAssign(tinyAngles, y, ret.y); + ret.z = MaskedAssign(tinyAngles, z, ret.z); + ret.w = MaskedAssign(tinyAngles, w, ret.w); + + return ret; +} + +//----------------------------------------------------------------------------- +// Purpose: return = this * ( s * q ) +// In other words, for a quaternion representing a rotation of angle theta, return +// one of angle s*theta +// s is four floats in a fltx4 -- one for each quaternion +//----------------------------------------------------------------------------- + +FORCEINLINE FourQuaternions FourQuaternions::MulAc(const fltx4& s, const FourQuaternions& q) const +{ + /* + void QuaternionMA( const Quaternion &p, float s, const Quaternion &q, Quaternion &qt ) + { + Quaternion p1, q1; + QuaternionScale( q, s, q1 ); + QuaternionMult( p, q1, p1 ); + QuaternionNormalize( p1 ); + qt[0] = p1[0]; + qt[1] = p1[1]; + qt[2] = p1[2]; + qt[3] = p1[3]; + } + */ + + return Mul(q.ScaleAngle(s)); +} + + +FORCEINLINE FourQuaternions FourQuaternions::ScaleMul(const fltx4& s, const FourQuaternions& q) const +{ + return ScaleAngle(s).Mul(q); +} + + +FORCEINLINE FourQuaternions FourQuaternions::Slerp(const FourQuaternions& originalto, const fltx4& t) +{ + FourQuaternions ret; + static const fltx4 OneMinusEpsilon = { 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f }; + + // align if necessary. + + // actually, before we even do that, start by computing the dot product of + // the quaternions. it has lots of dependent ops and we can sneak it into + // the pipeline bubbles as we figure out alignment. Of course we don't know + // yet if we need to realign, so compute them both -- there's plenty of + // space in the bubbles. They're roomy, those bubbles. + fltx4 cosineOmega; +#if 0 // Maybe I don't need to do alignment seperately, using the xb360 technique... + FourQuaternions to; + { + fltx4 diffs[4], sums[4], originalToNeg[4]; + fltx4 dotIfAligned, dotIfNotAligned; + + // compute negations of the TO quaternion. + originalToNeg[0] = NegSIMD(originalto.x); + originalToNeg[1] = NegSIMD(originalto.y); + originalToNeg[2] = NegSIMD(originalto.z); + originalToNeg[3] = NegSIMD(originalto.w); + + dotIfAligned = MulSIMD(x, originalto.x); + dotIfNotAligned = MulSIMD(x, originalToNeg[0]); + + diffs[0] = SubSIMD(x, originalto.x); + diffs[1] = SubSIMD(y, originalto.y); + diffs[2] = SubSIMD(z, originalto.z); + diffs[3] = SubSIMD(w, originalto.w); + + sums[0] = AddSIMD(x, originalto.x); + sums[1] = AddSIMD(y, originalto.y); + sums[2] = AddSIMD(z, originalto.z); + sums[3] = AddSIMD(w, originalto.w); + + dotIfAligned = MaddSIMD(y, originalto.y, dotIfAligned); + dotIfNotAligned = MaddSIMD(y, originalToNeg[1], dotIfNotAligned); + + fltx4 diffsDot, sumsDot; + + diffsDot = MulSIMD(diffs[0], diffs[0]); // x^2 + sumsDot = MulSIMD(sums[0], sums[0]); // x^2 + // do some work on the dot products while letting the multiplies cook + dotIfAligned = MaddSIMD(z, originalto.z, dotIfAligned); + dotIfNotAligned = MaddSIMD(z, originalToNeg[2], dotIfNotAligned); + + diffsDot = MaddSIMD(diffs[1], diffs[1], diffsDot); // x^2 + y^2 + sumsDot = MaddSIMD(sums[1], sums[1], sumsDot); + diffsDot = MaddSIMD(diffs[2], diffs[2], diffsDot); // x^2 + y^2 + z^2 + sumsDot = MaddSIMD(sums[2], sums[2], sumsDot); + diffsDot = MaddSIMD(diffs[3], diffs[3], diffsDot); // x^2 + y^2 + z^2 + w^2 + sumsDot = MaddSIMD(sums[3], sums[3], sumsDot); + // do some work on the dot products while letting the multiplies cook + dotIfAligned = MaddSIMD(w, originalto.w, dotIfAligned); + dotIfNotAligned = MaddSIMD(w, originalToNeg[3], dotIfNotAligned); + + // are the differences greater than the sums? + // if so, we need to negate that quaternion + fltx4 mask = CmpGtSIMD(diffsDot, sumsDot); // 1 for diffs>0 and 0 elsewhere + to.x = MaskedAssign(mask, originalToNeg[0], originalto.x); + to.y = MaskedAssign(mask, originalToNeg[1], originalto.y); + to.z = MaskedAssign(mask, originalToNeg[2], originalto.z); + to.w = MaskedAssign(mask, originalToNeg[3], originalto.w); + + cosineOmega = MaskedAssign(mask, dotIfNotAligned, dotIfAligned); + } + + // right, now to is aligned to be the short way round, and we computed + // the dot product while we were figuring all that out. +#else + const FourQuaternions& to = originalto; + cosineOmega = MulSIMD(x, to.x); + cosineOmega = MaddSIMD(y, to.y, cosineOmega); + cosineOmega = MaddSIMD(z, to.z, cosineOmega); + cosineOmega = MaddSIMD(w, to.w, cosineOmega); +#endif + + fltx4 Zero = Four_Zeros; + fltx4 cosOmegaLessThanZero = CmpLtSIMD(cosineOmega, Zero); + // fltx4 shouldNegate = MaskedAssign(cosOmegaLessThanZero, Four_NegativeOnes , Four_Ones ); + fltx4 signMask = LoadAlignedSIMD((float*)g_SIMD_signmask); // contains a one in the sign bit -- xor against a number to negate it + fltx4 sinOmega = Four_Ones; + + // negate cosineOmega where necessary + cosineOmega = MaskedAssign(cosOmegaLessThanZero, XorSIMD(cosineOmega, signMask), cosineOmega); + fltx4 oneMinusT = SubSIMD(Four_Ones, t); + fltx4 bCosOmegaLessThanOne = CmpLtSIMD(cosineOmega, OneMinusEpsilon); // we'll use this to mask out null slerps + + // figure out the sin component of the diff quaternion. + // since sin^2(t) + cos^2(t) = 1... + sinOmega = MsubSIMD(cosineOmega, cosineOmega, sinOmega); // = 1 - cos^2(t) = sin^2(t) + fltx4 invSinOmega = ReciprocalSqrtSIMD(sinOmega); // 1/sin(t) + sinOmega = MulSIMD(sinOmega, invSinOmega); // = sin^2(t) / sin(t) = sin(t) + + // use the arctangent technique to work out omega from tan^-1(sin/cos) + fltx4 omega = ArcTan2SIMD(sinOmega, cosineOmega); + + // alpha = sin(omega * (1-T))/sin(omega) + // beta = sin(omega * T)/sin(omega) + fltx4 alpha = MulSIMD(omega, oneMinusT); // w(1-T) + fltx4 beta = MulSIMD(omega, t); // w(T) + signMask = MaskedAssign(cosOmegaLessThanZero, signMask, Zero); + + alpha = SinSIMD(alpha); // sin(w(1-T)) + beta = SinSIMD(beta); // sin(wT) + + alpha = MulSIMD(alpha, invSinOmega); + beta = MulSIMD(beta, invSinOmega); + + // depending on whether the dot product was less than zero, negate beta, or not + beta = XorSIMD(beta, signMask); + + // mask out singularities (where omega = 1) + alpha = MaskedAssign(bCosOmegaLessThanOne, alpha, oneMinusT); + beta = MaskedAssign(bCosOmegaLessThanOne, beta, t); + + ret.x = MulSIMD(x, alpha); + ret.y = MulSIMD(y, alpha); + ret.z = MulSIMD(z, alpha); + ret.w = MulSIMD(w, alpha); + + ret.x = MaddSIMD(to.x, beta, ret.x); + ret.y = MaddSIMD(to.y, beta, ret.y); + ret.z = MaddSIMD(to.z, beta, ret.z); + ret.w = MaddSIMD(to.w, beta, ret.w); + + return ret; +} + + + +FORCEINLINE FourQuaternions FourQuaternions::SlerpNoAlign(const FourQuaternions& originalto, const fltx4& t) +{ + FourQuaternions ret; + static const fltx4 OneMinusEpsilon = { 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f }; + + // align if necessary. + + // actually, before we even do that, start by computing the dot product of + // the quaternions. it has lots of dependent ops and we can sneak it into + // the pipeline bubbles as we figure out alignment. Of course we don't know + // yet if we need to realign, so compute them both -- there's plenty of + // space in the bubbles. They're roomy, those bubbles. + fltx4 cosineOmega; + + const FourQuaternions& to = originalto; + cosineOmega = MulSIMD(x, to.x); + cosineOmega = MaddSIMD(y, to.y, cosineOmega); + cosineOmega = MaddSIMD(z, to.z, cosineOmega); + cosineOmega = MaddSIMD(w, to.w, cosineOmega); + + fltx4 sinOmega = Four_Ones; + + fltx4 oneMinusT = SubSIMD(Four_Ones, t); + fltx4 bCosOmegaLessThanOne = CmpLtSIMD(cosineOmega, OneMinusEpsilon); // we'll use this to mask out null slerps + + // figure out the sin component of the diff quaternion. + // since sin^2(t) + cos^2(t) = 1... + sinOmega = MsubSIMD(cosineOmega, cosineOmega, sinOmega); // = 1 - cos^2(t) = sin^2(t) + fltx4 invSinOmega = ReciprocalSqrtSIMD(sinOmega); // 1/sin(t) + sinOmega = MulSIMD(sinOmega, invSinOmega); // = sin^2(t) / sin(t) = sin(t) + + // use the arctangent technique to work out omega from tan^-1(sin/cos) + fltx4 omega = ArcTan2SIMD(sinOmega, cosineOmega); + + // alpha = sin(omega * (1-T))/sin(omega) + // beta = sin(omega * T)/sin(omega) + fltx4 alpha = MulSIMD(omega, oneMinusT); // w(1-T) + fltx4 beta = MulSIMD(omega, t); // w(T) + alpha = SinSIMD(alpha); // sin(w(1-T)) + beta = SinSIMD(beta); // sin(wT) + alpha = MulSIMD(alpha, invSinOmega); + beta = MulSIMD(beta, invSinOmega); + + // mask out singularities (where omega = 1) + alpha = MaskedAssign(bCosOmegaLessThanOne, alpha, oneMinusT); + beta = MaskedAssign(bCosOmegaLessThanOne, beta, t); + + ret.x = MulSIMD(x, alpha); + ret.y = MulSIMD(y, alpha); + ret.z = MulSIMD(z, alpha); + ret.w = MulSIMD(w, alpha); + + ret.x = MaddSIMD(to.x, beta, ret.x); + ret.y = MaddSIMD(to.y, beta, ret.y); + ret.z = MaddSIMD(to.z, beta, ret.z); + ret.w = MaddSIMD(to.w, beta, ret.w); + + return ret; +} + +/***** removed because one of the SWIG permutations doesn't include ssequaternion.h, causing a missing symbol on this function: +inline void FourVectors::RotateBy( const FourQuaternions &quats ) +{ + quats.RotateFourVectors( this ); +} +*/ + #endif // SSEQUATMATH_H diff --git a/src/public/mathlib/vector.h b/src/public/mathlib/vector.h index c7654ba83..7ec2a0469 100644 --- a/src/public/mathlib/vector.h +++ b/src/public/mathlib/vector.h @@ -2177,55 +2177,26 @@ inline void AngularImpulseToQAngle( const AngularImpulse &impulse, QAngle &angle FORCEINLINE vec_t InvRSquared( float const *v ) { -#if defined(__i386__) || defined(_M_IX86) - float sqrlen = v[0]*v[0]+v[1]*v[1]+v[2]*v[2] + 1.0e-10f, result; - _mm_store_ss(&result, _mm_rcp_ss( _mm_max_ss( _mm_set_ss(1.0f), _mm_load_ss(&sqrlen) ) )); - return result; -#else - return 1.f/fpmax(1.f, v[0]*v[0]+v[1]*v[1]+v[2]*v[2]); -#endif + // The compiler will make it good + return 1.f / (v[0]*v[0]+v[1]*v[1]+v[2]*v[2] + 1.0e-10f); } FORCEINLINE vec_t InvRSquared( const Vector &v ) { - return InvRSquared(&v.x); -} - -#if defined(__i386__) || defined(_M_IX86) -inline void _SSE_RSqrtInline( float a, float* out ) -{ - __m128 xx = _mm_load_ss( &a ); - __m128 xr = _mm_rsqrt_ss( xx ); - __m128 xt; - xt = _mm_mul_ss( xr, xr ); - xt = _mm_mul_ss( xt, xx ); - xt = _mm_sub_ss( _mm_set_ss(3.f), xt ); - xt = _mm_mul_ss( xt, _mm_set_ss(0.5f) ); - xr = _mm_mul_ss( xr, xt ); - _mm_store_ss( out, xr ); + // The compiler will make it good + return 1.0f / (v.x*v.x + v.y*v.y + v.z*v.z + 1.0e-10f); } -#endif // FIXME: Change this back to a #define once we get rid of the vec_t version FORCEINLINE float VectorNormalize( Vector& vec ) { -#ifndef DEBUG // stop crashing my edit-and-continue! - #if defined(__i386__) || defined(_M_IX86) - #define DO_SSE_OPTIMIZATION - #endif -#endif - -#if defined( DO_SSE_OPTIMIZATION ) - float sqrlen = vec.LengthSqr() + 1.0e-10f, invlen; - _SSE_RSqrtInline(sqrlen, &invlen); + // The compiler will make it good + const float len = sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z + 1.0e-10f); + const float invlen = 1.0f / len; vec.x *= invlen; vec.y *= invlen; vec.z *= invlen; - return sqrlen * invlen; -#else - extern float (FASTCALL *pfVectorNormalize)(Vector& v); - return (*pfVectorNormalize)(vec); -#endif + return len; } // FIXME: Obsolete version of VectorNormalize, once we remove all the friggin float*s @@ -2236,7 +2207,11 @@ FORCEINLINE float VectorNormalize( float * v ) FORCEINLINE void VectorNormalizeFast( Vector &vec ) { - VectorNormalize(vec); + // The previous version just called VectorNormalize but it's significant to be able to do a rsqrtss here. + const float invlen = 1.0f / sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z + 1.0e-10f); + vec.x *= invlen; + vec.y *= invlen; + vec.z *= invlen; } #else @@ -2308,4 +2283,3 @@ inline bool Vector::IsLengthLessThan( float val ) const } #endif - diff --git a/src/public/mathlib/vector4d.h b/src/public/mathlib/vector4d.h index 2b20c8823..812348910 100644 --- a/src/public/mathlib/vector4d.h +++ b/src/public/mathlib/vector4d.h @@ -635,20 +635,17 @@ inline void Vector4DMultiplyAligned( Vector4DAligned const& a, Vector4DAligned c #endif } -inline void Vector4DWeightMAD( vec_t w, Vector4DAligned const& vInA, Vector4DAligned& vOutA, Vector4DAligned const& vInB, Vector4DAligned& vOutB ) +inline void Vector4DWeightMADSSE( vec_t w, Vector4DAligned const& vInA, Vector4DAligned& vOutA, Vector4DAligned const& vInB, Vector4DAligned& vOutB ) { Assert( vInA.IsValid() && vInB.IsValid() && IsFinite(w) ); #if !defined( _X360 ) - vOutA.x += vInA.x * w; - vOutA.y += vInA.y * w; - vOutA.z += vInA.z * w; - vOutA.w += vInA.w * w; + // Replicate scalar float out to 4 components + __m128 packed = _mm_set_ps1( w ); - vOutB.x += vInB.x * w; - vOutB.y += vInB.y * w; - vOutB.z += vInB.z * w; - vOutB.w += vInB.w * w; + // 4D SSE Vector MAD + vOutA.AsM128() = _mm_add_ps( vOutA.AsM128(), _mm_mul_ps( vInA.AsM128(), packed ) ); + vOutB.AsM128() = _mm_add_ps( vOutB.AsM128(), _mm_mul_ps( vInB.AsM128(), packed ) ); #else __vector4 temp; @@ -660,17 +657,24 @@ inline void Vector4DWeightMAD( vec_t w, Vector4DAligned const& vInA, Vector4DAli #endif } -inline void Vector4DWeightMADSSE( vec_t w, Vector4DAligned const& vInA, Vector4DAligned& vOutA, Vector4DAligned const& vInB, Vector4DAligned& vOutB ) +inline void Vector4DWeightMAD( vec_t w, Vector4DAligned const& vInA, Vector4DAligned& vOutA, Vector4DAligned const& vInB, Vector4DAligned& vOutB ) { Assert( vInA.IsValid() && vInB.IsValid() && IsFinite(w) ); #if !defined( _X360 ) - // Replicate scalar float out to 4 components - __m128 packed = _mm_set1_ps( w ); +#if 1 // Now using SSE2, so this is faster + Vector4DWeightMADSSE(w, vInA, vOutA, vInB, vOutB); +#else + vOutA.x += vInA.x * w; + vOutA.y += vInA.y * w; + vOutA.z += vInA.z * w; + vOutA.w += vInA.w * w; - // 4D SSE Vector MAD - vOutA.AsM128() = _mm_add_ps( vOutA.AsM128(), _mm_mul_ps( vInA.AsM128(), packed ) ); - vOutB.AsM128() = _mm_add_ps( vOutB.AsM128(), _mm_mul_ps( vInB.AsM128(), packed ) ); + vOutB.x += vInB.x * w; + vOutB.y += vInB.y * w; + vOutB.z += vInB.z * w; + vOutB.w += vInB.w * w; +#endif #else __vector4 temp; diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h index 593aead5b..fd542388f 100644 --- a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h +++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h @@ -147,7 +147,11 @@ #endif #endif // !_XM_NO_INTRINSICS_ -#include "sal.h" +#ifdef _WIN32 +#include +#else +#include "../../dotnetrt/sal.h" +#endif #include #ifdef _MSC_VER diff --git a/src/thirdparty/dotnetrt/sal.h b/src/thirdparty/dotnetrt/sal.h index 2e0457140..a4a31fa44 100644 --- a/src/thirdparty/dotnetrt/sal.h +++ b/src/thirdparty/dotnetrt/sal.h @@ -1,2953 +1,327 @@ -// VALVE EDIT: -// taken from https://github.com/dotnet/runtime/blob/main/src/coreclr/pal/inc/rt/sal.h -// used for DirectXMath compatibly on POSIX - -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -/*** -*sal.h - markers for documenting the semantics of APIs -* - -* -*Purpose: -* sal.h provides a set of annotations to describe how a function uses its -* parameters - the assumptions it makes about them, and the guarantees it makes -* upon finishing. -****/ -#pragma once - -/*========================================================================== - - The comments in this file are intended to give basic understanding of - the usage of SAL, the Microsoft Source Code Annotation Language. - For more details, please see http://go.microsoft.com/fwlink/?LinkID=242134 - - The macros are defined in 3 layers, plus the structural set: - - _In_/_Out_/_Ret_ Layer: - ---------------------- - This layer provides the highest abstraction and its macros should be used - in most cases. These macros typically start with: - _In_ : input parameter to a function, unmodified by called function - _Out_ : output parameter, written to by called function, pointed-to - location not expected to be initialized prior to call - _Outptr_ : like _Out_ when returned variable is a pointer type - (so param is pointer-to-pointer type). Called function - provides/allocated space. - _Outref_ : like _Outptr_, except param is reference-to-pointer type. - _Inout_ : inout parameter, read from and potentially modified by - called function. - _Ret_ : for return values - _Field_ : class/struct field invariants - For common usage, this class of SAL provides the most concise annotations. - Note that _In_/_Out_/_Inout_/_Outptr_ annotations are designed to be used - with a parameter target. Using them with _At_ to specify non-parameter - targets may yield unexpected results. - - This layer also includes a number of other properties that can be specified - to extend the ability of code analysis, most notably: - -- Designating parameters as format strings for printf/scanf/scanf_s - -- Requesting stricter type checking for C enum parameters - - _Pre_/_Post_ Layer: - ------------------ - The macros of this layer only should be used when there is no suitable macro - in the _In_/_Out_ layer. Its macros start with _Pre_ or _Post_. - This layer provides the most flexibility for annotations. - - Implementation Abstraction Layer: - -------------------------------- - Macros from this layer should never be used directly. The layer only exists - to hide the implementation of the annotation macros. - - Structural Layer: - ---------------- - These annotations, like _At_ and _When_, are used with annotations from - any of the other layers as modifiers, indicating exactly when and where - the annotations apply. - - - Common syntactic conventions: - ---------------------------- - - Usage: - ----- - _In_, _Out_, _Inout_, _Pre_, _Post_, are for formal parameters. - _Ret_, _Deref_ret_ must be used for return values. - - Nullness: - -------- - If the parameter can be NULL as a precondition to the function, the - annotation contains _opt. If the macro does not contain '_opt' the - parameter cannot be NULL. - - If an out/inout parameter returns a null pointer as a postcondition, this is - indicated by _Ret_maybenull_ or _result_maybenull_. If the macro is not - of this form, then the result will not be NULL as a postcondition. - _Outptr_ - output value is not NULL - _Outptr_result_maybenull_ - output value might be NULL - - String Type: - ----------- - _z: NullTerminated string - for _In_ parameters the buffer must have the specified stringtype before the call - for _Out_ parameters the buffer must have the specified stringtype after the call - for _Inout_ parameters both conditions apply - - Extent Syntax: - ------------- - Buffer sizes are expressed as element counts, unless the macro explicitly - contains _byte_ or _bytes_. Some annotations specify two buffer sizes, in - which case the second is used to indicate how much of the buffer is valid - as a postcondition. This table outlines the precondition buffer allocation - size, precondition number of valid elements, postcondition allocation size, - and postcondition number of valid elements for representative buffer size - annotations: - Pre | Pre | Post | Post - alloc | valid | alloc | valid - Annotation elems | elems | elems | elems - ---------- ------------------------------------ - _In_reads_(s) s | s | s | s - _Inout_updates_(s) s | s | s | s - _Inout_updates_to_(s,c) s | s | s | c - _Out_writes_(s) s | 0 | s | s - _Out_writes_to_(s,c) s | 0 | s | c - _Outptr_result_buffer_(s) ? | ? | s | s - _Outptr_result_buffer_to_(s,c) ? | ? | s | c - - For the _Outptr_ annotations, the buffer in question is at one level of - dereference. The called function is responsible for supplying the buffer. - - Success and failure: - ------------------- - The SAL concept of success allows functions to define expressions that can - be tested by the caller, which if it evaluates to non-zero, indicates the - function succeeded, which means that its postconditions are guaranteed to - hold. Otherwise, if the expression evaluates to zero, the function is - considered to have failed, and the postconditions are not guaranteed. - - The success criteria can be specified with the _Success_(expr) annotation: - _Success_(return != FALSE) BOOL - PathCanonicalizeA(_Out_writes_(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) : - pszBuf is only guaranteed to be NULL-terminated when TRUE is returned, - and FALSE indicates failure. In common practice, callers check for zero - vs. non-zero returns, so it is preferable to express the success - criteria in terms of zero/non-zero, not checked for exactly TRUE. - - Functions can specify that some postconditions will still hold, even when - the function fails, using _On_failure_(anno-list), or postconditions that - hold regardless of success or failure using _Always_(anno-list). - - The annotation _Return_type_success_(expr) may be used with a typedef to - give a default _Success_ criteria to all functions returning that type. - This is the case for common Windows API status types, including - HRESULT and NTSTATUS. This may be overridden on a per-function basis by - specifying a _Success_ annotation locally. - -============================================================================*/ - -#define __ATTR_SAL - -#ifndef _SAL_VERSION /*IFSTRIP=IGN*/ -#define _SAL_VERSION 20 -#endif - -#ifdef _PREFAST_ // [ - -// choose attribute or __declspec implementation -#ifndef _USE_DECLSPECS_FOR_SAL // [ -#define _USE_DECLSPECS_FOR_SAL 1 -#endif // ] - -#if _USE_DECLSPECS_FOR_SAL // [ -#undef _USE_ATTRIBUTES_FOR_SAL -#define _USE_ATTRIBUTES_FOR_SAL 0 -#elif !defined(_USE_ATTRIBUTES_FOR_SAL) // ][ -#if _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // [ -#define _USE_ATTRIBUTES_FOR_SAL 1 -#else // ][ -#define _USE_ATTRIBUTES_FOR_SAL 0 -#endif // ] -#endif // ] - - -#if !_USE_DECLSPECS_FOR_SAL // [ -#if !_USE_ATTRIBUTES_FOR_SAL // [ -#if _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // [ -#undef _USE_ATTRIBUTES_FOR_SAL -#define _USE_ATTRIBUTES_FOR_SAL 1 -#else // ][ -#undef _USE_DECLSPECS_FOR_SAL -#define _USE_DECLSPECS_FOR_SAL 1 -#endif // ] -#endif // ] -#endif // ] - -#else - -// Disable expansion of SAL macros in non-Prefast mode to -// improve compiler throughput. -#ifndef _USE_DECLSPECS_FOR_SAL // [ -#define _USE_DECLSPECS_FOR_SAL 0 -#endif // ] -#ifndef _USE_ATTRIBUTES_FOR_SAL // [ -#define _USE_ATTRIBUTES_FOR_SAL 0 -#endif // ] - -#endif // ] - -// safeguard for MIDL and RC builds -#if _USE_DECLSPECS_FOR_SAL && ( defined( MIDL_PASS ) || defined(__midl) || defined(RC_INVOKED) || !defined(_PREFAST_) ) /*IFSTRIP=IGN*/ // [ -#undef _USE_DECLSPECS_FOR_SAL -#define _USE_DECLSPECS_FOR_SAL 0 -#endif // ] -#if _USE_ATTRIBUTES_FOR_SAL && ( !defined(_MSC_EXTENSIONS) || defined( MIDL_PASS ) || defined(__midl) || defined(RC_INVOKED) ) /*IFSTRIP=IGN*/ // [ -#undef _USE_ATTRIBUTES_FOR_SAL -#define _USE_ATTRIBUTES_FOR_SAL 0 -#endif // ] - -#if _USE_DECLSPECS_FOR_SAL || _USE_ATTRIBUTES_FOR_SAL - -// Special enum type for Y/N/M -enum __SAL_YesNo {_SAL_notpresent, _SAL_no, _SAL_maybe, _SAL_yes, _SAL_default}; - -#endif - -#if defined(BUILD_WINDOWS) && !_USE_ATTRIBUTES_FOR_SAL /*IFSTRIP=IGN*/ -#define _SAL1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1") _GrouP_(annotes _SAL_nop_impl_) -#define _SAL1_1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.1") _GrouP_(annotes _SAL_nop_impl_) -#define _SAL1_2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.2") _GrouP_(annotes _SAL_nop_impl_) -#define _SAL2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "2") _GrouP_(annotes _SAL_nop_impl_) -#else -#define _SAL1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1") _Group_(annotes _SAL_nop_impl_) -#define _SAL1_1_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.1") _Group_(annotes _SAL_nop_impl_) -#define _SAL1_2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "1.2") _Group_(annotes _SAL_nop_impl_) -#define _SAL2_Source_(Name, args, annotes) _SA_annotes3(SAL_name, #Name, "", "2") _Group_(annotes _SAL_nop_impl_) -#endif - -//============================================================================ -// Structural SAL: -// These annotations modify the use of other annotations. They may -// express the annotation target (i.e. what parameter/field the annotation -// applies to) or the condition under which the annotation is applicable. -//============================================================================ - -// _At_(target, annos) specifies that the annotations listed in 'annos' is to -// be applied to 'target' rather than to the identifier which is the current -// lexical target. -#define _At_(target, annos) _At_impl_(target, annos _SAL_nop_impl_) - -// _At_buffer_(target, iter, bound, annos) is similar to _At_, except that -// target names a buffer, and each annotation in annos is applied to each -// element of target up to bound, with the variable named in iter usable -// by the annotations to refer to relevant offsets within target. -#define _At_buffer_(target, iter, bound, annos) _At_buffer_impl_(target, iter, bound, annos _SAL_nop_impl_) - -// _When_(expr, annos) specifies that the annotations listed in 'annos' only -// apply when 'expr' evaluates to non-zero. -#define _When_(expr, annos) _When_impl_(expr, annos _SAL_nop_impl_) -#define _Group_(annos) _Group_impl_(annos _SAL_nop_impl_) -#define _GrouP_(annos) _GrouP_impl_(annos _SAL_nop_impl_) - -// indicates whether normal post conditions apply to a function -#define _Success_(expr) _SAL2_Source_(_Success_, (expr), _Success_impl_(expr)) - -// indicates whether post conditions apply to a function returning -// the type that this annotation is applied to -#define _Return_type_success_(expr) _SAL2_Source_(_Return_type_success_, (expr), _Success_impl_(expr)) - -// Establish postconditions that apply only if the function does not succeed -#define _On_failure_(annos) _On_failure_impl_(annos _SAL_nop_impl_) - -// Establish postconditions that apply in both success and failure cases. -// Only applicable with functions that have _Success_ or _Return_type_succss_. -#define _Always_(annos) _Always_impl_(annos _SAL_nop_impl_) - -// Usable on a function definition. Asserts that a function declaration is -// in scope, and its annotations are to be used. There are no other annotations -// allowed on the function definition. -#define _Use_decl_annotations_ _Use_decl_anno_impl_ - -// _Notref_ may precede a _Deref_ or "real" annotation, and removes one -// level of dereference if the parameter is a C++ reference (&). If the -// net deref on a "real" annotation is negative, it is simply discarded. -#define _Notref_ _Notref_impl_ - -// Annotations for defensive programming styles. -#define _Pre_defensive_ _SA_annotes0(SAL_pre_defensive) -#define _Post_defensive_ _SA_annotes0(SAL_post_defensive) - -#define _In_defensive_(annotes) _Pre_defensive_ _Group_(annotes) -#define _Out_defensive_(annotes) _Post_defensive_ _Group_(annotes) -#define _Inout_defensive_(annotes) _Pre_defensive_ _Post_defensive_ _Group_(annotes) - -//============================================================================ -// _In_\_Out_ Layer: -//============================================================================ - -// Reserved pointer parameters, must always be NULL. -#define _Reserved_ _SAL2_Source_(_Reserved_, (), _Pre1_impl_(__null_impl)) - -// _Const_ allows specification that any namable memory location is considered -// readonly for a given call. -#define _Const_ _SAL2_Source_(_Const_, (), _Pre1_impl_(__readaccess_impl_notref)) - - -// Input parameters -------------------------- - -// _In_ - Annotations for parameters where data is passed into the function, but not modified. -// _In_ by itself can be used with non-pointer types (although it is redundant). - -// e.g. void SetPoint( _In_ const POINT* pPT ); -#define _In_ _SAL2_Source_(_In_, (), _Pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_ _Deref_pre1_impl_(__readaccess_impl_notref)) -#define _In_opt_ _SAL2_Source_(_In_opt_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_ _Deref_pre_readonly_) - -// nullterminated 'in' parameters. -// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); -#define _In_z_ _SAL2_Source_(_In_z_, (), _In_ _Pre1_impl_(__zterm_impl)) -#define _In_opt_z_ _SAL2_Source_(_In_opt_z_, (), _In_opt_ _Pre1_impl_(__zterm_impl)) - - -// 'input' buffers with given size - -#define _In_reads_(size) _SAL2_Source_(_In_reads_, (size), _Pre_count_(size) _Deref_pre_readonly_) -#define _In_reads_opt_(size) _SAL2_Source_(_In_reads_opt_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_) -#define _In_reads_bytes_(size) _SAL2_Source_(_In_reads_bytes_, (size), _Pre_bytecount_(size) _Deref_pre_readonly_) -#define _In_reads_bytes_opt_(size) _SAL2_Source_(_In_reads_bytes_opt_, (size), _Pre_opt_bytecount_(size) _Deref_pre_readonly_) -#define _In_reads_z_(size) _SAL2_Source_(_In_reads_z_, (size), _In_reads_(size) _Pre_z_) -#define _In_reads_opt_z_(size) _SAL2_Source_(_In_reads_opt_z_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_ _Pre_opt_z_) -#define _In_reads_or_z_(size) _SAL2_Source_(_In_reads_or_z_, (size), _In_ _When_(_String_length_(_Curr_) < (size), _Pre_z_) _When_(_String_length_(_Curr_) >= (size), _Pre1_impl_(__count_impl(size)))) -#define _In_reads_or_z_opt_(size) _SAL2_Source_(_In_reads_or_z_opt_, (size), _In_opt_ _When_(_String_length_(_Curr_) < (size), _Pre_z_) _When_(_String_length_(_Curr_) >= (size), _Pre1_impl_(__count_impl(size)))) - - -// 'input' buffers valid to the given end pointer - -#define _In_reads_to_ptr_(ptr) _SAL2_Source_(_In_reads_to_ptr_, (ptr), _Pre_ptrdiff_count_(ptr) _Deref_pre_readonly_) -#define _In_reads_to_ptr_opt_(ptr) _SAL2_Source_(_In_reads_to_ptr_opt_, (ptr), _Pre_opt_ptrdiff_count_(ptr) _Deref_pre_readonly_) -#define _In_reads_to_ptr_z_(ptr) _SAL2_Source_(_In_reads_to_ptr_z_, (ptr), _In_reads_to_ptr_(ptr) _Pre_z_) -#define _In_reads_to_ptr_opt_z_(ptr) _SAL2_Source_(_In_reads_to_ptr_opt_z_, (ptr), _Pre_opt_ptrdiff_count_(ptr) _Deref_pre_readonly_ _Pre_opt_z_) - - - -// Output parameters -------------------------- - -// _Out_ - Annotations for pointer or reference parameters where data passed back to the caller. -// These are mostly used where the pointer/reference is to a non-pointer type. -// _Outptr_/_Outref) (see below) are typically used to return pointers via parameters. - -// e.g. void GetPoint( _Out_ POINT* pPT ); -#define _Out_ _SAL2_Source_(_Out_, (), _Out_impl_) -#define _Out_opt_ _SAL2_Source_(_Out_opt_, (), _Out_opt_impl_) - -#define _Out_writes_(size) _SAL2_Source_(_Out_writes_, (size), _Pre_cap_(size) _Post_valid_impl_) -#define _Out_writes_opt_(size) _SAL2_Source_(_Out_writes_opt_, (size), _Pre_opt_cap_(size) _Post_valid_impl_) -#define _Out_writes_bytes_(size) _SAL2_Source_(_Out_writes_bytes_, (size), _Pre_bytecap_(size) _Post_valid_impl_) -#define _Out_writes_bytes_opt_(size) _SAL2_Source_(_Out_writes_bytes_opt_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_) -#define _Out_writes_z_(size) _SAL2_Source_(_Out_writes_z_, (size), _Pre_cap_(size) _Post_valid_impl_ _Post_z_) -#define _Out_writes_opt_z_(size) _SAL2_Source_(_Out_writes_opt_z_, (size), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_z_) - -#define _Out_writes_to_(size,count) _SAL2_Source_(_Out_writes_to_, (size,count), _Pre_cap_(size) _Post_valid_impl_ _Post_count_(count)) -#define _Out_writes_to_opt_(size,count) _SAL2_Source_(_Out_writes_to_opt_, (size,count), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_count_(count)) -#define _Out_writes_all_(size) _SAL2_Source_(_Out_writes_all_, (size), _Out_writes_to_(_Old_(size), _Old_(size))) -#define _Out_writes_all_opt_(size) _SAL2_Source_(_Out_writes_all_opt_, (size), _Out_writes_to_opt_(_Old_(size), _Old_(size))) - -#define _Out_writes_bytes_to_(size,count) _SAL2_Source_(_Out_writes_bytes_to_, (size,count), _Pre_bytecap_(size) _Post_valid_impl_ _Post_bytecount_(count)) -#define _Out_writes_bytes_to_opt_(size,count) _SAL2_Source_(_Out_writes_bytes_to_opt_, (size,count), _Pre_opt_bytecap_(size) _Post_valid_impl_ _Post_bytecount_(count)) -#define _Out_writes_bytes_all_(size) _SAL2_Source_(_Out_writes_bytes_all_, (size), _Out_writes_bytes_to_(_Old_(size), _Old_(size))) -#define _Out_writes_bytes_all_opt_(size) _SAL2_Source_(_Out_writes_bytes_all_opt_, (size), _Out_writes_bytes_to_opt_(_Old_(size), _Old_(size))) - -#define _Out_writes_to_ptr_(ptr) _SAL2_Source_(_Out_writes_to_ptr_, (ptr), _Pre_ptrdiff_cap_(ptr) _Post_valid_impl_) -#define _Out_writes_to_ptr_opt_(ptr) _SAL2_Source_(_Out_writes_to_ptr_opt_, (ptr), _Pre_opt_ptrdiff_cap_(ptr) _Post_valid_impl_) -#define _Out_writes_to_ptr_z_(ptr) _SAL2_Source_(_Out_writes_to_ptr_z_, (ptr), _Pre_ptrdiff_cap_(ptr) _Post_valid_impl_ Post_z_) -#define _Out_writes_to_ptr_opt_z_(ptr) _SAL2_Source_(_Out_writes_to_ptr_opt_z_, (ptr), _Pre_opt_ptrdiff_cap_(ptr) _Post_valid_impl_ Post_z_) - - -// Inout parameters ---------------------------- - -// _Inout_ - Annotations for pointer or reference parameters where data is passed in and -// potentially modified. -// void ModifyPoint( _Inout_ POINT* pPT ); -// void ModifyPointByRef( _Inout_ POINT& pPT ); - -#define _Inout_ _SAL2_Source_(_Inout_, (), _Prepost_valid_) -#define _Inout_opt_ _SAL2_Source_(_Inout_opt_, (), _Prepost_opt_valid_) - -// For modifying string buffers -// void toupper( _Inout_z_ char* sz ); -#define _Inout_z_ _SAL2_Source_(_Inout_z_, (), _Prepost_z_) -#define _Inout_opt_z_ _SAL2_Source_(_Inout_opt_z_, (), _Prepost_opt_z_) - -// For modifying buffers with explicit element size -#define _Inout_updates_(size) _SAL2_Source_(_Inout_updates_, (size), _Pre_cap_(size) _Pre_valid_impl_ _Post_valid_impl_) -#define _Inout_updates_opt_(size) _SAL2_Source_(_Inout_updates_opt_, (size), _Pre_opt_cap_(size) _Pre_valid_impl_ _Post_valid_impl_) -#define _Inout_updates_z_(size) _SAL2_Source_(_Inout_updates_z_, (size), _Pre_cap_(size) _Pre_valid_impl_ _Post_valid_impl_ _Pre1_impl_(__zterm_impl) _Post1_impl_(__zterm_impl)) -#define _Inout_updates_opt_z_(size) _SAL2_Source_(_Inout_updates_opt_z_, (size), _Pre_opt_cap_(size) _Pre_valid_impl_ _Post_valid_impl_ _Pre1_impl_(__zterm_impl) _Post1_impl_(__zterm_impl)) - -#define _Inout_updates_to_(size,count) _SAL2_Source_(_Inout_updates_to_, (size,count), _Out_writes_to_(size,count) _Pre_valid_impl_ _Pre1_impl_(__count_impl(count))) -#define _Inout_updates_to_opt_(size,count) _SAL2_Source_(_Inout_updates_to_opt_, (size,count), _Out_writes_to_opt_(size,count) _Pre_valid_impl_ _Pre1_impl_(__count_impl(count))) - -#define _Inout_updates_all_(size) _SAL2_Source_(_Inout_updates_all_, (size), _Inout_updates_to_(_Old_(size), _Old_(size))) -#define _Inout_updates_all_opt_(size) _SAL2_Source_(_Inout_updates_all_opt_, (size), _Inout_updates_to_opt_(_Old_(size), _Old_(size))) - -// For modifying buffers with explicit byte size -#define _Inout_updates_bytes_(size) _SAL2_Source_(_Inout_updates_bytes_, (size), _Pre_bytecap_(size) _Pre_valid_impl_ _Post_valid_impl_) -#define _Inout_updates_bytes_opt_(size) _SAL2_Source_(_Inout_updates_bytes_opt_, (size), _Pre_opt_bytecap_(size) _Pre_valid_impl_ _Post_valid_impl_) - -#define _Inout_updates_bytes_to_(size,count) _SAL2_Source_(_Inout_updates_bytes_to_, (size,count), _Out_writes_bytes_to_(size,count) _Pre_valid_impl_ _Pre1_impl_(__bytecount_impl(count))) -#define _Inout_updates_bytes_to_opt_(size,count) _SAL2_Source_(_Inout_updates_bytes_to_opt_, (size,count), _Out_writes_bytes_to_opt_(size,count) _Pre_valid_impl_ _Pre1_impl_(__bytecount_impl(count))) - -#define _Inout_updates_bytes_all_(size) _SAL2_Source_(_Inout_updates_bytes_all_, (size), _Inout_updates_bytes_to_(_Old_(size), _Old_(size))) -#define _Inout_updates_bytes_all_opt_(size) _SAL2_Source_(_Inout_updates_bytes_all_opt_, (size), _Inout_updates_bytes_to_opt_(_Old_(size), _Old_(size))) - - -// Pointer to pointer parameters ------------------------- - -// _Outptr_ - Annotations for output params returning pointers -// These describe parameters where the called function provides the buffer: -// HRESULT SHStrDupW(_In_ LPCWSTR psz, _Outptr_ LPWSTR *ppwsz); -// The caller passes the address of an LPWSTR variable as ppwsz, and SHStrDupW allocates -// and initializes memory and returns the pointer to the new LPWSTR in *ppwsz. -// -// _Outptr_opt_ - describes parameters that are allowed to be NULL. -// _Outptr_*_result_maybenull_ - describes parameters where the called function might return NULL to the caller. -// -// Example: -// void MyFunc(_Outptr_opt_ int **ppData1, _Outptr_result_maybenull_ int **ppData2); -// Callers: -// MyFunc(NULL, NULL); // error: parameter 2, ppData2, should not be NULL -// MyFunc(&pData1, &pData2); // ok: both non-NULL -// if (*pData1 == *pData2) ... // error: pData2 might be NULL after call - -#define _Outptr_ _SAL2_Source_(_Outptr_, (), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(1))) -#define _Outptr_result_maybenull_ _SAL2_Source_(_Outptr_result_maybenull_, (), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(1))) -#define _Outptr_opt_ _SAL2_Source_(_Outptr_opt_, (), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(1))) -#define _Outptr_opt_result_maybenull_ _SAL2_Source_(_Outptr_opt_result_maybenull_, (), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(1))) - -// Annotations for _Outptr_ parameters returning pointers to null terminated strings. - -#define _Outptr_result_z_ _SAL2_Source_(_Outptr_result_z_, (), _Out_impl_ _Deref_post_z_) -#define _Outptr_opt_result_z_ _SAL2_Source_(_Outptr_opt_result_z_, (), _Out_opt_impl_ _Deref_post_z_) -#define _Outptr_result_maybenull_z_ _SAL2_Source_(_Outptr_result_maybenull_z_, (), _Out_impl_ _Deref_post_opt_z_) -#define _Outptr_opt_result_maybenull_z_ _SAL2_Source_(_Outptr_opt_result_maybenull_z_, (), _Out_opt_impl_ _Deref_post_opt_z_) - -// Annotations for _Outptr_ parameters where the output pointer is set to NULL if the function fails. - -#define _Outptr_result_nullonfailure_ _SAL2_Source_(_Outptr_result_nullonfailure_, (), _Outptr_ _On_failure_(_Deref_post_null_)) -#define _Outptr_opt_result_nullonfailure_ _SAL2_Source_(_Outptr_opt_result_nullonfailure_, (), _Outptr_opt_ _On_failure_(_Deref_post_null_)) - -// Annotations for _Outptr_ parameters which return a pointer to a ref-counted COM object, -// following the COM convention of setting the output to NULL on failure. -// The current implementation is identical to _Outptr_result_nullonfailure_. -// For pointers to types that are not COM objects, _Outptr_result_nullonfailure_ is preferred. - -#define _COM_Outptr_ _SAL2_Source_(_COM_Outptr_, (), _Outptr_ _On_failure_(_Deref_post_null_)) -#define _COM_Outptr_result_maybenull_ _SAL2_Source_(_COM_Outptr_result_maybenull_, (), _Outptr_result_maybenull_ _On_failure_(_Deref_post_null_)) -#define _COM_Outptr_opt_ _SAL2_Source_(_COM_Outptr_opt_, (), _Outptr_opt_ _On_failure_(_Deref_post_null_)) -#define _COM_Outptr_opt_result_maybenull_ _SAL2_Source_(_COM_Outptr_opt_result_maybenull_, (), _Outptr_opt_result_maybenull_ _On_failure_(_Deref_post_null_)) - -// Annotations for _Outptr_ parameters returning a pointer to buffer with a specified number of elements/bytes - -#define _Outptr_result_buffer_(size) _SAL2_Source_(_Outptr_result_buffer_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __cap_impl(size))) -#define _Outptr_opt_result_buffer_(size) _SAL2_Source_(_Outptr_opt_result_buffer_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __cap_impl(size))) -#define _Outptr_result_buffer_to_(size, count) _SAL2_Source_(_Outptr_result_buffer_to_, (size, count), _Out_impl_ _Deref_post3_impl_(__notnull_impl_notref, __cap_impl(size), __count_impl(count))) -#define _Outptr_opt_result_buffer_to_(size, count) _SAL2_Source_(_Outptr_opt_result_buffer_to_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__notnull_impl_notref, __cap_impl(size), __count_impl(count))) - -#define _Outptr_result_buffer_all_(size) _SAL2_Source_(_Outptr_result_buffer_all_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(size))) -#define _Outptr_opt_result_buffer_all_(size) _SAL2_Source_(_Outptr_opt_result_buffer_all_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(size))) - -#define _Outptr_result_buffer_maybenull_(size) _SAL2_Source_(_Outptr_result_buffer_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __cap_impl(size))) -#define _Outptr_opt_result_buffer_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_buffer_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __cap_impl(size))) -#define _Outptr_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_result_buffer_to_maybenull_, (size, count), _Out_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __cap_impl(size), __count_impl(count))) -#define _Outptr_opt_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_opt_result_buffer_to_maybenull_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __cap_impl(size), __count_impl(count))) - -#define _Outptr_result_buffer_all_maybenull_(size) _SAL2_Source_(_Outptr_result_buffer_all_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(size))) -#define _Outptr_opt_result_buffer_all_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_buffer_all_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(size))) - -#define _Outptr_result_bytebuffer_(size) _SAL2_Source_(_Outptr_result_bytebuffer_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecap_impl(size))) -#define _Outptr_opt_result_bytebuffer_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecap_impl(size))) -#define _Outptr_result_bytebuffer_to_(size, count) _SAL2_Source_(_Outptr_result_bytebuffer_to_, (size, count), _Out_impl_ _Deref_post3_impl_(__notnull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) -#define _Outptr_opt_result_bytebuffer_to_(size, count) _SAL2_Source_(_Outptr_opt_result_bytebuffer_to_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__notnull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) - -#define _Outptr_result_bytebuffer_all_(size) _SAL2_Source_(_Outptr_result_bytebuffer_all_, (size), _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecount_impl(size))) -#define _Outptr_opt_result_bytebuffer_all_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_all_, (size), _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecount_impl(size))) - -#define _Outptr_result_bytebuffer_maybenull_(size) _SAL2_Source_(_Outptr_result_bytebuffer_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecap_impl(size))) -#define _Outptr_opt_result_bytebuffer_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecap_impl(size))) -#define _Outptr_result_bytebuffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_result_bytebuffer_to_maybenull_, (size, count), _Out_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) -#define _Outptr_opt_result_bytebuffer_to_maybenull_(size, count) _SAL2_Source_(_Outptr_opt_result_bytebuffer_to_maybenull_, (size, count), _Out_opt_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __bytecap_impl(size), __bytecount_impl(count))) - -#define _Outptr_result_bytebuffer_all_maybenull_(size) _SAL2_Source_(_Outptr_result_bytebuffer_all_maybenull_, (size), _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecount_impl(size))) -#define _Outptr_opt_result_bytebuffer_all_maybenull_(size) _SAL2_Source_(_Outptr_opt_result_bytebuffer_all_maybenull_, (size), _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecount_impl(size))) - -// Annotations for output reference to pointer parameters. - -#define _Outref_ _SAL2_Source_(_Outref_, (), _Out_impl_ _Post_notnull_) -#define _Outref_result_maybenull_ _SAL2_Source_(_Outref_result_maybenull_, (), _Pre2_impl_(__notnull_impl_notref, __cap_c_one_notref_impl) _Post_maybenull_ _Post_valid_impl_) - -#define _Outref_result_buffer_(size) _SAL2_Source_(_Outref_result_buffer_, (size), _Outref_ _Post1_impl_(__cap_impl(size))) -#define _Outref_result_bytebuffer_(size) _SAL2_Source_(_Outref_result_bytebuffer_, (size), _Outref_ _Post1_impl_(__bytecap_impl(size))) -#define _Outref_result_buffer_to_(size, count) _SAL2_Source_(_Outref_result_buffer_to_, (size, count), _Outref_result_buffer_(size) _Post1_impl_(__count_impl(count))) -#define _Outref_result_bytebuffer_to_(size, count) _SAL2_Source_(_Outref_result_bytebuffer_to_, (size, count), _Outref_result_bytebuffer_(size) _Post1_impl_(__bytecount_impl(count))) -#define _Outref_result_buffer_all_(size) _SAL2_Source_(_Outref_result_buffer_all_, (size), _Outref_result_buffer_to_(size, _Old_(size))) -#define _Outref_result_bytebuffer_all_(size) _SAL2_Source_(_Outref_result_bytebuffer_all_, (size), _Outref_result_bytebuffer_to_(size, _Old_(size))) - -#define _Outref_result_buffer_maybenull_(size) _SAL2_Source_(_Outref_result_buffer_maybenull_, (size), _Outref_result_maybenull_ _Post1_impl_(__cap_impl(size))) -#define _Outref_result_bytebuffer_maybenull_(size) _SAL2_Source_(_Outref_result_bytebuffer_maybenull_, (size), _Outref_result_maybenull_ _Post1_impl_(__bytecap_impl(size))) -#define _Outref_result_buffer_to_maybenull_(size, count) _SAL2_Source_(_Outref_result_buffer_to_maybenull_, (size, count), _Outref_result_buffer_maybenull_(size) _Post1_impl_(__count_impl(count))) -#define _Outref_result_bytebuffer_to_maybenull_(size, count) _SAL2_Source_(_Outref_result_bytebuffer_to_maybenull_, (size, count), _Outref_result_bytebuffer_maybenull_(size) _Post1_impl_(__bytecount_impl(count))) -#define _Outref_result_buffer_all_maybenull_(size) _SAL2_Source_(_Outref_result_buffer_all_maybenull_, (size), _Outref_result_buffer_to_maybenull_(size, _Old_(size))) -#define _Outref_result_bytebuffer_all_maybenull_(size) _SAL2_Source_(_Outref_result_bytebuffer_all_maybenull_, (size), _Outref_result_bytebuffer_to_maybenull_(size, _Old_(size))) - -// Annotations for output reference to pointer parameters that guarantee -// that the pointer is set to NULL on failure. -#define _Outref_result_nullonfailure_ _SAL2_Source_(_Outref_result_nullonfailure_, (), _Outref_ _On_failure_(_Post_null_)) - -// Generic annotations to set output value of a by-pointer or by-reference parameter to null/zero on failure. -#define _Result_nullonfailure_ _SAL2_Source_(_Result_nullonfailure_, (), _On_failure_(_Notref_impl_ _Deref_impl_ _Post_null_)) -#define _Result_zeroonfailure_ _SAL2_Source_(_Result_zeroonfailure_, (), _On_failure_(_Notref_impl_ _Deref_impl_ _Out_range_(==, 0))) - - -// return values ------------------------------- - -// -// _Ret_ annotations -// -// describing conditions that hold for return values after the call - -// e.g. _Ret_z_ CString::operator const WCHAR*() const throw(); -#define _Ret_z_ _SAL2_Source_(_Ret_z_, (), _Ret2_impl_(__notnull_impl, __zterm_impl) _Ret_valid_impl_) -#define _Ret_maybenull_z_ _SAL2_Source_(_Ret_maybenull_z_, (), _Ret2_impl_(__maybenull_impl,__zterm_impl) _Ret_valid_impl_) - -// used with allocated but not yet initialized objects -#define _Ret_notnull_ _SAL2_Source_(_Ret_notnull_, (), _Ret1_impl_(__notnull_impl)) -#define _Ret_maybenull_ _SAL2_Source_(_Ret_maybenull_, (), _Ret1_impl_(__maybenull_impl)) -#define _Ret_null_ _SAL2_Source_(_Ret_null_, (), _Ret1_impl_(__null_impl)) - -// used with allocated and initialized objects -// returns single valid object -#define _Ret_valid_ _SAL2_Source_(_Ret_valid_, (), _Ret1_impl_(__notnull_impl_notref) _Ret_valid_impl_) - -// returns pointer to initialized buffer of specified size -#define _Ret_writes_(size) _SAL2_Source_(_Ret_writes_, (size), _Ret2_impl_(__notnull_impl, __count_impl(size)) _Ret_valid_impl_) -#define _Ret_writes_z_(size) _SAL2_Source_(_Ret_writes_z_, (size), _Ret3_impl_(__notnull_impl, __count_impl(size), __zterm_impl) _Ret_valid_impl_) -#define _Ret_writes_bytes_(size) _SAL2_Source_(_Ret_writes_bytes_, (size), _Ret2_impl_(__notnull_impl, __bytecount_impl(size)) _Ret_valid_impl_) -#define _Ret_writes_maybenull_(size) _SAL2_Source_(_Ret_writes_maybenull_, (size), _Ret2_impl_(__maybenull_impl,__count_impl(size)) _Ret_valid_impl_) -#define _Ret_writes_maybenull_z_(size) _SAL2_Source_(_Ret_writes_maybenull_z_, (size), _Ret3_impl_(__maybenull_impl,__count_impl(size),__zterm_impl) _Ret_valid_impl_) -#define _Ret_writes_bytes_maybenull_(size) _SAL2_Source_(_Ret_writes_bytes_maybenull_, (size), _Ret2_impl_(__maybenull_impl,__bytecount_impl(size)) _Ret_valid_impl_) - -// returns pointer to partially initialized buffer, with total size 'size' and initialized size 'count' -#define _Ret_writes_to_(size,count) _SAL2_Source_(_Ret_writes_to_, (size,count), _Ret3_impl_(__notnull_impl, __cap_impl(size), __count_impl(count)) _Ret_valid_impl_) -#define _Ret_writes_bytes_to_(size,count) _SAL2_Source_(_Ret_writes_bytes_to_, (size,count), _Ret3_impl_(__notnull_impl, __bytecap_impl(size), __bytecount_impl(count)) _Ret_valid_impl_) -#define _Ret_writes_to_maybenull_(size,count) _SAL2_Source_(_Ret_writes_to_maybenull_, (size,count), _Ret3_impl_(__maybenull_impl, __cap_impl(size), __count_impl(count)) _Ret_valid_impl_) -#define _Ret_writes_bytes_to_maybenull_(size,count) _SAL2_Source_(_Ret_writes_bytes_to_maybenull_, (size,count), _Ret3_impl_(__maybenull_impl, __bytecap_impl(size), __bytecount_impl(count)) _Ret_valid_impl_) - - -// Annotations for strict type checking -#define _Points_to_data_ _SAL2_Source_(_Points_to_data_, (), _Pre_ _Points_to_data_impl_) -#define _Literal_ _SAL2_Source_(_Literal_, (), _Pre_ _Literal_impl_) -#define _Notliteral_ _SAL2_Source_(_Notliteral_, (), _Pre_ _Notliteral_impl_) - -// Check the return value of a function e.g. _Check_return_ ErrorCode Foo(); -#define _Check_return_ _SAL2_Source_(_Check_return_, (), _Check_return_impl_) -#define _Must_inspect_result_ _SAL2_Source_(_Must_inspect_result_, (), _Must_inspect_impl_ _Check_return_impl_) - -// e.g. MyPrintF( _Printf_format_string_ const WCHAR* wzFormat, ... ); -#define _Printf_format_string_ _SAL2_Source_(_Printf_format_string_, (), _Printf_format_string_impl_) -#define _Scanf_format_string_ _SAL2_Source_(_Scanf_format_string_, (), _Scanf_format_string_impl_) -#define _Scanf_s_format_string_ _SAL2_Source_(_Scanf_s_format_string_, (), _Scanf_s_format_string_impl_) - -#define _Format_string_impl_(kind,where) _SA_annotes2(SAL_IsFormatString2, kind, where) -#define _Printf_format_string_params_(x) _SAL2_Source_(_Printf_format_string_params_, (x), _Format_string_impl_("printf", x)) -#define _Scanf_format_string_params_(x) _SAL2_Source_(_Scanf_format_string_params_, (x), _Format_string_impl_("scanf", x)) -#define _Scanf_s_format_string_params_(x) _SAL2_Source_(_Scanf_s_format_string_params_, (x), _Format_string_impl_("scanf_s", x)) - -// annotations to express value of integral or pointer parameter -#define _In_range_(lb,ub) _SAL2_Source_(_In_range_, (lb,ub), _In_range_impl_(lb,ub)) -#define _Out_range_(lb,ub) _SAL2_Source_(_Out_range_, (lb,ub), _Out_range_impl_(lb,ub)) -#define _Ret_range_(lb,ub) _SAL2_Source_(_Ret_range_, (lb,ub), _Ret_range_impl_(lb,ub)) -#define _Deref_in_range_(lb,ub) _SAL2_Source_(_Deref_in_range_, (lb,ub), _Deref_in_range_impl_(lb,ub)) -#define _Deref_out_range_(lb,ub) _SAL2_Source_(_Deref_out_range_, (lb,ub), _Deref_out_range_impl_(lb,ub)) -#define _Deref_ret_range_(lb,ub) _SAL2_Source_(_Deref_ret_range_, (lb,ub), _Deref_ret_range_impl_(lb,ub)) -#define _Pre_equal_to_(expr) _SAL2_Source_(_Pre_equal_to_, (expr), _In_range_(==, expr)) -#define _Post_equal_to_(expr) _SAL2_Source_(_Post_equal_to_, (expr), _Out_range_(==, expr)) - -// annotation to express that a value (usually a field of a mutable class) -// is not changed by a function call -#define _Unchanged_(e) _SAL2_Source_(_Unchanged_, (e), _At_(e, _Post_equal_to_(_Old_(e)) _Const_)) - -// Annotations to allow expressing generalized pre and post conditions. -// 'cond' may be any valid SAL expression that is considered to be true as a precondition -// or postcondition (respsectively). -#define _Pre_satisfies_(cond) _SAL2_Source_(_Pre_satisfies_, (cond), _Pre_satisfies_impl_(cond)) -#define _Post_satisfies_(cond) _SAL2_Source_(_Post_satisfies_, (cond), _Post_satisfies_impl_(cond)) - -// Annotations to express struct, class and field invariants -#define _Struct_size_bytes_(size) _SAL2_Source_(_Struct_size_bytes_, (size), _Writable_bytes_(size)) - -#define _Field_size_(size) _SAL2_Source_(_Field_size_, (size), _Notnull_ _Writable_elements_(size)) -#define _Field_size_opt_(size) _SAL2_Source_(_Field_size_opt_, (size), _Maybenull_ _Writable_elements_(size)) -#define _Field_size_part_(size, count) _SAL2_Source_(_Field_size_part_, (size, count), _Notnull_ _Writable_elements_(size) _Readable_elements_(count)) -#define _Field_size_part_opt_(size, count) _SAL2_Source_(_Field_size_part_opt_, (size, count), _Maybenull_ _Writable_elements_(size) _Readable_elements_(count)) -#define _Field_size_full_(size) _SAL2_Source_(_Field_size_full_, (size), _Field_size_part_(size, size)) -#define _Field_size_full_opt_(size) _SAL2_Source_(_Field_size_full_opt_, (size), _Field_size_part_opt_(size, size)) - -#define _Field_size_bytes_(size) _SAL2_Source_(_Field_size_bytes_, (size), _Notnull_ _Writable_bytes_(size)) -#define _Field_size_bytes_opt_(size) _SAL2_Source_(_Field_size_bytes_opt_, (size), _Maybenull_ _Writable_bytes_(size)) -#define _Field_size_bytes_part_(size, count) _SAL2_Source_(_Field_size_bytes_part_, (size, count), _Notnull_ _Writable_bytes_(size) _Readable_bytes_(count)) -#define _Field_size_bytes_part_opt_(size, count) _SAL2_Source_(_Field_size_bytes_part_opt_, (size, count), _Maybenull_ _Writable_bytes_(size) _Readable_bytes_(count)) -#define _Field_size_bytes_full_(size) _SAL2_Source_(_Field_size_bytes_full_, (size), _Field_size_bytes_part_(size, size)) -#define _Field_size_bytes_full_opt_(size) _SAL2_Source_(_Field_size_bytes_full_opt_, (size), _Field_size_bytes_part_opt_(size, size)) - -#define _Field_z_ _SAL2_Source_(_Field_z_, (), _Null_terminated_) - -#define _Field_range_(min,max) _SAL2_Source_(_Field_range_, (min,max), _Field_range_impl_(min,max)) - -//============================================================================ -// _Pre_\_Post_ Layer: -//============================================================================ - -// -// Raw Pre/Post for declaring custom pre/post conditions -// - -#define _Pre_ _Pre_impl_ -#define _Post_ _Post_impl_ - -// -// Validity property -// - -#define _Valid_ _Valid_impl_ -#define _Notvalid_ _Notvalid_impl_ -#define _Maybevalid_ _Maybevalid_impl_ - -// -// Buffer size properties -// - -// Expressing buffer sizes without specifying pre or post condition -#define _Readable_bytes_(size) _SAL2_Source_(_Readable_bytes_, (size), _Readable_bytes_impl_(size)) -#define _Readable_elements_(size) _SAL2_Source_(_Readable_elements_, (size), _Readable_elements_impl_(size)) -#define _Writable_bytes_(size) _SAL2_Source_(_Writable_bytes_, (size), _Writable_bytes_impl_(size)) -#define _Writable_elements_(size) _SAL2_Source_(_Writable_elements_, (size), _Writable_elements_impl_(size)) - -#define _Null_terminated_ _SAL2_Source_(_Null_terminated_, (), _Null_terminated_impl_) -#define _NullNull_terminated_ _SAL2_Source_(_NullNull_terminated_, (), _NullNull_terminated_impl_) - -// Expressing buffer size as pre or post condition -#define _Pre_readable_size_(size) _SAL2_Source_(_Pre_readable_size_, (size), _Pre1_impl_(__count_impl(size)) _Pre_valid_impl_) -#define _Pre_writable_size_(size) _SAL2_Source_(_Pre_writable_size_, (size), _Pre1_impl_(__cap_impl(size))) -#define _Pre_readable_byte_size_(size) _SAL2_Source_(_Pre_readable_byte_size_, (size), _Pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) -#define _Pre_writable_byte_size_(size) _SAL2_Source_(_Pre_writable_byte_size_, (size), _Pre1_impl_(__bytecap_impl(size))) - -#define _Post_readable_size_(size) _SAL2_Source_(_Post_readable_size_, (size), _Post1_impl_(__count_impl(size)) _Post_valid_impl_) -#define _Post_writable_size_(size) _SAL2_Source_(_Post_writable_size_, (size), _Post1_impl_(__cap_impl(size))) -#define _Post_readable_byte_size_(size) _SAL2_Source_(_Post_readable_byte_size_, (size), _Post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) -#define _Post_writable_byte_size_(size) _SAL2_Source_(_Post_writable_byte_size_, (size), _Post1_impl_(__bytecap_impl(size))) - -// -// Pointer null-ness properties -// -#define _Null_ _Null_impl_ -#define _Notnull_ _Notnull_impl_ -#define _Maybenull_ _Maybenull_impl_ - -// -// _Pre_ annotations --- -// -// describing conditions that must be met before the call of the function - -// e.g. int strlen( _Pre_z_ const char* sz ); -// buffer is a zero terminated string -#define _Pre_z_ _SAL2_Source_(_Pre_z_, (), _Pre1_impl_(__zterm_impl) _Pre_valid_impl_) - -// valid size unknown or indicated by type (e.g.:LPSTR) -#define _Pre_valid_ _SAL2_Source_(_Pre_valid_, (), _Pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_) -#define _Pre_opt_valid_ _SAL2_Source_(_Pre_opt_valid_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_) - -#define _Pre_invalid_ _SAL2_Source_(_Pre_invalid_, (), _Deref_pre1_impl_(__notvalid_impl)) - -// Overrides recursive valid when some field is not yet initialized when using _Inout_ -#define _Pre_unknown_ _SAL2_Source_(_Pre_unknown_, (), _Pre1_impl_(__maybevalid_impl)) - -// used with allocated but not yet initialized objects -#define _Pre_notnull_ _SAL2_Source_(_Pre_notnull_, (), _Pre1_impl_(__notnull_impl_notref)) -#define _Pre_maybenull_ _SAL2_Source_(_Pre_maybenull_, (), _Pre1_impl_(__maybenull_impl_notref)) -#define _Pre_null_ _SAL2_Source_(_Pre_null_, (), _Pre1_impl_(__null_impl_notref)) - -// -// _Post_ annotations --- -// -// describing conditions that hold after the function call - -// void CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_ char* szFrom, size_t cchFrom ); -// buffer will be a zero-terminated string after the call -#define _Post_z_ _SAL2_Source_(_Post_z_, (), _Post1_impl_(__zterm_impl) _Post_valid_impl_) - -// e.g. HRESULT InitStruct( _Post_valid_ Struct* pobj ); -#define _Post_valid_ _SAL2_Source_(_Post_valid_, (), _Post_valid_impl_) -#define _Post_invalid_ _SAL2_Source_(_Post_invalid_, (), _Deref_post1_impl_(__notvalid_impl)) - -// e.g. void free( _Post_ptr_invalid_ void* pv ); -#define _Post_ptr_invalid_ _SAL2_Source_(_Post_ptr_invalid_, (), _Post1_impl_(__notvalid_impl)) - -// e.g. void ThrowExceptionIfNull( _Post_notnull_ const void* pv ); -#define _Post_notnull_ _SAL2_Source_(_Post_notnull_, (), _Post1_impl_(__notnull_impl)) - -// e.g. HRESULT GetObject(_Outptr_ _On_failure_(_At_(*p, _Post_null_)) T **p); -#define _Post_null_ _SAL2_Source_(_Post_null_, (), _Post1_impl_(__null_impl)) - -#define _Post_maybenull_ _SAL2_Source_(_Post_maybenull_, (), _Post1_impl_(__maybenull_impl)) - -#define _Prepost_z_ _SAL2_Source_(_Prepost_z_, (), _Pre_z_ _Post_z_) - - -// #pragma region Input Buffer SAL 1 compatibility macros - -/*========================================================================== - - This section contains definitions for macros defined for VS2010 and earlier. - Usage of these macros is still supported, but the SAL 2 macros defined above - are recommended instead. This comment block is retained to assist in - understanding SAL that still uses the older syntax. - - The macros are defined in 3 layers: - - _In_\_Out_ Layer: - ---------------- - This layer provides the highest abstraction and its macros should be used - in most cases. Its macros start with _In_, _Out_ or _Inout_. For the - typical case they provide the most concise annotations. - - _Pre_\_Post_ Layer: - ------------------ - The macros of this layer only should be used when there is no suitable macro - in the _In_\_Out_ layer. Its macros start with _Pre_, _Post_, _Ret_, - _Deref_pre_ _Deref_post_ and _Deref_ret_. This layer provides the most - flexibility for annotations. - - Implementation Abstraction Layer: - -------------------------------- - Macros from this layer should never be used directly. The layer only exists - to hide the implementation of the annotation macros. - - - Annotation Syntax: - |--------------|----------|----------------|-----------------------------| - | Usage | Nullness | ZeroTerminated | Extent | - |--------------|----------|----------------|-----------------------------| - | _In_ | <> | <> | <> | - | _Out_ | opt_ | z_ | [byte]cap_[c_|x_]( size ) | - | _Inout_ | | | [byte]count_[c_|x_]( size ) | - | _Deref_out_ | | | ptrdiff_cap_( ptr ) | - |--------------| | | ptrdiff_count_( ptr ) | - | _Ret_ | | | | - | _Deref_ret_ | | | | - |--------------| | | | - | _Pre_ | | | | - | _Post_ | | | | - | _Deref_pre_ | | | | - | _Deref_post_ | | | | - |--------------|----------|----------------|-----------------------------| - - Usage: - ----- - _In_, _Out_, _Inout_, _Pre_, _Post_, _Deref_pre_, _Deref_post_ are for - formal parameters. - _Ret_, _Deref_ret_ must be used for return values. - - Nullness: - -------- - If the pointer can be NULL the annotation contains _opt. If the macro - does not contain '_opt' the pointer may not be NULL. - - String Type: - ----------- - _z: NullTerminated string - for _In_ parameters the buffer must have the specified stringtype before the call - for _Out_ parameters the buffer must have the specified stringtype after the call - for _Inout_ parameters both conditions apply - - Extent Syntax: - |------|---------------|---------------| - | Unit | Writ\Readable | Argument Type | - |------|---------------|---------------| - | <> | cap_ | <> | - | byte | count_ | c_ | - | | | x_ | - |------|---------------|---------------| - - 'cap' (capacity) describes the writable size of the buffer and is typically used - with _Out_. The default unit is elements. Use 'bytecap' if the size is given in bytes - 'count' describes the readable size of the buffer and is typically used with _In_. - The default unit is elements. Use 'bytecount' if the size is given in bytes. - - Argument syntax for cap_, bytecap_, count_, bytecount_: - (|return)[+n] e.g. cch, return, cb+2 - - If the buffer size is a constant expression use the c_ postfix. - E.g. cap_c_(20), count_c_(MAX_PATH), bytecount_c_(16) - - If the buffer size is given by a limiting pointer use the ptrdiff_ versions - of the macros. - - If the buffer size is neither a parameter nor a constant expression use the x_ - postfix. e.g. bytecount_x_(num*size) x_ annotations accept any arbitrary string. - No analysis can be done for x_ annotations but they at least tell the tool that - the buffer has some sort of extent description. x_ annotations might be supported - by future compiler versions. - -============================================================================*/ - -// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch ) -// valid buffer extent described by another parameter -#define _In_count_(size) _SAL1_1_Source_(_In_count_, (size), _Pre_count_(size) _Deref_pre_readonly_) -#define _In_opt_count_(size) _SAL1_1_Source_(_In_opt_count_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_) -#define _In_bytecount_(size) _SAL1_1_Source_(_In_bytecount_, (size), _Pre_bytecount_(size) _Deref_pre_readonly_) -#define _In_opt_bytecount_(size) _SAL1_1_Source_(_In_opt_bytecount_, (size), _Pre_opt_bytecount_(size) _Deref_pre_readonly_) - -// valid buffer extent described by a constant extression -#define _In_count_c_(size) _SAL1_1_Source_(_In_count_c_, (size), _Pre_count_c_(size) _Deref_pre_readonly_) -#define _In_opt_count_c_(size) _SAL1_1_Source_(_In_opt_count_c_, (size), _Pre_opt_count_c_(size) _Deref_pre_readonly_) -#define _In_bytecount_c_(size) _SAL1_1_Source_(_In_bytecount_c_, (size), _Pre_bytecount_c_(size) _Deref_pre_readonly_) -#define _In_opt_bytecount_c_(size) _SAL1_1_Source_(_In_opt_bytecount_c_, (size), _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_) - -// nullterminated 'input' buffers with given size - -// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch ) -// nullterminated valid buffer extent described by another parameter -#define _In_z_count_(size) _SAL1_1_Source_(_In_z_count_, (size), _Pre_z_ _Pre_count_(size) _Deref_pre_readonly_) -#define _In_opt_z_count_(size) _SAL1_1_Source_(_In_opt_z_count_, (size), _Pre_opt_z_ _Pre_opt_count_(size) _Deref_pre_readonly_) -#define _In_z_bytecount_(size) _SAL1_1_Source_(_In_z_bytecount_, (size), _Pre_z_ _Pre_bytecount_(size) _Deref_pre_readonly_) -#define _In_opt_z_bytecount_(size) _SAL1_1_Source_(_In_opt_z_bytecount_, (size), _Pre_opt_z_ _Pre_opt_bytecount_(size) _Deref_pre_readonly_) - -// nullterminated valid buffer extent described by a constant extression -#define _In_z_count_c_(size) _SAL1_1_Source_(_In_z_count_c_, (size), _Pre_z_ _Pre_count_c_(size) _Deref_pre_readonly_) -#define _In_opt_z_count_c_(size) _SAL1_1_Source_(_In_opt_z_count_c_, (size), _Pre_opt_z_ _Pre_opt_count_c_(size) _Deref_pre_readonly_) -#define _In_z_bytecount_c_(size) _SAL1_1_Source_(_In_z_bytecount_c_, (size), _Pre_z_ _Pre_bytecount_c_(size) _Deref_pre_readonly_) -#define _In_opt_z_bytecount_c_(size) _SAL1_1_Source_(_In_opt_z_bytecount_c_, (size), _Pre_opt_z_ _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_) - -// buffer capacity is described by another pointer -// e.g. void Foo( _In_ptrdiff_count_(pchMax) const char* pch, const char* pchMax ) { while pch < pchMax ) pch++; } -#define _In_ptrdiff_count_(size) _SAL1_1_Source_(_In_ptrdiff_count_, (size), _Pre_ptrdiff_count_(size) _Deref_pre_readonly_) -#define _In_opt_ptrdiff_count_(size) _SAL1_1_Source_(_In_opt_ptrdiff_count_, (size), _Pre_opt_ptrdiff_count_(size) _Deref_pre_readonly_) - -// 'x' version for complex expressions that are not supported by the current compiler version -// e.g. void Set3ColMatrix( _In_count_x_(3*cRows) const Elem* matrix, int cRows ); -#define _In_count_x_(size) _SAL1_1_Source_(_In_count_x_, (size), _Pre_count_x_(size) _Deref_pre_readonly_) -#define _In_opt_count_x_(size) _SAL1_1_Source_(_In_opt_count_x_, (size), _Pre_opt_count_x_(size) _Deref_pre_readonly_) -#define _In_bytecount_x_(size) _SAL1_1_Source_(_In_bytecount_x_, (size), _Pre_bytecount_x_(size) _Deref_pre_readonly_) -#define _In_opt_bytecount_x_(size) _SAL1_1_Source_(_In_opt_bytecount_x_, (size), _Pre_opt_bytecount_x_(size) _Deref_pre_readonly_) - - -// 'out' with buffer size -// e.g. void GetIndices( _Out_cap_(cIndices) int* rgIndices, size_t cIndices ); -// buffer capacity is described by another parameter -#define _Out_cap_(size) _SAL1_1_Source_(_Out_cap_, (size), _Pre_cap_(size) _Post_valid_impl_) -#define _Out_opt_cap_(size) _SAL1_1_Source_(_Out_opt_cap_, (size), _Pre_opt_cap_(size) _Post_valid_impl_) -#define _Out_bytecap_(size) _SAL1_1_Source_(_Out_bytecap_, (size), _Pre_bytecap_(size) _Post_valid_impl_) -#define _Out_opt_bytecap_(size) _SAL1_1_Source_(_Out_opt_bytecap_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_) - -// buffer capacity is described by a constant expression -#define _Out_cap_c_(size) _SAL1_1_Source_(_Out_cap_c_, (size), _Pre_cap_c_(size) _Post_valid_impl_) -#define _Out_opt_cap_c_(size) _SAL1_1_Source_(_Out_opt_cap_c_, (size), _Pre_opt_cap_c_(size) _Post_valid_impl_) -#define _Out_bytecap_c_(size) _SAL1_1_Source_(_Out_bytecap_c_, (size), _Pre_bytecap_c_(size) _Post_valid_impl_) -#define _Out_opt_bytecap_c_(size) _SAL1_1_Source_(_Out_opt_bytecap_c_, (size), _Pre_opt_bytecap_c_(size) _Post_valid_impl_) - -// buffer capacity is described by another parameter multiplied by a constant expression -#define _Out_cap_m_(mult,size) _SAL1_1_Source_(_Out_cap_m_, (mult,size), _Pre_cap_m_(mult,size) _Post_valid_impl_) -#define _Out_opt_cap_m_(mult,size) _SAL1_1_Source_(_Out_opt_cap_m_, (mult,size), _Pre_opt_cap_m_(mult,size) _Post_valid_impl_) -#define _Out_z_cap_m_(mult,size) _SAL1_1_Source_(_Out_z_cap_m_, (mult,size), _Pre_cap_m_(mult,size) _Post_valid_impl_ _Post_z_) -#define _Out_opt_z_cap_m_(mult,size) _SAL1_1_Source_(_Out_opt_z_cap_m_, (mult,size), _Pre_opt_cap_m_(mult,size) _Post_valid_impl_ _Post_z_) - -// buffer capacity is described by another pointer -// e.g. void Foo( _Out_ptrdiff_cap_(pchMax) char* pch, const char* pchMax ) { while pch < pchMax ) pch++; } -#define _Out_ptrdiff_cap_(size) _SAL1_1_Source_(_Out_ptrdiff_cap_, (size), _Pre_ptrdiff_cap_(size) _Post_valid_impl_) -#define _Out_opt_ptrdiff_cap_(size) _SAL1_1_Source_(_Out_opt_ptrdiff_cap_, (size), _Pre_opt_ptrdiff_cap_(size) _Post_valid_impl_) - -// buffer capacity is described by a complex expression -#define _Out_cap_x_(size) _SAL1_1_Source_(_Out_cap_x_, (size), _Pre_cap_x_(size) _Post_valid_impl_) -#define _Out_opt_cap_x_(size) _SAL1_1_Source_(_Out_opt_cap_x_, (size), _Pre_opt_cap_x_(size) _Post_valid_impl_) -#define _Out_bytecap_x_(size) _SAL1_1_Source_(_Out_bytecap_x_, (size), _Pre_bytecap_x_(size) _Post_valid_impl_) -#define _Out_opt_bytecap_x_(size) _SAL1_1_Source_(_Out_opt_bytecap_x_, (size), _Pre_opt_bytecap_x_(size) _Post_valid_impl_) - -// a zero terminated string is filled into a buffer of given capacity -// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); -// buffer capacity is described by another parameter -#define _Out_z_cap_(size) _SAL1_1_Source_(_Out_z_cap_, (size), _Pre_cap_(size) _Post_valid_impl_ _Post_z_) -#define _Out_opt_z_cap_(size) _SAL1_1_Source_(_Out_opt_z_cap_, (size), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_z_) -#define _Out_z_bytecap_(size) _SAL1_1_Source_(_Out_z_bytecap_, (size), _Pre_bytecap_(size) _Post_valid_impl_ _Post_z_) -#define _Out_opt_z_bytecap_(size) _SAL1_1_Source_(_Out_opt_z_bytecap_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_ _Post_z_) - -// buffer capacity is described by a constant expression -#define _Out_z_cap_c_(size) _SAL1_1_Source_(_Out_z_cap_c_, (size), _Pre_cap_c_(size) _Post_valid_impl_ _Post_z_) -#define _Out_opt_z_cap_c_(size) _SAL1_1_Source_(_Out_opt_z_cap_c_, (size), _Pre_opt_cap_c_(size) _Post_valid_impl_ _Post_z_) -#define _Out_z_bytecap_c_(size) _SAL1_1_Source_(_Out_z_bytecap_c_, (size), _Pre_bytecap_c_(size) _Post_valid_impl_ _Post_z_) -#define _Out_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Out_opt_z_bytecap_c_, (size), _Pre_opt_bytecap_c_(size) _Post_valid_impl_ _Post_z_) - -// buffer capacity is described by a complex expression -#define _Out_z_cap_x_(size) _SAL1_1_Source_(_Out_z_cap_x_, (size), _Pre_cap_x_(size) _Post_valid_impl_ _Post_z_) -#define _Out_opt_z_cap_x_(size) _SAL1_1_Source_(_Out_opt_z_cap_x_, (size), _Pre_opt_cap_x_(size) _Post_valid_impl_ _Post_z_) -#define _Out_z_bytecap_x_(size) _SAL1_1_Source_(_Out_z_bytecap_x_, (size), _Pre_bytecap_x_(size) _Post_valid_impl_ _Post_z_) -#define _Out_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Out_opt_z_bytecap_x_, (size), _Pre_opt_bytecap_x_(size) _Post_valid_impl_ _Post_z_) - -// a zero terminated string is filled into a buffer of given capacity -// e.g. size_t CopyCharRange( _In_count_(cchFrom) const char* rgchFrom, size_t cchFrom, _Out_cap_post_count_(cchTo,return)) char* rgchTo, size_t cchTo ); -#define _Out_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_cap_post_count_, (cap,count), _Pre_cap_(cap) _Post_valid_impl_ _Post_count_(count)) -#define _Out_opt_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_opt_cap_post_count_, (cap,count), _Pre_opt_cap_(cap) _Post_valid_impl_ _Post_count_(count)) -#define _Out_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_bytecap_post_bytecount_, (cap,count), _Pre_bytecap_(cap) _Post_valid_impl_ _Post_bytecount_(count)) -#define _Out_opt_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_opt_bytecap_post_bytecount_, (cap,count), _Pre_opt_bytecap_(cap) _Post_valid_impl_ _Post_bytecount_(count)) - -// a zero terminated string is filled into a buffer of given capacity -// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Out_z_cap_post_count_(cchTo,return+1) char* szTo, size_t cchTo ); -#define _Out_z_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_z_cap_post_count_, (cap,count), _Pre_cap_(cap) _Post_valid_impl_ _Post_z_count_(count)) -#define _Out_opt_z_cap_post_count_(cap,count) _SAL1_1_Source_(_Out_opt_z_cap_post_count_, (cap,count), _Pre_opt_cap_(cap) _Post_valid_impl_ _Post_z_count_(count)) -#define _Out_z_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_z_bytecap_post_bytecount_, (cap,count), _Pre_bytecap_(cap) _Post_valid_impl_ _Post_z_bytecount_(count)) -#define _Out_opt_z_bytecap_post_bytecount_(cap,count) _SAL1_1_Source_(_Out_opt_z_bytecap_post_bytecount_, (cap,count), _Pre_opt_bytecap_(cap) _Post_valid_impl_ _Post_z_bytecount_(count)) - -// only use with dereferenced arguments e.g. '*pcch' -#define _Out_capcount_(capcount) _SAL1_1_Source_(_Out_capcount_, (capcount), _Pre_cap_(capcount) _Post_valid_impl_ _Post_count_(capcount)) -#define _Out_opt_capcount_(capcount) _SAL1_1_Source_(_Out_opt_capcount_, (capcount), _Pre_opt_cap_(capcount) _Post_valid_impl_ _Post_count_(capcount)) -#define _Out_bytecapcount_(capcount) _SAL1_1_Source_(_Out_bytecapcount_, (capcount), _Pre_bytecap_(capcount) _Post_valid_impl_ _Post_bytecount_(capcount)) -#define _Out_opt_bytecapcount_(capcount) _SAL1_1_Source_(_Out_opt_bytecapcount_, (capcount), _Pre_opt_bytecap_(capcount) _Post_valid_impl_ _Post_bytecount_(capcount)) - -#define _Out_capcount_x_(capcount) _SAL1_1_Source_(_Out_capcount_x_, (capcount), _Pre_cap_x_(capcount) _Post_valid_impl_ _Post_count_x_(capcount)) -#define _Out_opt_capcount_x_(capcount) _SAL1_1_Source_(_Out_opt_capcount_x_, (capcount), _Pre_opt_cap_x_(capcount) _Post_valid_impl_ _Post_count_x_(capcount)) -#define _Out_bytecapcount_x_(capcount) _SAL1_1_Source_(_Out_bytecapcount_x_, (capcount), _Pre_bytecap_x_(capcount) _Post_valid_impl_ _Post_bytecount_x_(capcount)) -#define _Out_opt_bytecapcount_x_(capcount) _SAL1_1_Source_(_Out_opt_bytecapcount_x_, (capcount), _Pre_opt_bytecap_x_(capcount) _Post_valid_impl_ _Post_bytecount_x_(capcount)) - -// e.g. GetString( _Out_z_capcount_(*pLen+1) char* sz, size_t* pLen ); -#define _Out_z_capcount_(capcount) _SAL1_1_Source_(_Out_z_capcount_, (capcount), _Pre_cap_(capcount) _Post_valid_impl_ _Post_z_count_(capcount)) -#define _Out_opt_z_capcount_(capcount) _SAL1_1_Source_(_Out_opt_z_capcount_, (capcount), _Pre_opt_cap_(capcount) _Post_valid_impl_ _Post_z_count_(capcount)) -#define _Out_z_bytecapcount_(capcount) _SAL1_1_Source_(_Out_z_bytecapcount_, (capcount), _Pre_bytecap_(capcount) _Post_valid_impl_ _Post_z_bytecount_(capcount)) -#define _Out_opt_z_bytecapcount_(capcount) _SAL1_1_Source_(_Out_opt_z_bytecapcount_, (capcount), _Pre_opt_bytecap_(capcount) _Post_valid_impl_ _Post_z_bytecount_(capcount)) - - -// 'inout' buffers with initialized elements before and after the call -// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndices, size_t cIndices ); -#define _Inout_count_(size) _SAL1_1_Source_(_Inout_count_, (size), _Prepost_count_(size)) -#define _Inout_opt_count_(size) _SAL1_1_Source_(_Inout_opt_count_, (size), _Prepost_opt_count_(size)) -#define _Inout_bytecount_(size) _SAL1_1_Source_(_Inout_bytecount_, (size), _Prepost_bytecount_(size)) -#define _Inout_opt_bytecount_(size) _SAL1_1_Source_(_Inout_opt_bytecount_, (size), _Prepost_opt_bytecount_(size)) - -#define _Inout_count_c_(size) _SAL1_1_Source_(_Inout_count_c_, (size), _Prepost_count_c_(size)) -#define _Inout_opt_count_c_(size) _SAL1_1_Source_(_Inout_opt_count_c_, (size), _Prepost_opt_count_c_(size)) -#define _Inout_bytecount_c_(size) _SAL1_1_Source_(_Inout_bytecount_c_, (size), _Prepost_bytecount_c_(size)) -#define _Inout_opt_bytecount_c_(size) _SAL1_1_Source_(_Inout_opt_bytecount_c_, (size), _Prepost_opt_bytecount_c_(size)) - -// nullterminated 'inout' buffers with initialized elements before and after the call -// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndices, size_t cIndices ); -#define _Inout_z_count_(size) _SAL1_1_Source_(_Inout_z_count_, (size), _Prepost_z_ _Prepost_count_(size)) -#define _Inout_opt_z_count_(size) _SAL1_1_Source_(_Inout_opt_z_count_, (size), _Prepost_z_ _Prepost_opt_count_(size)) -#define _Inout_z_bytecount_(size) _SAL1_1_Source_(_Inout_z_bytecount_, (size), _Prepost_z_ _Prepost_bytecount_(size)) -#define _Inout_opt_z_bytecount_(size) _SAL1_1_Source_(_Inout_opt_z_bytecount_, (size), _Prepost_z_ _Prepost_opt_bytecount_(size)) - -#define _Inout_z_count_c_(size) _SAL1_1_Source_(_Inout_z_count_c_, (size), _Prepost_z_ _Prepost_count_c_(size)) -#define _Inout_opt_z_count_c_(size) _SAL1_1_Source_(_Inout_opt_z_count_c_, (size), _Prepost_z_ _Prepost_opt_count_c_(size)) -#define _Inout_z_bytecount_c_(size) _SAL1_1_Source_(_Inout_z_bytecount_c_, (size), _Prepost_z_ _Prepost_bytecount_c_(size)) -#define _Inout_opt_z_bytecount_c_(size) _SAL1_1_Source_(_Inout_opt_z_bytecount_c_, (size), _Prepost_z_ _Prepost_opt_bytecount_c_(size)) - -#define _Inout_ptrdiff_count_(size) _SAL1_1_Source_(_Inout_ptrdiff_count_, (size), _Pre_ptrdiff_count_(size)) -#define _Inout_opt_ptrdiff_count_(size) _SAL1_1_Source_(_Inout_opt_ptrdiff_count_, (size), _Pre_opt_ptrdiff_count_(size)) - -#define _Inout_count_x_(size) _SAL1_1_Source_(_Inout_count_x_, (size), _Prepost_count_x_(size)) -#define _Inout_opt_count_x_(size) _SAL1_1_Source_(_Inout_opt_count_x_, (size), _Prepost_opt_count_x_(size)) -#define _Inout_bytecount_x_(size) _SAL1_1_Source_(_Inout_bytecount_x_, (size), _Prepost_bytecount_x_(size)) -#define _Inout_opt_bytecount_x_(size) _SAL1_1_Source_(_Inout_opt_bytecount_x_, (size), _Prepost_opt_bytecount_x_(size)) - -// e.g. void AppendToLPSTR( _In_ LPCSTR szFrom, _Inout_cap_(cchTo) LPSTR* szTo, size_t cchTo ); -#define _Inout_cap_(size) _SAL1_1_Source_(_Inout_cap_, (size), _Pre_valid_cap_(size) _Post_valid_) -#define _Inout_opt_cap_(size) _SAL1_1_Source_(_Inout_opt_cap_, (size), _Pre_opt_valid_cap_(size) _Post_valid_) -#define _Inout_bytecap_(size) _SAL1_1_Source_(_Inout_bytecap_, (size), _Pre_valid_bytecap_(size) _Post_valid_) -#define _Inout_opt_bytecap_(size) _SAL1_1_Source_(_Inout_opt_bytecap_, (size), _Pre_opt_valid_bytecap_(size) _Post_valid_) - -#define _Inout_cap_c_(size) _SAL1_1_Source_(_Inout_cap_c_, (size), _Pre_valid_cap_c_(size) _Post_valid_) -#define _Inout_opt_cap_c_(size) _SAL1_1_Source_(_Inout_opt_cap_c_, (size), _Pre_opt_valid_cap_c_(size) _Post_valid_) -#define _Inout_bytecap_c_(size) _SAL1_1_Source_(_Inout_bytecap_c_, (size), _Pre_valid_bytecap_c_(size) _Post_valid_) -#define _Inout_opt_bytecap_c_(size) _SAL1_1_Source_(_Inout_opt_bytecap_c_, (size), _Pre_opt_valid_bytecap_c_(size) _Post_valid_) - -#define _Inout_cap_x_(size) _SAL1_1_Source_(_Inout_cap_x_, (size), _Pre_valid_cap_x_(size) _Post_valid_) -#define _Inout_opt_cap_x_(size) _SAL1_1_Source_(_Inout_opt_cap_x_, (size), _Pre_opt_valid_cap_x_(size) _Post_valid_) -#define _Inout_bytecap_x_(size) _SAL1_1_Source_(_Inout_bytecap_x_, (size), _Pre_valid_bytecap_x_(size) _Post_valid_) -#define _Inout_opt_bytecap_x_(size) _SAL1_1_Source_(_Inout_opt_bytecap_x_, (size), _Pre_opt_valid_bytecap_x_(size) _Post_valid_) - -// inout string buffers with writable size -// e.g. void AppendStr( _In_z_ const char* szFrom, _Inout_z_cap_(cchTo) char* szTo, size_t cchTo ); -#define _Inout_z_cap_(size) _SAL1_1_Source_(_Inout_z_cap_, (size), _Pre_z_cap_(size) _Post_z_) -#define _Inout_opt_z_cap_(size) _SAL1_1_Source_(_Inout_opt_z_cap_, (size), _Pre_opt_z_cap_(size) _Post_z_) -#define _Inout_z_bytecap_(size) _SAL1_1_Source_(_Inout_z_bytecap_, (size), _Pre_z_bytecap_(size) _Post_z_) -#define _Inout_opt_z_bytecap_(size) _SAL1_1_Source_(_Inout_opt_z_bytecap_, (size), _Pre_opt_z_bytecap_(size) _Post_z_) - -#define _Inout_z_cap_c_(size) _SAL1_1_Source_(_Inout_z_cap_c_, (size), _Pre_z_cap_c_(size) _Post_z_) -#define _Inout_opt_z_cap_c_(size) _SAL1_1_Source_(_Inout_opt_z_cap_c_, (size), _Pre_opt_z_cap_c_(size) _Post_z_) -#define _Inout_z_bytecap_c_(size) _SAL1_1_Source_(_Inout_z_bytecap_c_, (size), _Pre_z_bytecap_c_(size) _Post_z_) -#define _Inout_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Inout_opt_z_bytecap_c_, (size), _Pre_opt_z_bytecap_c_(size) _Post_z_) - -#define _Inout_z_cap_x_(size) _SAL1_1_Source_(_Inout_z_cap_x_, (size), _Pre_z_cap_x_(size) _Post_z_) -#define _Inout_opt_z_cap_x_(size) _SAL1_1_Source_(_Inout_opt_z_cap_x_, (size), _Pre_opt_z_cap_x_(size) _Post_z_) -#define _Inout_z_bytecap_x_(size) _SAL1_1_Source_(_Inout_z_bytecap_x_, (size), _Pre_z_bytecap_x_(size) _Post_z_) -#define _Inout_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Inout_opt_z_bytecap_x_, (size), _Pre_opt_z_bytecap_x_(size) _Post_z_) - - -// returning pointers to valid objects -#define _Ret_ _SAL1_1_Source_(_Ret_, (), _Ret_valid_) -#define _Ret_opt_ _SAL1_1_Source_(_Ret_opt_, (), _Ret_opt_valid_) - -// annotations to express 'boundedness' of integral value parameter -#define _In_bound_ _SAL1_1_Source_(_In_bound_, (), _In_bound_impl_) -#define _Out_bound_ _SAL1_1_Source_(_Out_bound_, (), _Out_bound_impl_) -#define _Ret_bound_ _SAL1_1_Source_(_Ret_bound_, (), _Ret_bound_impl_) -#define _Deref_in_bound_ _SAL1_1_Source_(_Deref_in_bound_, (), _Deref_in_bound_impl_) -#define _Deref_out_bound_ _SAL1_1_Source_(_Deref_out_bound_, (), _Deref_out_bound_impl_) -#define _Deref_inout_bound_ _SAL1_1_Source_(_Deref_inout_bound_, (), _Deref_in_bound_ _Deref_out_bound_) -#define _Deref_ret_bound_ _SAL1_1_Source_(_Deref_ret_bound_, (), _Deref_ret_bound_impl_) - -// e.g. HRESULT HrCreatePoint( _Deref_out_opt_ POINT** ppPT ); -#define _Deref_out_ _SAL1_1_Source_(_Deref_out_, (), _Out_ _Deref_post_valid_) -#define _Deref_out_opt_ _SAL1_1_Source_(_Deref_out_opt_, (), _Out_ _Deref_post_opt_valid_) -#define _Deref_opt_out_ _SAL1_1_Source_(_Deref_opt_out_, (), _Out_opt_ _Deref_post_valid_) -#define _Deref_opt_out_opt_ _SAL1_1_Source_(_Deref_opt_out_opt_, (), _Out_opt_ _Deref_post_opt_valid_) - -// e.g. void CloneString( _In_z_ const WCHAR* wzFrom, _Deref_out_z_ WCHAR** pWzTo ); -#define _Deref_out_z_ _SAL1_1_Source_(_Deref_out_z_, (), _Out_ _Deref_post_z_) -#define _Deref_out_opt_z_ _SAL1_1_Source_(_Deref_out_opt_z_, (), _Out_ _Deref_post_opt_z_) -#define _Deref_opt_out_z_ _SAL1_1_Source_(_Deref_opt_out_z_, (), _Out_opt_ _Deref_post_z_) -#define _Deref_opt_out_opt_z_ _SAL1_1_Source_(_Deref_opt_out_opt_z_, (), _Out_opt_ _Deref_post_opt_z_) - -// -// _Deref_pre_ --- -// -// describing conditions for array elements of dereferenced pointer parameters that must be met before the call - -// e.g. void SaveStringArray( _In_count_(cStrings) _Deref_pre_z_ const WCHAR* const rgpwch[] ); -#define _Deref_pre_z_ _SAL1_1_Source_(_Deref_pre_z_, (), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__zterm_impl) _Pre_valid_impl_) -#define _Deref_pre_opt_z_ _SAL1_1_Source_(_Deref_pre_opt_z_, (), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__zterm_impl) _Pre_valid_impl_) - -// e.g. void FillInArrayOfStr32( _In_count_(cStrings) _Deref_pre_cap_c_(32) _Deref_post_z_ WCHAR* const rgpwch[] ); -// buffer capacity is described by another parameter -#define _Deref_pre_cap_(size) _SAL1_1_Source_(_Deref_pre_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_impl(size))) -#define _Deref_pre_opt_cap_(size) _SAL1_1_Source_(_Deref_pre_opt_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_impl(size))) -#define _Deref_pre_bytecap_(size) _SAL1_1_Source_(_Deref_pre_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size))) -#define _Deref_pre_opt_bytecap_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size))) - -// buffer capacity is described by a constant expression -#define _Deref_pre_cap_c_(size) _SAL1_1_Source_(_Deref_pre_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size))) -#define _Deref_pre_opt_cap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size))) -#define _Deref_pre_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size))) -#define _Deref_pre_opt_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size))) - -// buffer capacity is described by a complex condition -#define _Deref_pre_cap_x_(size) _SAL1_1_Source_(_Deref_pre_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size))) -#define _Deref_pre_opt_cap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size))) -#define _Deref_pre_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size))) -#define _Deref_pre_opt_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size))) - -// convenience macros for nullterminated buffers with given capacity -#define _Deref_pre_z_cap_(size) _SAL1_1_Source_(_Deref_pre_z_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_z_cap_(size) _SAL1_1_Source_(_Deref_pre_opt_z_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_z_bytecap_(size) _SAL1_1_Source_(_Deref_pre_z_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_z_bytecap_(size) _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) - -#define _Deref_pre_z_cap_c_(size) _SAL1_1_Source_(_Deref_pre_z_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_z_cap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_z_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_z_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) - -#define _Deref_pre_z_cap_x_(size) _SAL1_1_Source_(_Deref_pre_z_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_z_cap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_z_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_z_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) - -// known capacity and valid but unknown readable extent -#define _Deref_pre_valid_cap_(size) _SAL1_1_Source_(_Deref_pre_valid_cap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_valid_cap_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_cap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_valid_bytecap_(size) _SAL1_1_Source_(_Deref_pre_valid_bytecap_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_valid_bytecap_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) - -#define _Deref_pre_valid_cap_c_(size) _SAL1_1_Source_(_Deref_pre_valid_cap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_valid_cap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_cap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_valid_bytecap_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) - -#define _Deref_pre_valid_cap_x_(size) _SAL1_1_Source_(_Deref_pre_valid_cap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_valid_cap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_cap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_valid_bytecap_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) - -// e.g. void SaveMatrix( _In_count_(n) _Deref_pre_count_(n) const Elem** matrix, size_t n ); -// valid buffer extent is described by another parameter -#define _Deref_pre_count_(size) _SAL1_1_Source_(_Deref_pre_count_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__count_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_count_(size) _SAL1_1_Source_(_Deref_pre_opt_count_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_bytecount_(size) _SAL1_1_Source_(_Deref_pre_bytecount_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_bytecount_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecount_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) - -// valid buffer extent is described by a constant expression -#define _Deref_pre_count_c_(size) _SAL1_1_Source_(_Deref_pre_count_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_count_c_(size) _SAL1_1_Source_(_Deref_pre_opt_count_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_bytecount_c_(size) _SAL1_1_Source_(_Deref_pre_bytecount_c_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_bytecount_c_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecount_c_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) - -// valid buffer extent is described by a complex expression -#define _Deref_pre_count_x_(size) _SAL1_1_Source_(_Deref_pre_count_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_count_x_(size) _SAL1_1_Source_(_Deref_pre_opt_count_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_bytecount_x_(size) _SAL1_1_Source_(_Deref_pre_bytecount_x_, (size), _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) -#define _Deref_pre_opt_bytecount_x_(size) _SAL1_1_Source_(_Deref_pre_opt_bytecount_x_, (size), _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) - -// e.g. void PrintStringArray( _In_count_(cElems) _Deref_pre_valid_ LPCSTR rgStr[], size_t cElems ); -#define _Deref_pre_valid_ _SAL1_1_Source_(_Deref_pre_valid_, (), _Deref_pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_) -#define _Deref_pre_opt_valid_ _SAL1_1_Source_(_Deref_pre_opt_valid_, (), _Deref_pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_) -#define _Deref_pre_invalid_ _SAL1_1_Source_(_Deref_pre_invalid_, (), _Deref_pre1_impl_(__notvalid_impl)) - -#define _Deref_pre_notnull_ _SAL1_1_Source_(_Deref_pre_notnull_, (), _Deref_pre1_impl_(__notnull_impl_notref)) -#define _Deref_pre_maybenull_ _SAL1_1_Source_(_Deref_pre_maybenull_, (), _Deref_pre1_impl_(__maybenull_impl_notref)) -#define _Deref_pre_null_ _SAL1_1_Source_(_Deref_pre_null_, (), _Deref_pre1_impl_(__null_impl_notref)) - -// restrict access rights -#define _Deref_pre_readonly_ _SAL1_1_Source_(_Deref_pre_readonly_, (), _Deref_pre1_impl_(__readaccess_impl_notref)) -#define _Deref_pre_writeonly_ _SAL1_1_Source_(_Deref_pre_writeonly_, (), _Deref_pre1_impl_(__writeaccess_impl_notref)) - -// -// _Deref_post_ --- -// -// describing conditions for array elements or dereferenced pointer parameters that hold after the call - -// e.g. void CloneString( _In_z_ const Wchar_t* wzIn _Out_ _Deref_post_z_ WCHAR** pWzOut ); -#define _Deref_post_z_ _SAL1_1_Source_(_Deref_post_z_, (), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__zterm_impl) _Post_valid_impl_) -#define _Deref_post_opt_z_ _SAL1_1_Source_(_Deref_post_opt_z_, (), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__zterm_impl) _Post_valid_impl_) - -// e.g. HRESULT HrAllocateMemory( size_t cb, _Out_ _Deref_post_bytecap_(cb) void** ppv ); -// buffer capacity is described by another parameter -#define _Deref_post_cap_(size) _SAL1_1_Source_(_Deref_post_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_impl(size))) -#define _Deref_post_opt_cap_(size) _SAL1_1_Source_(_Deref_post_opt_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_impl(size))) -#define _Deref_post_bytecap_(size) _SAL1_1_Source_(_Deref_post_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size))) -#define _Deref_post_opt_bytecap_(size) _SAL1_1_Source_(_Deref_post_opt_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size))) - -// buffer capacity is described by a constant expression -#define _Deref_post_cap_c_(size) _SAL1_1_Source_(_Deref_post_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size))) -#define _Deref_post_opt_cap_c_(size) _SAL1_1_Source_(_Deref_post_opt_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size))) -#define _Deref_post_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size))) -#define _Deref_post_opt_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_opt_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size))) - -// buffer capacity is described by a complex expression -#define _Deref_post_cap_x_(size) _SAL1_1_Source_(_Deref_post_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size))) -#define _Deref_post_opt_cap_x_(size) _SAL1_1_Source_(_Deref_post_opt_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size))) -#define _Deref_post_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size))) -#define _Deref_post_opt_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_opt_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size))) - -// convenience macros for nullterminated buffers with given capacity -#define _Deref_post_z_cap_(size) _SAL1_1_Source_(_Deref_post_z_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_z_cap_(size) _SAL1_1_Source_(_Deref_post_opt_z_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_impl(size)) _Post_valid_impl_) -#define _Deref_post_z_bytecap_(size) _SAL1_1_Source_(_Deref_post_z_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_z_bytecap_(size) _SAL1_1_Source_(_Deref_post_opt_z_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_impl(size)) _Post_valid_impl_) - -#define _Deref_post_z_cap_c_(size) _SAL1_1_Source_(_Deref_post_z_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_c_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_z_cap_c_(size) _SAL1_1_Source_(_Deref_post_opt_z_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_c_impl(size)) _Post_valid_impl_) -#define _Deref_post_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_z_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_opt_z_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Post_valid_impl_) - -#define _Deref_post_z_cap_x_(size) _SAL1_1_Source_(_Deref_post_z_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_x_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_z_cap_x_(size) _SAL1_1_Source_(_Deref_post_opt_z_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__cap_x_impl(size)) _Post_valid_impl_) -#define _Deref_post_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_z_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_opt_z_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Post_valid_impl_) - -// known capacity and valid but unknown readable extent -#define _Deref_post_valid_cap_(size) _SAL1_1_Source_(_Deref_post_valid_cap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_valid_cap_(size) _SAL1_1_Source_(_Deref_post_opt_valid_cap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_impl(size)) _Post_valid_impl_) -#define _Deref_post_valid_bytecap_(size) _SAL1_1_Source_(_Deref_post_valid_bytecap_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_valid_bytecap_(size) _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_impl(size)) _Post_valid_impl_) - -#define _Deref_post_valid_cap_c_(size) _SAL1_1_Source_(_Deref_post_valid_cap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_valid_cap_c_(size) _SAL1_1_Source_(_Deref_post_opt_valid_cap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_c_impl(size)) _Post_valid_impl_) -#define _Deref_post_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_valid_bytecap_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_valid_bytecap_c_(size) _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_c_impl(size)) _Post_valid_impl_) - -#define _Deref_post_valid_cap_x_(size) _SAL1_1_Source_(_Deref_post_valid_cap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_valid_cap_x_(size) _SAL1_1_Source_(_Deref_post_opt_valid_cap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__cap_x_impl(size)) _Post_valid_impl_) -#define _Deref_post_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_valid_bytecap_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecap_x_impl(size)) _Post_valid_impl_) - -// e.g. HRESULT HrAllocateZeroInitializedMemory( size_t cb, _Out_ _Deref_post_bytecount_(cb) void** ppv ); -// valid buffer extent is described by another parameter -#define _Deref_post_count_(size) _SAL1_1_Source_(_Deref_post_count_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_count_(size) _SAL1_1_Source_(_Deref_post_opt_count_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_impl(size)) _Post_valid_impl_) -#define _Deref_post_bytecount_(size) _SAL1_1_Source_(_Deref_post_bytecount_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_bytecount_(size) _SAL1_1_Source_(_Deref_post_opt_bytecount_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) - -// buffer capacity is described by a constant expression -#define _Deref_post_count_c_(size) _SAL1_1_Source_(_Deref_post_count_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_c_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_count_c_(size) _SAL1_1_Source_(_Deref_post_opt_count_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_c_impl(size)) _Post_valid_impl_) -#define _Deref_post_bytecount_c_(size) _SAL1_1_Source_(_Deref_post_bytecount_c_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_bytecount_c_(size) _SAL1_1_Source_(_Deref_post_opt_bytecount_c_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_) - -// buffer capacity is described by a complex expression -#define _Deref_post_count_x_(size) _SAL1_1_Source_(_Deref_post_count_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__count_x_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_count_x_(size) _SAL1_1_Source_(_Deref_post_opt_count_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__count_x_impl(size)) _Post_valid_impl_) -#define _Deref_post_bytecount_x_(size) _SAL1_1_Source_(_Deref_post_bytecount_x_, (size), _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_) -#define _Deref_post_opt_bytecount_x_(size) _SAL1_1_Source_(_Deref_post_opt_bytecount_x_, (size), _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_) - -// e.g. void GetStrings( _Out_count_(cElems) _Deref_post_valid_ LPSTR const rgStr[], size_t cElems ); -#define _Deref_post_valid_ _SAL1_1_Source_(_Deref_post_valid_, (), _Deref_post1_impl_(__notnull_impl_notref) _Post_valid_impl_) -#define _Deref_post_opt_valid_ _SAL1_1_Source_(_Deref_post_opt_valid_, (), _Deref_post1_impl_(__maybenull_impl_notref) _Post_valid_impl_) - -#define _Deref_post_notnull_ _SAL1_1_Source_(_Deref_post_notnull_, (), _Deref_post1_impl_(__notnull_impl_notref)) -#define _Deref_post_maybenull_ _SAL1_1_Source_(_Deref_post_maybenull_, (), _Deref_post1_impl_(__maybenull_impl_notref)) -#define _Deref_post_null_ _SAL1_1_Source_(_Deref_post_null_, (), _Deref_post1_impl_(__null_impl_notref)) - -// -// _Deref_ret_ --- -// - -#define _Deref_ret_z_ _SAL1_1_Source_(_Deref_ret_z_, (), _Deref_ret1_impl_(__notnull_impl_notref) _Deref_ret1_impl_(__zterm_impl)) -#define _Deref_ret_opt_z_ _SAL1_1_Source_(_Deref_ret_opt_z_, (), _Deref_ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__zterm_impl)) - -// -// special _Deref_ --- -// -#define _Deref2_pre_readonly_ _SAL1_1_Source_(_Deref2_pre_readonly_, (), _Deref2_pre1_impl_(__readaccess_impl_notref)) - -// -// _Ret_ --- -// - -// e.g. _Ret_opt_valid_ LPSTR void* CloneSTR( _Pre_valid_ LPSTR src ); -#define _Ret_opt_valid_ _SAL1_1_Source_(_Ret_opt_valid_, (), _Ret1_impl_(__maybenull_impl_notref) _Ret_valid_impl_) -#define _Ret_opt_z_ _SAL1_1_Source_(_Ret_opt_z_, (), _Ret2_impl_(__maybenull_impl,__zterm_impl) _Ret_valid_impl_) - -// e.g. _Ret_opt_bytecap_(cb) void* AllocateMemory( size_t cb ); -// Buffer capacity is described by another parameter -#define _Ret_cap_(size) _SAL1_1_Source_(_Ret_cap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_impl(size))) -#define _Ret_opt_cap_(size) _SAL1_1_Source_(_Ret_opt_cap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_impl(size))) -#define _Ret_bytecap_(size) _SAL1_1_Source_(_Ret_bytecap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_impl(size))) -#define _Ret_opt_bytecap_(size) _SAL1_1_Source_(_Ret_opt_bytecap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_impl(size))) - -// Buffer capacity is described by a constant expression -#define _Ret_cap_c_(size) _SAL1_1_Source_(_Ret_cap_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_c_impl(size))) -#define _Ret_opt_cap_c_(size) _SAL1_1_Source_(_Ret_opt_cap_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_c_impl(size))) -#define _Ret_bytecap_c_(size) _SAL1_1_Source_(_Ret_bytecap_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_c_impl(size))) -#define _Ret_opt_bytecap_c_(size) _SAL1_1_Source_(_Ret_opt_bytecap_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_c_impl(size))) - -// Buffer capacity is described by a complex condition -#define _Ret_cap_x_(size) _SAL1_1_Source_(_Ret_cap_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_x_impl(size))) -#define _Ret_opt_cap_x_(size) _SAL1_1_Source_(_Ret_opt_cap_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_x_impl(size))) -#define _Ret_bytecap_x_(size) _SAL1_1_Source_(_Ret_bytecap_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_x_impl(size))) -#define _Ret_opt_bytecap_x_(size) _SAL1_1_Source_(_Ret_opt_bytecap_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_x_impl(size))) - -// return value is nullterminated and capacity is given by another parameter -#define _Ret_z_cap_(size) _SAL1_1_Source_(_Ret_z_cap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__cap_impl(size)) _Ret_valid_impl_) -#define _Ret_opt_z_cap_(size) _SAL1_1_Source_(_Ret_opt_z_cap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__cap_impl(size)) _Ret_valid_impl_) -#define _Ret_z_bytecap_(size) _SAL1_1_Source_(_Ret_z_bytecap_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecap_impl(size)) _Ret_valid_impl_) -#define _Ret_opt_z_bytecap_(size) _SAL1_1_Source_(_Ret_opt_z_bytecap_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecap_impl(size)) _Ret_valid_impl_) - -// e.g. _Ret_opt_bytecount_(cb) void* AllocateZeroInitializedMemory( size_t cb ); -// Valid Buffer extent is described by another parameter -#define _Ret_count_(size) _SAL1_1_Source_(_Ret_count_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_impl(size)) _Ret_valid_impl_) -#define _Ret_opt_count_(size) _SAL1_1_Source_(_Ret_opt_count_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_impl(size)) _Ret_valid_impl_) -#define _Ret_bytecount_(size) _SAL1_1_Source_(_Ret_bytecount_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_impl(size)) _Ret_valid_impl_) -#define _Ret_opt_bytecount_(size) _SAL1_1_Source_(_Ret_opt_bytecount_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_impl(size)) _Ret_valid_impl_) - -// Valid Buffer extent is described by a constant expression -#define _Ret_count_c_(size) _SAL1_1_Source_(_Ret_count_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_c_impl(size)) _Ret_valid_impl_) -#define _Ret_opt_count_c_(size) _SAL1_1_Source_(_Ret_opt_count_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_c_impl(size)) _Ret_valid_impl_) -#define _Ret_bytecount_c_(size) _SAL1_1_Source_(_Ret_bytecount_c_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_c_impl(size)) _Ret_valid_impl_) -#define _Ret_opt_bytecount_c_(size) _SAL1_1_Source_(_Ret_opt_bytecount_c_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_c_impl(size)) _Ret_valid_impl_) - -// Valid Buffer extent is described by a complex expression -#define _Ret_count_x_(size) _SAL1_1_Source_(_Ret_count_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_x_impl(size)) _Ret_valid_impl_) -#define _Ret_opt_count_x_(size) _SAL1_1_Source_(_Ret_opt_count_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_x_impl(size)) _Ret_valid_impl_) -#define _Ret_bytecount_x_(size) _SAL1_1_Source_(_Ret_bytecount_x_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_x_impl(size)) _Ret_valid_impl_) -#define _Ret_opt_bytecount_x_(size) _SAL1_1_Source_(_Ret_opt_bytecount_x_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_x_impl(size)) _Ret_valid_impl_) - -// return value is nullterminated and length is given by another parameter -#define _Ret_z_count_(size) _SAL1_1_Source_(_Ret_z_count_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__count_impl(size)) _Ret_valid_impl_) -#define _Ret_opt_z_count_(size) _SAL1_1_Source_(_Ret_opt_z_count_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__count_impl(size)) _Ret_valid_impl_) -#define _Ret_z_bytecount_(size) _SAL1_1_Source_(_Ret_z_bytecount_, (size), _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecount_impl(size)) _Ret_valid_impl_) -#define _Ret_opt_z_bytecount_(size) _SAL1_1_Source_(_Ret_opt_z_bytecount_, (size), _Ret1_impl_(__maybenull_impl_notref) _Ret2_impl_(__zterm_impl,__bytecount_impl(size)) _Ret_valid_impl_) - - -// _Pre_ annotations --- -#define _Pre_opt_z_ _SAL1_1_Source_(_Pre_opt_z_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__zterm_impl) _Pre_valid_impl_) - -// restrict access rights -#define _Pre_readonly_ _SAL1_1_Source_(_Pre_readonly_, (), _Pre1_impl_(__readaccess_impl_notref)) -#define _Pre_writeonly_ _SAL1_1_Source_(_Pre_writeonly_, (), _Pre1_impl_(__writeaccess_impl_notref)) - -// e.g. void FreeMemory( _Pre_bytecap_(cb) _Post_ptr_invalid_ void* pv, size_t cb ); -// buffer capacity described by another parameter -#define _Pre_cap_(size) _SAL1_1_Source_(_Pre_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_impl(size))) -#define _Pre_opt_cap_(size) _SAL1_1_Source_(_Pre_opt_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_impl(size))) -#define _Pre_bytecap_(size) _SAL1_1_Source_(_Pre_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_impl(size))) -#define _Pre_opt_bytecap_(size) _SAL1_1_Source_(_Pre_opt_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_impl(size))) - -// buffer capacity described by a constant expression -#define _Pre_cap_c_(size) _SAL1_1_Source_(_Pre_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_impl(size))) -#define _Pre_opt_cap_c_(size) _SAL1_1_Source_(_Pre_opt_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_impl(size))) -#define _Pre_bytecap_c_(size) _SAL1_1_Source_(_Pre_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size))) -#define _Pre_opt_bytecap_c_(size) _SAL1_1_Source_(_Pre_opt_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size))) -#define _Pre_cap_c_one_ _SAL1_1_Source_(_Pre_cap_c_one_, (), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl)) -#define _Pre_opt_cap_c_one_ _SAL1_1_Source_(_Pre_opt_cap_c_one_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl)) - -// buffer capacity is described by another parameter multiplied by a constant expression -#define _Pre_cap_m_(mult,size) _SAL1_1_Source_(_Pre_cap_m_, (mult,size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__mult_impl(mult,size))) -#define _Pre_opt_cap_m_(mult,size) _SAL1_1_Source_(_Pre_opt_cap_m_, (mult,size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__mult_impl(mult,size))) - -// buffer capacity described by size of other buffer, only used by dangerous legacy APIs -// e.g. int strcpy(_Pre_cap_for_(src) char* dst, const char* src); -#define _Pre_cap_for_(param) _SAL1_1_Source_(_Pre_cap_for_, (param), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_for_impl(param))) -#define _Pre_opt_cap_for_(param) _SAL1_1_Source_(_Pre_opt_cap_for_, (param), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_for_impl(param))) - -// buffer capacity described by a complex condition -#define _Pre_cap_x_(size) _SAL1_1_Source_(_Pre_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(size))) -#define _Pre_opt_cap_x_(size) _SAL1_1_Source_(_Pre_opt_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(size))) -#define _Pre_bytecap_x_(size) _SAL1_1_Source_(_Pre_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size))) -#define _Pre_opt_bytecap_x_(size) _SAL1_1_Source_(_Pre_opt_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size))) - -// buffer capacity described by the difference to another pointer parameter -#define _Pre_ptrdiff_cap_(ptr) _SAL1_1_Source_(_Pre_ptrdiff_cap_, (ptr), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(__ptrdiff(ptr)))) -#define _Pre_opt_ptrdiff_cap_(ptr) _SAL1_1_Source_(_Pre_opt_ptrdiff_cap_, (ptr), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(__ptrdiff(ptr)))) - -// e.g. void AppendStr( _Pre_z_ const char* szFrom, _Pre_z_cap_(cchTo) _Post_z_ char* szTo, size_t cchTo ); -#define _Pre_z_cap_(size) _SAL1_1_Source_(_Pre_z_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_z_cap_(size) _SAL1_1_Source_(_Pre_opt_z_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_impl(size)) _Pre_valid_impl_) -#define _Pre_z_bytecap_(size) _SAL1_1_Source_(_Pre_z_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_z_bytecap_(size) _SAL1_1_Source_(_Pre_opt_z_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_impl(size)) _Pre_valid_impl_) - -#define _Pre_z_cap_c_(size) _SAL1_1_Source_(_Pre_z_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_z_cap_c_(size) _SAL1_1_Source_(_Pre_opt_z_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_c_impl(size)) _Pre_valid_impl_) -#define _Pre_z_bytecap_c_(size) _SAL1_1_Source_(_Pre_z_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_z_bytecap_c_(size) _SAL1_1_Source_(_Pre_opt_z_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_c_impl(size)) _Pre_valid_impl_) - -#define _Pre_z_cap_x_(size) _SAL1_1_Source_(_Pre_z_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_z_cap_x_(size) _SAL1_1_Source_(_Pre_opt_z_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__cap_x_impl(size)) _Pre_valid_impl_) -#define _Pre_z_bytecap_x_(size) _SAL1_1_Source_(_Pre_z_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_z_bytecap_x_(size) _SAL1_1_Source_(_Pre_opt_z_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre2_impl_(__zterm_impl,__bytecap_x_impl(size)) _Pre_valid_impl_) - -// known capacity and valid but unknown readable extent -#define _Pre_valid_cap_(size) _SAL1_1_Source_(_Pre_valid_cap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_valid_cap_(size) _SAL1_1_Source_(_Pre_opt_valid_cap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_impl(size)) _Pre_valid_impl_) -#define _Pre_valid_bytecap_(size) _SAL1_1_Source_(_Pre_valid_bytecap_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_valid_bytecap_(size) _SAL1_1_Source_(_Pre_opt_valid_bytecap_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_) - -#define _Pre_valid_cap_c_(size) _SAL1_1_Source_(_Pre_valid_cap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_valid_cap_c_(size) _SAL1_1_Source_(_Pre_opt_valid_cap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_) -#define _Pre_valid_bytecap_c_(size) _SAL1_1_Source_(_Pre_valid_bytecap_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_valid_bytecap_c_(size) _SAL1_1_Source_(_Pre_opt_valid_bytecap_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_) - -#define _Pre_valid_cap_x_(size) _SAL1_1_Source_(_Pre_valid_cap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_valid_cap_x_(size) _SAL1_1_Source_(_Pre_opt_valid_cap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_) -#define _Pre_valid_bytecap_x_(size) _SAL1_1_Source_(_Pre_valid_bytecap_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Pre_opt_valid_bytecap_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_) - -// e.g. void AppendCharRange( _Pre_count_(cchFrom) const char* rgFrom, size_t cchFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); -// Valid buffer extent described by another parameter -#define _Pre_count_(size) _SAL1_1_Source_(_Pre_count_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_count_(size) _SAL1_1_Source_(_Pre_opt_count_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_impl(size)) _Pre_valid_impl_) -#define _Pre_bytecount_(size) _SAL1_1_Source_(_Pre_bytecount_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_bytecount_(size) _SAL1_1_Source_(_Pre_opt_bytecount_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_) - -// Valid buffer extent described by a constant expression -#define _Pre_count_c_(size) _SAL1_1_Source_(_Pre_count_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_count_c_(size) _SAL1_1_Source_(_Pre_opt_count_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_) -#define _Pre_bytecount_c_(size) _SAL1_1_Source_(_Pre_bytecount_c_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_bytecount_c_(size) _SAL1_1_Source_(_Pre_opt_bytecount_c_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_) - -// Valid buffer extent described by a complex expression -#define _Pre_count_x_(size) _SAL1_1_Source_(_Pre_count_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_count_x_(size) _SAL1_1_Source_(_Pre_opt_count_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_) -#define _Pre_bytecount_x_(size) _SAL1_1_Source_(_Pre_bytecount_x_, (size), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) -#define _Pre_opt_bytecount_x_(size) _SAL1_1_Source_(_Pre_opt_bytecount_x_, (size), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_) - -// Valid buffer extent described by the difference to another pointer parameter -#define _Pre_ptrdiff_count_(ptr) _SAL1_1_Source_(_Pre_ptrdiff_count_, (ptr), _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_x_impl(__ptrdiff(ptr))) _Pre_valid_impl_) -#define _Pre_opt_ptrdiff_count_(ptr) _SAL1_1_Source_(_Pre_opt_ptrdiff_count_, (ptr), _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_x_impl(__ptrdiff(ptr))) _Pre_valid_impl_) - - -// char * strncpy(_Out_cap_(_Count) _Post_maybez_ char * _Dest, _In_z_ const char * _Source, _In_ size_t _Count) -// buffer maybe zero-terminated after the call -#define _Post_maybez_ _SAL1_1_Source_(_Post_maybez_, (), _Post1_impl_(__maybezterm_impl)) - -// e.g. SIZE_T HeapSize( _In_ HANDLE hHeap, DWORD dwFlags, _Pre_notnull_ _Post_bytecap_(return) LPCVOID lpMem ); -#define _Post_cap_(size) _SAL1_1_Source_(_Post_cap_, (size), _Post1_impl_(__cap_impl(size))) -#define _Post_bytecap_(size) _SAL1_1_Source_(_Post_bytecap_, (size), _Post1_impl_(__bytecap_impl(size))) - -// e.g. int strlen( _In_z_ _Post_count_(return+1) const char* sz ); -#define _Post_count_(size) _SAL1_1_Source_(_Post_count_, (size), _Post1_impl_(__count_impl(size)) _Post_valid_impl_) -#define _Post_bytecount_(size) _SAL1_1_Source_(_Post_bytecount_, (size), _Post1_impl_(__bytecount_impl(size)) _Post_valid_impl_) -#define _Post_count_c_(size) _SAL1_1_Source_(_Post_count_c_, (size), _Post1_impl_(__count_c_impl(size)) _Post_valid_impl_) -#define _Post_bytecount_c_(size) _SAL1_1_Source_(_Post_bytecount_c_, (size), _Post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_) -#define _Post_count_x_(size) _SAL1_1_Source_(_Post_count_x_, (size), _Post1_impl_(__count_x_impl(size)) _Post_valid_impl_) -#define _Post_bytecount_x_(size) _SAL1_1_Source_(_Post_bytecount_x_, (size), _Post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_) - -// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_count_(return+1) char* szFrom, size_t cchFrom ); -#define _Post_z_count_(size) _SAL1_1_Source_(_Post_z_count_, (size), _Post2_impl_(__zterm_impl,__count_impl(size)) _Post_valid_impl_) -#define _Post_z_bytecount_(size) _SAL1_1_Source_(_Post_z_bytecount_, (size), _Post2_impl_(__zterm_impl,__bytecount_impl(size)) _Post_valid_impl_) -#define _Post_z_count_c_(size) _SAL1_1_Source_(_Post_z_count_c_, (size), _Post2_impl_(__zterm_impl,__count_c_impl(size)) _Post_valid_impl_) -#define _Post_z_bytecount_c_(size) _SAL1_1_Source_(_Post_z_bytecount_c_, (size), _Post2_impl_(__zterm_impl,__bytecount_c_impl(size)) _Post_valid_impl_) -#define _Post_z_count_x_(size) _SAL1_1_Source_(_Post_z_count_x_, (size), _Post2_impl_(__zterm_impl,__count_x_impl(size)) _Post_valid_impl_) -#define _Post_z_bytecount_x_(size) _SAL1_1_Source_(_Post_z_bytecount_x_, (size), _Post2_impl_(__zterm_impl,__bytecount_x_impl(size)) _Post_valid_impl_) - -// -// _Prepost_ --- -// -// describing conditions that hold before and after the function call - -#define _Prepost_opt_z_ _SAL1_1_Source_(_Prepost_opt_z_, (), _Pre_opt_z_ _Post_z_) - -#define _Prepost_count_(size) _SAL1_1_Source_(_Prepost_count_, (size), _Pre_count_(size) _Post_count_(size)) -#define _Prepost_opt_count_(size) _SAL1_1_Source_(_Prepost_opt_count_, (size), _Pre_opt_count_(size) _Post_count_(size)) -#define _Prepost_bytecount_(size) _SAL1_1_Source_(_Prepost_bytecount_, (size), _Pre_bytecount_(size) _Post_bytecount_(size)) -#define _Prepost_opt_bytecount_(size) _SAL1_1_Source_(_Prepost_opt_bytecount_, (size), _Pre_opt_bytecount_(size) _Post_bytecount_(size)) -#define _Prepost_count_c_(size) _SAL1_1_Source_(_Prepost_count_c_, (size), _Pre_count_c_(size) _Post_count_c_(size)) -#define _Prepost_opt_count_c_(size) _SAL1_1_Source_(_Prepost_opt_count_c_, (size), _Pre_opt_count_c_(size) _Post_count_c_(size)) -#define _Prepost_bytecount_c_(size) _SAL1_1_Source_(_Prepost_bytecount_c_, (size), _Pre_bytecount_c_(size) _Post_bytecount_c_(size)) -#define _Prepost_opt_bytecount_c_(size) _SAL1_1_Source_(_Prepost_opt_bytecount_c_, (size), _Pre_opt_bytecount_c_(size) _Post_bytecount_c_(size)) -#define _Prepost_count_x_(size) _SAL1_1_Source_(_Prepost_count_x_, (size), _Pre_count_x_(size) _Post_count_x_(size)) -#define _Prepost_opt_count_x_(size) _SAL1_1_Source_(_Prepost_opt_count_x_, (size), _Pre_opt_count_x_(size) _Post_count_x_(size)) -#define _Prepost_bytecount_x_(size) _SAL1_1_Source_(_Prepost_bytecount_x_, (size), _Pre_bytecount_x_(size) _Post_bytecount_x_(size)) -#define _Prepost_opt_bytecount_x_(size) _SAL1_1_Source_(_Prepost_opt_bytecount_x_, (size), _Pre_opt_bytecount_x_(size) _Post_bytecount_x_(size)) - -#define _Prepost_valid_ _SAL1_1_Source_(_Prepost_valid_, (), _Pre_valid_ _Post_valid_) -#define _Prepost_opt_valid_ _SAL1_1_Source_(_Prepost_opt_valid_, (), _Pre_opt_valid_ _Post_valid_) - -// -// _Deref_ --- -// -// short version for _Deref_pre_ _Deref_post_ -// describing conditions for array elements or dereferenced pointer parameters that hold before and after the call - -#define _Deref_prepost_z_ _SAL1_1_Source_(_Deref_prepost_z_, (), _Deref_pre_z_ _Deref_post_z_) -#define _Deref_prepost_opt_z_ _SAL1_1_Source_(_Deref_prepost_opt_z_, (), _Deref_pre_opt_z_ _Deref_post_opt_z_) - -#define _Deref_prepost_cap_(size) _SAL1_1_Source_(_Deref_prepost_cap_, (size), _Deref_pre_cap_(size) _Deref_post_cap_(size)) -#define _Deref_prepost_opt_cap_(size) _SAL1_1_Source_(_Deref_prepost_opt_cap_, (size), _Deref_pre_opt_cap_(size) _Deref_post_opt_cap_(size)) -#define _Deref_prepost_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_bytecap_, (size), _Deref_pre_bytecap_(size) _Deref_post_bytecap_(size)) -#define _Deref_prepost_opt_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecap_, (size), _Deref_pre_opt_bytecap_(size) _Deref_post_opt_bytecap_(size)) - -#define _Deref_prepost_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_cap_x_, (size), _Deref_pre_cap_x_(size) _Deref_post_cap_x_(size)) -#define _Deref_prepost_opt_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_cap_x_, (size), _Deref_pre_opt_cap_x_(size) _Deref_post_opt_cap_x_(size)) -#define _Deref_prepost_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_bytecap_x_, (size), _Deref_pre_bytecap_x_(size) _Deref_post_bytecap_x_(size)) -#define _Deref_prepost_opt_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecap_x_, (size), _Deref_pre_opt_bytecap_x_(size) _Deref_post_opt_bytecap_x_(size)) - -#define _Deref_prepost_z_cap_(size) _SAL1_1_Source_(_Deref_prepost_z_cap_, (size), _Deref_pre_z_cap_(size) _Deref_post_z_cap_(size)) -#define _Deref_prepost_opt_z_cap_(size) _SAL1_1_Source_(_Deref_prepost_opt_z_cap_, (size), _Deref_pre_opt_z_cap_(size) _Deref_post_opt_z_cap_(size)) -#define _Deref_prepost_z_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_z_bytecap_, (size), _Deref_pre_z_bytecap_(size) _Deref_post_z_bytecap_(size)) -#define _Deref_prepost_opt_z_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_opt_z_bytecap_, (size), _Deref_pre_opt_z_bytecap_(size) _Deref_post_opt_z_bytecap_(size)) - -#define _Deref_prepost_valid_cap_(size) _SAL1_1_Source_(_Deref_prepost_valid_cap_, (size), _Deref_pre_valid_cap_(size) _Deref_post_valid_cap_(size)) -#define _Deref_prepost_opt_valid_cap_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_cap_, (size), _Deref_pre_opt_valid_cap_(size) _Deref_post_opt_valid_cap_(size)) -#define _Deref_prepost_valid_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_valid_bytecap_, (size), _Deref_pre_valid_bytecap_(size) _Deref_post_valid_bytecap_(size)) -#define _Deref_prepost_opt_valid_bytecap_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_bytecap_, (size), _Deref_pre_opt_valid_bytecap_(size) _Deref_post_opt_valid_bytecap_(size)) - -#define _Deref_prepost_valid_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_valid_cap_x_, (size), _Deref_pre_valid_cap_x_(size) _Deref_post_valid_cap_x_(size)) -#define _Deref_prepost_opt_valid_cap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_cap_x_, (size), _Deref_pre_opt_valid_cap_x_(size) _Deref_post_opt_valid_cap_x_(size)) -#define _Deref_prepost_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_valid_bytecap_x_, (size), _Deref_pre_valid_bytecap_x_(size) _Deref_post_valid_bytecap_x_(size)) -#define _Deref_prepost_opt_valid_bytecap_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_valid_bytecap_x_, (size), _Deref_pre_opt_valid_bytecap_x_(size) _Deref_post_opt_valid_bytecap_x_(size)) - -#define _Deref_prepost_count_(size) _SAL1_1_Source_(_Deref_prepost_count_, (size), _Deref_pre_count_(size) _Deref_post_count_(size)) -#define _Deref_prepost_opt_count_(size) _SAL1_1_Source_(_Deref_prepost_opt_count_, (size), _Deref_pre_opt_count_(size) _Deref_post_opt_count_(size)) -#define _Deref_prepost_bytecount_(size) _SAL1_1_Source_(_Deref_prepost_bytecount_, (size), _Deref_pre_bytecount_(size) _Deref_post_bytecount_(size)) -#define _Deref_prepost_opt_bytecount_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecount_, (size), _Deref_pre_opt_bytecount_(size) _Deref_post_opt_bytecount_(size)) - -#define _Deref_prepost_count_x_(size) _SAL1_1_Source_(_Deref_prepost_count_x_, (size), _Deref_pre_count_x_(size) _Deref_post_count_x_(size)) -#define _Deref_prepost_opt_count_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_count_x_, (size), _Deref_pre_opt_count_x_(size) _Deref_post_opt_count_x_(size)) -#define _Deref_prepost_bytecount_x_(size) _SAL1_1_Source_(_Deref_prepost_bytecount_x_, (size), _Deref_pre_bytecount_x_(size) _Deref_post_bytecount_x_(size)) -#define _Deref_prepost_opt_bytecount_x_(size) _SAL1_1_Source_(_Deref_prepost_opt_bytecount_x_, (size), _Deref_pre_opt_bytecount_x_(size) _Deref_post_opt_bytecount_x_(size)) - -#define _Deref_prepost_valid_ _SAL1_1_Source_(_Deref_prepost_valid_, (), _Deref_pre_valid_ _Deref_post_valid_) -#define _Deref_prepost_opt_valid_ _SAL1_1_Source_(_Deref_prepost_opt_valid_, (), _Deref_pre_opt_valid_ _Deref_post_opt_valid_) - -// -// _Deref_ -// -// used with references to arrays - -#define _Deref_out_z_cap_c_(size) _SAL1_1_Source_(_Deref_out_z_cap_c_, (size), _Deref_pre_cap_c_(size) _Deref_post_z_) -#define _Deref_inout_z_cap_c_(size) _SAL1_1_Source_(_Deref_inout_z_cap_c_, (size), _Deref_pre_z_cap_c_(size) _Deref_post_z_) -#define _Deref_out_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_out_z_bytecap_c_, (size), _Deref_pre_bytecap_c_(size) _Deref_post_z_) -#define _Deref_inout_z_bytecap_c_(size) _SAL1_1_Source_(_Deref_inout_z_bytecap_c_, (size), _Deref_pre_z_bytecap_c_(size) _Deref_post_z_) -#define _Deref_inout_z_ _SAL1_1_Source_(_Deref_inout_z_, (), _Deref_prepost_z_) - -// #pragma endregion Input Buffer SAL 1 compatibility macros - - -//============================================================================ -// Implementation Layer: -//============================================================================ - - -// Naming conventions: -// A symbol the begins with _SA_ is for the machinery of creating any -// annotations; many of those come from sourceannotations.h in the case -// of attributes. - -// A symbol that ends with _impl is the very lowest level macro. It is -// not required to be a legal standalone annotation, and in the case -// of attribute annotations, usually is not. (In the case of some declspec -// annotations, it might be, but it should not be assumed so.) Those -// symols will be used in the _PreN..., _PostN... and _RetN... annotations -// to build up more complete annotations. - -// A symbol ending in _impl_ is reserved to the implementation as well, -// but it does form a complete annotation; usually they are used to build -// up even higher level annotations. - - -#if _USE_ATTRIBUTES_FOR_SAL || _USE_DECLSPECS_FOR_SAL // [ -// Sharable "_impl" macros: these can be shared between the various annotation -// forms but are part of the implementation of the macros. These are collected -// here to assure that only necessary differences in the annotations -// exist. - -#define _Always_impl_(annos) _Group_(annos _SAL_nop_impl_) _On_failure_impl_(annos _SAL_nop_impl_) -#define _Bound_impl_ _SA_annotes0(SAL_bound) -#define _Field_range_impl_(min,max) _Range_impl_(min,max) -#define _Literal_impl_ _SA_annotes1(SAL_constant, __yes) -#define _Maybenull_impl_ _SA_annotes1(SAL_null, __maybe) -#define _Maybevalid_impl_ _SA_annotes1(SAL_valid, __maybe) -#define _Must_inspect_impl_ _Post_impl_ _SA_annotes0(SAL_mustInspect) -#define _Notliteral_impl_ _SA_annotes1(SAL_constant, __no) -#define _Notnull_impl_ _SA_annotes1(SAL_null, __no) -#define _Notvalid_impl_ _SA_annotes1(SAL_valid, __no) -#define _NullNull_terminated_impl_ _Group_(_SA_annotes1(SAL_nullTerminated, __yes) _SA_annotes1(SAL_readableTo,inexpressibleCount("NullNull terminated string"))) -#define _Null_impl_ _SA_annotes1(SAL_null, __yes) -#define _Null_terminated_impl_ _SA_annotes1(SAL_nullTerminated, __yes) -#define _Out_impl_ _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl) _Post_valid_impl_ -#define _Out_opt_impl_ _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl) _Post_valid_impl_ -#define _Points_to_data_impl_ _At_(*_Curr_, _SA_annotes1(SAL_mayBePointer, __no)) -#define _Post_satisfies_impl_(cond) _Post_impl_ _Satisfies_impl_(cond) -#define _Post_valid_impl_ _Post1_impl_(__valid_impl) -#define _Pre_satisfies_impl_(cond) _Pre_impl_ _Satisfies_impl_(cond) -#define _Pre_valid_impl_ _Pre1_impl_(__valid_impl) -#define _Range_impl_(min,max) _SA_annotes2(SAL_range, min, max) -#define _Readable_bytes_impl_(size) _SA_annotes1(SAL_readableTo, byteCount(size)) -#define _Readable_elements_impl_(size) _SA_annotes1(SAL_readableTo, elementCount(size)) -#define _Ret_valid_impl_ _Ret1_impl_(__valid_impl) -#define _Satisfies_impl_(cond) _SA_annotes1(SAL_satisfies, cond) -#define _Valid_impl_ _SA_annotes1(SAL_valid, __yes) -#define _Writable_bytes_impl_(size) _SA_annotes1(SAL_writableTo, byteCount(size)) -#define _Writable_elements_impl_(size) _SA_annotes1(SAL_writableTo, elementCount(size)) - -#define _In_range_impl_(min,max) _Pre_impl_ _Range_impl_(min,max) -#define _Out_range_impl_(min,max) _Post_impl_ _Range_impl_(min,max) -#define _Ret_range_impl_(min,max) _Post_impl_ _Range_impl_(min,max) -#define _Deref_in_range_impl_(min,max) _Deref_pre_impl_ _Range_impl_(min,max) -#define _Deref_out_range_impl_(min,max) _Deref_post_impl_ _Range_impl_(min,max) -#define _Deref_ret_range_impl_(min,max) _Deref_post_impl_ _Range_impl_(min,max) - -#define _Deref_pre_impl_ _Pre_impl_ _Notref_impl_ _Deref_impl_ -#define _Deref_post_impl_ _Post_impl_ _Notref_impl_ _Deref_impl_ - -// The following are for the implementation machinery, and are not -// suitable for annotating general code. -// We're tying to phase this out, someday. The parser quotes the param. -#define __AuToQuOtE _SA_annotes0(SAL_AuToQuOtE) - -// Normally the parser does some simple type checking of annotation params, -// defer that check to the plugin. -#define __deferTypecheck _SA_annotes0(SAL_deferTypecheck) - -#define _SA_SPECSTRIZE( x ) #x -#define _SAL_nop_impl_ /* nothing */ -#define __nop_impl(x) x -#endif - - -#if _USE_ATTRIBUTES_FOR_SAL // [ - -// Using attributes for sal - -#include "codeanalysis\sourceannotations.h" - - -#define _SA_annotes0(n) [SAL_annotes(Name=#n)] -#define _SA_annotes1(n,pp1) [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1))] -#define _SA_annotes2(n,pp1,pp2) [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1), p2=_SA_SPECSTRIZE(pp2))] -#define _SA_annotes3(n,pp1,pp2,pp3) [SAL_annotes(Name=#n, p1=_SA_SPECSTRIZE(pp1), p2=_SA_SPECSTRIZE(pp2), p3=_SA_SPECSTRIZE(pp3))] - -#define _Pre_impl_ [SAL_pre] -#define _Post_impl_ [SAL_post] -#define _Deref_impl_ [SAL_deref] -#define _Notref_impl_ [SAL_notref] - - -// Declare a function to be an annotation or primop (respectively). -// Done this way so that they don't appear in the regular compiler's -// namespace. -#define __ANNOTATION(fun) _SA_annotes0(SAL_annotation) void __SA_##fun; -#define __PRIMOP(type, fun) _SA_annotes0(SAL_primop) type __SA_##fun; -#define __QUALIFIER(fun) _SA_annotes0(SAL_qualifier) void __SA_##fun; - -// Benign declspec needed here for WindowsPREfast -#define __In_impl_ [SA_Pre(Valid=SA_Yes)] [SA_Pre(Deref=1, Notref=1, Access=SA_Read)] __declspec("SAL_pre SAL_valid") - -#elif _USE_DECLSPECS_FOR_SAL // ][ - -// Using declspecs for sal - -#define _SA_annotes0(n) __declspec(#n) -#define _SA_annotes1(n,pp1) __declspec(#n "(" _SA_SPECSTRIZE(pp1) ")" ) -#define _SA_annotes2(n,pp1,pp2) __declspec(#n "(" _SA_SPECSTRIZE(pp1) "," _SA_SPECSTRIZE(pp2) ")") -#define _SA_annotes3(n,pp1,pp2,pp3) __declspec(#n "(" _SA_SPECSTRIZE(pp1) "," _SA_SPECSTRIZE(pp2) "," _SA_SPECSTRIZE(pp3) ")") - -#define _Pre_impl_ _SA_annotes0(SAL_pre) -#define _Post_impl_ _SA_annotes0(SAL_post) -#define _Deref_impl_ _SA_annotes0(SAL_deref) -#define _Notref_impl_ _SA_annotes0(SAL_notref) - -// Declare a function to be an annotation or primop (respectively). -// Done this way so that they don't appear in the regular compiler's -// namespace. -#define __ANNOTATION(fun) _SA_annotes0(SAL_annotation) void __SA_##fun - -#define __PRIMOP(type, fun) _SA_annotes0(SAL_primop) type __SA_##fun - -#define __QUALIFIER(fun) _SA_annotes0(SAL_qualifier) void __SA_##fun; - -#define __In_impl_ _Pre_impl_ _SA_annotes0(SAL_valid) _Pre_impl_ _Deref_impl_ _Notref_impl_ _SA_annotes0(SAL_readonly) - -#else // ][ - -// Using "nothing" for sal - -#define _SA_annotes0(n) -#define _SA_annotes1(n,pp1) -#define _SA_annotes2(n,pp1,pp2) -#define _SA_annotes3(n,pp1,pp2,pp3) - -#define __ANNOTATION(fun) -#define __PRIMOP(type, fun) -#define __QUALIFIER(type, fun) - -#endif // ] - -#if _USE_ATTRIBUTES_FOR_SAL || _USE_DECLSPECS_FOR_SAL // [ - -// Declare annotations that need to be declared. -__ANNOTATION(SAL_useHeader(void)); -__ANNOTATION(SAL_bound(void)); -__ANNOTATION(SAL_allocator(void)); //??? resolve with PFD -__ANNOTATION(SAL_file_parser(__AuToQuOtE __In_impl_ char *, __In_impl_ char *)); -__ANNOTATION(SAL_source_code_content(__In_impl_ char *)); -__ANNOTATION(SAL_analysisHint(__AuToQuOtE __In_impl_ char *)); -__ANNOTATION(SAL_untrusted_data_source(__AuToQuOtE __In_impl_ char *)); -__ANNOTATION(SAL_untrusted_data_source_this(__AuToQuOtE __In_impl_ char *)); -__ANNOTATION(SAL_validated(__AuToQuOtE __In_impl_ char *)); -__ANNOTATION(SAL_validated_this(__AuToQuOtE __In_impl_ char *)); -__ANNOTATION(SAL_encoded(void)); -__ANNOTATION(SAL_adt(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); -__ANNOTATION(SAL_add_adt_property(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); -__ANNOTATION(SAL_remove_adt_property(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); -__ANNOTATION(SAL_transfer_adt_property_from(__AuToQuOtE __In_impl_ char *)); -__ANNOTATION(SAL_post_type(__AuToQuOtE __In_impl_ char *)); -__ANNOTATION(SAL_volatile(void)); -__ANNOTATION(SAL_nonvolatile(void)); -__ANNOTATION(SAL_entrypoint(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *)); -__ANNOTATION(SAL_blocksOn(__In_impl_ void*)); -__ANNOTATION(SAL_mustInspect(void)); - -// Only appears in model files, but needs to be declared. -__ANNOTATION(SAL_TypeName(__AuToQuOtE __In_impl_ char *)); - -// To be declared well-known soon. -__ANNOTATION(SAL_interlocked(void);) - -#pragma warning (suppress: 28227 28241) -__ANNOTATION(SAL_name(__In_impl_ char *, __In_impl_ char *, __In_impl_ char *);) - -__PRIMOP(char *, _Macro_value_(__In_impl_ char *)); -__PRIMOP(int, _Macro_defined_(__In_impl_ char *)); -__PRIMOP(char *, _Strstr_(__In_impl_ char *, __In_impl_ char *)); - -#endif // ] - -#if _USE_ATTRIBUTES_FOR_SAL // [ - -#define _Check_return_impl_ [SA_Post(MustCheck=SA_Yes)] - -#define _Success_impl_(expr) [SA_Success(Condition=#expr)] -#define _On_failure_impl_(annos) [SAL_context(p1="SAL_failed")] _Group_(_Post_impl_ _Group_(annos _SAL_nop_impl_)) - -#define _Printf_format_string_impl_ [SA_FormatString(Style="printf")] -#define _Scanf_format_string_impl_ [SA_FormatString(Style="scanf")] -#define _Scanf_s_format_string_impl_ [SA_FormatString(Style="scanf_s")] - -#define _In_bound_impl_ [SA_PreBound(Deref=0)] -#define _Out_bound_impl_ [SA_PostBound(Deref=0)] -#define _Ret_bound_impl_ [SA_PostBound(Deref=0)] -#define _Deref_in_bound_impl_ [SA_PreBound(Deref=1)] -#define _Deref_out_bound_impl_ [SA_PostBound(Deref=1)] -#define _Deref_ret_bound_impl_ [SA_PostBound(Deref=1)] - -#define __valid_impl Valid=SA_Yes -#define __maybevalid_impl Valid=SA_Maybe -#define __notvalid_impl Valid=SA_No - -#define __null_impl Null=SA_Yes -#define __maybenull_impl Null=SA_Maybe -#define __notnull_impl Null=SA_No - -#define __null_impl_notref Null=SA_Yes,Notref=1 -#define __maybenull_impl_notref Null=SA_Maybe,Notref=1 -#define __notnull_impl_notref Null=SA_No,Notref=1 - -#define __zterm_impl NullTerminated=SA_Yes -#define __maybezterm_impl NullTerminated=SA_Maybe -#define __maybzterm_impl NullTerminated=SA_Maybe -#define __notzterm_impl NullTerminated=SA_No - -#define __readaccess_impl Access=SA_Read -#define __writeaccess_impl Access=SA_Write -#define __allaccess_impl Access=SA_ReadWrite - -#define __readaccess_impl_notref Access=SA_Read,Notref=1 -#define __writeaccess_impl_notref Access=SA_Write,Notref=1 -#define __allaccess_impl_notref Access=SA_ReadWrite,Notref=1 - -#if _MSC_VER >= 1610 /*IFSTRIP=IGN*/ // [ - -// For SAL2, we need to expect general expressions. - -#define __cap_impl(size) WritableElements="\n"#size -#define __bytecap_impl(size) WritableBytes="\n"#size -#define __bytecount_impl(size) ValidBytes="\n"#size -#define __count_impl(size) ValidElements="\n"#size - -#else // ][ - -#define __cap_impl(size) WritableElements=#size -#define __bytecap_impl(size) WritableBytes=#size -#define __bytecount_impl(size) ValidBytes=#size -#define __count_impl(size) ValidElements=#size - -#endif // ] - -#define __cap_c_impl(size) WritableElementsConst=size -#define __cap_c_one_notref_impl WritableElementsConst=1,Notref=1 -#define __cap_for_impl(param) WritableElementsLength=#param -#define __cap_x_impl(size) WritableElements="\n@"#size - -#define __bytecap_c_impl(size) WritableBytesConst=size -#define __bytecap_x_impl(size) WritableBytes="\n@"#size - -#define __mult_impl(mult,size) __cap_impl((mult)*(size)) - -#define __count_c_impl(size) ValidElementsConst=size -#define __count_x_impl(size) ValidElements="\n@"#size - -#define __bytecount_c_impl(size) ValidBytesConst=size -#define __bytecount_x_impl(size) ValidBytes="\n@"#size - - -#define _At_impl_(target, annos) [SAL_at(p1=#target)] _Group_(annos) -#define _At_buffer_impl_(target, iter, bound, annos) [SAL_at_buffer(p1=#target, p2=#iter, p3=#bound)] _Group_(annos) -#define _When_impl_(expr, annos) [SAL_when(p1=#expr)] _Group_(annos) - -#define _Group_impl_(annos) [SAL_begin] annos [SAL_end] -#define _GrouP_impl_(annos) [SAL_BEGIN] annos [SAL_END] - -#define _Use_decl_anno_impl_ _SA_annotes0(SAL_useHeader) // this is a special case! - -#define _Pre1_impl_(p1) [SA_Pre(p1)] -#define _Pre2_impl_(p1,p2) [SA_Pre(p1,p2)] -#define _Pre3_impl_(p1,p2,p3) [SA_Pre(p1,p2,p3)] - -#define _Post1_impl_(p1) [SA_Post(p1)] -#define _Post2_impl_(p1,p2) [SA_Post(p1,p2)] -#define _Post3_impl_(p1,p2,p3) [SA_Post(p1,p2,p3)] - -#define _Ret1_impl_(p1) [SA_Post(p1)] -#define _Ret2_impl_(p1,p2) [SA_Post(p1,p2)] -#define _Ret3_impl_(p1,p2,p3) [SA_Post(p1,p2,p3)] - -#define _Deref_pre1_impl_(p1) [SA_Pre(Deref=1,p1)] -#define _Deref_pre2_impl_(p1,p2) [SA_Pre(Deref=1,p1,p2)] -#define _Deref_pre3_impl_(p1,p2,p3) [SA_Pre(Deref=1,p1,p2,p3)] - - -#define _Deref_post1_impl_(p1) [SA_Post(Deref=1,p1)] -#define _Deref_post2_impl_(p1,p2) [SA_Post(Deref=1,p1,p2)] -#define _Deref_post3_impl_(p1,p2,p3) [SA_Post(Deref=1,p1,p2,p3)] - -#define _Deref_ret1_impl_(p1) [SA_Post(Deref=1,p1)] -#define _Deref_ret2_impl_(p1,p2) [SA_Post(Deref=1,p1,p2)] -#define _Deref_ret3_impl_(p1,p2,p3) [SA_Post(Deref=1,p1,p2,p3)] - -#define _Deref2_pre1_impl_(p1) [SA_Pre(Deref=2,Notref=1,p1)] -#define _Deref2_post1_impl_(p1) [SA_Post(Deref=2,Notref=1,p1)] -#define _Deref2_ret1_impl_(p1) [SA_Post(Deref=2,Notref=1,p1)] - -// Obsolete -- may be needed for transition to attributes. -#define __inner_typefix(ctype) [SAL_typefix(p1=_SA_SPECSTRIZE(ctype))] -#define __inner_exceptthat [SAL_except] - - -#elif _USE_DECLSPECS_FOR_SAL // ][ - -#define _Check_return_impl_ __post _SA_annotes0(SAL_checkReturn) - -#define _Success_impl_(expr) _SA_annotes1(SAL_success, expr) -#define _On_failure_impl_(annos) _SA_annotes1(SAL_context, SAL_failed) _Group_(_Post_impl_ _Group_(_SAL_nop_impl_ annos)) - -#define _Printf_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "printf") -#define _Scanf_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "scanf") -#define _Scanf_s_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "scanf_s") - -#define _In_bound_impl_ _Pre_impl_ _Bound_impl_ -#define _Out_bound_impl_ _Post_impl_ _Bound_impl_ -#define _Ret_bound_impl_ _Post_impl_ _Bound_impl_ -#define _Deref_in_bound_impl_ _Deref_pre_impl_ _Bound_impl_ -#define _Deref_out_bound_impl_ _Deref_post_impl_ _Bound_impl_ -#define _Deref_ret_bound_impl_ _Deref_post_impl_ _Bound_impl_ - - -#define __null_impl _SA_annotes0(SAL_null) // _SA_annotes1(SAL_null, __yes) -#define __notnull_impl _SA_annotes0(SAL_notnull) // _SA_annotes1(SAL_null, __no) -#define __maybenull_impl _SA_annotes0(SAL_maybenull) // _SA_annotes1(SAL_null, __maybe) - -#define __valid_impl _SA_annotes0(SAL_valid) // _SA_annotes1(SAL_valid, __yes) -#define __notvalid_impl _SA_annotes0(SAL_notvalid) // _SA_annotes1(SAL_valid, __no) -#define __maybevalid_impl _SA_annotes0(SAL_maybevalid) // _SA_annotes1(SAL_valid, __maybe) - -#define __null_impl_notref _Notref_ _Null_impl_ -#define __maybenull_impl_notref _Notref_ _Maybenull_impl_ -#define __notnull_impl_notref _Notref_ _Notnull_impl_ - -#define __zterm_impl _SA_annotes1(SAL_nullTerminated, __yes) -#define __maybezterm_impl _SA_annotes1(SAL_nullTerminated, __maybe) -#define __maybzterm_impl _SA_annotes1(SAL_nullTerminated, __maybe) -#define __notzterm_impl _SA_annotes1(SAL_nullTerminated, __no) - -#define __readaccess_impl _SA_annotes1(SAL_access, 0x1) -#define __writeaccess_impl _SA_annotes1(SAL_access, 0x2) -#define __allaccess_impl _SA_annotes1(SAL_access, 0x3) - -#define __readaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x1) -#define __writeaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x2) -#define __allaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x3) - -#define __cap_impl(size) _SA_annotes1(SAL_writableTo,elementCount(size)) -#define __cap_c_impl(size) _SA_annotes1(SAL_writableTo,elementCount(size)) -#define __cap_c_one_notref_impl _Notref_ _SA_annotes1(SAL_writableTo,elementCount(1)) -#define __cap_for_impl(param) _SA_annotes1(SAL_writableTo,inexpressibleCount(sizeof(param))) -#define __cap_x_impl(size) _SA_annotes1(SAL_writableTo,inexpressibleCount(#size)) - -#define __bytecap_impl(size) _SA_annotes1(SAL_writableTo,byteCount(size)) -#define __bytecap_c_impl(size) _SA_annotes1(SAL_writableTo,byteCount(size)) -#define __bytecap_x_impl(size) _SA_annotes1(SAL_writableTo,inexpressibleCount(#size)) - -#define __mult_impl(mult,size) _SA_annotes1(SAL_writableTo,(mult)*(size)) - -#define __count_impl(size) _SA_annotes1(SAL_readableTo,elementCount(size)) -#define __count_c_impl(size) _SA_annotes1(SAL_readableTo,elementCount(size)) -#define __count_x_impl(size) _SA_annotes1(SAL_readableTo,inexpressibleCount(#size)) - -#define __bytecount_impl(size) _SA_annotes1(SAL_readableTo,byteCount(size)) -#define __bytecount_c_impl(size) _SA_annotes1(SAL_readableTo,byteCount(size)) -#define __bytecount_x_impl(size) _SA_annotes1(SAL_readableTo,inexpressibleCount(#size)) - -#define _At_impl_(target, annos) _SA_annotes0(SAL_at(target)) _Group_(annos) -#define _At_buffer_impl_(target, iter, bound, annos) _SA_annotes3(SAL_at_buffer, target, iter, bound) _Group_(annos) -#define _Group_impl_(annos) _SA_annotes0(SAL_begin) annos _SA_annotes0(SAL_end) -#define _GrouP_impl_(annos) _SA_annotes0(SAL_BEGIN) annos _SA_annotes0(SAL_END) -#define _When_impl_(expr, annos) _SA_annotes0(SAL_when(expr)) _Group_(annos) - -#define _Use_decl_anno_impl_ __declspec("SAL_useHeader()") // this is a special case! - -#define _Pre1_impl_(p1) _Pre_impl_ p1 -#define _Pre2_impl_(p1,p2) _Pre_impl_ p1 _Pre_impl_ p2 -#define _Pre3_impl_(p1,p2,p3) _Pre_impl_ p1 _Pre_impl_ p2 _Pre_impl_ p3 - -#define _Post1_impl_(p1) _Post_impl_ p1 -#define _Post2_impl_(p1,p2) _Post_impl_ p1 _Post_impl_ p2 -#define _Post3_impl_(p1,p2,p3) _Post_impl_ p1 _Post_impl_ p2 _Post_impl_ p3 - -#define _Ret1_impl_(p1) _Post_impl_ p1 -#define _Ret2_impl_(p1,p2) _Post_impl_ p1 _Post_impl_ p2 -#define _Ret3_impl_(p1,p2,p3) _Post_impl_ p1 _Post_impl_ p2 _Post_impl_ p3 - -#define _Deref_pre1_impl_(p1) _Deref_pre_impl_ p1 -#define _Deref_pre2_impl_(p1,p2) _Deref_pre_impl_ p1 _Deref_pre_impl_ p2 -#define _Deref_pre3_impl_(p1,p2,p3) _Deref_pre_impl_ p1 _Deref_pre_impl_ p2 _Deref_pre_impl_ p3 - -#define _Deref_post1_impl_(p1) _Deref_post_impl_ p1 -#define _Deref_post2_impl_(p1,p2) _Deref_post_impl_ p1 _Deref_post_impl_ p2 -#define _Deref_post3_impl_(p1,p2,p3) _Deref_post_impl_ p1 _Deref_post_impl_ p2 _Deref_post_impl_ p3 - -#define _Deref_ret1_impl_(p1) _Deref_post_impl_ p1 -#define _Deref_ret2_impl_(p1,p2) _Deref_post_impl_ p1 _Deref_post_impl_ p2 -#define _Deref_ret3_impl_(p1,p2,p3) _Deref_post_impl_ p1 _Deref_post_impl_ p2 _Deref_post_impl_ p3 - -#define _Deref2_pre1_impl_(p1) _Deref_pre_impl_ _Notref_impl_ _Deref_impl_ p1 -#define _Deref2_post1_impl_(p1) _Deref_post_impl_ _Notref_impl_ _Deref_impl_ p1 -#define _Deref2_ret1_impl_(p1) _Deref_post_impl_ _Notref_impl_ _Deref_impl_ p1 - -#define __inner_typefix(ctype) _SA_annotes1(SAL_typefix, ctype) -#define __inner_exceptthat _SA_annotes0(SAL_except) - -#elif defined(_MSC_EXTENSIONS) && !defined( MIDL_PASS ) && !defined(__midl) && !defined(RC_INVOKED) && defined(_PFT_VER) && _MSC_VER >= 1400 /*IFSTRIP=IGN*/ // ][ - -// minimum attribute expansion for foreground build - -#pragma push_macro( "SA" ) -#pragma push_macro( "REPEATABLE" ) - -#ifdef __cplusplus // [ -#define SA( id ) id -#define REPEATABLE [repeatable] -#else // !__cplusplus // ][ -#define SA( id ) SA_##id -#define REPEATABLE -#endif // !__cplusplus // ] - -REPEATABLE -[source_annotation_attribute( SA( Parameter ) )] -struct __P_impl -{ -#ifdef __cplusplus // [ - __P_impl(); -#endif // ] - int __d_; -}; -typedef struct __P_impl __P_impl; - -REPEATABLE -[source_annotation_attribute( SA( ReturnValue ) )] -struct __R_impl -{ -#ifdef __cplusplus // [ - __R_impl(); -#endif // ] - int __d_; -}; -typedef struct __R_impl __R_impl; - -[source_annotation_attribute( SA( Method ) )] -struct __M_ -{ -#ifdef __cplusplus // [ - __M_(); -#endif // ] - int __d_; -}; -typedef struct __M_ __M_; - -[source_annotation_attribute( SA( All ) )] -struct __A_ -{ -#ifdef __cplusplus // [ - __A_(); -#endif // ] - int __d_; -}; -typedef struct __A_ __A_; - -[source_annotation_attribute( SA( Field ) )] -struct __F_ -{ -#ifdef __cplusplus // [ - __F_(); -#endif // ] - int __d_; -}; -typedef struct __F_ __F_; - -#pragma pop_macro( "REPEATABLE" ) -#pragma pop_macro( "SA" ) - - -#define _SAL_nop_impl_ - -#define _At_impl_(target, annos) [__A_(__d_=0)] -#define _At_buffer_impl_(target, iter, bound, annos) [__A_(__d_=0)] -#define _When_impl_(expr, annos) annos -#define _Group_impl_(annos) annos -#define _GrouP_impl_(annos) annos -#define _Use_decl_anno_impl_ [__M_(__d_=0)] - -#define _Points_to_data_impl_ [__P_impl(__d_=0)] -#define _Literal_impl_ [__P_impl(__d_=0)] -#define _Notliteral_impl_ [__P_impl(__d_=0)] - -#define _Pre_valid_impl_ [__P_impl(__d_=0)] -#define _Post_valid_impl_ [__P_impl(__d_=0)] -#define _Ret_valid_impl_ [__R_impl(__d_=0)] - -#define _Check_return_impl_ [__R_impl(__d_=0)] -#define _Must_inspect_impl_ [__R_impl(__d_=0)] - -#define _Success_impl_(expr) [__M_(__d_=0)] -#define _On_failure_impl_(expr) [__M_(__d_=0)] -#define _Always_impl_(expr) [__M_(__d_=0)] - -#define _Printf_format_string_impl_ [__P_impl(__d_=0)] -#define _Scanf_format_string_impl_ [__P_impl(__d_=0)] -#define _Scanf_s_format_string_impl_ [__P_impl(__d_=0)] - -#define _Raises_SEH_exception_impl_ [__M_(__d_=0)] -#define _Maybe_raises_SEH_exception_impl_ [__M_(__d_=0)] - -#define _In_bound_impl_ [__P_impl(__d_=0)] -#define _Out_bound_impl_ [__P_impl(__d_=0)] -#define _Ret_bound_impl_ [__R_impl(__d_=0)] -#define _Deref_in_bound_impl_ [__P_impl(__d_=0)] -#define _Deref_out_bound_impl_ [__P_impl(__d_=0)] -#define _Deref_ret_bound_impl_ [__R_impl(__d_=0)] - -#define _Range_impl_(min,max) [__P_impl(__d_=0)] -#define _In_range_impl_(min,max) [__P_impl(__d_=0)] -#define _Out_range_impl_(min,max) [__P_impl(__d_=0)] -#define _Ret_range_impl_(min,max) [__R_impl(__d_=0)] -#define _Deref_in_range_impl_(min,max) [__P_impl(__d_=0)] -#define _Deref_out_range_impl_(min,max) [__P_impl(__d_=0)] -#define _Deref_ret_range_impl_(min,max) [__R_impl(__d_=0)] - -#define _Field_range_impl_(min,max) [__F_(__d_=0)] - -#define _Pre_satisfies_impl_(cond) [__A_(__d_=0)] -#define _Post_satisfies_impl_(cond) [__A_(__d_=0)] -#define _Satisfies_impl_(cond) [__A_(__d_=0)] - -#define _Null_impl_ [__A_(__d_=0)] -#define _Notnull_impl_ [__A_(__d_=0)] -#define _Maybenull_impl_ [__A_(__d_=0)] - -#define _Valid_impl_ [__A_(__d_=0)] -#define _Notvalid_impl_ [__A_(__d_=0)] -#define _Maybevalid_impl_ [__A_(__d_=0)] - -#define _Readable_bytes_impl_(size) [__A_(__d_=0)] -#define _Readable_elements_impl_(size) [__A_(__d_=0)] -#define _Writable_bytes_impl_(size) [__A_(__d_=0)] -#define _Writable_elements_impl_(size) [__A_(__d_=0)] - -#define _Null_terminated_impl_ [__A_(__d_=0)] -#define _NullNull_terminated_impl_ [__A_(__d_=0)] - -#define _Pre_impl_ [__P_impl(__d_=0)] -#define _Pre1_impl_(p1) [__P_impl(__d_=0)] -#define _Pre2_impl_(p1,p2) [__P_impl(__d_=0)] -#define _Pre3_impl_(p1,p2,p3) [__P_impl(__d_=0)] - -#define _Post_impl_ [__P_impl(__d_=0)] -#define _Post1_impl_(p1) [__P_impl(__d_=0)] -#define _Post2_impl_(p1,p2) [__P_impl(__d_=0)] -#define _Post3_impl_(p1,p2,p3) [__P_impl(__d_=0)] - -#define _Ret1_impl_(p1) [__R_impl(__d_=0)] -#define _Ret2_impl_(p1,p2) [__R_impl(__d_=0)] -#define _Ret3_impl_(p1,p2,p3) [__R_impl(__d_=0)] - -#define _Deref_pre1_impl_(p1) [__P_impl(__d_=0)] -#define _Deref_pre2_impl_(p1,p2) [__P_impl(__d_=0)] -#define _Deref_pre3_impl_(p1,p2,p3) [__P_impl(__d_=0)] - -#define _Deref_post1_impl_(p1) [__P_impl(__d_=0)] -#define _Deref_post2_impl_(p1,p2) [__P_impl(__d_=0)] -#define _Deref_post3_impl_(p1,p2,p3) [__P_impl(__d_=0)] - -#define _Deref_ret1_impl_(p1) [__R_impl(__d_=0)] -#define _Deref_ret2_impl_(p1,p2) [__R_impl(__d_=0)] -#define _Deref_ret3_impl_(p1,p2,p3) [__R_impl(__d_=0)] - -#define _Deref2_pre1_impl_(p1) //[__P_impl(__d_=0)] -#define _Deref2_post1_impl_(p1) //[__P_impl(__d_=0)] -#define _Deref2_ret1_impl_(p1) //[__P_impl(__d_=0)] - -#else // ][ - - -#define _SAL_nop_impl_ X - -#define _At_impl_(target, annos) -#define _When_impl_(expr, annos) -#define _Group_impl_(annos) -#define _GrouP_impl_(annos) -#define _At_buffer_impl_(target, iter, bound, annos) -#define _Use_decl_anno_impl_ -#define _Points_to_data_impl_ -#define _Literal_impl_ -#define _Notliteral_impl_ -#define _Notref_impl_ - -#define _Pre_valid_impl_ -#define _Post_valid_impl_ -#define _Ret_valid_impl_ - -#define _Check_return_impl_ -#define _Must_inspect_impl_ - -#define _Success_impl_(expr) -#define _On_failure_impl_(annos) -#define _Always_impl_(annos) - -#define _Printf_format_string_impl_ -#define _Scanf_format_string_impl_ -#define _Scanf_s_format_string_impl_ - -#define _In_bound_impl_ -#define _Out_bound_impl_ -#define _Ret_bound_impl_ -#define _Deref_in_bound_impl_ -#define _Deref_out_bound_impl_ -#define _Deref_ret_bound_impl_ - -#define _Range_impl_(min,max) -#define _In_range_impl_(min,max) -#define _Out_range_impl_(min,max) -#define _Ret_range_impl_(min,max) -#define _Deref_in_range_impl_(min,max) -#define _Deref_out_range_impl_(min,max) -#define _Deref_ret_range_impl_(min,max) - -#define _Satisfies_impl_(expr) -#define _Pre_satisfies_impl_(expr) -#define _Post_satisfies_impl_(expr) - -#define _Null_impl_ -#define _Notnull_impl_ -#define _Maybenull_impl_ - -#define _Valid_impl_ -#define _Notvalid_impl_ -#define _Maybevalid_impl_ - -#define _Field_range_impl_(min,max) - -#define _Pre_impl_ -#define _Pre1_impl_(p1) -#define _Pre2_impl_(p1,p2) -#define _Pre3_impl_(p1,p2,p3) - -#define _Post_impl_ -#define _Post1_impl_(p1) -#define _Post2_impl_(p1,p2) -#define _Post3_impl_(p1,p2,p3) - -#define _Ret1_impl_(p1) -#define _Ret2_impl_(p1,p2) -#define _Ret3_impl_(p1,p2,p3) - -#define _Deref_pre1_impl_(p1) -#define _Deref_pre2_impl_(p1,p2) -#define _Deref_pre3_impl_(p1,p2,p3) - -#define _Deref_post1_impl_(p1) -#define _Deref_post2_impl_(p1,p2) -#define _Deref_post3_impl_(p1,p2,p3) - -#define _Deref_ret1_impl_(p1) -#define _Deref_ret2_impl_(p1,p2) -#define _Deref_ret3_impl_(p1,p2,p3) - -#define _Deref2_pre1_impl_(p1) -#define _Deref2_post1_impl_(p1) -#define _Deref2_ret1_impl_(p1) - -#define _Readable_bytes_impl_(size) -#define _Readable_elements_impl_(size) -#define _Writable_bytes_impl_(size) -#define _Writable_elements_impl_(size) +/* +**============================================================================== +** +** Open Management Infrastructure (OMI) v.1.1.0 +** +** Copyright (c) Microsoft Corporation +** +** All rights reserved. +** +** MIT License +** +** Permission is hereby granted, free of charge, to any person obtaining +** a copy of this software and associated documentation files (the +** ""Software""), to deal in the Software without restriction, including +** without limitation the rights to use, copy, modify, merge, publish, +** distribute, sublicense, and/or sell copies of the Software, and to +** permit persons to whom the Software is furnished to do so, subject to +** the following conditions: +** +** The above copyright notice and this permission notice shall be +** included in all copies or substantial portions of the Software. +** +** THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, +** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +** NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +** LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +** OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +** WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +** +**============================================================================== +*/ -#define _Null_terminated_impl_ -#define _NullNull_terminated_impl_ +#ifndef _SAL_h +#define _SAL_h -// Obsolete -- may be needed for transition to attributes. -#define __inner_typefix(ctype) -#define __inner_exceptthat +#if !defined(_In_) +# define _In_ +#endif -#endif // ] - -// This section contains the deprecated annotations +#if !defined(_In_count_) +# define _In_count_(count) +#endif -/* - ------------------------------------------------------------------------------- - Introduction - - sal.h provides a set of annotations to describe how a function uses its - parameters - the assumptions it makes about them, and the guarantees it makes - upon finishing. - - Annotations may be placed before either a function parameter's type or its return - type, and describe the function's behavior regarding the parameter or return value. - There are two classes of annotations: buffer annotations and advanced annotations. - Buffer annotations describe how functions use their pointer parameters, and - advanced annotations either describe complex/unusual buffer behavior, or provide - additional information about a parameter that is not otherwise expressible. - - ------------------------------------------------------------------------------- - Buffer Annotations - - The most important annotations in sal.h provide a consistent way to annotate - buffer parameters or return values for a function. Each of these annotations describes - a single buffer (which could be a string, a fixed-length or variable-length array, - or just a pointer) that the function interacts with: where it is, how large it is, - how much is initialized, and what the function does with it. - - The appropriate macro for a given buffer can be constructed using the table below. - Just pick the appropriate values from each category, and combine them together - with a leading underscore. Some combinations of values do not make sense as buffer - annotations. Only meaningful annotations can be added to your code; for a list of - these, see the buffer annotation definitions section. - - Only a single buffer annotation should be used for each parameter. - - |------------|------------|---------|--------|----------|----------|---------------| - | Level | Usage | Size | Output | NullTerm | Optional | Parameters | - |------------|------------|---------|--------|----------|----------|---------------| - | <> | <> | <> | <> | _z | <> | <> | - | _deref | _in | _ecount | _full | _nz | _opt | (size) | - | _deref_opt | _out | _bcount | _part | | | (size,length) | - | | _inout | | | | | | - | | | | | | | | - |------------|------------|---------|--------|----------|----------|---------------| - - Level: Describes the buffer pointer's level of indirection from the parameter or - return value 'p'. - - <> : p is the buffer pointer. - _deref : *p is the buffer pointer. p must not be NULL. - _deref_opt : *p may be the buffer pointer. p may be NULL, in which case the rest of - the annotation is ignored. - - Usage: Describes how the function uses the buffer. - - <> : The buffer is not accessed. If used on the return value or with _deref, the - function will provide the buffer, and it will be uninitialized at exit. - Otherwise, the caller must provide the buffer. This should only be used - for alloc and free functions. - _in : The function will only read from the buffer. The caller must provide the - buffer and initialize it. Cannot be used with _deref. - _out : The function will only write to the buffer. If used on the return value or - with _deref, the function will provide the buffer and initialize it. - Otherwise, the caller must provide the buffer, and the function will - initialize it. - _inout : The function may freely read from and write to the buffer. The caller must - provide the buffer and initialize it. If used with _deref, the buffer may - be reallocated by the function. - - Size: Describes the total size of the buffer. This may be less than the space actually - allocated for the buffer, in which case it describes the accessible amount. - - <> : No buffer size is given. If the type specifies the buffer size (such as - with LPSTR and LPWSTR), that amount is used. Otherwise, the buffer is one - element long. Must be used with _in, _out, or _inout. - _ecount : The buffer size is an explicit element count. - _bcount : The buffer size is an explicit byte count. - - Output: Describes how much of the buffer will be initialized by the function. For - _inout buffers, this also describes how much is initialized at entry. Omit this - category for _in buffers; they must be fully initialized by the caller. - - <> : The type specifies how much is initialized. For instance, a function initializing - an LPWSTR must NULL-terminate the string. - _full : The function initializes the entire buffer. - _part : The function initializes part of the buffer, and explicitly indicates how much. - - NullTerm: States if the present of a '\0' marks the end of valid elements in the buffer. - _z : A '\0' indicated the end of the buffer - _nz : The buffer may not be null terminated and a '\0' does not indicate the end of the - buffer. - Optional: Describes if the buffer itself is optional. - - <> : The pointer to the buffer must not be NULL. - _opt : The pointer to the buffer might be NULL. It will be checked before being dereferenced. - - Parameters: Gives explicit counts for the size and length of the buffer. - - <> : There is no explicit count. Use when neither _ecount nor _bcount is used. - (size) : Only the buffer's total size is given. Use with _ecount or _bcount but not _part. - (size,length) : The buffer's total size and initialized length are given. Use with _ecount_part - and _bcount_part. - - ------------------------------------------------------------------------------- - Buffer Annotation Examples - - LWSTDAPI_(BOOL) StrToIntExA( - __in LPCSTR pszString, - DWORD dwFlags, - __out int *piRet -- A pointer whose dereference will be filled in. - ); - - void MyPaintingFunction( - __in HWND hwndControl, -- An initialized read-only parameter. - __in_opt HDC hdcOptional, -- An initialized read-only parameter that might be NULL. - __inout IPropertyStore *ppsStore -- An initialized parameter that may be freely used - -- and modified. - ); - - LWSTDAPI_(BOOL) PathCompactPathExA( - __out_ecount(cchMax) LPSTR pszOut, -- A string buffer with cch elements that will - -- be NULL terminated on exit. - __in LPCSTR pszSrc, - UINT cchMax, - DWORD dwFlags - ); - - HRESULT SHLocalAllocBytes( - size_t cb, - __deref_bcount(cb) T **ppv -- A pointer whose dereference will be set to an - -- uninitialized buffer with cb bytes. - ); - - __inout_bcount_full(cb) : A buffer with cb elements that is fully initialized at - entry and exit, and may be written to by this function. - - __out_ecount_part(count, *countOut) : A buffer with count elements that will be - partially initialized by this function. The function indicates how much it - initialized by setting *countOut. - - ------------------------------------------------------------------------------- - Advanced Annotations - - Advanced annotations describe behavior that is not expressible with the regular - buffer macros. These may be used either to annotate buffer parameters that involve - complex or conditional behavior, or to enrich existing annotations with additional - information. - - __success(expr) f : - indicates whether function f succeeded or not. If is true at exit, - all the function's guarantees (as given by other annotations) must hold. If - is false at exit, the caller should not expect any of the function's guarantees - to hold. If not used, the function must always satisfy its guarantees. Added - automatically to functions that indicate success in standard ways, such as by - returning an HRESULT. - - __nullterminated p : - Pointer p is a buffer that may be read or written up to and including the first - NULL character or pointer. May be used on typedefs, which marks valid (properly - initialized) instances of that type as being NULL-terminated. - - __nullnullterminated p : - Pointer p is a buffer that may be read or written up to and including the first - sequence of two NULL characters or pointers. May be used on typedefs, which marks - valid instances of that type as being double-NULL terminated. - - __reserved v : - Value v must be 0/NULL, reserved for future use. - - __checkReturn v : - Return value v must not be ignored by callers of this function. - - __typefix(ctype) v : - Value v should be treated as an instance of ctype, rather than its declared type. - - __override f : - Specify C#-style 'override' behaviour for overriding virtual methods. +#if !defined(_In_opt_) +# define _In_opt_ +#endif - __callback f : - Function f can be used as a function pointer. - - __format_string p : - Pointer p is a string that contains % markers in the style of printf. - - __blocksOn(resource) f : - Function f blocks on the resource 'resource'. +#if !defined(_In_z_) +# define _In_z_ +#endif - FALLTHROUGH : - Annotates switch statement labels where fall-through is desired, to distinguish - from forgotten break statements. +#if !defined(_In_opt_z_) +# define _In_opt_z_ +#endif - ------------------------------------------------------------------------------- - Advanced Annotation Examples +#if !defined(_Must_inspect_result_) +# define _Must_inspect_result_ +#endif - __success(return != FALSE) LWSTDAPI_(BOOL) - PathCanonicalizeA(__out_ecount(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) : - pszBuf is only guaranteed to be NULL-terminated when TRUE is returned. +#if !defined(_Out_) +# define _Out_ +#endif - typedef __nullterminated WCHAR* LPWSTR : Initialized LPWSTRs are NULL-terminated strings. - - __out_ecount(cch) __typefix(LPWSTR) void *psz : psz is a buffer parameter which will be - a NULL-terminated WCHAR string at exit, and which initially contains cch WCHARs. +#if !defined(_Outptr_) +# define _Outptr_ +#endif - ------------------------------------------------------------------------------- -*/ +#if !defined(_Outptr_opt_) +# define _Outptr_opt_ +#endif -#define __specstrings +#if !defined(_Out_writes_z_) +# define _Out_writes_z_(count) +#endif -#ifdef __cplusplus // [ -#ifndef __nothrow // [ -# define __nothrow NOTHROW_DECL -#endif // ] -extern "C" { -#else // ][ -#ifndef __nothrow // [ -# define __nothrow -#endif // ] -#endif /* #ifdef __cplusplus */ // ] +#if !defined(_Outptr_result_z_) +# define _Outptr_result_z_ +#endif +#if !defined(_Outptr_result_bytebuffer_) +# define _Outptr_result_bytebuffer_(count) +#endif -/* - ------------------------------------------------------------------------------- - Helper Macro Definitions +#if !defined(_Outptr_result_maybenull_) +# define _Outptr_result_maybenull_ +#endif - These express behavior common to many of the high-level annotations. - DO NOT USE THESE IN YOUR CODE. - ------------------------------------------------------------------------------- -*/ +#if !defined(_Outptr_opt_result_maybenull_) +# define _Outptr_opt_result_maybenull_ +#endif -/* - The helper annotations are only understood by the compiler version used by - various defect detection tools. When the regular compiler is running, they - are defined into nothing, and do not affect the compiled code. -*/ +#if !defined(_Outptr_result_maybenull_z_) +# define _Outptr_result_maybenull_z_ +#endif -#if !defined(__midl) && defined(_PREFAST_) // [ +#if !defined(_Outptr_opt_result_z_) +# define _Outptr_opt_result_z_ +#endif - /* - In the primitive "SAL_*" annotations "SAL" stands for Standard - Annotation Language. These "SAL_*" annotations are the - primitives the compiler understands and high-level MACROs - will decompose into these primivates. - */ +#if !defined(_Outptr_opt_result_maybenull_z_) +# define _Outptr_opt_result_maybenull_z_ +#endif - #define _SA_SPECSTRIZE( x ) #x +#if !defined(_Return_type_success_) +# define _Return_type_success_(expr) +#endif - /* - __null p - __notnull p - __maybenull p +#if !defined(_In_reads_bytes_) +# define _In_reads_bytes_(count) +#endif - Annotates a pointer p. States that pointer p is null. Commonly used - in the negated form __notnull or the possibly null form __maybenull. - */ +#if !defined(_In_reads_opt_) +# define _In_reads_opt_(expr) +#endif -#ifndef PAL_STDCPP_COMPAT - #define __null _Null_impl_ - #define __notnull _Notnull_impl_ - #define __maybenull _Maybenull_impl_ -#endif // !PAL_STDCPP_COMPAT +#if !defined(_Out_writes_to_opt_) +# define _Out_writes_to_opt_(length, lengthwritten) +#endif - /* - __readonly l - __notreadonly l - __maybereadonly l +#if !defined(_Acquires_lock_) +# define _Acquires_lock_(lock) +#endif - Annotates a location l. States that location l is not modified after - this point. If the annotation is placed on the precondition state of - a function, the restriction only applies until the postcondition state - of the function. __maybereadonly states that the annotated location - may be modified, whereas __notreadonly states that a location must be - modified. - */ +#if !defined(_Releases_lock_) +# define _Releases_lock_(lock) +#endif - #define __readonly _Pre1_impl_(__readaccess_impl) - #define __notreadonly _Pre1_impl_(__allaccess_impl) - #define __maybereadonly _Pre1_impl_(__readaccess_impl) +#if !defined(_Inout_) +# define _Inout_ +#endif - /* - __valid v - __notvalid v - __maybevalid v +#if !defined(_Inout_opt_) +# define _Inout_opt_ +#endif - Annotates any value v. States that the value satisfies all properties of - valid values of its type. For example, for a string buffer, valid means - that the buffer pointer is either NULL or points to a NULL-terminated string. - */ +#if !defined(_Inout_z_) +# define _Inout_z_ +#endif - #define __valid _Valid_impl_ - #define __notvalid _Notvalid_impl_ - #define __maybevalid _Maybevalid_impl_ +#if !defined(_Out_opt_) +# define _Out_opt_ +#endif - /* - __readableTo(extent) p +#if !defined(_Out_writes_bytes_) +# define _Out_writes_bytes_(count) +#endif - Annotates a buffer pointer p. If the buffer can be read, extent describes - how much of the buffer is readable. For a reader of the buffer, this is - an explicit permission to read up to that amount, rather than a restriction to - read only up to it. - */ +#if !defined(_In_reads_) +# define _In_reads_(count) +#endif - #define __readableTo(extent) _SA_annotes1(SAL_readableTo, extent) +#if !defined(_In_reads_z_) +# define _In_reads_z_(count) +#endif - /* +#if !defined(_Out_writes_opt_) +# define _Out_writes_opt_(count) +#endif - __elem_readableTo(size) +#if !defined(_Null_terminated_) +#define _Null_terminated_ +#endif - Annotates a buffer pointer p as being readable to size elements. - */ +#if !defined(_Requires_lock_not_held_) +#define _Requires_lock_not_held_(lock) +#endif - #define __elem_readableTo(size) _SA_annotes1(SAL_readableTo, elementCount( size )) +#if !defined(_Requires_lock_held_) +#define _Requires_lock_held_(lock) +#endif - /* - __byte_readableTo(size) +#if !defined(__field_ecount) +#define __field_ecount(count) +#endif - Annotates a buffer pointer p as being readable to size bytes. - */ - #define __byte_readableTo(size) _SA_annotes1(SAL_readableTo, byteCount(size)) +#if !defined(_Check_return_) +#define _Check_return_ +#endif - /* - __writableTo(extent) p +#if !defined(_Deref_post_z_) +#define _Deref_post_z_ +#endif - Annotates a buffer pointer p. If the buffer can be modified, extent - describes how much of the buffer is writable (usually the allocation - size). For a writer of the buffer, this is an explicit permission to - write up to that amount, rather than a restriction to write only up to it. - */ - #define __writableTo(size) _SA_annotes1(SAL_writableTo, size) +#if !defined(_Deref_prepost_opt_z_) +#define _Deref_prepost_opt_z_ +#endif - /* - __elem_writableTo(size) +#if !defined(_Deref_out_range_) +#define _Deref_out_range_(min, max) +#endif - Annotates a buffer pointer p as being writable to size elements. - */ - #define __elem_writableTo(size) _SA_annotes1(SAL_writableTo, elementCount( size )) +#if !defined(_Inout_opt_z_) +#define _Inout_opt_z_ +#endif - /* - __byte_writableTo(size) +#if !defined(_Inout_updates_z_) +#define _Inout_updates_z_(count) +#endif - Annotates a buffer pointer p as being writable to size bytes. - */ - #define __byte_writableTo(size) _SA_annotes1(SAL_writableTo, byteCount( size)) +#if !defined(_Out_writes_) +#define _Out_writes_(count) +#endif - /* - __deref p +#if !defined(_Post_readable_size_) +#define _Post_readable_size_(count) +#endif - Annotates a pointer p. The next annotation applies one dereference down - in the type. If readableTo(p, size) then the next annotation applies to - all elements *(p+i) for which i satisfies the size. If p is a pointer - to a struct, the next annotation applies to all fields of the struct. - */ - #define __deref _Deref_impl_ +#if !defined(_Post_ptr_invalid_) +#define _Post_ptr_invalid_ +#endif - /* - __pre __next_annotation +#if !defined(_Pre_valid_) +#define _Pre_valid_ +#endif - The next annotation applies in the precondition state - */ - #define __pre _Pre_impl_ +#if !defined(_Pre_writable_size_) +#define _Pre_writable_size_(count) +#endif - /* - __post __next_annotation +#if !defined(_Success_) +#define _Success_(count) +#endif - The next annotation applies in the postcondition state - */ - #define __post _Post_impl_ - - /* - __precond() - - When is true, the next annotation applies in the precondition state - (currently not enabled) - */ - #define __precond(expr) __pre - - /* - __postcond() - - When is true, the next annotation applies in the postcondition state - (currently not enabled) - */ - #define __postcond(expr) __post - - /* - __exceptthat +#if !defined(_Ret_notnull_) +#define _Ret_notnull_ +#endif - Given a set of annotations Q containing __exceptthat maybeP, the effect of - the except clause is to erase any P or notP annotations (explicit or - implied) within Q at the same level of dereferencing that the except - clause appears, and to replace it with maybeP. +#if !defined(_Ret_z_) +#define _Ret_z_ +#endif - Example 1: __valid __pre_except_maybenull on a pointer p means that the - pointer may be null, and is otherwise valid, thus overriding - the implicit notnull annotation implied by __valid on - pointers. +#if !defined(_Use_decl_annotations_) +#define _Use_decl_annotations_ +#endif - Example 2: __valid __deref __pre_except_maybenull on an int **p means - that p is not null (implied by valid), but the elements - pointed to by p could be null, and are otherwise valid. - */ - #define __exceptthat __inner_exceptthat +#if !defined(_Ret_maybenull_) +#define _Ret_maybenull_ +#endif - /* - _refparam +#if !defined(_Pre_writable_byte_size_) +#define _Pre_writable_byte_size_(count) +#endif - Added to all out parameter macros to indicate that they are all reference - parameters. - */ - #define __refparam _Notref_ __deref __notreadonly - - /* - __inner_* - - Helper macros that directly correspond to certain high-level annotations. - - */ - - /* - Macros to classify the entrypoints and indicate their category. - - Pre-defined control point categories include: RPC, LPC, DeviceDriver, UserToKernel, ISAPI, COM. - - */ - #define __inner_control_entrypoint(category) _SA_annotes2(SAL_entrypoint, controlEntry, category) - - - /* - Pre-defined data entry point categories include: Registry, File, Network. - */ - #define __inner_data_entrypoint(category) _SA_annotes2(SAL_entrypoint, dataEntry, category) - - #define __inner_override _SA_annotes0(__override) - #define __inner_callback _SA_annotes0(__callback) - #define __inner_blocksOn(resource) _SA_annotes1(SAL_blocksOn, resource) - - #define __post_except_maybenull __post __inner_exceptthat _Maybenull_impl_ - #define __pre_except_maybenull __pre __inner_exceptthat _Maybenull_impl_ - - #define __post_deref_except_maybenull __post __deref __inner_exceptthat _Maybenull_impl_ - #define __pre_deref_except_maybenull __pre __deref __inner_exceptthat _Maybenull_impl_ - - #define __inexpressible_readableTo(size) _Readable_elements_impl_(_Inexpressible_(size)) - #define __inexpressible_writableTo(size) _Writable_elements_impl_(_Inexpressible_(size)) - - -#else // ][ -#ifndef PAL_STDCPP_COMPAT - #define __null - #define __notnull - #define __deref -#endif // !PAL_STDCPP_COMPAT - #define __maybenull - #define __readonly - #define __notreadonly - #define __maybereadonly - #define __valid - #define __notvalid - #define __maybevalid - #define __readableTo(extent) - #define __elem_readableTo(size) - #define __byte_readableTo(size) - #define __writableTo(size) - #define __elem_writableTo(size) - #define __byte_writableTo(size) - #define __pre - #define __post - #define __precond(expr) - #define __postcond(expr) - #define __exceptthat - #define __inner_override - #define __inner_callback - #define __inner_blocksOn(resource) - #define __refparam - #define __inner_control_entrypoint(category) - #define __inner_data_entrypoint(category) - - #define __post_except_maybenull - #define __pre_except_maybenull - #define __post_deref_except_maybenull - #define __pre_deref_except_maybenull - - #define __inexpressible_readableTo(size) - #define __inexpressible_writableTo(size) - -#endif /* #if !defined(__midl) && defined(_PREFAST_) */ // ] +#if !defined(_Post_writable_byte_size_) +#define _Post_writable_byte_size_(count) +#endif -/* -------------------------------------------------------------------------------- -Buffer Annotation Definitions +#if !defined(_Analysis_assume_) +#define _Analysis_assume_(expr) +#endif -Any of these may be used to directly annotate functions, but only one should -be used for each parameter. To determine which annotation to use for a given -buffer, use the table in the buffer annotations section. -------------------------------------------------------------------------------- -*/ +#if !defined(_Post_satisfies_) +#define _Post_satisfies_(expr) +#endif -#define __ecount(size) _SAL1_Source_(__ecount, (size), __notnull __elem_writableTo(size)) -#define __bcount(size) _SAL1_Source_(__bcount, (size), __notnull __byte_writableTo(size)) -#define __in_ecount(size) _SAL1_Source_(__in_ecount, (size), _In_reads_(size)) -#define __in_bcount(size) _SAL1_Source_(__in_bcount, (size), _In_reads_bytes_(size)) -#define __in_z _SAL1_Source_(__in_z, (), _In_z_) -#define __in_ecount_z(size) _SAL1_Source_(__in_ecount_z, (size), _In_reads_z_(size)) -#define __in_bcount_z(size) _SAL1_Source_(__in_bcount_z, (size), __in_bcount(size) __pre __nullterminated) -#define __in_nz _SAL1_Source_(__in_nz, (), __in) -#define __in_ecount_nz(size) _SAL1_Source_(__in_ecount_nz, (size), __in_ecount(size)) -#define __in_bcount_nz(size) _SAL1_Source_(__in_bcount_nz, (size), __in_bcount(size)) -#define __out_ecount(size) _SAL1_Source_(__out_ecount, (size), _Out_writes_(size)) -#define __out_bcount(size) _SAL1_Source_(__out_bcount, (size), _Out_writes_bytes_(size)) -#define __out_ecount_part(size,length) _SAL1_Source_(__out_ecount_part, (size,length), _Out_writes_to_(size,length)) -#define __out_bcount_part(size,length) _SAL1_Source_(__out_bcount_part, (size,length), _Out_writes_bytes_to_(size,length)) -#define __out_ecount_full(size) _SAL1_Source_(__out_ecount_full, (size), _Out_writes_all_(size)) -#define __out_bcount_full(size) _SAL1_Source_(__out_bcount_full, (size), _Out_writes_bytes_all_(size)) -#define __out_z _SAL1_Source_(__out_z, (), __post __valid __refparam __post __nullterminated) -#define __out_z_opt _SAL1_Source_(__out_z_opt, (), __post __valid __refparam __post __nullterminated __pre_except_maybenull) -#define __out_ecount_z(size) _SAL1_Source_(__out_ecount_z, (size), __ecount(size) __post __valid __refparam __post __nullterminated) -#define __out_bcount_z(size) _SAL1_Source_(__out_bcount_z, (size), __bcount(size) __post __valid __refparam __post __nullterminated) -#define __out_ecount_part_z(size,length) _SAL1_Source_(__out_ecount_part_z, (size,length), __out_ecount_part(size,length) __post __nullterminated) -#define __out_bcount_part_z(size,length) _SAL1_Source_(__out_bcount_part_z, (size,length), __out_bcount_part(size,length) __post __nullterminated) -#define __out_ecount_full_z(size) _SAL1_Source_(__out_ecount_full_z, (size), __out_ecount_full(size) __post __nullterminated) -#define __out_bcount_full_z(size) _SAL1_Source_(__out_bcount_full_z, (size), __out_bcount_full(size) __post __nullterminated) -#define __out_nz _SAL1_Source_(__out_nz, (), __post __valid __refparam) -#define __out_nz_opt _SAL1_Source_(__out_nz_opt, (), __post __valid __refparam __post_except_maybenull_) -#define __out_ecount_nz(size) _SAL1_Source_(__out_ecount_nz, (size), __ecount(size) __post __valid __refparam) -#define __out_bcount_nz(size) _SAL1_Source_(__out_bcount_nz, (size), __bcount(size) __post __valid __refparam) -#define __inout _SAL1_Source_(__inout, (), _Inout_) -#define __inout_ecount(size) _SAL1_Source_(__inout_ecount, (size), _Inout_updates_(size)) -#define __inout_bcount(size) _SAL1_Source_(__inout_bcount, (size), _Inout_updates_bytes_(size)) -#define __inout_ecount_part(size,length) _SAL1_Source_(__inout_ecount_part, (size,length), _Inout_updates_to_(size,length)) -#define __inout_bcount_part(size,length) _SAL1_Source_(__inout_bcount_part, (size,length), _Inout_updates_bytes_to_(size,length)) -#define __inout_ecount_full(size) _SAL1_Source_(__inout_ecount_full, (size), _Inout_updates_all_(size)) -#define __inout_bcount_full(size) _SAL1_Source_(__inout_bcount_full, (size), _Inout_updates_bytes_all_(size)) -#define __inout_z _SAL1_Source_(__inout_z, (), _Inout_z_) -#define __inout_ecount_z(size) _SAL1_Source_(__inout_ecount_z, (size), _Inout_updates_z_(size)) -#define __inout_bcount_z(size) _SAL1_Source_(__inout_bcount_z, (size), __inout_bcount(size) __pre __nullterminated __post __nullterminated) -#define __inout_nz _SAL1_Source_(__inout_nz, (), __inout) -#define __inout_ecount_nz(size) _SAL1_Source_(__inout_ecount_nz, (size), __inout_ecount(size)) -#define __inout_bcount_nz(size) _SAL1_Source_(__inout_bcount_nz, (size), __inout_bcount(size)) -#define __ecount_opt(size) _SAL1_Source_(__ecount_opt, (size), __ecount(size) __pre_except_maybenull) -#define __bcount_opt(size) _SAL1_Source_(__bcount_opt, (size), __bcount(size) __pre_except_maybenull) -#define __in_opt _SAL1_Source_(__in_opt, (), _In_opt_) -#define __in_ecount_opt(size) _SAL1_Source_(__in_ecount_opt, (size), _In_reads_opt_(size)) -#define __in_bcount_opt(size) _SAL1_Source_(__in_bcount_opt, (size), _In_reads_bytes_opt_(size)) -#define __in_z_opt _SAL1_Source_(__in_z_opt, (), _In_opt_z_) -#define __in_ecount_z_opt(size) _SAL1_Source_(__in_ecount_z_opt, (size), __in_ecount_opt(size) __pre __nullterminated) -#define __in_bcount_z_opt(size) _SAL1_Source_(__in_bcount_z_opt, (size), __in_bcount_opt(size) __pre __nullterminated) -#define __in_nz_opt _SAL1_Source_(__in_nz_opt, (), __in_opt) -#define __in_ecount_nz_opt(size) _SAL1_Source_(__in_ecount_nz_opt, (size), __in_ecount_opt(size)) -#define __in_bcount_nz_opt(size) _SAL1_Source_(__in_bcount_nz_opt, (size), __in_bcount_opt(size)) -#define __out_opt _SAL1_Source_(__out_opt, (), _Out_opt_) -#define __out_ecount_opt(size) _SAL1_Source_(__out_ecount_opt, (size), _Out_writes_opt_(size)) -#define __out_bcount_opt(size) _SAL1_Source_(__out_bcount_opt, (size), _Out_writes_bytes_opt_(size)) -#define __out_ecount_part_opt(size,length) _SAL1_Source_(__out_ecount_part_opt, (size,length), __out_ecount_part(size,length) __pre_except_maybenull) -#define __out_bcount_part_opt(size,length) _SAL1_Source_(__out_bcount_part_opt, (size,length), __out_bcount_part(size,length) __pre_except_maybenull) -#define __out_ecount_full_opt(size) _SAL1_Source_(__out_ecount_full_opt, (size), __out_ecount_full(size) __pre_except_maybenull) -#define __out_bcount_full_opt(size) _SAL1_Source_(__out_bcount_full_opt, (size), __out_bcount_full(size) __pre_except_maybenull) -#define __out_ecount_z_opt(size) _SAL1_Source_(__out_ecount_z_opt, (size), __out_ecount_opt(size) __post __nullterminated) -#define __out_bcount_z_opt(size) _SAL1_Source_(__out_bcount_z_opt, (size), __out_bcount_opt(size) __post __nullterminated) -#define __out_ecount_part_z_opt(size,length) _SAL1_Source_(__out_ecount_part_z_opt, (size,length), __out_ecount_part_opt(size,length) __post __nullterminated) -#define __out_bcount_part_z_opt(size,length) _SAL1_Source_(__out_bcount_part_z_opt, (size,length), __out_bcount_part_opt(size,length) __post __nullterminated) -#define __out_ecount_full_z_opt(size) _SAL1_Source_(__out_ecount_full_z_opt, (size), __out_ecount_full_opt(size) __post __nullterminated) -#define __out_bcount_full_z_opt(size) _SAL1_Source_(__out_bcount_full_z_opt, (size), __out_bcount_full_opt(size) __post __nullterminated) -#define __out_ecount_nz_opt(size) _SAL1_Source_(__out_ecount_nz_opt, (size), __out_ecount_opt(size) __post __nullterminated) -#define __out_bcount_nz_opt(size) _SAL1_Source_(__out_bcount_nz_opt, (size), __out_bcount_opt(size) __post __nullterminated) -#define __inout_opt _SAL1_Source_(__inout_opt, (), _Inout_opt_) -#define __inout_ecount_opt(size) _SAL1_Source_(__inout_ecount_opt, (size), __inout_ecount(size) __pre_except_maybenull) -#define __inout_bcount_opt(size) _SAL1_Source_(__inout_bcount_opt, (size), __inout_bcount(size) __pre_except_maybenull) -#define __inout_ecount_part_opt(size,length) _SAL1_Source_(__inout_ecount_part_opt, (size,length), __inout_ecount_part(size,length) __pre_except_maybenull) -#define __inout_bcount_part_opt(size,length) _SAL1_Source_(__inout_bcount_part_opt, (size,length), __inout_bcount_part(size,length) __pre_except_maybenull) -#define __inout_ecount_full_opt(size) _SAL1_Source_(__inout_ecount_full_opt, (size), __inout_ecount_full(size) __pre_except_maybenull) -#define __inout_bcount_full_opt(size) _SAL1_Source_(__inout_bcount_full_opt, (size), __inout_bcount_full(size) __pre_except_maybenull) -#define __inout_z_opt _SAL1_Source_(__inout_z_opt, (), __inout_opt __pre __nullterminated __post __nullterminated) -#define __inout_ecount_z_opt(size) _SAL1_Source_(__inout_ecount_z_opt, (size), __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated) -#define __inout_ecount_z_opt(size) _SAL1_Source_(__inout_ecount_z_opt, (size), __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated) -#define __inout_bcount_z_opt(size) _SAL1_Source_(__inout_bcount_z_opt, (size), __inout_bcount_opt(size)) -#define __inout_nz_opt _SAL1_Source_(__inout_nz_opt, (), __inout_opt) -#define __inout_ecount_nz_opt(size) _SAL1_Source_(__inout_ecount_nz_opt, (size), __inout_ecount_opt(size)) -#define __inout_bcount_nz_opt(size) _SAL1_Source_(__inout_bcount_nz_opt, (size), __inout_bcount_opt(size)) -#define __deref_ecount(size) _SAL1_Source_(__deref_ecount, (size), _Notref_ __ecount(1) __post _Notref_ __elem_readableTo(1) __post _Notref_ __deref _Notref_ __notnull __post __deref __elem_writableTo(size)) -#define __deref_bcount(size) _SAL1_Source_(__deref_bcount, (size), _Notref_ __ecount(1) __post _Notref_ __elem_readableTo(1) __post _Notref_ __deref _Notref_ __notnull __post __deref __byte_writableTo(size)) -#define __deref_out _SAL1_Source_(__deref_out, (), _Outptr_) -#define __deref_out_ecount(size) _SAL1_Source_(__deref_out_ecount, (size), _Outptr_result_buffer_(size)) -#define __deref_out_bcount(size) _SAL1_Source_(__deref_out_bcount, (size), _Outptr_result_bytebuffer_(size)) -#define __deref_out_ecount_part(size,length) _SAL1_Source_(__deref_out_ecount_part, (size,length), _Outptr_result_buffer_to_(size,length)) -#define __deref_out_bcount_part(size,length) _SAL1_Source_(__deref_out_bcount_part, (size,length), _Outptr_result_bytebuffer_to_(size,length)) -#define __deref_out_ecount_full(size) _SAL1_Source_(__deref_out_ecount_full, (size), __deref_out_ecount_part(size,size)) -#define __deref_out_bcount_full(size) _SAL1_Source_(__deref_out_bcount_full, (size), __deref_out_bcount_part(size,size)) -#define __deref_out_z _SAL1_Source_(__deref_out_z, (), _Outptr_result_z_) -#define __deref_out_ecount_z(size) _SAL1_Source_(__deref_out_ecount_z, (size), __deref_out_ecount(size) __post __deref __nullterminated) -#define __deref_out_bcount_z(size) _SAL1_Source_(__deref_out_bcount_z, (size), __deref_out_bcount(size) __post __deref __nullterminated) -#define __deref_out_nz _SAL1_Source_(__deref_out_nz, (), __deref_out) -#define __deref_out_ecount_nz(size) _SAL1_Source_(__deref_out_ecount_nz, (size), __deref_out_ecount(size)) -#define __deref_out_bcount_nz(size) _SAL1_Source_(__deref_out_bcount_nz, (size), __deref_out_ecount(size)) -#define __deref_inout _SAL1_Source_(__deref_inout, (), _Notref_ __notnull _Notref_ __elem_readableTo(1) __pre __deref __valid __post _Notref_ __deref __valid __refparam) -#define __deref_inout_z _SAL1_Source_(__deref_inout_z, (), __deref_inout __pre __deref __nullterminated __post _Notref_ __deref __nullterminated) -#define __deref_inout_ecount(size) _SAL1_Source_(__deref_inout_ecount, (size), __deref_inout __pre __deref __elem_writableTo(size) __post _Notref_ __deref __elem_writableTo(size)) -#define __deref_inout_bcount(size) _SAL1_Source_(__deref_inout_bcount, (size), __deref_inout __pre __deref __byte_writableTo(size) __post _Notref_ __deref __byte_writableTo(size)) -#define __deref_inout_ecount_part(size,length) _SAL1_Source_(__deref_inout_ecount_part, (size,length), __deref_inout_ecount(size) __pre __deref __elem_readableTo(length) __post __deref __elem_readableTo(length)) -#define __deref_inout_bcount_part(size,length) _SAL1_Source_(__deref_inout_bcount_part, (size,length), __deref_inout_bcount(size) __pre __deref __byte_readableTo(length) __post __deref __byte_readableTo(length)) -#define __deref_inout_ecount_full(size) _SAL1_Source_(__deref_inout_ecount_full, (size), __deref_inout_ecount_part(size,size)) -#define __deref_inout_bcount_full(size) _SAL1_Source_(__deref_inout_bcount_full, (size), __deref_inout_bcount_part(size,size)) -#define __deref_inout_ecount_z(size) _SAL1_Source_(__deref_inout_ecount_z, (size), __deref_inout_ecount(size) __pre __deref __nullterminated __post __deref __nullterminated) -#define __deref_inout_bcount_z(size) _SAL1_Source_(__deref_inout_bcount_z, (size), __deref_inout_bcount(size) __pre __deref __nullterminated __post __deref __nullterminated) -#define __deref_inout_nz _SAL1_Source_(__deref_inout_nz, (), __deref_inout) -#define __deref_inout_ecount_nz(size) _SAL1_Source_(__deref_inout_ecount_nz, (size), __deref_inout_ecount(size)) -#define __deref_inout_bcount_nz(size) _SAL1_Source_(__deref_inout_bcount_nz, (size), __deref_inout_ecount(size)) -#define __deref_ecount_opt(size) _SAL1_Source_(__deref_ecount_opt, (size), __deref_ecount(size) __post_deref_except_maybenull) -#define __deref_bcount_opt(size) _SAL1_Source_(__deref_bcount_opt, (size), __deref_bcount(size) __post_deref_except_maybenull) -#define __deref_out_opt _SAL1_Source_(__deref_out_opt, (), __deref_out __post_deref_except_maybenull) -#define __deref_out_ecount_opt(size) _SAL1_Source_(__deref_out_ecount_opt, (size), __deref_out_ecount(size) __post_deref_except_maybenull) -#define __deref_out_bcount_opt(size) _SAL1_Source_(__deref_out_bcount_opt, (size), __deref_out_bcount(size) __post_deref_except_maybenull) -#define __deref_out_ecount_part_opt(size,length) _SAL1_Source_(__deref_out_ecount_part_opt, (size,length), __deref_out_ecount_part(size,length) __post_deref_except_maybenull) -#define __deref_out_bcount_part_opt(size,length) _SAL1_Source_(__deref_out_bcount_part_opt, (size,length), __deref_out_bcount_part(size,length) __post_deref_except_maybenull) -#define __deref_out_ecount_full_opt(size) _SAL1_Source_(__deref_out_ecount_full_opt, (size), __deref_out_ecount_full(size) __post_deref_except_maybenull) -#define __deref_out_bcount_full_opt(size) _SAL1_Source_(__deref_out_bcount_full_opt, (size), __deref_out_bcount_full(size) __post_deref_except_maybenull) -#define __deref_out_z_opt _SAL1_Source_(__deref_out_z_opt, (), _Outptr_result_maybenull_z_) -#define __deref_out_ecount_z_opt(size) _SAL1_Source_(__deref_out_ecount_z_opt, (size), __deref_out_ecount_opt(size) __post __deref __nullterminated) -#define __deref_out_bcount_z_opt(size) _SAL1_Source_(__deref_out_bcount_z_opt, (size), __deref_out_bcount_opt(size) __post __deref __nullterminated) -#define __deref_out_nz_opt _SAL1_Source_(__deref_out_nz_opt, (), __deref_out_opt) -#define __deref_out_ecount_nz_opt(size) _SAL1_Source_(__deref_out_ecount_nz_opt, (size), __deref_out_ecount_opt(size)) -#define __deref_out_bcount_nz_opt(size) _SAL1_Source_(__deref_out_bcount_nz_opt, (size), __deref_out_bcount_opt(size)) -#define __deref_inout_opt _SAL1_Source_(__deref_inout_opt, (), __deref_inout __pre_deref_except_maybenull __post_deref_except_maybenull) -#define __deref_inout_ecount_opt(size) _SAL1_Source_(__deref_inout_ecount_opt, (size), __deref_inout_ecount(size) __pre_deref_except_maybenull __post_deref_except_maybenull) -#define __deref_inout_bcount_opt(size) _SAL1_Source_(__deref_inout_bcount_opt, (size), __deref_inout_bcount(size) __pre_deref_except_maybenull __post_deref_except_maybenull) -#define __deref_inout_ecount_part_opt(size,length) _SAL1_Source_(__deref_inout_ecount_part_opt, (size,length), __deref_inout_ecount_part(size,length) __pre_deref_except_maybenull __post_deref_except_maybenull) -#define __deref_inout_bcount_part_opt(size,length) _SAL1_Source_(__deref_inout_bcount_part_opt, (size,length), __deref_inout_bcount_part(size,length) __pre_deref_except_maybenull __post_deref_except_maybenull) -#define __deref_inout_ecount_full_opt(size) _SAL1_Source_(__deref_inout_ecount_full_opt, (size), __deref_inout_ecount_full(size) __pre_deref_except_maybenull __post_deref_except_maybenull) -#define __deref_inout_bcount_full_opt(size) _SAL1_Source_(__deref_inout_bcount_full_opt, (size), __deref_inout_bcount_full(size) __pre_deref_except_maybenull __post_deref_except_maybenull) -#define __deref_inout_z_opt _SAL1_Source_(__deref_inout_z_opt, (), __deref_inout_opt __pre __deref __nullterminated __post __deref __nullterminated) -#define __deref_inout_ecount_z_opt(size) _SAL1_Source_(__deref_inout_ecount_z_opt, (size), __deref_inout_ecount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) -#define __deref_inout_bcount_z_opt(size) _SAL1_Source_(__deref_inout_bcount_z_opt, (size), __deref_inout_bcount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) -#define __deref_inout_nz_opt _SAL1_Source_(__deref_inout_nz_opt, (), __deref_inout_opt) -#define __deref_inout_ecount_nz_opt(size) _SAL1_Source_(__deref_inout_ecount_nz_opt, (size), __deref_inout_ecount_opt(size)) -#define __deref_inout_bcount_nz_opt(size) _SAL1_Source_(__deref_inout_bcount_nz_opt, (size), __deref_inout_bcount_opt(size)) -#define __deref_opt_ecount(size) _SAL1_Source_(__deref_opt_ecount, (size), __deref_ecount(size) __pre_except_maybenull) -#define __deref_opt_bcount(size) _SAL1_Source_(__deref_opt_bcount, (size), __deref_bcount(size) __pre_except_maybenull) -#define __deref_opt_out _SAL1_Source_(__deref_opt_out, (), _Outptr_opt_) -#define __deref_opt_out_z _SAL1_Source_(__deref_opt_out_z, (), _Outptr_opt_result_z_) -#define __deref_opt_out_ecount(size) _SAL1_Source_(__deref_opt_out_ecount, (size), __deref_out_ecount(size) __pre_except_maybenull) -#define __deref_opt_out_bcount(size) _SAL1_Source_(__deref_opt_out_bcount, (size), __deref_out_bcount(size) __pre_except_maybenull) -#define __deref_opt_out_ecount_part(size,length) _SAL1_Source_(__deref_opt_out_ecount_part, (size,length), __deref_out_ecount_part(size,length) __pre_except_maybenull) -#define __deref_opt_out_bcount_part(size,length) _SAL1_Source_(__deref_opt_out_bcount_part, (size,length), __deref_out_bcount_part(size,length) __pre_except_maybenull) -#define __deref_opt_out_ecount_full(size) _SAL1_Source_(__deref_opt_out_ecount_full, (size), __deref_out_ecount_full(size) __pre_except_maybenull) -#define __deref_opt_out_bcount_full(size) _SAL1_Source_(__deref_opt_out_bcount_full, (size), __deref_out_bcount_full(size) __pre_except_maybenull) -#define __deref_opt_inout _SAL1_Source_(__deref_opt_inout, (), _Inout_opt_) -#define __deref_opt_inout_ecount(size) _SAL1_Source_(__deref_opt_inout_ecount, (size), __deref_inout_ecount(size) __pre_except_maybenull) -#define __deref_opt_inout_bcount(size) _SAL1_Source_(__deref_opt_inout_bcount, (size), __deref_inout_bcount(size) __pre_except_maybenull) -#define __deref_opt_inout_ecount_part(size,length) _SAL1_Source_(__deref_opt_inout_ecount_part, (size,length), __deref_inout_ecount_part(size,length) __pre_except_maybenull) -#define __deref_opt_inout_bcount_part(size,length) _SAL1_Source_(__deref_opt_inout_bcount_part, (size,length), __deref_inout_bcount_part(size,length) __pre_except_maybenull) -#define __deref_opt_inout_ecount_full(size) _SAL1_Source_(__deref_opt_inout_ecount_full, (size), __deref_inout_ecount_full(size) __pre_except_maybenull) -#define __deref_opt_inout_bcount_full(size) _SAL1_Source_(__deref_opt_inout_bcount_full, (size), __deref_inout_bcount_full(size) __pre_except_maybenull) -#define __deref_opt_inout_z _SAL1_Source_(__deref_opt_inout_z, (), __deref_opt_inout __pre __deref __nullterminated __post __deref __nullterminated) -#define __deref_opt_inout_ecount_z(size) _SAL1_Source_(__deref_opt_inout_ecount_z, (size), __deref_opt_inout_ecount(size) __pre __deref __nullterminated __post __deref __nullterminated) -#define __deref_opt_inout_bcount_z(size) _SAL1_Source_(__deref_opt_inout_bcount_z, (size), __deref_opt_inout_bcount(size) __pre __deref __nullterminated __post __deref __nullterminated) -#define __deref_opt_inout_nz _SAL1_Source_(__deref_opt_inout_nz, (), __deref_opt_inout) -#define __deref_opt_inout_ecount_nz(size) _SAL1_Source_(__deref_opt_inout_ecount_nz, (size), __deref_opt_inout_ecount(size)) -#define __deref_opt_inout_bcount_nz(size) _SAL1_Source_(__deref_opt_inout_bcount_nz, (size), __deref_opt_inout_bcount(size)) -#define __deref_opt_ecount_opt(size) _SAL1_Source_(__deref_opt_ecount_opt, (size), __deref_ecount_opt(size) __pre_except_maybenull) -#define __deref_opt_bcount_opt(size) _SAL1_Source_(__deref_opt_bcount_opt, (size), __deref_bcount_opt(size) __pre_except_maybenull) -#define __deref_opt_out_opt _SAL1_Source_(__deref_opt_out_opt, (), _Outptr_opt_result_maybenull_) -#define __deref_opt_out_ecount_opt(size) _SAL1_Source_(__deref_opt_out_ecount_opt, (size), __deref_out_ecount_opt(size) __pre_except_maybenull) -#define __deref_opt_out_bcount_opt(size) _SAL1_Source_(__deref_opt_out_bcount_opt, (size), __deref_out_bcount_opt(size) __pre_except_maybenull) -#define __deref_opt_out_ecount_part_opt(size,length) _SAL1_Source_(__deref_opt_out_ecount_part_opt, (size,length), __deref_out_ecount_part_opt(size,length) __pre_except_maybenull) -#define __deref_opt_out_bcount_part_opt(size,length) _SAL1_Source_(__deref_opt_out_bcount_part_opt, (size,length), __deref_out_bcount_part_opt(size,length) __pre_except_maybenull) -#define __deref_opt_out_ecount_full_opt(size) _SAL1_Source_(__deref_opt_out_ecount_full_opt, (size), __deref_out_ecount_full_opt(size) __pre_except_maybenull) -#define __deref_opt_out_bcount_full_opt(size) _SAL1_Source_(__deref_opt_out_bcount_full_opt, (size), __deref_out_bcount_full_opt(size) __pre_except_maybenull) -#define __deref_opt_out_z_opt _SAL1_Source_(__deref_opt_out_z_opt, (), __post __deref __valid __refparam __pre_except_maybenull __pre_deref_except_maybenull __post_deref_except_maybenull __post __deref __nullterminated) -#define __deref_opt_out_ecount_z_opt(size) _SAL1_Source_(__deref_opt_out_ecount_z_opt, (size), __deref_opt_out_ecount_opt(size) __post __deref __nullterminated) -#define __deref_opt_out_bcount_z_opt(size) _SAL1_Source_(__deref_opt_out_bcount_z_opt, (size), __deref_opt_out_bcount_opt(size) __post __deref __nullterminated) -#define __deref_opt_out_nz_opt _SAL1_Source_(__deref_opt_out_nz_opt, (), __deref_opt_out_opt) -#define __deref_opt_out_ecount_nz_opt(size) _SAL1_Source_(__deref_opt_out_ecount_nz_opt, (size), __deref_opt_out_ecount_opt(size)) -#define __deref_opt_out_bcount_nz_opt(size) _SAL1_Source_(__deref_opt_out_bcount_nz_opt, (size), __deref_opt_out_bcount_opt(size)) -#define __deref_opt_inout_opt _SAL1_Source_(__deref_opt_inout_opt, (), __deref_inout_opt __pre_except_maybenull) -#define __deref_opt_inout_ecount_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_opt, (size), __deref_inout_ecount_opt(size) __pre_except_maybenull) -#define __deref_opt_inout_bcount_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_opt, (size), __deref_inout_bcount_opt(size) __pre_except_maybenull) -#define __deref_opt_inout_ecount_part_opt(size,length) _SAL1_Source_(__deref_opt_inout_ecount_part_opt, (size,length), __deref_inout_ecount_part_opt(size,length) __pre_except_maybenull) -#define __deref_opt_inout_bcount_part_opt(size,length) _SAL1_Source_(__deref_opt_inout_bcount_part_opt, (size,length), __deref_inout_bcount_part_opt(size,length) __pre_except_maybenull) -#define __deref_opt_inout_ecount_full_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_full_opt, (size), __deref_inout_ecount_full_opt(size) __pre_except_maybenull) -#define __deref_opt_inout_bcount_full_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_full_opt, (size), __deref_inout_bcount_full_opt(size) __pre_except_maybenull) -#define __deref_opt_inout_z_opt _SAL1_Source_(__deref_opt_inout_z_opt, (), __deref_opt_inout_opt __pre __deref __nullterminated __post __deref __nullterminated) -#define __deref_opt_inout_ecount_z_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_z_opt, (size), __deref_opt_inout_ecount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) -#define __deref_opt_inout_bcount_z_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_z_opt, (size), __deref_opt_inout_bcount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated) -#define __deref_opt_inout_nz_opt _SAL1_Source_(__deref_opt_inout_nz_opt, (), __deref_opt_inout_opt) -#define __deref_opt_inout_ecount_nz_opt(size) _SAL1_Source_(__deref_opt_inout_ecount_nz_opt, (size), __deref_opt_inout_ecount_opt(size)) -#define __deref_opt_inout_bcount_nz_opt(size) _SAL1_Source_(__deref_opt_inout_bcount_nz_opt, (size), __deref_opt_inout_bcount_opt(size)) +#if !defined(_Post_invalid_) +#define _Post_invalid_ +#endif -/* -------------------------------------------------------------------------------- -Advanced Annotation Definitions +#if !defined(_Post_valid_) +#define _Post_valid_ +#endif -Any of these may be used to directly annotate functions, and may be used in -combination with each other or with regular buffer macros. For an explanation -of each annotation, see the advanced annotations section. -------------------------------------------------------------------------------- -*/ +#if !defined(_Pre_notnull_) +#define _Pre_notnull_ +#endif -#define __success(expr) _Success_(expr) -#define __nullterminated _Null_terminated_ -#define __nullnullterminated -#define __clr_reserved _SAL1_Source_(__reserved, (), _Reserved_) -#define __checkReturn _SAL1_Source_(__checkReturn, (), _Check_return_) -#define __typefix(ctype) _SAL1_Source_(__typefix, (ctype), __inner_typefix(ctype)) -#define __override __inner_override -#define __callback __inner_callback -#define __format_string _Printf_format_string_ -#define __blocksOn(resource) __inner_blocksOn(resource) -#define __control_entrypoint(category) __inner_control_entrypoint(category) -#define __data_entrypoint(category) __inner_data_entrypoint(category) -#define __useHeader _Use_decl_anno_impl_ -#define __on_failure(annotes) _On_failure_impl_(annotes _SAL_nop_impl_) - -#ifndef __has_cpp_attribute -#define __has_cpp_attribute(x) (0) -#endif - -#ifndef __fallthrough // [ -#if __has_cpp_attribute(fallthrough) -#define __fallthrough [[fallthrough]] -#else -#define __fallthrough -#endif -#endif // ] - -#ifndef __analysis_assume // [ -#ifdef _PREFAST_ // [ -#define __analysis_assume(expr) __assume(expr) -#else // ][ -#define __analysis_assume(expr) -#endif // ] -#endif // ] - -#ifndef _Analysis_assume_ // [ -#ifdef _PREFAST_ // [ -#define _Analysis_assume_(expr) __assume(expr) -#else // ][ -#define _Analysis_assume_(expr) -#endif // ] -#endif // ] +#if !defined(_When_) +#define _When_(expr1, expr2) +#endif -#define _Analysis_noreturn_ _SAL2_Source_(_Analysis_noreturn_, (), _SA_annotes0(SAL_terminates)) +#define _Deref_pre_z_ -#ifdef _PREFAST_ // [ -__inline __nothrow -void __AnalysisAssumeNullterminated(_Post_ __nullterminated void *p); +#if !defined(_Inout_updates_) +#define _Inout_updates_(count) +#endif -#define _Analysis_assume_nullterminated_(x) __AnalysisAssumeNullterminated(x) -#else // ][ -#define _Analysis_assume_nullterminated_(x) -#endif // ] +#if !defined(_Out_writes_opt_z_) +#define _Out_writes_opt_z_(count) +#endif -// -// Set the analysis mode (global flags to analysis). -// They take effect at the point of declaration; use at global scope -// as a declaration. -// +#if !defined(_Out_cap_post_count_) +#define _Out_cap_post_count_(maxLen,used) +#endif -// Synthesize a unique symbol. -#define ___MKID(x, y) x ## y -#define __MKID(x, y) ___MKID(x, y) -#define __GENSYM(x) __MKID(x, __COUNTER__) +#if !defined(_Inout_count_) +#define _Inout_count_(size) +#endif -__ANNOTATION(SAL_analysisMode(__AuToQuOtE __In_impl_ char *mode);) +#if !defined(_Post_equal_to_) +#define _Post_equal_to_(expr) +#endif -#define _Analysis_mode_impl_(mode) _SA_annotes1(SAL_analysisMode, #mode) +#if !defined(_Field_size_bytes_) +# define _Field_size_bytes_(count) +#endif -#define _Analysis_mode_(mode) \ - typedef _Analysis_mode_impl_(mode) int \ - __GENSYM(__prefast_analysis_mode_flag); +#if !defined(_Field_range_) +# define _Field_range_(count, capacity) +#endif -// The following are predefined: -// _Analysis_operator_new_throw_ (operator new throws) -// _Analysis_operator_new_null_ (operator new returns null) -// _Analysis_operator_new_never_fails_ (operator new never fails) -// +#if !defined(_Post_z_) +# define _Post_z_ +#endif -// Function class annotations. -__ANNOTATION(SAL_functionClassNew(__In_impl_ char*);) -__PRIMOP(int, _In_function_class_(__In_impl_ char*);) -#define _In_function_class_(x) _In_function_class_(#x) +#if !defined(_Outptr_result_buffer_) +# define _Outptr_result_buffer_(count) +#endif -#define _Function_class_(x) _SA_annotes1(SAL_functionClassNew, #x) +#if !defined(_Field_size_) +# define _Field_size_(count) +#endif -/* - * interlocked operand used in interlocked instructions - */ -//#define _Interlocked_operand_ _Pre_ _SA_annotes0(SAL_interlocked) +#if !defined(_Always_) +# define _Always_(expr) +#endif -#define _Enum_is_bitflag_ _SA_annotes0(SAL_enumIsBitflag) -#define _Strict_type_match_ _SA_annotes0(SAL_strictType2) +#if !defined(_Readable_bytes_) +# define _Readable_bytes_(count) +#endif -#define _Maybe_raises_SEH_exception_ _Pre_ _SA_annotes1(SAL_inTry,__yes) -#define _Raises_SEH_exception_ _Group_(_Maybe_raises_SEH_exception_ _Analysis_noreturn_) +#if !defined(_Outptr_result_buffer_maybenull_) +# define _Outptr_result_buffer_maybenull_(count) +#endif -#ifdef __cplusplus // [ -} -#endif // ] +#endif /* _SAL_h */ diff --git a/src/thirdparty/sse_mathfun/sse_mathfun.h b/src/thirdparty/sse_mathfun/sse_mathfun.h new file mode 100644 index 000000000..24c3394a9 --- /dev/null +++ b/src/thirdparty/sse_mathfun/sse_mathfun.h @@ -0,0 +1,710 @@ +/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log + + Inspired by Intel Approximate Math library, and based on the + corresponding algorithms of the cephes math library + + The default is to use the SSE1 version. If you define USE_SSE2 the + the SSE2 intrinsics will be used in place of the MMX intrinsics. Do + not expect any significant performance improvement with SSE2. +*/ + +/* Copyright (C) 2007 Julien Pommier + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + (this is the zlib license) +*/ + +#include + +/* yes I know, the top of this file is quite ugly */ + +#ifdef _MSC_VER /* visual c++ */ +# define ALIGN16_BEG __declspec(align(16)) +# define ALIGN16_END +#else /* gcc or icc */ +# define ALIGN16_BEG +# define ALIGN16_END __attribute__((aligned(16))) +#endif + +/* __m128 is ugly to write */ +typedef __m128 v4sf; // vector of 4 float (sse1) + +#ifdef USE_SSE2 +# include +typedef __m128i v4si; // vector of 4 int (sse2) +#else +typedef __m64 v2si; // vector of 2 int (mmx) +#endif + +/* declare some SSE constants -- why can't I figure a better way to do that? */ +#define _PS_CONST(Name, Val) \ + static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { (float)(Val), (float)(Val), (float)(Val), (float)(Val) } +#define _PI32_CONST(Name, Val) \ + static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { (Val), (Val), (Val), (Val) } +#define _PS_CONST_TYPE(Name, Type, Val) \ + static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { (Type)(Val), (Type)(Val), (Type)(Val), (Type)(Val) } + +_PS_CONST(1 , 1.0f); +_PS_CONST(0p5, 0.5f); +/* the smallest non denormalized float number */ +_PS_CONST_TYPE(min_norm_pos, int, 0x00800000); +_PS_CONST_TYPE(mant_mask, int, 0x7f800000); +_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); + +_PS_CONST_TYPE(sign_mask, int, (int)0x80000000); +_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000); + +_PI32_CONST(1, 1); +_PI32_CONST(inv1, ~1); +_PI32_CONST(2, 2); +_PI32_CONST(4, 4); +_PI32_CONST(0x7f, 0x7f); + +_PS_CONST(cephes_SQRTHF, 0.707106781186547524); +_PS_CONST(cephes_log_p0, 7.0376836292E-2); +_PS_CONST(cephes_log_p1, - 1.1514610310E-1); +_PS_CONST(cephes_log_p2, 1.1676998740E-1); +_PS_CONST(cephes_log_p3, -1.2420140846E-1); +_PS_CONST(cephes_log_p4, +1.4249322787E-1); +_PS_CONST(cephes_log_p5, -1.6668057665E-1); +_PS_CONST(cephes_log_p6, +2.0000714765E-1); +_PS_CONST(cephes_log_p7, -2.4999993993E-1); +_PS_CONST(cephes_log_p8, +3.3333331174E-1); +_PS_CONST(cephes_log_q1, -2.12194440e-4); +_PS_CONST(cephes_log_q2, 0.693359375); + +#ifndef USE_SSE2 +typedef union xmm_mm_union { + __m128 xmm; + __m64 mm[2]; +} xmm_mm_union; + +#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \ + xmm_mm_union u; u.xmm = xmm_; \ + mm0_ = u.mm[0]; \ + mm1_ = u.mm[1]; \ +} + +#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \ + xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \ + } + +#endif // USE_SSE2 + +/* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 +*/ +inline v4sf log_ps(v4sf x) { +#ifdef USE_SSE2 + v4si emm0; +#else + v2si mm0, mm1; +#endif + v4sf one = *(v4sf*)_ps_1; + + v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); + + x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */ + +#ifndef USE_SSE2 + /* part 1: x = frexpf(x, &e); */ + COPY_XMM_TO_MM(x, mm0, mm1); + mm0 = _mm_srli_pi32(mm0, 23); + mm1 = _mm_srli_pi32(mm1, 23); +#else + emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); +#endif + /* keep only the fractional part */ + x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask); + x = _mm_or_ps(x, *(v4sf*)_ps_0p5); + +#ifndef USE_SSE2 + /* now e=mm0:mm1 contain the really base-2 exponent */ + mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f); + mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f); + v4sf e = _mm_cvtpi32x2_ps(mm0, mm1); + _mm_empty(); /* bye bye mmx */ +#else + emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f); + v4sf e = _mm_cvtepi32_ps(emm0); +#endif + + e = _mm_add_ps(e, one); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF); + v4sf tmp = _mm_and_ps(x, mask); + x = _mm_sub_ps(x, one); + e = _mm_sub_ps(e, _mm_and_ps(one, mask)); + x = _mm_add_ps(x, tmp); + + + v4sf z = _mm_mul_ps(x,x); + + v4sf y = *(v4sf*)_ps_cephes_log_p0; + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8); + y = _mm_mul_ps(y, x); + + y = _mm_mul_ps(y, z); + + + tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1); + y = _mm_add_ps(y, tmp); + + + tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); + y = _mm_sub_ps(y, tmp); + + tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2); + x = _mm_add_ps(x, y); + x = _mm_add_ps(x, tmp); + x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN + return x; +} + +_PS_CONST(exp_hi, 88.3762626647949f); +_PS_CONST(exp_lo, -88.3762626647949f); + +_PS_CONST(cephes_LOG2EF, 1.44269504088896341); +_PS_CONST(cephes_exp_C1, 0.693359375); +_PS_CONST(cephes_exp_C2, -2.12194440e-4); + +_PS_CONST(cephes_exp_p0, 1.9875691500E-4); +_PS_CONST(cephes_exp_p1, 1.3981999507E-3); +_PS_CONST(cephes_exp_p2, 8.3334519073E-3); +_PS_CONST(cephes_exp_p3, 4.1665795894E-2); +_PS_CONST(cephes_exp_p4, 1.6666665459E-1); +_PS_CONST(cephes_exp_p5, 5.0000001201E-1); + +inline v4sf exp_ps(v4sf x) { + v4sf tmp = _mm_setzero_ps(), fx; +#ifdef USE_SSE2 + v4si emm0; +#else + v2si mm0, mm1; +#endif + v4sf one = *(v4sf*)_ps_1; + + x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi); + x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF); + fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5); + + /* how to perform a floorf with SSE: just below */ +#ifndef USE_SSE2 + /* step 1 : cast to int */ + tmp = _mm_movehl_ps(tmp, fx); + mm0 = _mm_cvttps_pi32(fx); + mm1 = _mm_cvttps_pi32(tmp); + /* step 2 : cast back to float */ + tmp = _mm_cvtpi32x2_ps(mm0, mm1); +#else + emm0 = _mm_cvttps_epi32(fx); + tmp = _mm_cvtepi32_ps(emm0); +#endif + /* if greater, substract 1 */ + v4sf mask = _mm_cmpgt_ps(tmp, fx); + mask = _mm_and_ps(mask, one); + fx = _mm_sub_ps(tmp, mask); + + tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1); + v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2); + x = _mm_sub_ps(x, tmp); + x = _mm_sub_ps(x, z); + + z = _mm_mul_ps(x,x); + + v4sf y = *(v4sf*)_ps_cephes_exp_p0; + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5); + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, x); + y = _mm_add_ps(y, one); + + /* build 2^n */ +#ifndef USE_SSE2 + z = _mm_movehl_ps(z, fx); + mm0 = _mm_cvttps_pi32(fx); + mm1 = _mm_cvttps_pi32(z); + mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f); + mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f); + mm0 = _mm_slli_pi32(mm0, 23); + mm1 = _mm_slli_pi32(mm1, 23); + + v4sf pow2n; + COPY_MM_TO_XMM(mm0, mm1, pow2n); + _mm_empty(); +#else + emm0 = _mm_cvttps_epi32(fx); + emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f); + emm0 = _mm_slli_epi32(emm0, 23); + v4sf pow2n = _mm_castsi128_ps(emm0); +#endif + y = _mm_mul_ps(y, pow2n); + return y; +} + +_PS_CONST(minus_cephes_DP1, -0.78515625); +_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); +_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); +_PS_CONST(sincof_p0, -1.9515295891E-4); +_PS_CONST(sincof_p1, 8.3321608736E-3); +_PS_CONST(sincof_p2, -1.6666654611E-1); +_PS_CONST(coscof_p0, 2.443315711809948E-005); +_PS_CONST(coscof_p1, -1.388731625493765E-003); +_PS_CONST(coscof_p2, 4.166664568298827E-002); +_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI + + +/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so + it runs also on old athlons XPs and the pentium III of your grand + mother. + + The code is the exact rewriting of the cephes sinf function. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. + + Performance is also surprisingly good, 1.33 times faster than the + macos vsinf SSE2 function, and 1.5 times faster than the + __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not + too bad for an SSE1 function (with no special tuning) ! + However the latter libraries probably have a much better handling of NaN, + Inf, denormalized and other special arguments.. + + On my core 1 duo, the execution of this function takes approximately 95 cycles. + + From what I have observed on the experiments with Intel AMath lib, switching to an + SSE2 version would improve the perf by only 10%. + + Since it is based on SSE intrinsics, it has to be compiled at -O2 to + deliver full speed. +*/ +inline v4sf sin_ps(v4sf x) { // any x + v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y; + +#ifdef USE_SSE2 + v4si emm0, emm2; +#else + v2si mm0, mm1, mm2, mm3; +#endif + sign_bit = x; + /* take the absolute value */ + x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); + + /* scale by 4/Pi */ + y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); + +#ifdef USE_SSE2 + /* store the integer part of y in mm0 */ + emm2 = _mm_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); + emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); + y = _mm_cvtepi32_ps(emm2); + + /* get the swap sign flag */ + emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); + emm0 = _mm_slli_epi32(emm0, 29); + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4CreateCmdLine( argc, argv ); InitDefaultFileSystem(); InstallProgressReportHandler( PrintFReportHandler ); diff --git a/src/tier3/mdlutils.cpp b/src/tier3/mdlutils.cpp index afe872941..553a297f3 100644 --- a/src/tier3/mdlutils.cpp +++ b/src/tier3/mdlutils.cpp @@ -292,7 +292,7 @@ void CMDL::SetUpBones( const matrix3x4_t& rootToWorld, int nMaxBoneCount, matrix flCycle -= (int)(flCycle); Vector pos[MAXSTUDIOBONES]; - Quaternion q[MAXSTUDIOBONES]; + QuaternionAligned q[MAXSTUDIOBONES]; IBoneSetup boneSetup( &studioHdr, BONE_USED_BY_ANYTHING_AT_LOD( m_nLOD ), pPoseParameter, NULL ); boneSetup.InitPose( pos, q ); @@ -420,7 +420,7 @@ void CMDL::SetupBonesWithBoneMerge( const CStudioHdr *pMergeHdr, matrix3x4_t *pM flCycle -= (int)(flCycle); Vector pos[MAXSTUDIOBONES]; - Quaternion q[MAXSTUDIOBONES]; + QuaternionAligned q[MAXSTUDIOBONES]; IBoneSetup boneSetup( pMergeHdr, BONE_USED_BY_ANYTHING_AT_LOD( m_nLOD ), pPoseParameter ); boneSetup.InitPose( pos, q ); From b5a943fe822ee0a19a1e5d94887fbd281aa4a0fb Mon Sep 17 00:00:00 2001 From: mastercoms Date: Sun, 5 Mar 2023 00:20:04 -0500 Subject: [PATCH 06/42] perf: backport CalcBones optimizations from CS:GO * lower LODs are now marked as set up * align parent transform matrix --- src/game/client/c_baseanimating.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/game/client/c_baseanimating.cpp b/src/game/client/c_baseanimating.cpp index c78f8582f..a6dda9b98 100644 --- a/src/game/client/c_baseanimating.cpp +++ b/src/game/client/c_baseanimating.cpp @@ -2837,6 +2837,20 @@ bool C_BaseAnimating::SetupBones( matrix3x4_t *pBoneToWorldOut, int nMaxBones, i } } + // If we're setting up LOD N, we have set up all lower LODs also + // because lower LODs always use subsets of the bones of higher LODs. + int nLOD = 0; + int nMask = BONE_USED_BY_VERTEX_LOD0; + for( ; nLOD < MAX_NUM_LODS; ++nLOD, nMask <<= 1 ) + { + if ( boneMask & nMask ) + break; + } + for( ; nLOD < MAX_NUM_LODS; ++nLOD, nMask <<= 1 ) + { + boneMask |= nMask; + } + #ifdef DEBUG_BONE_SETUP_THREADING if ( cl_warn_thread_contested_bone_setup.GetBool() ) { @@ -2904,7 +2918,7 @@ bool C_BaseAnimating::SetupBones( matrix3x4_t *pBoneToWorldOut, int nMaxBones, i return false; // Setup our transform based on render angles and origin. - matrix3x4_t parentTransform; + ALIGN16 matrix3x4_t parentTransform ALIGN16_POST; AngleMatrix( GetRenderAngles(), GetRenderOrigin(), parentTransform ); // Load the boneMask with the total of what was asked for last frame. @@ -2974,6 +2988,7 @@ bool C_BaseAnimating::SetupBones( matrix3x4_t *pBoneToWorldOut, int nMaxBones, i StandardBlendingRules( hdr, pos, q, currentTime, bonesMaskNeedRecalc ); CBoneBitList boneComputed; + // don't calculate IK on ragdolls if ( m_pIk && !IsRagdoll() ) { From 712ff518b37adb4b42550f36a0a6386b37e10a96 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Sun, 5 Mar 2023 00:22:13 -0500 Subject: [PATCH 07/42] perf: add animation attachment deferral this skips a pre-mature/extra bone setup during particle simulation and flexing by allowing for the last frame's attachment position to be used technically this is a bit of a hack but it works well from my analysis. it gets rid of almost all of the particle cost in team fights besides sprite rendering --- src/game/client/c_baseanimating.cpp | 32 +++++++++++++++++++++--- src/game/client/c_baseanimating.h | 1 + src/game/client/c_baseflex.cpp | 2 +- src/game/client/tf/c_tf_player.cpp | 15 +++++++++++ src/game/shared/baseviewmodel_shared.cpp | 9 +++++++ src/game/shared/baseviewmodel_shared.h | 1 + src/game/shared/econ/econ_entity.cpp | 8 ++++++ src/game/shared/econ/econ_entity.h | 1 + src/game/shared/particle_property.cpp | 4 +-- 9 files changed, 66 insertions(+), 7 deletions(-) diff --git a/src/game/client/c_baseanimating.cpp b/src/game/client/c_baseanimating.cpp index a6dda9b98..3fdc65dcf 100644 --- a/src/game/client/c_baseanimating.cpp +++ b/src/game/client/c_baseanimating.cpp @@ -1096,6 +1096,7 @@ CStudioHdr *C_BaseAnimating::OnNewModel() } } m_BoneAccessor.Init( this, m_CachedBoneData.Base() ); // Always call this in case the studiohdr_t has changed. + m_iAccumulatedBoneMask = 0; // Reset the accumulated bone mask. // Free any IK data if (m_pIk) @@ -2033,18 +2034,24 @@ bool C_BaseAnimating::PutAttachment( int number, const matrix3x4_t &attachmentTo return false; CAttachmentData *pAtt = &m_Attachments[number-1]; - if ( gpGlobals->frametime > 0 && pAtt->m_nLastFramecount > 0 && pAtt->m_nLastFramecount == gpGlobals->framecount - 1 ) + if ( gpGlobals->frametime > 0 && pAtt->m_nLastFramecount > 0 && pAtt->m_nLastFramecount < gpGlobals->framecount ) { Vector vecPreviousOrigin, vecOrigin; MatrixPosition( pAtt->m_AttachmentToWorld, vecPreviousOrigin ); MatrixPosition( attachmentToWorld, vecOrigin ); - pAtt->m_vOriginVelocity = (vecOrigin - vecPreviousOrigin) / gpGlobals->frametime; + // compensate for the fact that the previous origin could have been multiple frames behind + pAtt->m_vOriginVelocity = (vecOrigin - vecPreviousOrigin) / (gpGlobals->frametime * (gpGlobals->framecount - pAtt->m_nLastFramecount)); + // only update the frame count if the position changed, so we don't have to recompute attachments + if ( !pAtt->m_vOriginVelocity.IsZero(0.00001f) ) + { + pAtt->m_nLastFramecount = gpGlobals->framecount; + } } else { pAtt->m_vOriginVelocity.Init(); + pAtt->m_nLastFramecount = gpGlobals->framecount; } - pAtt->m_nLastFramecount = gpGlobals->framecount; pAtt->m_bAnglesComputed = false; pAtt->m_AttachmentToWorld = attachmentToWorld; @@ -2108,6 +2115,21 @@ bool C_BaseAnimating::GetAttachment( const char *szName, Vector &absOrigin, QAng return GetAttachment( LookupAttachment( szName ), absOrigin, absAngles ); } +bool C_BaseAnimating::GetAttachmentDeferred( int number, matrix3x4_t& matrix ) +{ + if (number < 1 || number > m_Attachments.Count()) + return false; + + // allow visual effects (eg. particles) to be a frame behind bone setup so that there are not messy dependencies. + CAttachmentData* pAtt = &m_Attachments[number - 1]; + const bool bShouldUpdate = pAtt->m_nLastFramecount < gpGlobals->framecount - 1; + if ( bShouldUpdate && !CalcAttachments() ) + return false; + + matrix = pAtt->m_AttachmentToWorld; + return true; +} + //----------------------------------------------------------------------------- // Purpose: Get attachment point by index // Input : number - which point @@ -2883,7 +2905,9 @@ bool C_BaseAnimating::SetupBones( matrix3x4_t *pBoneToWorldOut, int nMaxBones, i m_flLastBoneSetupTime = currentTime; } m_iPrevBoneMask = m_iAccumulatedBoneMask; - m_iAccumulatedBoneMask = 0; + // Keep record of the fact that we've used attachments. Because of deferred attachments, we can't keep track from the previous frame. + //m_iAccumulatedBoneMask = 0; + m_iAccumulatedBoneMask = m_iAccumulatedBoneMask & BONE_USED_BY_ATTACHMENT; #ifdef STUDIO_ENABLE_PERF_COUNTERS CStudioHdr *hdr = GetModelPtr(); diff --git a/src/game/client/c_baseanimating.h b/src/game/client/c_baseanimating.h index 1c8a74b30..6aa79794b 100644 --- a/src/game/client/c_baseanimating.h +++ b/src/game/client/c_baseanimating.h @@ -265,6 +265,7 @@ class C_BaseAnimating : public C_BaseEntity, private IModelLoadCallback // Attachments. bool GetAttachment( const char *szName, Vector &absOrigin ); bool GetAttachment( const char *szName, Vector &absOrigin, QAngle &absAngles ); + virtual bool GetAttachmentDeferred( int number, matrix3x4_t &matrix ); // Inherited from C_BaseEntity virtual bool GetAttachment( int number, Vector &origin ); diff --git a/src/game/client/c_baseflex.cpp b/src/game/client/c_baseflex.cpp index a9bebdec1..8a52fb488 100644 --- a/src/game/client/c_baseflex.cpp +++ b/src/game/client/c_baseflex.cpp @@ -574,7 +574,7 @@ Vector C_BaseFlex::SetViewTarget( CStudioHdr *pStudioHdr ) if (m_iEyeAttachment > 0) { matrix3x4_t attToWorld; - if (!GetAttachment( m_iEyeAttachment, attToWorld )) + if (!GetAttachmentDeferred( m_iEyeAttachment, attToWorld )) { return Vector( 0, 0, 0); } diff --git a/src/game/client/tf/c_tf_player.cpp b/src/game/client/tf/c_tf_player.cpp index 61ec13426..d43b0c3da 100644 --- a/src/game/client/tf/c_tf_player.cpp +++ b/src/game/client/tf/c_tf_player.cpp @@ -667,6 +667,7 @@ class C_TFRagdoll : public C_BaseFlex int GetDamageCustom() { return m_iDamageCustom; } virtual bool GetAttachment( int iAttachment, matrix3x4_t &attachmentToWorld ); + virtual bool GetAttachmentDeferred( int iAttachment, matrix3x4_t &attachmentToWorld ); int GetClass() { return m_iClass; } @@ -1579,6 +1580,20 @@ bool C_TFRagdoll::GetAttachment( int iAttachment, matrix3x4_t &attachmentToWorld } } +bool C_TFRagdoll::GetAttachmentDeferred( int iAttachment, matrix3x4_t &attachmentToWorld ) +{ + int iHeadAttachment = LookupAttachment( "head" ); + if ( IsDecapitation() && (iAttachment == iHeadAttachment) ) + { + MatrixCopy( m_mHeadAttachment, attachmentToWorld ); + return true; + } + else + { + return BaseClass::GetAttachmentDeferred( iAttachment, attachmentToWorld ); + } +} + //----------------------------------------------------------------------------- // Purpose: // Input : - diff --git a/src/game/shared/baseviewmodel_shared.cpp b/src/game/shared/baseviewmodel_shared.cpp index 20538e8ca..f42445da3 100644 --- a/src/game/shared/baseviewmodel_shared.cpp +++ b/src/game/shared/baseviewmodel_shared.cpp @@ -655,6 +655,15 @@ bool CBaseViewModel::GetAttachment( int number, matrix3x4_t &matrix ) return BaseClass::GetAttachment( number, matrix ); } +bool C_BaseViewModel::GetAttachmentDeferred( int number, matrix3x4_t &matrix ) +{ + // Update priority for your own viewmodel (no deferral) + if ( m_hWeapon.Get() && m_hWeapon.Get()->WantsToOverrideViewmodelAttachments() ) + return m_hWeapon.Get()->GetAttachment(number, matrix); + + return BaseClass::GetAttachment( number, matrix ); +} + //----------------------------------------------------------------------------- // Purpose: //----------------------------------------------------------------------------- diff --git a/src/game/shared/baseviewmodel_shared.h b/src/game/shared/baseviewmodel_shared.h index 15d3be53f..434bcce05 100644 --- a/src/game/shared/baseviewmodel_shared.h +++ b/src/game/shared/baseviewmodel_shared.h @@ -168,6 +168,7 @@ class CBaseViewModel : public CBaseAnimating, public IHasOwner // Attachments virtual int LookupAttachment( const char *pAttachmentName ); virtual bool GetAttachment( int number, matrix3x4_t &matrix ); + virtual bool GetAttachmentDeferred( int number, matrix3x4_t &matrix ); virtual bool GetAttachment( int number, Vector &origin ); virtual bool GetAttachment( int number, Vector &origin, QAngle &angles ); virtual bool GetAttachmentVelocity( int number, Vector &originVel, Quaternion &angleVel ); diff --git a/src/game/shared/econ/econ_entity.cpp b/src/game/shared/econ/econ_entity.cpp index 041ad9940..21d68cf14 100644 --- a/src/game/shared/econ/econ_entity.cpp +++ b/src/game/shared/econ/econ_entity.cpp @@ -1947,6 +1947,14 @@ bool CEconEntity::GetAttachment( int number, matrix3x4_t &matrix ) return BaseClass::GetAttachment( number, matrix ); } +bool C_EconEntity::GetAttachmentDeferred( int number, matrix3x4_t &matrix ) +{ + if ( m_hViewmodelAttachment ) + return m_hViewmodelAttachment->GetAttachmentDeferred( number, matrix ); + + return BaseClass::GetAttachmentDeferred( number, matrix ); +} + //----------------------------------------------------------------------------- // Purpose: //----------------------------------------------------------------------------- diff --git a/src/game/shared/econ/econ_entity.h b/src/game/shared/econ/econ_entity.h index e0c2453de..e09d59f67 100644 --- a/src/game/shared/econ/econ_entity.h +++ b/src/game/shared/econ/econ_entity.h @@ -114,6 +114,7 @@ class CEconEntity : public CBaseAnimating, public IHasAttributes virtual bool GetAttachment( const char *szName, Vector &absOrigin ) { return BaseClass::GetAttachment(szName,absOrigin); } virtual bool GetAttachment( const char *szName, Vector &absOrigin, QAngle &absAngles ) { return BaseClass::GetAttachment(szName,absOrigin,absAngles); } virtual bool GetAttachment( int number, matrix3x4_t &matrix ); + virtual bool GetAttachmentDeferred( int number, matrix3x4_t &matrix ); virtual bool GetAttachment( int number, Vector &origin ); virtual bool GetAttachment( int number, Vector &origin, QAngle &angles ); virtual bool GetAttachmentVelocity( int number, Vector &originVel, Quaternion &angleVel ); diff --git a/src/game/shared/particle_property.cpp b/src/game/shared/particle_property.cpp index fe1087214..747211d31 100644 --- a/src/game/shared/particle_property.cpp +++ b/src/game/shared/particle_property.cpp @@ -612,10 +612,10 @@ void CParticleProperty::UpdateControlPoint( ParticleEffectList_t *pEffect, int i { matrix3x4_t attachmentToWorld; - if ( !pAnimating->GetAttachment( pPoint->iAttachmentPoint, attachmentToWorld ) ) + if ( !pAnimating->GetAttachmentDeferred( pPoint->iAttachmentPoint, attachmentToWorld ) ) { // try C_BaseAnimating if attach point is not on the weapon - if ( !pAnimating->C_BaseAnimating::GetAttachment( pPoint->iAttachmentPoint, attachmentToWorld ) ) + if ( !pAnimating->C_BaseAnimating::GetAttachmentDeferred( pPoint->iAttachmentPoint, attachmentToWorld ) ) { Warning( "Cannot update control point %d for effect '%s'.\n", pPoint->iAttachmentPoint, pEffect->pParticleEffect->GetEffectName() ); // Remove the effect cause this warning means something is orphaned From db8ac86c8e4da71704327949ca18b84114f80b20 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Sun, 5 Mar 2023 00:20:32 -0500 Subject: [PATCH 08/42] perf: fix duplicate GetSOCData for GetQualityParticleType * I don't think the compiler optimizes this out, because it isn't sure about side effects, and it's making this function spike up randomly in profiles, especially for the player model panel. --- src/game/shared/econ/econ_item_view.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/game/shared/econ/econ_item_view.cpp b/src/game/shared/econ/econ_item_view.cpp index e25390bd3..caf16d067 100644 --- a/src/game/shared/econ/econ_item_view.cpp +++ b/src/game/shared/econ/econ_item_view.cpp @@ -1084,7 +1084,7 @@ int CEconItemView::GetQualityParticleType() const if ( !pItem ) return 0; - if( GetSOCData()->GetQuality() == AE_SELFMADE || GetSOCData()->GetQuality() == AE_COMMUNITY ) + if( pItem->GetQuality() == AE_SELFMADE || pItem->GetQuality() == AE_COMMUNITY ) return pSparkleSystem ? pSparkleSystem->nSystemID : 0; else return 0; From 474947e53e35ec727a7d73344601c48ae6e2eb1f Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 6 Mar 2023 12:04:41 -0500 Subject: [PATCH 09/42] pending: sound updates * increase max channels to 256 to prevent us from hitting the cap in team fights * make more room for static channels over dynamic channels * dsound updates: * use DSBCAPS_TRUEPLAYPOSITION for accurate GetCurrentPosition * Pass NULL to GetCurrentPosition for write cursor, to skip querying it * limit dsound max distance, we only use it for local speaker spatialization * implement threaded sound a dedicated thread to doing all sound updates it gets notified when the main thread wants a sound update, and wakes up from sleep * backport 64 bit sound changes from CSGO * enable spatialization effects by default --- src/engine/audio/private/snd_channels.h | 4 +- src/engine/audio/private/snd_dev_direct.cpp | 27 +- src/engine/audio/private/snd_dma.cpp | 369 +++++++++++++++++--- src/engine/audio/private/snd_dsp.cpp | 6 +- src/engine/audio/private/snd_mix.cpp | 10 +- src/engine/audio/private/sound_private.h | 2 +- src/engine/audio/public/sound.h | 4 + src/engine/baseclientstate.cpp | 3 + src/engine/cl_main.cpp | 8 + src/engine/servermsghandler.cpp | 2 - src/game/server/sceneentity.cpp | 2 + src/game/server/soundscape_system.cpp | 5 + src/game/server/soundscape_system.h | 1 + src/game/shared/collisionproperty.cpp | 15 +- 14 files changed, 376 insertions(+), 82 deletions(-) diff --git a/src/engine/audio/private/snd_channels.h b/src/engine/audio/private/snd_channels.h index 06fd06506..ebc950d02 100644 --- a/src/engine/audio/private/snd_channels.h +++ b/src/engine/audio/private/snd_channels.h @@ -126,8 +126,8 @@ struct channel_t //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- -#define MAX_CHANNELS 128 -#define MAX_DYNAMIC_CHANNELS 64 +#define MAX_CHANNELS 256 +#define MAX_DYNAMIC_CHANNELS 32 //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- diff --git a/src/engine/audio/private/snd_dev_direct.cpp b/src/engine/audio/private/snd_dev_direct.cpp index a3c882265..3ae78c463 100644 --- a/src/engine/audio/private/snd_dev_direct.cpp +++ b/src/engine/audio/private/snd_dev_direct.cpp @@ -370,7 +370,7 @@ int CAudioDirectSound::PaintBegin( float mixAheadTime, int soundtime, int lpaint DWORD dwStatus; // If using surround, there are 4 or 5 different buffers being used and the pDSBuf is NULL. - if ( IsUsingBufferPerSpeaker() ) + if ( IsUsingBufferPerSpeaker() ) { if (pDSBufFL->GetStatus(&dwStatus) != DS_OK) Msg ("Couldn't get SURROUND FL sound buffer status\n"); @@ -674,6 +674,7 @@ bool CAudioDirectSound::SNDDMA_InitInterleaved( LPDIRECTSOUND lpDS, WAVEFORMATEX dsbdesc.dwFlags = 0; break; } + dsbdesc.dwFlags |= DSBCAPS_TRUEPLAYPOSITION; if ( !snd_mute_losefocus.GetBool() ) { dsbdesc.dwFlags |= DSBCAPS_GLOBALFOCUS; @@ -688,7 +689,7 @@ bool CAudioDirectSound::SNDDMA_InitInterleaved( LPDIRECTSOUND lpDS, WAVEFORMATEX if ( !bSuccess ) return false; - DWORD dwSize = 0, dwWrite; + DWORD dwSize = 0; DWORD *pBuffer = 0; if ( !LockDSBuffer( pDSBuf, &pBuffer, &dwSize, "DS_INTERLEAVED", DSBLOCK_ENTIREBUFFER ) ) return false; @@ -707,7 +708,7 @@ bool CAudioDirectSound::SNDDMA_InitInterleaved( LPDIRECTSOUND lpDS, WAVEFORMATEX pDSBuf->Play(0, 0, DSBPLAY_LOOPING); pDSBuf->Stop(); - pDSBuf->GetCurrentPosition(&m_outputBufferStartOffset, &dwWrite); + pDSBuf->GetCurrentPosition(&m_outputBufferStartOffset, NULL); pDSBuf->Play(0, 0, DSBPLAY_LOOPING); @@ -725,7 +726,7 @@ sndinitstat CAudioDirectSound::SNDDMA_InitDirect( void ) { DSBUFFERDESC dsbuf; DSBCAPS dsbcaps; - DWORD dwSize, dwWrite; + DWORD dwSize; WAVEFORMATEX format; WAVEFORMATEX pformat; HRESULT hresult; @@ -830,7 +831,7 @@ sndinitstat CAudioDirectSound::SNDDMA_InitDirect( void ) // sound hardware format Q_memset( &dsbuf, 0, sizeof(dsbuf) ); dsbuf.dwSize = sizeof(DSBUFFERDESC); - dsbuf.dwFlags = DSBCAPS_PRIMARYBUFFER; + dsbuf.dwFlags = DSBCAPS_PRIMARYBUFFER | DSBCAPS_TRUEPLAYPOSITION; if ( snd_legacy_surround.GetBool() || m_bSurround ) { dsbuf.dwFlags |= DSBCAPS_CTRL3D; @@ -900,7 +901,8 @@ sndinitstat CAudioDirectSound::SNDDMA_InitDirect( void ) // create the secondary buffer we'll actually work with Q_memset( &dsbuf, 0, sizeof(dsbuf) ); dsbuf.dwSize = sizeof(DSBUFFERDESC); - dsbuf.dwFlags = DSBCAPS_LOCSOFTWARE; // NOTE: don't use CTRLFREQUENCY (slow) + // NOTE: don't use CTRLFREQUENCY (slow) + dsbuf.dwFlags = DSBCAPS_LOCSOFTWARE | DSBCAPS_TRUEPLAYPOSITION; dsbuf.dwBufferBytes = SECONDARY_BUFFER_SIZE; dsbuf.lpwfxFormat = &format; if ( !snd_mute_losefocus.GetBool() ) @@ -992,7 +994,7 @@ sndinitstat CAudioDirectSound::SNDDMA_InitDirect( void ) pDSBuf->Stop(); - pDSBuf->GetCurrentPosition(&m_outputBufferStartOffset, &dwWrite); + pDSBuf->GetCurrentPosition(&m_outputBufferStartOffset, NULL); pDSBuf->Play(0, 0, DSBPLAY_LOOPING); } @@ -1288,7 +1290,7 @@ void DS3D_SetBufferParams( LPDIRECTSOUND3DBUFFER pDSBuf3D, D3DVECTOR *pbpos, D3D bparm.vConeOrientation = bdir; bparm.lConeOutsideVolume = DSBVOLUME_MIN; bparm.flMinDistance = 100.0; // no rolloff (until > 2.0 meter distance) - bparm.flMaxDistance = DS3D_DEFAULTMAXDISTANCE; + bparm.flMaxDistance = 1000.0; bparm.dwMode = DS3DMODE_NORMAL; hr = pDSBuf3D->SetAllParameters( &bparm, DS3D_DEFERRED ); @@ -1300,7 +1302,7 @@ bool CAudioDirectSound::SNDDMA_InitSurround(LPDIRECTSOUND lpDS, WAVEFORMATEX* lp { DSBUFFERDESC dsbuf; WAVEFORMATEX wvex; - DWORD dwSize, dwWrite; + DWORD dwSize; int reps; HRESULT hresult; void *lpData = NULL; @@ -1316,8 +1318,9 @@ bool CAudioDirectSound::SNDDMA_InitSurround(LPDIRECTSOUND lpDS, WAVEFORMATEX* lp memset (&dsbuf, 0, sizeof(dsbuf)); dsbuf.dwSize = sizeof(DSBUFFERDESC); - // NOTE: LOCHARDWARE causes SB AWE64 to crash in it's DSOUND driver - dsbuf.dwFlags = DSBCAPS_CTRL3D; // don't use CTRLFREQUENCY (slow) + // NOTE: LOCHARDWARE causes SB AWE64 to crash in it's DSOUND driver + // don't use CTRLFREQUENCY (slow) + dsbuf.dwFlags = DSBCAPS_CTRL3D | DSBCAPS_TRUEPLAYPOSITION; if ( !snd_mute_losefocus.GetBool() ) { dsbuf.dwFlags |= DSBCAPS_GLOBALFOCUS; @@ -1623,7 +1626,7 @@ bool CAudioDirectSound::SNDDMA_InitSurround(LPDIRECTSOUND lpDS, WAVEFORMATEX* lp // get hardware playback position, store it, syncronize all buffers to FL - pDSBufFL->GetCurrentPosition(&m_outputBufferStartOffset, &dwWrite); + pDSBufFL->GetCurrentPosition(&m_outputBufferStartOffset, NULL); pDSBufFR->SetCurrentPosition(m_outputBufferStartOffset); pDSBufRL->SetCurrentPosition(m_outputBufferStartOffset); pDSBufRR->SetCurrentPosition(m_outputBufferStartOffset); diff --git a/src/engine/audio/private/snd_dma.cpp b/src/engine/audio/private/snd_dma.cpp index 137bce7dd..6969f5c67 100644 --- a/src/engine/audio/private/snd_dma.cpp +++ b/src/engine/audio/private/snd_dma.cpp @@ -69,6 +69,12 @@ extern IVideoServices *g_pVideo; #define SNDLVL_TO_DIST_MULT( sndlvl ) ( sndlvl ? ((pow( 10.0f, snd_refdb.GetFloat() / 20 ) / pow( 10.0f, (float)sndlvl / 20 )) / snd_refdist.GetFloat()) : 0 ) #define DIST_MULT_TO_SNDLVL( dist_mult ) (soundlevel_t)(int)( dist_mult ? ( 20 * log10( pow( 10.0f, snd_refdb.GetFloat() / 20 ) / (dist_mult * snd_refdist.GetFloat()) ) ) : 0 ) +#if !defined( _X360 ) +#define THREADED_MIX_TIME 0.005 +#else +#define THREADED_MIX_TIME XMA_POLL_RATE * 0.001 +#endif + extern ConVar dsp_spatial; extern IPhysicsSurfaceProps *physprop; @@ -162,6 +168,8 @@ bool IsSoundSourceLocalPlayer( int soundsource ) CThreadMutex g_SndMutex; +CThreadEvent g_SndUpdateEvent; + #define THREAD_LOCK_SOUND() AUTO_LOCK( g_SndMutex ) const int MASK_BLOCK_AUDIO = CONTENTS_SOLID|CONTENTS_MOVEABLE|CONTENTS_WINDOW; @@ -236,13 +244,14 @@ vec_t S_GetNominalClipDist() return sound_nominal_clip_dist; } -int g_soundtime = 0; // sample PAIRS output since start -int g_paintedtime = 0; // sample PAIRS mixed since start +int64 g_soundtime = 0; // sample PAIRS output since start +double g_soundtimeerror = 0.0; // Error in sound time (used for synchronizing movie output sound to host_time) +int64 g_paintedtime = 0; // sample PAIRS mixed since start float g_ReplaySoundTimeFracAccumulator = 0.0f; // Used by replay float g_ClockSyncArray[NUM_CLOCK_SYNCS] = {0}; -int g_SoundClockPaintTime[NUM_CLOCK_SYNCS] = {0}; +int64 g_SoundClockPaintTime[NUM_CLOCK_SYNCS] = {0}; // default 10ms ConVar snd_delay_sound_shift("snd_delay_sound_shift","0.01"); @@ -277,7 +286,7 @@ float S_ComputeDelayForSoundtime( float soundtime, clocksync_index_t syncIndex ) int delaySamples = gameSamples - paintedSamples; float delay = delaySamples / float(dmaSpeed); - if ( gameDeltaTime < 0 || fabs(delay) > 0.500f ) + if ( gameDeltaTime < 0 || abs(delay) > 0.200f ) { // Note that the equations assume a correlation between game time and real time // some kind of clock error. This can happen with large host_timescale or when the @@ -451,7 +460,7 @@ static soundfade_t soundfade; // Client sound fading singleton object // autodetected from windows settings ConVar snd_surround( "snd_surround_speakers", "-1", FCVAR_INTERNAL_USE ); ConVar snd_legacy_surround( "snd_legacy_surround", "0", FCVAR_ARCHIVE ); -ConVar snd_noextraupdate( "snd_noextraupdate", "0" ); +ConVar snd_noextraupdate( "snd_noextraupdate", "1" ); ConVar snd_show( "snd_show", "0", FCVAR_CHEAT, "Show sounds info" ); ConVar snd_visualize ("snd_visualize", "0", FCVAR_CHEAT, "Show sounds location in world" ); ConVar snd_pitchquality( "snd_pitchquality", "1", FCVAR_ARCHIVE ); // 1) use high quality pitch shifters @@ -461,8 +470,17 @@ static ConVar volume( "volume", "1.0", FCVAR_ARCHIVE | FCVAR_ARCHIVE_XBOX, "Soun // user configurable music volume ConVar snd_musicvolume( "snd_musicvolume", "1.0", FCVAR_ARCHIVE | FCVAR_ARCHIVE_XBOX, "Music volume", true, 0.0f, true, 1.0f ); -ConVar snd_mixahead( "snd_mixahead", "0.1", FCVAR_ARCHIVE ); -ConVar snd_mix_async( "snd_mix_async", "0" ); +#ifdef THREADED_SOUND_UPDATE +ConVar snd_mixahead( "snd_threaded_mixahead", "0.1", 0 ); +#else +ConVar snd_mixahead( "snd_mixahead", "0.1", 0 ); +#endif +#ifdef THREADED_SOUND_UPDATE +ConVar snd_mix_async( "snd_mix_async", "1" ); +#else +ConVar snd_mix_async("snd_mix_async", "0"); +#endif + #ifdef _DEBUG static ConCommand snd_mixvol("snd_mixvol", MXR_DebugSetMixGroupVolume, "Set named Mixgroup to mix volume."); #endif @@ -4688,7 +4706,6 @@ void SND_SpatializeFirstFrameNoTrace( channel_t *pChannel) int S_AlterChannel( int soundsource, int entchannel, CSfxTable *sfx, int vol, int pitch, int flags ) { - THREAD_LOCK_SOUND(); int ch_idx; const char *name = sfx->getname(); @@ -5083,7 +5100,7 @@ int S_StartDynamicSound( StartSoundParams_t& params ) vol = 255; } - THREAD_LOCK_SOUND(); + THREAD_LOCK_SOUND() if ( params.flags & (SND_STOP|SND_CHANGE_VOL|SND_CHANGE_PITCH) ) { @@ -5376,6 +5393,8 @@ int S_StartStaticSound( StartSoundParams_t& params ) if ((params.flags & SND_STOP) && nSndShowStart > 0) DevMsg("S_StartStaticSound: %s Stopped.\n", sndname); + THREAD_LOCK_SOUND() + if ((params.flags & SND_STOP) || (params.flags & SND_CHANGE_VOL) || (params.flags & SND_CHANGE_PITCH)) { if (S_AlterChannel(params.soundsource, params.entchannel, params.pSfx, vol, params.pitch, params.flags) || (params.flags & SND_STOP)) @@ -5424,8 +5443,6 @@ int S_StartStaticSound( StartSoundParams_t& params ) g_pSoundServices->GetSoundSpatialization( params.soundsource, si ); // pick a channel to play on from the static area - THREAD_LOCK_SOUND(); - ch = SND_PickStaticChannel(params.soundsource, params.pSfx); // Autolooping sounds are always fixed origin(?) if ( !ch ) return 0; @@ -5736,7 +5753,7 @@ int S_GetCurrentStaticSounds( SoundInfo_t *pResult, int nSizeResult, int entchan // Stop all sounds for entity on a channel. void S_StopSound(int soundsource, int entchannel) { - THREAD_LOCK_SOUND(); + THREAD_LOCK_SOUND() CChannelList list; g_ActiveChannels.GetActiveChannels( list ); for ( int i = 0; i < list.Count(); i++ ) @@ -5771,7 +5788,7 @@ channel_t *S_FindChannelByGuid( int guid ) //----------------------------------------------------------------------------- void S_StopSoundByGuid( int guid ) { - THREAD_LOCK_SOUND(); + THREAD_LOCK_SOUND() channel_t *pChannel = S_FindChannelByGuid( guid ); if ( pChannel ) { @@ -5910,7 +5927,7 @@ void S_GetActiveSounds( CUtlVector< SndInfo_t >& sndlist ) void S_StopAllSounds( bool bClear ) { - THREAD_LOCK_SOUND(); + THREAD_LOCK_SOUND() int i; if ( !g_AudioDevice ) @@ -6113,12 +6130,221 @@ S_Update Called once each time through the main loop ============ */ -void S_Update( const AudioState_t *pAudioState ) +#ifdef THREADED_SOUND_UPDATE +void S_UpdateThreaded_Main() +{ + channel_t* ch; + static unsigned int s_roundrobin = 0; ///< number of times this function is called. + ///< used instead of host_frame because that number + ///< isn't necessarily available here (sez Yahn). + + g_AudioDevice->UpdateListener(listener_origin, listener_forward, listener_right, listener_up); + + int voiceChannelCount = 0; + int voiceChannelMaxVolume = 0; + + // reset traceline counter for this frame + g_snd_trace_count = 0; + + // calculate distance to nearest walls, update dsp_spatial + // updates one wall only per frame (one trace per frame) + SND_SetSpatialDelays(); + + // updates dsp_room if automatic room detection enabled + DAS_CheckNewRoomDSP(); + + // update spatialization for static and dynamic sounds + CChannelList list; + g_ActiveChannels.GetActiveChannels(list); + + g_SndMutex.Lock(); + + if (snd_spatialize_roundrobin.GetInt() == 0) + { + // spatialize each channel each time + for (int i = 0; i < list.Count(); i++) + { + ch = list.GetChannel(i); + if (!ch->sfx || !ch->activeIndex) + { + continue; + } + + SND_Spatialize(ch); // respatialize channel + + if (ch->sfx->pSource && ch->sfx->pSource->IsVoiceSource()) + { + voiceChannelCount++; + voiceChannelMaxVolume = max(voiceChannelMaxVolume, ChannelGetMaxVol(ch)); + } + } + } + else // lowend performance improvement: spatialize only some channels each frame. + { + unsigned int robinmask = (1 << snd_spatialize_roundrobin.GetInt()) - 1; + + // now do static channels + for (int i = 0; i < list.Count(); ++i) + { + ch = list.GetChannel(i); + if (!ch->sfx || !ch->activeIndex) + { + continue; + } + + // need to check bfirstpass because sound tracing may have been deferred + if (ch->flags.bfirstpass || (robinmask & s_roundrobin) == (i & robinmask)) + { + SND_Spatialize(ch); // respatialize channel + } + + if (ch->sfx->pSource && ch->sfx->pSource->IsVoiceSource()) + { + voiceChannelCount++; + voiceChannelMaxVolume = max(voiceChannelMaxVolume, ChannelGetMaxVol(ch)); + } + } + + ++s_roundrobin; + } + + SND_ChannelTraceReset(); + + g_SndMutex.Unlock(); + + // set new target for voice ducking + float frametime = g_pSoundServices->GetHostFrametime(); + S_UpdateVoiceDuck(voiceChannelCount, voiceChannelMaxVolume, frametime); + +#ifdef _X360 + // update x360 music volume + g_DashboardMusicMixValue = Approach(g_DashboardMusicMixTarget, g_DashboardMusicMixValue, g_DashboardMusicFadeRate * frametime); +#endif +} + +void S_UpdateThreaded_Base(const AudioState_t* pAudioState) +{ + VPROF("S_Update"); + if (!g_AudioDevice->IsActive()) + return; + + g_SndMutex.Lock(); + + if (pAudioState) + { + VectorCopy(pAudioState->m_Origin, listener_origin); + AngleVectors(pAudioState->m_Angles, &listener_forward, &listener_right, &listener_up); + s_bIsListenerUnderwater = pAudioState->m_bIsUnderwater; + } + else + { + VectorCopy(vec3_origin, listener_origin); + VectorCopy(vec3_origin, listener_forward); + VectorCopy(vec3_origin, listener_right); + VectorCopy(vec3_origin, listener_up); + s_bIsListenerUnderwater = false; + } + + // + // debugging output + // + if (snd_show.GetInt()) + { + con_nprint_t np; + np.time_to_live = 2.0f; + np.fixed_width_font = true; + + int total = 0; + + CChannelList activeChannels; + g_ActiveChannels.GetActiveChannels(activeChannels); + for (int i = 0; i < activeChannels.Count(); i++) + { + channel_t* channel = activeChannels.GetChannel(i); + if (!channel->sfx) + continue; + + np.index = total + 2; + if (channel->flags.fromserver) + { + np.color[0] = 1.0; + np.color[1] = 0.8; + np.color[2] = 0.1; + } + else + { + np.color[0] = 0.1; + np.color[1] = 0.9; + np.color[2] = 1.0; + } + + unsigned int sampleCount = RemainingSamples(channel); + float timeleft = (float)sampleCount / (float)channel->sfx->pSource->SampleRate(); + bool bLooping = channel->sfx->pSource->IsLooped(); + + if (snd_surround.GetInt() < 4) + { + Con_NXPrintf(&np, "%02i l(%03d) r(%03d) vol(%03d) ent(%03d) pos(%6d %6d %6d) timeleft(%f) looped(%d) %50s", + total + 1, + (int)channel->fvolume[IFRONT_LEFT], + (int)channel->fvolume[IFRONT_RIGHT], + channel->master_vol, + channel->soundsource, + (int)channel->origin[0], + (int)channel->origin[1], + (int)channel->origin[2], + timeleft, + bLooping, + channel->sfx->getname()); + } + else + { + Con_NXPrintf(&np, "%02i l(%03d) c(%03d) r(%03d) rl(%03d) rr(%03d) vol(%03d) ent(%03d) pos(%6d %6d %6d) timeleft(%f) looped(%d) %50s", + total + 1, + (int)channel->fvolume[IFRONT_LEFT], + (int)channel->fvolume[IFRONT_CENTER], + (int)channel->fvolume[IFRONT_RIGHT], + (int)channel->fvolume[IREAR_LEFT], + (int)channel->fvolume[IREAR_RIGHT], + channel->master_vol, + channel->soundsource, + (int)channel->origin[0], + (int)channel->origin[1], + (int)channel->origin[2], + timeleft, + bLooping, + channel->sfx->getname()); + } + + if (snd_visualize.GetInt()) + { + CDebugOverlay::AddTextOverlay(channel->origin, 0.05f, channel->sfx->getname()); + } + + total++; + } + + while (total <= 128) + { + Con_NPrintf(total + 2, ""); + total++; + } + } + + g_SndMutex.Unlock(); + + if (s_bOnLoadScreen) + return; + + S_Update_( snd_mixahead.GetFloat() ); +} +#endif + +void S_Update_Main( const AudioState_t *pAudioState ) { VPROF("S_Update"); channel_t *ch; - channel_t *combine; - static unsigned int s_roundrobin = 0 ; ///< number of times this function is called. + static unsigned int s_roundrobin = 0; ///< number of times this function is called. ///< used instead of host_frame because that number ///< isn't necessarily available here (sez Yahn). @@ -6146,8 +6372,6 @@ void S_Update( const AudioState_t *pAudioState ) } g_AudioDevice->UpdateListener( listener_origin, listener_forward, listener_right, listener_up ); - - combine = NULL; int voiceChannelCount = 0; int voiceChannelMaxVolume = 0; @@ -6175,6 +6399,11 @@ void S_Update( const AudioState_t *pAudioState ) Assert(ch->sfx); Assert(ch->activeIndex > 0); + if (!ch->sfx || ch->activeIndex < 1) + { + continue; + } + SND_Spatialize(ch); // respatialize channel if ( ch->sfx->pSource && ch->sfx->pSource->IsVoiceSource() ) @@ -6195,6 +6424,11 @@ void S_Update( const AudioState_t *pAudioState ) Assert(ch->sfx); Assert(ch->activeIndex > 0); + if (!ch->sfx || ch->activeIndex < 1) + { + continue; + } + // need to check bfirstpass because sound tracing may have been deferred if ( ch->flags.bfirstpass || (robinmask & s_roundrobin) == ( i & robinmask ) ) { @@ -6211,16 +6445,16 @@ void S_Update( const AudioState_t *pAudioState ) ++s_roundrobin; } - - SND_ChannelTraceReset(); // set new target for voice ducking float frametime = g_pSoundServices->GetHostFrametime(); S_UpdateVoiceDuck( voiceChannelCount, voiceChannelMaxVolume, frametime ); +#ifdef _X360 // update x360 music volume g_DashboardMusicMixValue = Approach( g_DashboardMusicMixTarget, g_DashboardMusicMixValue, g_DashboardMusicFadeRate * frametime ); +#endif // // debugging output @@ -6325,6 +6559,20 @@ void S_Update( const AudioState_t *pAudioState ) S_Update_( g_EstFrameTime + snd_mixahead.GetFloat() ); } +void S_Update(const AudioState_t* pAudioState) +{ +#ifdef THREADED_SOUND_UPDATE + if ( snd_mix_async.GetBool() ) + { + S_UpdateThreaded_Base(pAudioState); + } + else +#endif + { + S_Update_Main(pAudioState); + } +} + CON_COMMAND( snd_dumpclientsounds, "Dump sounds to VXConsole" ) { con_nprint_t np; @@ -6375,8 +6623,9 @@ CON_COMMAND( snd_dumpclientsounds, "Dump sounds to VXConsole" ) //----------------------------------------------------------------------------- void GetSoundTime(void) { - int fullsamples; - int sampleOutCount; + // Make them 64 bits so calculation is done in 64 bits. + int64 fullsamples; + int64 sampleOutCount; // size of output buffer in *full* 16 bit samples // A 2 channel device has a *full* sample consisting of a 16 bit LR pair. @@ -6394,13 +6643,6 @@ void GetSoundTime(void) { // buffer wrapped s_buffers++; - if ( g_paintedtime > 0x70000000 ) - { - // time to chop things off to avoid 32 bit limits - s_buffers = 0; - g_paintedtime = fullsamples; - S_StopAllSounds( true ); - } } s_oldsampleOutCount = sampleOutCount; @@ -6440,8 +6682,18 @@ void GetSoundTime(void) float t = g_pSoundServices->GetHostTime(); if ( s_lastsoundtime != t ) { - g_soundtime += g_pSoundServices->GetHostFrametime() * g_AudioDevice->DeviceDmaSpeed(); - + double flSamples = (double) g_pSoundServices->GetHostFrametime() * (double) g_AudioDevice->DeviceDmaSpeed(); + int nSamples = (int)flSamples; + double flSampleError = flSamples - (double)nSamples; + g_soundtimeerror += flSampleError; + if (fabs(g_soundtimeerror) > 1.0) + { + int nErrorSamples = (int)g_soundtimeerror; + g_soundtimeerror -= (double)nErrorSamples; + nSamples += nErrorSamples; + } + + g_soundtime += nSamples; s_lastsoundtime = t; } } @@ -6468,6 +6720,11 @@ void S_ExtraUpdate( void ) if ( snd_noextraupdate.GetInt() || cl_movieinfo.IsRecording() || IsReplayRendering() ) return; // don't pollute timings +#ifdef THREADED_SOUND_UPDATE + if (snd_mix_async.GetBool()) + return; +#endif + // If listener position and orientation has not yet been updated (ie: no call to S_Update since level load) // then don't mix. Important - mixing with listener at 'false' origin causes // some sounds to incorrectly spatialize to 0 volume, killing them before they can play. @@ -6546,45 +6803,41 @@ void S_Update_Guts( float mixAheadTime ) DEBUG_StopSoundMeasure( 4, samples ); } -#if !defined( _X360 ) -#define THREADED_MIX_TIME 33 -#else -#define THREADED_MIX_TIME XMA_POLL_RATE -#endif - ConVar snd_ShowThreadFrameTime( "snd_ShowThreadFrameTime", "0" ); bool g_bMixThreadExit; ThreadHandle_t g_hMixThread; void S_Update_Thread() { - float frameTime = THREADED_MIX_TIME * 0.001f; - double lastFrameTime = Plat_FloatTime(); + double frameTime = THREADED_MIX_TIME; while ( !g_bMixThreadExit ) { - // mixing (for 360) needs to be updated at a steady rate - // large update times causes the mixer to demand more audio data - // the 360 decoder has finite latency and cannot fulfill spike requests - float t0 = Plat_FloatTime(); - S_Update_Guts( frameTime + snd_mixahead.GetFloat() ); - int updateTime = ( Plat_FloatTime() - t0 ) * 1000.0f; + const double t0 = Plat_FloatTime(); +#ifdef THREADED_SOUND_UPDATE + S_UpdateThreaded_Main(); +#endif + S_Update_Guts(frameTime + snd_mixahead.GetFloat() ); + const double tf = Plat_FloatTime(); + + const double dt = tf - t0; - // try to maintain a steadier rate by compensating for fluctuating mix times - int sleepTime = THREADED_MIX_TIME - updateTime; - if ( sleepTime > 0 ) + // we have two goals: reduce latency and improve consistency + // this means we have regular update times that keep track of variance of frame times. + // however, we also want to update as soon as the game thread makes new stuff available to us + const int nSleepMS = (int) ((THREADED_MIX_TIME - dt) * 1000); + if (nSleepMS > 0) { - ThreadSleep( sleepTime ); + g_SndUpdateEvent.Wait(nSleepMS); } // mimic a frametime needed for sound update - double t1 = Plat_FloatTime(); - frameTime = t1 - lastFrameTime; - lastFrameTime = t1; + const double t1 = Plat_FloatTime(); + frameTime = t1 - t0; if ( snd_ShowThreadFrameTime.GetBool() ) { - Msg( "S_Update_Thread: frameTime: %d ms\n", (int)( frameTime * 1000.0f ) ); + Msg( "S_Update_Thread: frameTime: %f s\n", frameTime ); } } } @@ -6593,6 +6846,7 @@ void S_ShutdownMixThread() { if ( g_hMixThread ) { + g_SndUpdateEvent.Set(); g_bMixThreadExit = true; ThreadJoin( g_hMixThread ); ReleaseThreadHandle( g_hMixThread ); @@ -6602,7 +6856,7 @@ void S_ShutdownMixThread() void S_Update_( float mixAheadTime ) { - if ( !IsConsole() || !snd_mix_async.GetBool() ) + if ( !snd_mix_async.GetBool() ) { S_ShutdownMixThread(); S_Update_Guts( mixAheadTime ); @@ -6613,11 +6867,16 @@ void S_Update_( float mixAheadTime ) { g_bMixThreadExit = false; g_hMixThread = ThreadExecuteSolo( "SndMix", S_Update_Thread ); + ThreadSetPriority(g_hMixThread, TP_PRIORITY_HIGHEST); if ( IsX360() ) { ThreadSetAffinity( g_hMixThread, XBOX_PROCESSOR_5 ); } } + else + { + g_SndUpdateEvent.Set(); + } } } diff --git a/src/engine/audio/private/snd_dsp.cpp b/src/engine/audio/private/snd_dsp.cpp index 4e73fdb40..92a5c2c01 100644 --- a/src/engine/audio/private/snd_dsp.cpp +++ b/src/engine/audio/private/snd_dsp.cpp @@ -5890,10 +5890,10 @@ inline int PSET_GetNext ( pset_t *ppset, int x ) // Dsp presets -ConVar dsp_room ("dsp_room", "0", FCVAR_DEMO ); // room dsp preset - sounds more distant from player (1ch) +ConVar dsp_room ("dsp_room", "1", FCVAR_DEMO ); // room dsp preset - sounds more distant from player (1ch) ConVar dsp_water ("dsp_water", "14", FCVAR_DEMO ); // "14" underwater dsp preset - sound when underwater (1-2ch) ConVar dsp_player ("dsp_player", "0", FCVAR_DEMO | FCVAR_SERVER_CAN_EXECUTE ); // dsp on player - sound when player hit by special device (1-2ch) -ConVar dsp_facingaway ("dsp_facingaway", "0", FCVAR_DEMO ); // "30" sounds that face away from player (weapons, voice) (1-4ch) +ConVar dsp_facingaway ("dsp_facingaway", "30", FCVAR_DEMO ); // "30" sounds that face away from player (weapons, voice) (1-4ch) ConVar dsp_speaker ("dsp_speaker", "50", FCVAR_DEMO ); // "50" small distorted speaker sound (1ch) ConVar dsp_spatial ("dsp_spatial", "40", FCVAR_DEMO ); // spatial delays for l/r front/rear ears ConVar dsp_automatic ("dsp_automatic", "0", FCVAR_DEMO ); // automatic room type detection. if non zero, replaces dsp_room @@ -5930,7 +5930,7 @@ ConVar dsp_vol_5ch ("dsp_vol_5ch", "0.5", FCVAR_DEMO ); // 0.0 - 1.0; attenu ConVar dsp_vol_4ch ("dsp_vol_4ch", "0.5", FCVAR_DEMO ); // 0.0 - 1.0; attenuate master dsp volume for 4ch surround ConVar dsp_vol_2ch ("dsp_vol_2ch", "1.0", FCVAR_DEMO ); // 0.0 - 1.0; attenuate master dsp volume for 2ch surround -ConVar dsp_enhance_stereo("dsp_enhance_stereo", "0", FCVAR_ARCHIVE ); // 1) use dsp_spatial delays on all reverb channels +ConVar dsp_enhance_stereo("dsp_enhance_stereo", "1", FCVAR_CHEAT ); // 1) use dsp_spatial delays on all reverb channels // DSP preset executor diff --git a/src/engine/audio/private/snd_mix.cpp b/src/engine/audio/private/snd_mix.cpp index ca44cbf6e..877b633cd 100644 --- a/src/engine/audio/private/snd_mix.cpp +++ b/src/engine/audio/private/snd_mix.cpp @@ -53,7 +53,8 @@ bool BChannelLowVolume( channel_t *pch, int vol_min ); void ChannelCopyVolumes( channel_t *pch, int *pvolume_dest, int ivol_start, int cvol ); float ChannelLoudestCurVolume( const channel_t * RESTRICT pch ); -extern int g_soundtime; +extern int64 g_soundtime; +extern double g_soundtimeerror; extern float host_frametime; extern float host_frametime_unbounded; @@ -434,7 +435,7 @@ void S_FreeChannel(channel_t *ch) ch->flags.isSentence = false; // Msg("End sound %s\n", ch->sfx->getname() ); - + delete ch->pMixer; ch->pMixer = NULL; ch->sfx = NULL; @@ -2267,9 +2268,9 @@ void MIX_PaintChannels( int endtime, bool bIsUnderwater ) VPROF("MIX_PaintChannels"); tmZone( TELEMETRY_LEVEL0, TMZF_NONE, "%s", __FUNCTION__ ); - int end; + int64 end; int count; - bool b_spatial_delays = dsp_enhance_stereo.GetInt() != 0 ? true : false; + bool b_spatial_delays = dsp_enhance_stereo.GetBool(); bool room_fsurround_sav; bool room_fsurround_center_sav; paintbuffer_t *proom = MIX_GetPPaintFromIPaint(SOUND_BUFFER_ROOM); @@ -4196,6 +4197,7 @@ void SND_RecordInit() { g_paintedtime = 0; g_soundtime = 0; + g_soundtimeerror = 0.0; // TMP Wave file supports stereo only, so force stereo if ( snd_surround.GetInt() != 2 ) diff --git a/src/engine/audio/private/sound_private.h b/src/engine/audio/private/sound_private.h index f054c7c89..9a96308b1 100644 --- a/src/engine/audio/private/sound_private.h +++ b/src/engine/audio/private/sound_private.h @@ -44,7 +44,7 @@ void SNDDMA_Shutdown(void); // User-setable variables // ==================================================================== -extern int g_paintedtime; +extern int64 g_paintedtime; extern bool snd_initialized; diff --git a/src/engine/audio/public/sound.h b/src/engine/audio/public/sound.h index 64b4f9d9a..19bd742aa 100644 --- a/src/engine/audio/public/sound.h +++ b/src/engine/audio/public/sound.h @@ -22,6 +22,10 @@ #define AUDIOSOURCE_CACHE_ROOTDIR "maps/soundcache" +#if !defined(_X360) +#define THREADED_SOUND_UPDATE +#endif + class CSfxTable; enum soundlevel_t; struct SoundInfo_t; diff --git a/src/engine/baseclientstate.cpp b/src/engine/baseclientstate.cpp index 591f367a9..b1aa1622a 100644 --- a/src/engine/baseclientstate.cpp +++ b/src/engine/baseclientstate.cpp @@ -41,6 +41,7 @@ #include "replay_internal.h" #include "replayserver.h" #endif +#include "sound.h" // memdbgon must be the last include file in a .cpp file!!! #include "tier0/memdbgon.h" @@ -738,6 +739,8 @@ void CBaseClientState::Disconnect( const char *pszReason, bool bShowMainMenu ) CL_NotifyRPTOfDisconnect( ); #endif + S_StopAllSounds( true ); + m_nSignonState = SIGNONSTATE_NONE; netadr_t adr; diff --git a/src/engine/cl_main.cpp b/src/engine/cl_main.cpp index 4573a8a93..a808b09cd 100644 --- a/src/engine/cl_main.cpp +++ b/src/engine/cl_main.cpp @@ -145,10 +145,18 @@ struct ResourceLocker // Need to temporarily disable queued material system, then lock it m_QMS = Host_AllowQueuedMaterialSystem( false ); m_MatLock = g_pMaterialSystem->Lock(); + // Disable threaded sound updates while loading +#ifdef THREADED_SOUND_UPDATE + S_EnableThreadedMixing(false); +#endif } ~ResourceLocker() { + // Restore threaded sound update +#ifdef THREADED_SOUND_UPDATE + S_EnableThreadedMixing(true); +#endif // Restore QMS materials->Unlock( m_MatLock ); Host_AllowQueuedMaterialSystem( m_QMS ); diff --git a/src/engine/servermsghandler.cpp b/src/engine/servermsghandler.cpp index 96788090a..7024990cb 100644 --- a/src/engine/servermsghandler.cpp +++ b/src/engine/servermsghandler.cpp @@ -236,8 +236,6 @@ void CClientState::Disconnect( const char *pszReason, bool bShowMainMenu ) demoplayer->StopPlayback(); demorecorder->StopRecording(); #endif - - S_StopAllSounds( true ); R_DecalTermAll(); diff --git a/src/game/server/sceneentity.cpp b/src/game/server/sceneentity.cpp index f092624f6..9c7be7ec5 100644 --- a/src/game/server/sceneentity.cpp +++ b/src/game/server/sceneentity.cpp @@ -768,6 +768,8 @@ CSceneEntity::CSceneEntity( void ) m_bCompletedEarly = false; + if ( !m_pcvSndMixahead ) + m_pcvSndMixahead = cvar->FindVar( "snd_threaded_mixahead" ); if ( !m_pcvSndMixahead ) m_pcvSndMixahead = cvar->FindVar( "snd_mixahead" ); diff --git a/src/game/server/soundscape_system.cpp b/src/game/server/soundscape_system.cpp index 29b29402c..45b6c8dd7 100644 --- a/src/game/server/soundscape_system.cpp +++ b/src/game/server/soundscape_system.cpp @@ -263,6 +263,11 @@ void CSoundscapeSystem::LevelInitPostEntity() } } +void CSoundscapeSystem::LevelShutdownPreEntity() +{ + g_SoundscapeSystem.Shutdown(); +} + int CSoundscapeSystem::GetSoundscapeIndex( const char *pName ) { return m_soundscapes.GetStringID( pName ); diff --git a/src/game/server/soundscape_system.h b/src/game/server/soundscape_system.h index 8cdc356d9..f7164ad98 100644 --- a/src/game/server/soundscape_system.h +++ b/src/game/server/soundscape_system.h @@ -36,6 +36,7 @@ class CSoundscapeSystem : public CAutoGameSystemPerFrame virtual void FrameUpdatePostEntityThink( void ); virtual void LevelInitPreEntity( void ); virtual void LevelInitPostEntity(); + virtual void LevelShutdownPreEntity(); virtual void AddSoundscapeFile( const char *filename ); int GetSoundscapeIndex( const char *pName ); diff --git a/src/game/shared/collisionproperty.cpp b/src/game/shared/collisionproperty.cpp index 0ff49a4a6..bccba0496 100644 --- a/src/game/shared/collisionproperty.cpp +++ b/src/game/shared/collisionproperty.cpp @@ -167,6 +167,10 @@ void CDirtySpatialPartitionEntityList::OnPreQuery( SpatialPartitionListMask_t li if ( m_partitionWriteId != 0 && m_partitionWriteId == ThreadGetCurrentId() ) return; + // Don't break the cache by running in a separate thread! Not thread-safe! + if ( !ThreadInMainThread() ) + return; + #ifdef CLIENT_DLL // FIXME: This should really be an assertion... feh! if ( !C_BaseEntity::IsAbsRecomputationsEnabled() ) @@ -1107,7 +1111,8 @@ void CCollisionProperty::ComputeSurroundingBox( Vector *pVecWorldMins, Vector *p { Assert( GetSolid() != SOLID_CUSTOM ); bool bUseVPhysics = false; - if ( ( GetSolid() == SOLID_VPHYSICS ) && ( GetOuter()->GetMoveType() == MOVETYPE_VPHYSICS ) ) + // VPhysics is not thread-safe! + if ( ThreadInMainThread() && ( GetSolid() == SOLID_VPHYSICS ) && ( GetOuter()->GetMoveType() == MOVETYPE_VPHYSICS ) ) { // UNDONE: This may not be necessary any more. IPhysicsObject *pPhysics = GetOuter()->VPhysicsGetObject(); @@ -1128,7 +1133,9 @@ void CCollisionProperty::ComputeSurroundingBox( Vector *pVecWorldMins, Vector *p break; case USE_HITBOXES: - ComputeHitboxSurroundingBox( pVecWorldMins, pVecWorldMaxs ); + // Client code is not thread-safe! + if (ThreadInMainThread()) + ComputeHitboxSurroundingBox( pVecWorldMins, pVecWorldMaxs ); break; case USE_ROTATION_EXPANDED_BOUNDS: @@ -1141,7 +1148,9 @@ void CCollisionProperty::ComputeSurroundingBox( Vector *pVecWorldMins, Vector *p break; case USE_GAME_CODE: - GetOuter()->ComputeWorldSpaceSurroundingBox( pVecWorldMins, pVecWorldMaxs ); + // Client code is not thread-safe! + if (ThreadInMainThread()) + GetOuter()->ComputeWorldSpaceSurroundingBox( pVecWorldMins, pVecWorldMaxs ); Assert( pVecWorldMins->x <= pVecWorldMaxs->x ); Assert( pVecWorldMins->y <= pVecWorldMaxs->y ); Assert( pVecWorldMins->z <= pVecWorldMaxs->z ); From 5bd9fcfdb118c4ff8f9e015007d7259ef8bd64f4 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 6 Mar 2023 12:06:11 -0500 Subject: [PATCH 10/42] perf: backport animation ActivityList optimization this saves a lot of comparison times syncing client and server back and forth in a listen server (about 2 to 4%) --- src/game/shared/animation.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/game/shared/animation.cpp b/src/game/shared/animation.cpp index 975615813..da70c01ce 100644 --- a/src/game/shared/animation.cpp +++ b/src/game/shared/animation.cpp @@ -213,7 +213,7 @@ void VerifySequenceIndex( CStudioHdr *pstudiohdr ) return; } - if( pstudiohdr->GetActivityListVersion( ) != g_nActivityListVersion ) + if( pstudiohdr->GetActivityListVersion( ) < g_nActivityListVersion ) // sometimes the server's numbers can get ahead of the client's if we're sharing memory between the two, so it's only necessary to reindex if a model is lagging the version number. { // this model's sequences have not yet been indexed by activity IndexModelSequences( pstudiohdr ); From ae4f85871e7e59d6d391756b380f60e739b4f3ab Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 6 Mar 2023 12:13:32 -0500 Subject: [PATCH 11/42] perf: RenderSpriteCard backport from CSGO * skips render if alpha is 0 * combines nSequence calculation * uses FastQuad --- src/particles/builtin_particle_render_ops.cpp | 16 ++++++---------- src/public/materialsystem/imesh.h | 6 ++++++ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/particles/builtin_particle_render_ops.cpp b/src/particles/builtin_particle_render_ops.cpp index 966b15509..e588f754a 100644 --- a/src/particles/builtin_particle_render_ops.cpp +++ b/src/particles/builtin_particle_render_ops.cpp @@ -905,6 +905,9 @@ void C_OP_RenderSprites::RenderUnsortedNonSpriteCardOriented( CParticleCollectio void C_OP_RenderSprites::RenderSpriteCard( CMeshBuilder &meshBuilder, C_OP_RenderSpritesContext_t *pCtx, SpriteRenderInfo_t& info, int hParticle, ParticleRenderData_t const *pSortList, Vector *pCamera ) const { Assert( hParticle != -1 ); + unsigned char ac = pSortList->m_nAlpha; + if (! ac ) + return; int nGroup = hParticle / 4; int nOffset = hParticle & 0x3; @@ -921,7 +924,6 @@ void C_OP_RenderSprites::RenderSpriteCard( CMeshBuilder &meshBuilder, C_OP_Rende unsigned char rc = FastFToC( r ); unsigned char gc = FastFToC( g ); unsigned char bc = FastFToC( b ); - unsigned char ac = pSortList->m_nAlpha; float rad = pSortList->m_flRadius; if ( !IsFinite( rad ) ) @@ -959,16 +961,15 @@ void C_OP_RenderSprites::RenderSpriteCard( CMeshBuilder &meshBuilder, C_OP_Rende // float flLifetime = SubFloat( pLifeDuration[ nGroup * ld_stride ], nOffset ); // flAgeScale = ( flLifetime > 0.0f ) ? ( 1.0f / flLifetime ) * SEQUENCE_SAMPLE_COUNT : 0.0f; // } + int nSequence = SubFloat( info.m_pSequenceNumber[ nGroup * info.m_nSequenceStride ], nOffset ); if ( m_bAnimateInFPS ) { - int nSequence = SubFloat( info.m_pSequenceNumber[ nGroup * info.m_nSequenceStride ], nOffset ); flAgeScale = flAgeScale / info.m_pParticles->m_Sheet()->m_flFrameSpan[nSequence]; } pSample = GetSampleForSequence( info.m_pSheet, SubFloat( info.m_pCreationTimeStamp[ nGroup * info.m_nCreationTimeStride ], nOffset ), info.m_pParticles->m_flCurTime, - flAgeScale, - SubFloat( info.m_pSequenceNumber[ nGroup * info.m_nSequenceStride ], nOffset ) ); + flAgeScale, nSequence ); } const SequenceSampleTextureCoords_t *pSample0 = &(pSample->m_TextureCoordData[0]); @@ -1015,12 +1016,7 @@ void C_OP_RenderSprites::RenderSpriteCard( CMeshBuilder &meshBuilder, C_OP_Rende meshBuilder.TexCoord4f( 4, pSecondTexture0->m_fLeft_U0, pSecondTexture0->m_fTop_V0, pSecondTexture0->m_fRight_U0, pSecondTexture0->m_fBottom_V0 ); meshBuilder.AdvanceVertex(); - meshBuilder.FastIndex( info.m_nVertexOffset ); - meshBuilder.FastIndex( info.m_nVertexOffset + 1 ); - meshBuilder.FastIndex( info.m_nVertexOffset + 2 ); - meshBuilder.FastIndex( info.m_nVertexOffset ); - meshBuilder.FastIndex( info.m_nVertexOffset + 2 ); - meshBuilder.FastIndex( info.m_nVertexOffset + 3 ); + meshBuilder.FastQuad( info.m_nVertexOffset ); info.m_nVertexOffset += 4; } } diff --git a/src/public/materialsystem/imesh.h b/src/public/materialsystem/imesh.h index 3da443b4e..dffd1e419 100644 --- a/src/public/materialsystem/imesh.h +++ b/src/public/materialsystem/imesh.h @@ -3206,6 +3206,7 @@ class CMeshBuilder : public MeshDesc_t // Fast Index! No need to call advance index, and no random access allowed void FastIndex( unsigned short index ); + void FastQuad( int index ); // Fast Vertex! No need to call advance vertex, and no random access allowed. // WARNING - these are low level functions that are intended only for use @@ -3775,6 +3776,11 @@ FORCEINLINE void CMeshBuilder::FastIndex2( unsigned short nIndex1, unsigned shor m_IndexBuilder.FastIndex2( nIndex1, nIndex2 ); } +FORCEINLINE void CMeshBuilder::FastQuad( int nIndex ) +{ + m_IndexBuilder.FastQuad( nIndex ); +} + //----------------------------------------------------------------------------- // For use with the FastVertex methods, advances the current vertex by N //----------------------------------------------------------------------------- From 41671beaac61a57c8d5663f863c85568deca25b0 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Thu, 9 Mar 2023 22:07:39 -0500 Subject: [PATCH 12/42] perf(vgui): font optimizations * a lot of profile time in the RB tree lookups for FontTextureCache and Win32Font * adapt the old Xbox 360 only ABC cache to be optional and lazily filled this allows us to have a fast array based lookup for the common characters * fix IsValidIndex check against Find results, we can just compare directly to InvalidIndex this fixes extra bounds checks taking up extra time for both extended cache lookups * backport the m_CommonCharCache from CSGO for FontTextureCache this similarly adds a fast array based lookup for common characters and also I modified the code from CSGO to include the InvalidIndex check fix here as well * in short, this eliminates most of the time used for font rendering and fitting --- src/common/vgui_surfacelib/Win32Font.h | 6 +- src/vgui2/vgui_surfacelib/Win32Font.cpp | 70 ++++++++++++---- src/vguimatsurface/FontTextureCache.cpp | 106 +++++++++++++++++------- src/vguimatsurface/FontTextureCache.h | 17 +++- 4 files changed, 149 insertions(+), 50 deletions(-) diff --git a/src/common/vgui_surfacelib/Win32Font.h b/src/common/vgui_surfacelib/Win32Font.h index 7122fa95a..b6c66eb8d 100644 --- a/src/common/vgui_surfacelib/Win32Font.h +++ b/src/common/vgui_surfacelib/Win32Font.h @@ -128,6 +128,8 @@ class CWin32Font char c; }; + enum { ABCWIDTHS_CACHE_SIZE = 256 }; + abc_t* m_ABCWidthsCache[ABCWIDTHS_CACHE_SIZE]; #if !defined( _X360 ) // On PC we cache char widths on demand when actually requested to minimize our use of the kernels // paged pool (GDI may cache information about glyphs we have requested and take up lots of paged pool) @@ -138,10 +140,6 @@ class CWin32Font }; CUtlRBTree m_ExtendedABCWidthsCache; static bool ExtendedABCWidthsCacheLessFunc(const abc_cache_t &lhs, const abc_cache_t &rhs); -#else - // 360 requires all possible characters during font init - enum { ABCWIDTHS_CACHE_SIZE = 256 }; - abc_t m_ABCWidthsCache[ABCWIDTHS_CACHE_SIZE]; #endif }; diff --git a/src/vgui2/vgui_surfacelib/Win32Font.cpp b/src/vgui2/vgui_surfacelib/Win32Font.cpp index 5ec02ac69..b6bedc36a 100644 --- a/src/vgui2/vgui_surfacelib/Win32Font.cpp +++ b/src/vgui2/vgui_surfacelib/Win32Font.cpp @@ -1,4 +1,4 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// +//========= Copyright Valve Corporation, All rights reserved. ============// // // Purpose: // @@ -48,9 +48,7 @@ CWin32Font::CWin32Font() : m_ExtendedABCWidthsCache(256, 0, &ExtendedABCWidthsCa m_bAdditive = false; m_rgiBitmapSize[ 0 ] = m_rgiBitmapSize[ 1 ] = 0; -#if defined( _X360 ) Q_memset( m_ABCWidthsCache, 0, sizeof( m_ABCWidthsCache ) ); -#endif m_ExtendedABCWidthsCache.EnsureCapacity( 128 ); @@ -84,6 +82,14 @@ CWin32Font::~CWin32Font() ::DeleteDC( m_hDC ); if ( m_hDIB ) ::DeleteObject( m_hDIB ); + +#ifndef _X360 + for (int i = 0; i < ARRAYSIZE(m_ABCWidthsCache); i++) + { + delete m_ABCWidthsCache[i]; + m_ABCWidthsCache[i] = NULL; + } +#endif } //----------------------------------------------------------------------------- @@ -242,6 +248,14 @@ bool CWin32Font::Create(const char *windowsFontName, int tall, int weight, int b } } } +#else + Assert(ABCWIDTHS_CACHE_SIZE <= 256); + Q_memset(m_ABCWidthsCache, 0, sizeof(m_ABCWidthsCache)); + for (int i = 0; i < ARRAYSIZE(m_ABCWidthsCache); i++) + { + delete m_ABCWidthsCache[i]; + m_ABCWidthsCache[i] = NULL; + } #endif return true; @@ -476,7 +490,7 @@ void CWin32Font::GetCharRGBA(wchar_t ch, int rgbaWide, int rgbaTall, unsigned ch //----------------------------------------------------------------------------- bool CWin32Font::IsEqualTo(const char *windowsFontName, int tall, int weight, int blur, int scanlines, int flags) { - if ( !stricmp(windowsFontName, m_szName.String() ) + if ( !V_stricmp(windowsFontName, m_szName.String() ) && m_iTall == tall && m_iWeight == weight && m_iBlur == blur @@ -512,23 +526,31 @@ void CWin32Font::SetAsActiveFont(HDC hdc) void CWin32Font::GetCharABCWidths(int ch, int &a, int &b, int &c) { Assert( IsValid() ); -#if defined( _X360 ) - if (ch < ABCWIDTHS_CACHE_SIZE) + bool bFastPath = ch < ABCWIDTHS_CACHE_SIZE; + bool bNeedsExtendedLookup = !bFastPath; + abc_cache_t finder; + if (bFastPath) { // use the cache entry - a = m_ABCWidthsCache[ch].a; - b = m_ABCWidthsCache[ch].b; - c = m_ABCWidthsCache[ch].c; + abc_t* p_abc = m_ABCWidthsCache[ch]; + if (p_abc) + { + abc_t& abc = *p_abc; + a = abc.a; + b = abc.b; + c = abc.c; + return; + } + bNeedsExtendedLookup = true; } - else -#endif + if (bNeedsExtendedLookup) { - // look for it in the cache - abc_cache_t finder = { (wchar_t)ch }; + finder = { (wchar_t)ch }; unsigned short i = m_ExtendedABCWidthsCache.Find(finder); - if (m_ExtendedABCWidthsCache.IsValidIndex(i)) + // This used to be IsValidIndex, but we're getting this right from Find so we don't need to do extra bounds checks. + if ( i != m_ExtendedABCWidthsCache.InvalidIndex() ) { a = m_ExtendedABCWidthsCache[i].abc.a; b = m_ExtendedABCWidthsCache[i].abc.b; @@ -563,11 +585,25 @@ void CWin32Font::GetCharABCWidths(int ch, int &a, int &b, int &c) b = m_iMaxCharWidth; } } + } + char s_a = a - m_iBlur - m_iOutlineSize; + short s_b = b + ((m_iBlur + m_iOutlineSize) * 2) + m_iDropShadowOffset; + char s_c = c - m_iBlur - m_iDropShadowOffset - m_iOutlineSize; + + if (bFastPath) + { + m_ABCWidthsCache[ch] = new abc_t; + m_ABCWidthsCache[ch]->a = s_a; + m_ABCWidthsCache[ch]->b = s_b; + m_ABCWidthsCache[ch]->c = s_c; + } + else + { // add to the cache - finder.abc.a = a - m_iBlur - m_iOutlineSize; - finder.abc.b = b + ((m_iBlur + m_iOutlineSize) * 2) + m_iDropShadowOffset; - finder.abc.c = c - m_iBlur - m_iDropShadowOffset - m_iOutlineSize; + finder.abc.a = s_a; + finder.abc.b = s_b; + finder.abc.c = s_c; m_ExtendedABCWidthsCache.Insert(finder); } } diff --git a/src/vguimatsurface/FontTextureCache.cpp b/src/vguimatsurface/FontTextureCache.cpp index d94e9afae..1886d0f42 100644 --- a/src/vguimatsurface/FontTextureCache.cpp +++ b/src/vguimatsurface/FontTextureCache.cpp @@ -63,6 +63,7 @@ CON_COMMAND( mat_texture_outline_fonts, "Outline fonts textures." ) CFontTextureCache::CFontTextureCache() : m_CharCache(0, 256, CacheEntryLessFunc) { + V_memset(m_CommonCharCache, 0, sizeof(m_CommonCharCache)); Clear(); } @@ -95,6 +96,12 @@ void CFontTextureCache::Clear() } m_FontPages.SetLessFunc( DefLessFunc( vgui::HFont ) ); m_FontPages.RemoveAll(); + + for (int i = 0; i < ARRAYSIZE(m_CommonCharCache); i++) + { + delete m_CommonCharCache[i]; + m_CommonCharCache[i] = 0; + } } //----------------------------------------------------------------------------- @@ -208,19 +215,69 @@ bool CFontTextureCache::GetTextureForChars( vgui::HFont font, vgui::FontDrawType for ( int i = 0; i < numChars; i++ ) { - CacheEntry_t cacheItem; - cacheItem.font = font; - cacheItem.wch = wch[i]; - HCacheEntry cacheHandle = m_CharCache.Find( cacheItem ); - if ( ! m_CharCache.IsValidIndex( cacheHandle ) ) + wchar_t wideChar = wch[i]; + + int* pCachePage; + float* pCacheCoords; + + // profiling dicatated that avoiding the naive font/char RB lookup was beneficial + // instead waste a little memory to get all the western language chars to be direct + if (wideChar < MAX_COMMON_CHARS && font < ARRAYSIZE(m_CommonCharCache)) { - // All characters must come out of the same font - if ( winFont != FontManager().GetFontForChar( font, wch[i] ) ) - return false; + // dominant amount of simple chars are instant direct lookup + CommonChar_t* pCommonChars = m_CommonCharCache[font]; + if (!pCommonChars) + { + // missing + if (winFont != FontManager().GetFontForChar(font, wideChar)) + { + // all characters in string must come out of the same font + return false; + } + // init and insert + pCommonChars = new CommonChar_t; + memset(pCommonChars, 0, sizeof(CommonChar_t)); + m_CommonCharCache[font] = pCommonChars; + } + pCachePage = &pCommonChars->details[wideChar].page; + pCacheCoords = pCommonChars->details[wideChar].texCoords; + } + else + { + // extended chars are a costlier lookup + // page and char form a unique key to find in cache + CacheEntry_t cacheItem; + cacheItem.font = font; + cacheItem.wch = wideChar; + HCacheEntry cacheHandle = m_CharCache.Find(cacheItem); + if ( cacheHandle == m_CharCache.InvalidIndex() ) + { + // missing + if (winFont != FontManager().GetFontForChar(font, wideChar)) + { + // all characters in string must come out of the same font + return false; + } + + // init and insert + cacheItem.texCoords[0] = 0; + cacheItem.texCoords[1] = 0; + cacheItem.texCoords[2] = 0; + cacheItem.texCoords[3] = 0; + cacheHandle = m_CharCache.Insert(cacheItem); + Assert(m_CharCache.IsValidIndex(cacheHandle)); + } + pCachePage = &m_CharCache[cacheHandle].page; + pCacheCoords = m_CharCache[cacheHandle].texCoords; + } + + if ( pCacheCoords[2] == 0 && pCacheCoords[3] == 0 ) + { + // invalid page, setup for page allocation // get the char details int a, b, c; - winFont->GetCharABCWidths( wch[i], a, b, c ); + winFont->GetCharABCWidths( wideChar, a, b, c ); int fontWide = max( b, 1 ); int fontTall = max( winFont->GetHeight(), 1 ); if ( winFont->GetUnderlined() ) @@ -230,14 +287,14 @@ bool CFontTextureCache::GetTextureForChars( vgui::HFont font, vgui::FontDrawType // Get a texture to render into int page, drawX, drawY, twide, ttall; - if ( !AllocatePageForChar( fontWide, fontTall, page, drawX, drawY, twide, ttall ) ) + if ( !AllocatePageForChar(fontWide, fontTall, page, drawX, drawY, twide, ttall) ) return false; // accumulate data to pass to GetCharsRGBA below newEntries[ numNewChars ].page = page; newEntries[ numNewChars ].drawX = drawX; newEntries[ numNewChars ].drawY = drawY; - newChars[ numNewChars ].wch = wch[i]; + newChars[ numNewChars ].wch = wideChar; newChars[ numNewChars ].fontWide = fontWide; newChars[ numNewChars ].fontTall = fontTall; newChars[ numNewChars ].offset = 4*totalNewCharTexels; @@ -245,25 +302,18 @@ bool CFontTextureCache::GetTextureForChars( vgui::HFont font, vgui::FontDrawType maxNewCharTexels = max( maxNewCharTexels, fontWide*fontTall ); numNewChars++; - // set the cache info - cacheItem.page = page; - - // the 0.5 texel offset is done in CMatSystemTexture::SetMaterial() / CMatSystemSurface::StartDrawing() - double adjust = 0.0f; + // the 0.5 texel offset is done in CMatSystemTexture::SetMaterial() + pCacheCoords[0] = (float)( (double)drawX / ((double)twide) ); + pCacheCoords[1] = (float)( (double)drawY / ((double)ttall) ); + pCacheCoords[2] = (float)( (double)(drawX + fontWide) / (double)twide ); + pCacheCoords[3] = (float)( (double)(drawY + fontTall) / (double)ttall ); - cacheItem.texCoords[0] = (float)( (double)drawX / ((double)twide + adjust) ); - cacheItem.texCoords[1] = (float)( (double)drawY / ((double)ttall + adjust) ); - cacheItem.texCoords[2] = (float)( (double)(drawX + fontWide) / (double)twide ); - cacheItem.texCoords[3] = (float)( (double)(drawY + fontTall) / (double)ttall ); - - m_CharCache.Insert(cacheItem); - cacheHandle = m_CharCache.Find( cacheItem ); - Assert( m_CharCache.IsValidIndex( cacheHandle ) ); + *pCachePage = page; } - - int page = m_CharCache[cacheHandle].page; - textureID[i] = m_PageList[page].textureID[typePage]; - texCoords[i] = m_CharCache[cacheHandle].texCoords; + + // give data to caller + textureID[i] = m_PageList[*pCachePage].textureID[typePage]; + texCoords[i] = pCacheCoords; } // Generate texture data for all newly-encountered characters diff --git a/src/vguimatsurface/FontTextureCache.h b/src/vguimatsurface/FontTextureCache.h index 1f12e3ce2..f09c40171 100644 --- a/src/vguimatsurface/FontTextureCache.h +++ b/src/vguimatsurface/FontTextureCache.h @@ -18,6 +18,8 @@ class ITexture; +#define MAX_COMMON_CHARS 256 + //----------------------------------------------------------------------------- // Purpose: manages texture memory for unicode fonts in vgui //----------------------------------------------------------------------------- @@ -51,13 +53,24 @@ class CFontTextureCache FONT_PAGE_SIZE_COUNT, }; + // hold the common characters + struct charDetail_t + { + int page; + float texCoords[4]; + }; + struct CommonChar_t + { + charDetail_t details[MAX_COMMON_CHARS]; + }; + // a single character in the cache typedef unsigned short HCacheEntry; struct CacheEntry_t { vgui::HFont font; wchar_t wch; - unsigned char page; + int page; float texCoords[4]; // doubly-linked list for use in the LRU @@ -81,6 +94,8 @@ class CFontTextureCache // Creates font materials void CreateFontMaterials( Page_t &page, ITexture *pFontTexture, bool bitmapFont = false ); + CommonChar_t* m_CommonCharCache[384]; + // Computes the page size given a character height int ComputePageType( int charTall ) const; From 07ea2e18ffb11940780a85bfc8660c371d048488 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Thu, 9 Mar 2023 22:22:40 -0500 Subject: [PATCH 13/42] perf(vgui): backport CMatSystemSurface::DrawQuadArray from CSGO * this greatly improves performance of various HUD element draws --- src/vguimatsurface/MatSystemSurface.cpp | 141 ++++++++++++++---------- 1 file changed, 80 insertions(+), 61 deletions(-) diff --git a/src/vguimatsurface/MatSystemSurface.cpp b/src/vguimatsurface/MatSystemSurface.cpp index 742659081..7775f3b7c 100644 --- a/src/vguimatsurface/MatSystemSurface.cpp +++ b/src/vguimatsurface/MatSystemSurface.cpp @@ -1092,80 +1092,99 @@ void CMatSystemSurface::DrawQuadArray( int quadCount, vgui::Vertex_t *pVerts, un if ( !m_pMesh ) return; - meshBuilder.Begin( m_pMesh, MATERIAL_QUADS, quadCount ); - vgui::Vertex_t ulc; vgui::Vertex_t lrc; vgui::Vertex_t *pulc; vgui::Vertex_t *plrc; - if ( bShouldClip ) - { - for ( int i = 0; i < quadCount; ++i ) - { - PREFETCH360( &pVerts[ 2 * ( i + 1 ) ], 0 ); - - if ( !ClipRect( pVerts[2*i], pVerts[2*i + 1], &ulc, &lrc ) ) - { - continue; - } - pulc = &ulc; - plrc = &lrc; - - meshBuilder.Position3f( pulc->m_Position.x, pulc->m_Position.y, m_flZPos ); - meshBuilder.Color4ubv( pColor ); - meshBuilder.TexCoord2f( 0, pulc->m_TexCoord.x, pulc->m_TexCoord.y ); - meshBuilder.AdvanceVertexF(); + int nMaxVertices, nMaxIndices; + CMatRenderContextPtr pRenderContext(g_pMaterialSystem); + pRenderContext->GetMaxToRender(m_pMesh, false, &nMaxVertices, &nMaxIndices); + if (!nMaxVertices || !nMaxIndices) + return; // probably in alt-tab - meshBuilder.Position3f( plrc->m_Position.x, pulc->m_Position.y, m_flZPos ); - meshBuilder.Color4ubv( pColor ); - meshBuilder.TexCoord2f( 0, plrc->m_TexCoord.x, pulc->m_TexCoord.y ); - meshBuilder.AdvanceVertexF(); + int nMaxQuads = nMaxVertices / 4; + nMaxQuads = MIN(nMaxQuads, nMaxIndices / 6); - meshBuilder.Position3f( plrc->m_Position.x, plrc->m_Position.y, m_flZPos ); - meshBuilder.Color4ubv( pColor ); - meshBuilder.TexCoord2f( 0, plrc->m_TexCoord.x, plrc->m_TexCoord.y ); - meshBuilder.AdvanceVertexF(); + int nFirstQuad = 0; + int nQuadsRemaining = quadCount; - meshBuilder.Position3f( pulc->m_Position.x, plrc->m_Position.y, m_flZPos ); - meshBuilder.Color4ubv( pColor ); - meshBuilder.TexCoord2f( 0, pulc->m_TexCoord.x, plrc->m_TexCoord.y ); - meshBuilder.AdvanceVertexF(); - } - } - else + while (nQuadsRemaining > 0) { - for ( int i = 0; i < quadCount; ++i ) + quadCount = MIN( nQuadsRemaining, nMaxQuads ); + meshBuilder.Begin( m_pMesh, MATERIAL_QUADS, quadCount ); + if ( bShouldClip ) { - PREFETCH360( &pVerts[ 2 * ( i + 1 ) ], 0 ); - - pulc = &pVerts[2*i]; - plrc = &pVerts[2*i + 1]; - - meshBuilder.Position3f( pulc->m_Position.x, pulc->m_Position.y, m_flZPos ); - meshBuilder.Color4ubv( pColor ); - meshBuilder.TexCoord2f( 0, pulc->m_TexCoord.x, pulc->m_TexCoord.y ); - meshBuilder.AdvanceVertexF(); - - meshBuilder.Position3f( plrc->m_Position.x, pulc->m_Position.y, m_flZPos ); - meshBuilder.Color4ubv( pColor ); - meshBuilder.TexCoord2f( 0, plrc->m_TexCoord.x, pulc->m_TexCoord.y ); - meshBuilder.AdvanceVertexF(); - - meshBuilder.Position3f( plrc->m_Position.x, plrc->m_Position.y, m_flZPos ); - meshBuilder.Color4ubv( pColor ); - meshBuilder.TexCoord2f( 0, plrc->m_TexCoord.x, plrc->m_TexCoord.y ); - meshBuilder.AdvanceVertexF(); + for ( int q = 0; q < quadCount; ++q ) + { + int i = q + nFirstQuad; + PREFETCH360( &pVerts[ 2 * ( i + 1 ) ], 0 ); - meshBuilder.Position3f( pulc->m_Position.x, plrc->m_Position.y, m_flZPos ); - meshBuilder.Color4ubv( pColor ); - meshBuilder.TexCoord2f( 0, pulc->m_TexCoord.x, plrc->m_TexCoord.y ); - meshBuilder.AdvanceVertexF(); + if ( !ClipRect( pVerts[2*i], pVerts[2*i + 1], &ulc, &lrc ) ) + { + continue; + } + pulc = &ulc; + plrc = &lrc; + + meshBuilder.Position3f( pulc->m_Position.x, pulc->m_Position.y, m_flZPos ); + meshBuilder.Color4ubv( pColor ); + meshBuilder.TexCoord2f( 0, pulc->m_TexCoord.x, pulc->m_TexCoord.y ); + meshBuilder.AdvanceVertexF(); + + meshBuilder.Position3f( plrc->m_Position.x, pulc->m_Position.y, m_flZPos ); + meshBuilder.Color4ubv( pColor ); + meshBuilder.TexCoord2f( 0, plrc->m_TexCoord.x, pulc->m_TexCoord.y ); + meshBuilder.AdvanceVertexF(); + + meshBuilder.Position3f( plrc->m_Position.x, plrc->m_Position.y, m_flZPos ); + meshBuilder.Color4ubv( pColor ); + meshBuilder.TexCoord2f( 0, plrc->m_TexCoord.x, plrc->m_TexCoord.y ); + meshBuilder.AdvanceVertexF(); + + meshBuilder.Position3f( pulc->m_Position.x, plrc->m_Position.y, m_flZPos ); + meshBuilder.Color4ubv( pColor ); + meshBuilder.TexCoord2f( 0, pulc->m_TexCoord.x, plrc->m_TexCoord.y ); + meshBuilder.AdvanceVertexF(); + } + } + else + { + for (int q = 0; q < quadCount; ++q) + { + int i = q + nFirstQuad; + PREFETCH360( &pVerts[ 2 * ( i + 1 ) ], 0 ); + + pulc = &pVerts[2*i]; + plrc = &pVerts[2*i + 1]; + + meshBuilder.Position3f( pulc->m_Position.x, pulc->m_Position.y, m_flZPos ); + meshBuilder.Color4ubv( pColor ); + meshBuilder.TexCoord2f( 0, pulc->m_TexCoord.x, pulc->m_TexCoord.y ); + meshBuilder.AdvanceVertexF(); + + meshBuilder.Position3f( plrc->m_Position.x, pulc->m_Position.y, m_flZPos ); + meshBuilder.Color4ubv( pColor ); + meshBuilder.TexCoord2f( 0, plrc->m_TexCoord.x, pulc->m_TexCoord.y ); + meshBuilder.AdvanceVertexF(); + + meshBuilder.Position3f( plrc->m_Position.x, plrc->m_Position.y, m_flZPos ); + meshBuilder.Color4ubv( pColor ); + meshBuilder.TexCoord2f( 0, plrc->m_TexCoord.x, plrc->m_TexCoord.y ); + meshBuilder.AdvanceVertexF(); + + meshBuilder.Position3f( pulc->m_Position.x, plrc->m_Position.y, m_flZPos ); + meshBuilder.Color4ubv( pColor ); + meshBuilder.TexCoord2f( 0, pulc->m_TexCoord.x, plrc->m_TexCoord.y ); + meshBuilder.AdvanceVertexF(); + } } - } - meshBuilder.End(); - m_pMesh->Draw(); + meshBuilder.End(); + m_pMesh->Draw(); + nFirstQuad += quadCount; + nQuadsRemaining -= quadCount; + } } From a81ea13b01fd4b3fefaee5321841aac3f8daee7f Mon Sep 17 00:00:00 2001 From: mastercoms Date: Thu, 9 Mar 2023 22:11:24 -0500 Subject: [PATCH 14/42] perf(init): move to CUtlSymbolLarge for dmxloader (CSGO backport) * the particle data loaded from dmx is huge, CUtlSymbolLarge is much more well suited for larger symbol tables * this optimizes particle loading time slightly * also backport the minor change to use the template GetString in dmxloader for size --- src/dmxloader/dmxattribute.cpp | 6 +++--- src/dmxloader/dmxelement.cpp | 9 +++++---- src/dmxloader/dmxloader.cpp | 6 +++--- src/public/dmxloader/dmxattribute.h | 11 ++++++----- src/public/dmxloader/dmxelement.h | 11 ++++++----- src/public/tier1/mempool.h | 6 +++--- 6 files changed, 26 insertions(+), 23 deletions(-) diff --git a/src/dmxloader/dmxattribute.cpp b/src/dmxloader/dmxattribute.cpp index aea474c79..4299d0f2e 100644 --- a/src/dmxloader/dmxattribute.cpp +++ b/src/dmxloader/dmxattribute.cpp @@ -15,7 +15,7 @@ //----------------------------------------------------------------------------- // globals //----------------------------------------------------------------------------- -CUtlSymbolTableMT CDmxAttribute::s_AttributeNameSymbols; +CUtlSymbolTableLargeMT CDmxAttribute::s_AttributeNameSymbols; //----------------------------------------------------------------------------- @@ -128,7 +128,7 @@ CDmxAttribute::CDmxAttribute( const char *pAttributeName ) m_pData = NULL; } -CDmxAttribute::CDmxAttribute( CUtlSymbol attributeName ) +CDmxAttribute::CDmxAttribute( CUtlSymbolLarge attributeName ) { m_Name = attributeName; m_Type = AT_UNKNOWN; @@ -221,7 +221,7 @@ inline const char* CDmxAttribute::GetTypeString() const //----------------------------------------------------------------------------- const char *CDmxAttribute::GetName() const { - return s_AttributeNameSymbols.String( m_Name ); + return m_Name.String(); } diff --git a/src/dmxloader/dmxelement.cpp b/src/dmxloader/dmxelement.cpp index 7537590f8..17a8b55a7 100644 --- a/src/dmxloader/dmxelement.cpp +++ b/src/dmxloader/dmxelement.cpp @@ -8,6 +8,7 @@ #include "dmxloader/dmxattribute.h" #include "tier1/utlbuffer.h" #include "mathlib/ssemath.h" +#include "tier1/utlsymbollarge.h" // memdbgon must be the last include file in a .cpp file!!! #include "tier0/memdbgon.h" @@ -16,7 +17,7 @@ //----------------------------------------------------------------------------- // globals //----------------------------------------------------------------------------- -CUtlSymbolTableMT CDmxElement::s_TypeSymbols; +CUtlSymbolTableLargeMT CDmxElement::s_TypeSymbols; //----------------------------------------------------------------------------- @@ -50,14 +51,14 @@ CDmxElement::~CDmxElement() //----------------------------------------------------------------------------- // Utility method for getting at the type //----------------------------------------------------------------------------- -CUtlSymbol CDmxElement::GetType() const +CUtlSymbolLarge CDmxElement::GetType() const { return m_Type; } const char* CDmxElement::GetTypeString() const { - return s_TypeSymbols.String( m_Type ); + return m_Type.String(); } const char* CDmxElement::GetName() const @@ -225,7 +226,7 @@ int CDmxElement::FindAttribute( const char *pAttributeName ) const //----------------------------------------------------------------------------- // Find an attribute by name-based lookup //----------------------------------------------------------------------------- -int CDmxElement::FindAttribute( CUtlSymbol attributeName ) const +int CDmxElement::FindAttribute( CUtlSymbolLarge attributeName ) const { Resort(); CDmxAttribute search( attributeName ); diff --git a/src/dmxloader/dmxloader.cpp b/src/dmxloader/dmxloader.cpp index 2083f3fec..a45b0f140 100644 --- a/src/dmxloader/dmxloader.cpp +++ b/src/dmxloader/dmxloader.cpp @@ -443,7 +443,7 @@ bool CDmxSerializer::Unserialize( CUtlBuffer &buf, int nEncodingVersion, CDmxEle } // Read in the element count. - int nElementCount = buf.GetInt(); + const int nElementCount = buf.GetInt(); if ( !nElementCount ) { // Empty (but valid) file @@ -474,10 +474,10 @@ bool CDmxSerializer::Unserialize( CUtlBuffer &buf, int nEncodingVersion, CDmxEle } else { - buf.GetString( pTypeBuf ); + buf.GetString<256>( pTypeBuf ); pType = pTypeBuf; } - buf.GetString( pName ); + buf.GetString<2048>( pName ); buf.Get( &id, sizeof(DmObjectId_t) ); CDmxElement *pElement = new CDmxElement( pType ); diff --git a/src/public/dmxloader/dmxattribute.h b/src/public/dmxloader/dmxattribute.h index 17523660e..78adea366 100644 --- a/src/public/dmxloader/dmxattribute.h +++ b/src/public/dmxloader/dmxattribute.h @@ -16,6 +16,7 @@ #include "tier1/utlrbtree.h" #include "tier1/utlsymbol.h" #include "tier1/mempool.h" +#include "utlsymbollarge.h" #include "dmxloader/dmxloader.h" @@ -48,7 +49,7 @@ class CDmxAttribute // Returns the name. NOTE: The utlsymbol // can be turned into a string by using g_pDataModel->String(); const char *GetName() const; - CUtlSymbol GetNameSymbol() const; + CUtlSymbolLarge GetNameSymbol() const; void SetName( const char *pName ); // Gets values @@ -89,7 +90,7 @@ class CDmxAttribute private: CDmxAttribute( const char *pAttributeName ); - CDmxAttribute( CUtlSymbol attributeName ); + CDmxAttribute( CUtlSymbolLarge attributeName ); ~CDmxAttribute(); // Allocate, free memory for data @@ -100,10 +101,10 @@ class CDmxAttribute void SetValue( DmAttributeType_t type, const void *pSrc, int nLen ); DmAttributeType_t m_Type; - CUtlSymbol m_Name; + CUtlSymbolLarge m_Name; void *m_pData; - static CUtlSymbolTableMT s_AttributeNameSymbols; + static CUtlSymbolTableLargeMT s_AttributeNameSymbols; friend class CDmxElement; }; @@ -122,7 +123,7 @@ template< class T > inline bool CDmxAttribute::IsA() const return GetType() == CDmAttributeInfo< T >::ATTRIBUTE_TYPE; } -inline CUtlSymbol CDmxAttribute::GetNameSymbol() const +inline CUtlSymbolLarge CDmxAttribute::GetNameSymbol() const { return m_Name; } diff --git a/src/public/dmxloader/dmxelement.h b/src/public/dmxloader/dmxelement.h index c4aadd8ac..bdb290b0e 100644 --- a/src/public/dmxloader/dmxelement.h +++ b/src/public/dmxloader/dmxelement.h @@ -15,6 +15,7 @@ #include "tier1/utlvector.h" #include "tier1/utlrbtree.h" #include "tier1/utlsymbol.h" +#include "utlsymbollarge.h" #include "tier1/mempool.h" #include "tier1/UtlSortVector.h" #include "dmxloader/dmxattribute.h" @@ -28,7 +29,7 @@ class CDmxAttributeLess public: bool Less( const CDmxAttribute * pAttribute1, const CDmxAttribute *pAttribute2, void *pContext ) { - return pAttribute1->GetNameSymbol() < pAttribute2->GetNameSymbol(); + return (pAttribute1 ? pAttribute1->GetNameSymbol() : CUtlSymbolLarge(UTL_INVAL_SYMBOL_LARGE)) < (pAttribute2 ? pAttribute2->GetNameSymbol() : CUtlSymbolLarge(UTL_INVAL_SYMBOL_LARGE)); } }; @@ -105,7 +106,7 @@ class CDmxElement int AttributeCount() const; CDmxAttribute *GetAttribute( int nIndex ); const CDmxAttribute *GetAttribute( int nIndex ) const; - CUtlSymbol GetType() const; + CUtlSymbolLarge GetType() const; const char* GetTypeString() const; const char* GetName() const; const DmObjectId_t &GetId() const; @@ -161,7 +162,7 @@ class CDmxElement // Finds an attribute by name int FindAttribute( const char *pAttributeName ) const; - int FindAttribute( CUtlSymbol attributeName ) const; + int FindAttribute( CUtlSymbolLarge attributeName ) const; // Sets the object id void SetId( const DmObjectId_t &id ); @@ -171,12 +172,12 @@ class CDmxElement AttributeList_t m_Attributes; DmObjectId_t m_Id; // We need this strictly because we support serialization - CUtlSymbol m_Type; + CUtlSymbolLarge m_Type; char m_nLockCount; mutable bool m_bResortNeeded : 1; bool m_bIsMarkedForDeletion : 1; - static CUtlSymbolTableMT s_TypeSymbols; + static CUtlSymbolTableLargeMT s_TypeSymbols; friend class CDmxSerializer; friend class CDmxSerializerKeyValues2; diff --git a/src/public/tier1/mempool.h b/src/public/tier1/mempool.h index 01d3a33f1..a8ae22fd4 100644 --- a/src/public/tier1/mempool.h +++ b/src/public/tier1/mempool.h @@ -66,8 +66,8 @@ class CUtlMemoryPool static void SetErrorReportFunc( MemoryPoolReportFunc_t func ); // returns number of allocated blocks - int Count() { return m_BlocksAllocated; } - int PeakCount() { return m_PeakAlloc; } + int Count() const { return m_BlocksAllocated; } + int PeakCount() const { return m_PeakAlloc; } protected: class CBlob @@ -111,7 +111,7 @@ class CUtlMemoryPool class CMemoryPoolMT : public CUtlMemoryPool { public: - CMemoryPoolMT(int blockSize, int numElements, int growMode = UTLMEMORYPOOL_GROW_FAST, const char *pszAllocOwner = NULL) : CUtlMemoryPool( blockSize, numElements, growMode, pszAllocOwner) {} + CMemoryPoolMT(int blockSize, int numElements, int growMode = UTLMEMORYPOOL_GROW_FAST, const char *pszAllocOwner = NULL, int nAlignment = 0) : CUtlMemoryPool( blockSize, numElements, growMode, pszAllocOwner, nAlignment) {} void* Alloc() { AUTO_LOCK( m_mutex ); return CUtlMemoryPool::Alloc(); } From 4f63036d229929c87987a82a021dfe5ed84f829e Mon Sep 17 00:00:00 2001 From: mastercoms Date: Thu, 9 Mar 2023 22:30:06 -0500 Subject: [PATCH 15/42] perf(init): stop using a mutex during search paths * I don't think there's any IO threading here that would warrant the need for mutexing this, as all platforms now use a single IO thread * this greatly improves initialization time when scanning paths --- src/filesystem/basefilesystem.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/filesystem/basefilesystem.h b/src/filesystem/basefilesystem.h index e413db7da..b7b3529e7 100644 --- a/src/filesystem/basefilesystem.h +++ b/src/filesystem/basefilesystem.h @@ -580,10 +580,8 @@ abstract_class CBaseFileSystem : public CTier1AppSystem< IFileSystem > if ( *ppszFilename && !Q_IsAbsolutePath( *ppszFilename ) ) { - // Copy paths to minimize mutex lock time - pFileSystem->m_SearchPathsMutex.Lock(); + // Copy to keep filesystem intact CopySearchPaths( pFileSystem->m_SearchPaths ); - pFileSystem->m_SearchPathsMutex.Unlock(); pFileSystem->FixUpPath ( *ppszFilename, m_Filename, sizeof( m_Filename ) ); } @@ -611,10 +609,8 @@ abstract_class CBaseFileSystem : public CTier1AppSystem< IFileSystem > { m_pathID = UTL_INVAL_SYMBOL; } - // Copy paths to minimize mutex lock time - pFileSystem->m_SearchPathsMutex.Lock(); + // Copy to keep filesystem intact CopySearchPaths( pFileSystem->m_SearchPaths ); - pFileSystem->m_SearchPathsMutex.Unlock(); m_Filename[0] = '\0'; } From d27e190d282646a6a0c0096eef4fba288b0385ca Mon Sep 17 00:00:00 2001 From: mastercoms Date: Thu, 9 Mar 2023 22:19:22 -0500 Subject: [PATCH 16/42] perf(init): lazily load parsed data for game DLL when on a listen server during startup, the particles manifest and item schema were being parsed in both the server DLL and client DLL on startup. this greatly delays a player's ability quickly launch the game and queue into a match. so, we can instead just skip initializing for server DLL until we need the manifests at map load. particles are already parsed specifically for the map at level init, and pending item schema updates from the GC are also initialized at level init, so this should be a fine change. it also shouldn't touch the dedicated server path. as an added boost, we now also backport a change from CSGO which added threaded init to the client. this completely hides the particle init cost behind item schema parsing time, which is the longer of the two initializations. --- src/game/client/cdll_client_int.cpp | 15 +++++++++++++-- src/game/server/gameinterface.cpp | 19 ++++++++++++++++++- src/game/shared/econ/econ_item_inventory.cpp | 11 ++++++++++- src/game/shared/econ/econ_item_inventory.h | 2 ++ src/game/shared/tf/tf_item_inventory.cpp | 5 +++++ src/game/shared/tf/tf_item_inventory.h | 1 + 6 files changed, 49 insertions(+), 4 deletions(-) diff --git a/src/game/client/cdll_client_int.cpp b/src/game/client/cdll_client_int.cpp index 625cbd120..b6ba114da 100644 --- a/src/game/client/cdll_client_int.cpp +++ b/src/game/client/cdll_client_int.cpp @@ -60,6 +60,7 @@ #include "datacache/imdlcache.h" #include "kbutton.h" #include "tier0/icommandline.h" +#include "vstdlib/jobthread.h" #include "gamerules_register.h" #include "vgui_controls/AnimationController.h" #include "bitmap/tgawriter.h" @@ -838,6 +839,14 @@ bool IsEngineThreaded() return false; } +bool InitParticleManager() +{ + if (!ParticleMgr()->Init(MAX_TOTAL_PARTICLES, materials)) + return false; + + return true; +} + //----------------------------------------------------------------------------- // Constructor //----------------------------------------------------------------------------- @@ -991,8 +1000,8 @@ int CHLClient::Init( CreateInterfaceFn appSystemFactory, CreateInterfaceFn physi if (!Initializer::InitializeAllObjects()) return false; - if (!ParticleMgr()->Init(MAX_TOTAL_PARTICLES, materials)) - return false; + CFunctorJob *pGameJob = new CFunctorJob( CreateFunctor( InitParticleManager ) ); + g_pThreadPool->AddJob( pGameJob ); if (!VGui_Startup( appSystemFactory )) @@ -1035,6 +1044,8 @@ int CHLClient::Init( CreateInterfaceFn appSystemFactory, CreateInterfaceFn physi modemanager->Init( ); + pGameJob->WaitForFinishAndRelease(); + g_pClientMode->InitViewport(); gHUD.Init(); diff --git a/src/game/server/gameinterface.cpp b/src/game/server/gameinterface.cpp index 509fd9058..d352ff281 100644 --- a/src/game/server/gameinterface.cpp +++ b/src/game/server/gameinterface.cpp @@ -565,6 +565,8 @@ EXPOSE_SINGLE_INTERFACE_GLOBALVAR(CServerGameDLL, IServerGameDLL, INTERFACEVERSI // When bumping the version to this interface, check that our assumption is still valid and expose the older version in the same way COMPILE_TIME_ASSERT( INTERFACEVERSION_SERVERGAMEDLL_INT == 10 ); +static bool bParsedParticles = false; + bool CServerGameDLL::DLLInit( CreateInterfaceFn appSystemFactory, CreateInterfaceFn physicsFactory, CreateInterfaceFn fileSystemFactory, CGlobalVars *pGlobals) @@ -727,7 +729,11 @@ bool CServerGameDLL::DLLInit( CreateInterfaceFn appSystemFactory, InvalidateQueryCache(); // Parse the particle manifest file & register the effects within it - ParseParticleEffects( false, false ); + if ( engine->IsDedicatedServer() ) + { + ParseParticleEffects( false, false ); + bParsedParticles = true; + } // try to get debug overlay, may be NULL if on HLDS debugoverlay = (IVDebugOverlay *)appSystemFactory( VDEBUG_OVERLAY_INTERFACE_VERSION, NULL ); @@ -958,6 +964,11 @@ bool CServerGameDLL::LevelInit( const char *pMapName, char const *pMapEntities, if ( pItemSchema ) { pItemSchema->BInitFromDelayedBuffer(); + // First valid class must be non-zero if we have a valid schema + if ( pItemSchema->GetFirstValidClass() == 0 ) + { + InventoryManager()->InitializeInventory(); + } } #endif // USES_ECON_ITEMS @@ -970,6 +981,12 @@ bool CServerGameDLL::LevelInit( const char *pMapName, char const *pMapEntities, UpdateRichPresence(); } + if ( !bParsedParticles ) + { + ParseParticleEffects( false, false ); + bParsedParticles = true; + } + //Tony; parse custom manifest if exists! ParseParticleEffectsMap( pMapName, false ); diff --git a/src/game/shared/econ/econ_item_inventory.cpp b/src/game/shared/econ/econ_item_inventory.cpp index 26732e783..1bc2b2b93 100644 --- a/src/game/shared/econ/econ_item_inventory.cpp +++ b/src/game/shared/econ/econ_item_inventory.cpp @@ -303,6 +303,16 @@ bool CInventoryManager::Init( void ) // Purpose: //----------------------------------------------------------------------------- void CInventoryManager::PostInit( void ) +{ +#ifdef GAME_DLL + if ( engine->IsDedicatedServer() ) +#endif + { + InitializeInventory(); + } +} + +void CInventoryManager::InitializeInventory() { // Initialize the item system. ItemSystem()->Init(); @@ -443,7 +453,6 @@ void CInventoryManager::LevelShutdownPostEntity( void ) ItemSystem()->ResetAttribStringCache(); } - //----------------------------------------------------------------------------- // Purpose: Lets the client know that we're now connected to the GC //----------------------------------------------------------------------------- diff --git a/src/game/shared/econ/econ_item_inventory.h b/src/game/shared/econ/econ_item_inventory.h index fd6e8b8b1..4b225a43f 100644 --- a/src/game/shared/econ/econ_item_inventory.h +++ b/src/game/shared/econ/econ_item_inventory.h @@ -216,6 +216,8 @@ class CInventoryManager : public CAutoGameSystemPerFrame virtual void LevelInitPreEntity( void ) OVERRIDE; virtual void LevelShutdownPostEntity( void ) OVERRIDE; + virtual void InitializeInventory( void ); + #ifdef CLIENT_DLL // Gets called each frame virtual void Update( float frametime ) OVERRIDE; diff --git a/src/game/shared/tf/tf_item_inventory.cpp b/src/game/shared/tf/tf_item_inventory.cpp index 5b77ae604..0e321f959 100644 --- a/src/game/shared/tf/tf_item_inventory.cpp +++ b/src/game/shared/tf/tf_item_inventory.cpp @@ -220,6 +220,11 @@ CTFInventoryManager::~CTFInventoryManager( void ) void CTFInventoryManager::PostInit( void ) { BaseClass::PostInit(); +} + +void CTFInventoryManager::InitializeInventory() +{ + BaseClass::InitializeInventory(); GenerateBaseItems(); } diff --git a/src/game/shared/tf/tf_item_inventory.h b/src/game/shared/tf/tf_item_inventory.h index 8d1f30516..3b14189d7 100644 --- a/src/game/shared/tf/tf_item_inventory.h +++ b/src/game/shared/tf/tf_item_inventory.h @@ -150,6 +150,7 @@ class CTFInventoryManager : public CInventoryManager ~CTFInventoryManager(); virtual void PostInit( void ); + virtual void InitializeInventory(); #ifdef CLIENT_DLL virtual CPlayerInventory *GeneratePlayerInventoryObject() const { return new CTFPlayerInventory; } From 153c113437717fc3d613d50e47fb5e65a0b81e05 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Thu, 9 Mar 2023 22:28:31 -0500 Subject: [PATCH 17/42] perf: stricmp optimizations * backported FastASCIIToLower from CSGO and applied it appropriately to strtools * backported _V_stricmp conditional structure from CSGO but adjusted it to use a fast ASCII lookup table and also use tolower on non-ASCII characters for general safety * select a few choice cases to migrate from stricmp to V_stricmp CSGO applies it to everything but these are few hotspots --- src/tier1/strtools.cpp | 85 +++++++++++++++++------- src/vgui2/src/LocalizedStringTable.cpp | 2 +- src/vgui2/vgui_controls/Panel.cpp | 2 +- src/vguimatsurface/TextureDictionary.cpp | 2 +- src/vstdlib/KeyValuesSystem.cpp | 2 +- 5 files changed, 65 insertions(+), 28 deletions(-) diff --git a/src/tier1/strtools.cpp b/src/tier1/strtools.cpp index a63fa4fcd..62c4d134f 100644 --- a/src/tier1/strtools.cpp +++ b/src/tier1/strtools.cpp @@ -80,20 +80,20 @@ #endif #include "tier0/memdbgon.h" -static int FastToLower( char c ) -{ - int i = (unsigned char) c; - if ( i < 0x80 ) - { - // Brutally fast branchless ASCII tolower(): - i += (((('A'-1) - i) & (i - ('Z'+1))) >> 26) & 0x20; - } - else - { - i += isupper( i ) ? 0x20 : 0; - } - return i; -} +#define USE_FAST_CASE_CONVERSION 1 +#if USE_FAST_CASE_CONVERSION +/// Faster conversion of an ascii char to upper case. This function does not obey locale or any language +/// setting. It should not be used to convert characters for printing, but it is a better choice +/// for internal strings such as used for hash table keys, etc. It's meant to be inlined and used +/// in places like the various dictionary classes. Not obeying locale also protects you from things +/// like your hash values being different depending on the locale setting. +#define FastASCIIToUpper( c ) ( ( ( (c) >= 'a' ) && ( (c) <= 'z' ) ) ? ( (c) - 32 ) : (c) ) +/// similar to FastASCIIToLower +#define FastASCIIToLower( c ) ( ( ( (c) >= 'A' ) && ( (c) <= 'Z' ) ) ? ( (c) + 32 ) : (c) ) +#else +#define FastASCIIToLower tolower +#define FastASCIIToUpper toupper +#endif void _V_memset (const char* file, int line, void *dest, int fill, int count) { @@ -260,6 +260,17 @@ char *V_strnlwr(char *s, size_t count) return pRet; } +static constexpr uint8 lowerAsciiLookup[128] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, + 0x40, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F +}; + int V_stricmp( const char *str1, const char *str2 ) { // It is not uncommon to compare a string to itself. See @@ -272,6 +283,7 @@ int V_stricmp( const char *str1, const char *str2 ) } const unsigned char *s1 = (const unsigned char*)str1; const unsigned char *s2 = (const unsigned char*)str2; +#if 0 for ( ; *s1; ++s1, ++s2 ) { if ( *s1 != *s2 ) @@ -291,6 +303,31 @@ int V_stricmp( const char *str1, const char *str2 ) } } return *s2 ? -1 : 0; +#else + while (true) + { + unsigned char c1 = *s1++; + unsigned char c2 = *s2++; + if (c1 == c2) + { + if ( !c1 ) return 0; + } + else if ((((uint32)c1 | (uint32)c2) & 0xffffff80) == 0) + { + if (int32 res = lowerAsciiLookup[c1] - lowerAsciiLookup[c2]) + { + return res; + } + } + else + { + if (int32 res = tolower(c1) - tolower(c2)) + { + return res; + } + } + } +#endif } int V_strnicmp( const char *str1, const char *str2, int n ) @@ -348,7 +385,7 @@ const char *StringAfterPrefix( const char *str, const char *prefix ) if ( !*prefix ) return str; } - while ( FastToLower( *str++ ) == FastToLower( *prefix++ ) ); + while ( tolower( *str++ ) == tolower( *prefix++ ) ); return NULL; } @@ -638,7 +675,7 @@ char const* V_stristr( char const* pStr, char const* pSearch ) while (*pLetter != 0) { // Skip over non-matches - if (FastToLower((unsigned char)*pLetter) == FastToLower((unsigned char)*pSearch)) + if (FastASCIIToLower((unsigned char)*pLetter) == FastASCIIToLower((unsigned char)*pSearch)) { // Check for match char const* pMatch = pLetter + 1; @@ -649,7 +686,7 @@ char const* V_stristr( char const* pStr, char const* pSearch ) if (*pMatch == 0) return 0; - if (FastToLower((unsigned char)*pMatch) != FastToLower((unsigned char)*pTest)) + if (FastASCIIToLower((unsigned char)*pMatch) != FastASCIIToLower((unsigned char)*pTest)) break; ++pMatch; @@ -696,7 +733,7 @@ char const* V_strnistr( char const* pStr, char const* pSearch, int n ) return 0; // Skip over non-matches - if (FastToLower(*pLetter) == FastToLower(*pSearch)) + if (FastASCIIToLower(*pLetter) == FastASCIIToLower(*pSearch)) { int n1 = n - 1; @@ -712,7 +749,7 @@ char const* V_strnistr( char const* pStr, char const* pSearch, int n ) if (*pMatch == 0) return 0; - if (FastToLower(*pMatch) != FastToLower(*pTest)) + if (FastASCIIToLower(*pMatch) != FastASCIIToLower(*pTest)) break; ++pMatch; @@ -1421,7 +1458,7 @@ int _V_UCS2ToUnicode( const ucs2 *pUCS2, wchar_t *pUnicode, int cubDestSizeInByt size_t nMaxUTF8 = cubDestSizeInBytes; char *pIn = (char *)pUCS2; char *pOut = (char *)pUnicode; - if ( conv_t > 0 ) + if ( conv_t != nullptr ) { cchResult = iconv( conv_t, &pIn, &nLenUnicde, &pOut, &nMaxUTF8 ); iconv_close( conv_t ); @@ -1461,7 +1498,7 @@ int _V_UnicodeToUCS2( const wchar_t *pUnicode, int cubSrcInBytes, char *pUCS2, i size_t nMaxUCS2 = cubDestSizeInBytes; char *pIn = (char*)pUnicode; char *pOut = pUCS2; - if ( conv_t > 0 ) + if ( conv_t != nullptr ) { cchResult = iconv( conv_t, &pIn, &nLenUnicde, &pOut, &nMaxUCS2 ); iconv_close( conv_t ); @@ -1509,7 +1546,7 @@ int _V_UCS2ToUTF8( const ucs2 *pUCS2, char *pUTF8, int cubDestSizeInBytes ) size_t nMaxUTF8 = cubDestSizeInBytes - 1; char *pIn = (char *)pUCS2; char *pOut = (char *)pUTF8; - if ( conv_t > 0 ) + if ( conv_t != nullptr ) { const size_t nBytesToWrite = nMaxUTF8; cchResult = iconv( conv_t, &pIn, &nLenUnicde, &pOut, &nMaxUTF8 ); @@ -1554,7 +1591,7 @@ int _V_UTF8ToUCS2( const char *pUTF8, int cubSrcInBytes, ucs2 *pUCS2, int cubDes size_t nMaxUTF8 = cubDestSizeInBytes; char *pIn = (char *)pUTF8; char *pOut = (char *)pUCS2; - if ( conv_t > 0 ) + if ( conv_t != nullptr ) { cchResult = iconv( conv_t, &pIn, &nLenUnicde, &pOut, &nMaxUTF8 ); iconv_close( conv_t ); @@ -2275,7 +2312,7 @@ bool V_MakeRelativePath( const char *pFullPath, const char *pDirectory, char *pR // Strip out common parts of the path const char *pLastCommonPath = NULL; const char *pLastCommonDir = NULL; - while ( *pPath && ( FastToLower( *pPath ) == FastToLower( *pDir ) || + while ( *pPath && ( tolower( *pPath ) == tolower( *pDir ) || ( PATHSEPARATOR( *pPath ) && ( PATHSEPARATOR( *pDir ) || (*pDir == 0) ) ) ) ) { if ( PATHSEPARATOR( *pPath ) ) diff --git a/src/vgui2/src/LocalizedStringTable.cpp b/src/vgui2/src/LocalizedStringTable.cpp index 4e34164f9..2f6effd75 100644 --- a/src/vgui2/src/LocalizedStringTable.cpp +++ b/src/vgui2/src/LocalizedStringTable.cpp @@ -731,7 +731,7 @@ bool CLocalizedStringTable::SymLess(localizedstring_t const &i1, localizedstring const char *str2 = (i2.nameIndex == INVALID_LOCALIZE_STRING_INDEX) ? i2.pszValueString : &g_StringTable.m_Names[i2.nameIndex]; - return stricmp(str1, str2) < 0; + return V_stricmp(str1, str2) < 0; } diff --git a/src/vgui2/vgui_controls/Panel.cpp b/src/vgui2/vgui_controls/Panel.cpp index 2d20a355a..28516440e 100644 --- a/src/vgui2/vgui_controls/Panel.cpp +++ b/src/vgui2/vgui_controls/Panel.cpp @@ -6343,7 +6343,7 @@ PanelAnimationMapEntry *Panel::FindPanelAnimationEntry( char const *scriptname, { PanelAnimationMapEntry *e = &map->entries[ i ]; - if ( !stricmp( e->name(), scriptname ) ) + if ( !V_stricmp( e->name(), scriptname ) ) { return e; } diff --git a/src/vguimatsurface/TextureDictionary.cpp b/src/vguimatsurface/TextureDictionary.cpp index d8fe072d1..eda60e88e 100644 --- a/src/vguimatsurface/TextureDictionary.cpp +++ b/src/vguimatsurface/TextureDictionary.cpp @@ -983,7 +983,7 @@ int CTextureDictionary::FindTextureIdForTextureFile( char const *pFileName ) if ( !mat ) continue; - if ( !stricmp( mat->GetName(), pFileName ) ) + if ( ! V_stricmp( mat->GetName(), pFileName ) ) return i; } diff --git a/src/vstdlib/KeyValuesSystem.cpp b/src/vstdlib/KeyValuesSystem.cpp index 0665d94f5..4b8f16d03 100644 --- a/src/vstdlib/KeyValuesSystem.cpp +++ b/src/vstdlib/KeyValuesSystem.cpp @@ -235,7 +235,7 @@ HKeySymbol CKeyValuesSystem::GetSymbolForString( const char *name, bool bCreate hash_item_t *item = &m_HashTable[hash]; while (1) { - if (!stricmp(name, (char *)m_Strings.GetBase() + item->stringIndex )) + if (!V_stricmp(name, (char *)m_Strings.GetBase() + item->stringIndex )) { return (HKeySymbol)item->stringIndex; } From b379961aece998da89e061193af7a7b7e3b24ae7 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Thu, 9 Mar 2023 22:22:00 -0500 Subject: [PATCH 18/42] perf: make various SharedObjects final to avoid virtual calls in many cases, certain SharedObject classes are used directly, so there is no need to virtualize calls in these circumstances. this saves a bit of runtime performance especially with CEconItem i didn't finalize the classes whose types were not used directly --- src/game/shared/econ/econ_contribution.h | 2 +- src/game/shared/econ/econ_game_account_client.h | 2 +- src/game/shared/econ/econ_item.h | 2 +- src/game/shared/tf/tf_duel_summary.h | 2 +- src/game/shared/tf/tf_ladder_data.h | 4 ++-- src/game/shared/tf/tf_lobby_server.h | 2 +- src/game/shared/tf/tf_party.h | 2 +- src/game/shared/tf/tf_rating_data.h | 2 +- src/game/shared/tf/tf_wardata.h | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/game/shared/econ/econ_contribution.h b/src/game/shared/econ/econ_contribution.h index b46f87f47..9721748f9 100644 --- a/src/game/shared/econ/econ_contribution.h +++ b/src/game/shared/econ/econ_contribution.h @@ -22,7 +22,7 @@ namespace GCSDK //--------------------------------------------------------------------------------- // Purpose: All the account-level information that the GC tracks for TF //--------------------------------------------------------------------------------- -class CTFMapContribution : public GCSDK::CProtoBufSharedObject< CSOTFMapContribution, k_EEconTypeMapContribution > +class CTFMapContribution final : public GCSDK::CProtoBufSharedObject< CSOTFMapContribution, k_EEconTypeMapContribution > { #ifdef GC DECLARE_CLASS_MEMPOOL( CTFMapContribution ); diff --git a/src/game/shared/econ/econ_game_account_client.h b/src/game/shared/econ/econ_game_account_client.h index 372086816..29bb8b478 100644 --- a/src/game/shared/econ/econ_game_account_client.h +++ b/src/game/shared/econ/econ_game_account_client.h @@ -17,7 +17,7 @@ //--------------------------------------------------------------------------------- // Purpose: All the account-level information that the GC tracks //--------------------------------------------------------------------------------- -class CEconGameAccountClient : public GCSDK::CProtoBufSharedObject< CSOEconGameAccountClient, k_EEconTypeGameAccountClient > +class CEconGameAccountClient final : public GCSDK::CProtoBufSharedObject< CSOEconGameAccountClient, k_EEconTypeGameAccountClient > { #ifdef GC DECLARE_CLASS_MEMPOOL( CEconGameAccountClient ); diff --git a/src/game/shared/econ/econ_item.h b/src/game/shared/econ/econ_item.h index ae3270edf..2e1aea098 100644 --- a/src/game/shared/econ/econ_item.h +++ b/src/game/shared/econ/econ_item.h @@ -282,7 +282,7 @@ template < typename T > uint32 WrapDeprecatedUntypedEconItemAttribute( T tValue template < typename TAttribInMemoryType > schema_attribute_stat_bucket_t ISchemaAttributeTypeBase::s_InstanceStats; -class CEconItem : public GCSDK::CSharedObject, public CMaterialOverrideContainer< IEconItemInterface > +class CEconItem final : public GCSDK::CSharedObject, public CMaterialOverrideContainer< IEconItemInterface > { #ifdef GC_DLL DECLARE_CLASS_MEMPOOL( CEconItem ); diff --git a/src/game/shared/tf/tf_duel_summary.h b/src/game/shared/tf/tf_duel_summary.h index 4529423c0..b209a607e 100644 --- a/src/game/shared/tf/tf_duel_summary.h +++ b/src/game/shared/tf/tf_duel_summary.h @@ -44,7 +44,7 @@ const uint32 kWinsPerLevel = 10; //--------------------------------------------------------------------------------- // Purpose: //--------------------------------------------------------------------------------- -class CTFDuelSummary : public GCSDK::CProtoBufSharedObject< CSOTFDuelSummary, k_EEconTypeDuelSummary > +class CTFDuelSummary final : public GCSDK::CProtoBufSharedObject< CSOTFDuelSummary, k_EEconTypeDuelSummary > { #ifdef GC DECLARE_CLASS_MEMPOOL( CTFDuelSummary ); diff --git a/src/game/shared/tf/tf_ladder_data.h b/src/game/shared/tf/tf_ladder_data.h index a1b672a1d..d2ea2d02a 100644 --- a/src/game/shared/tf/tf_ladder_data.h +++ b/src/game/shared/tf/tf_ladder_data.h @@ -27,7 +27,7 @@ //--------------------------------------------------------------------------------- // Purpose: The shared object that contains a ladder player's stats //--------------------------------------------------------------------------------- -class CSOTFLadderData : public GCSDK::CProtoBufSharedObject< CSOTFLadderPlayerStats, k_EEConTypeLadderData > +class CSOTFLadderData final : public GCSDK::CProtoBufSharedObject< CSOTFLadderPlayerStats, k_EEConTypeLadderData > { public: CSOTFLadderData(); @@ -55,7 +55,7 @@ CSOTFLadderData *GetLocalPlayerLadderData( EMatchGroup nMatchGroup ); // TODO: G //--------------------------------------------------------------------------------- // Purpose: The shared object that contains stats from a specific match - for match history on the client //--------------------------------------------------------------------------------- -class CSOTFMatchResultPlayerInfo : public GCSDK::CProtoBufSharedObject< CSOTFMatchResultPlayerStats, k_EEConTypeMatchResultPlayerInfo > +class CSOTFMatchResultPlayerInfo final : public GCSDK::CProtoBufSharedObject< CSOTFMatchResultPlayerStats, k_EEConTypeMatchResultPlayerInfo > { public: CSOTFMatchResultPlayerInfo(); diff --git a/src/game/shared/tf/tf_lobby_server.h b/src/game/shared/tf/tf_lobby_server.h index 8252b8397..dffb18788 100644 --- a/src/game/shared/tf/tf_lobby_server.h +++ b/src/game/shared/tf/tf_lobby_server.h @@ -15,7 +15,7 @@ #include "tf_matchmaking_shared.h" #include "playergroup.h" -class CTFGSLobby : public GCSDK::CProtoBufSharedObject +class CTFGSLobby final : public GCSDK::CProtoBufSharedObject { typedef GCSDK::CProtoBufSharedObject BaseClass; public: diff --git a/src/game/shared/tf/tf_party.h b/src/game/shared/tf/tf_party.h index f391773d4..bfca53dcc 100644 --- a/src/game/shared/tf/tf_party.h +++ b/src/game/shared/tf/tf_party.h @@ -24,7 +24,7 @@ namespace GCSDK const int k_nTFPartyMaxSize = 6; -class CTFParty : public GCSDK::CProtoBufSharedObject, public GCSDK::IParty +class CTFParty final : public GCSDK::CProtoBufSharedObject, public GCSDK::IParty { #ifdef GC DECLARE_CLASS_MEMPOOL( CTFParty ); diff --git a/src/game/shared/tf/tf_rating_data.h b/src/game/shared/tf/tf_rating_data.h index 0d1b53833..bfeaf306b 100644 --- a/src/game/shared/tf/tf_rating_data.h +++ b/src/game/shared/tf/tf_rating_data.h @@ -21,7 +21,7 @@ //--------------------------------------------------------------------------------- // Purpose: The shared object that contains a specific MM rating //--------------------------------------------------------------------------------- -class CTFRatingData : public GCSDK::CProtoBufSharedObject< CSOTFRatingData, k_EProtoObjectTFRatingData, /* bPublicMutable */ false > +class CTFRatingData final : public GCSDK::CProtoBufSharedObject< CSOTFRatingData, k_EProtoObjectTFRatingData, /* bPublicMutable */ false > { public: CTFRatingData(); diff --git a/src/game/shared/tf/tf_wardata.h b/src/game/shared/tf/tf_wardata.h index bd78825d6..b49b684bc 100644 --- a/src/game/shared/tf/tf_wardata.h +++ b/src/game/shared/tf/tf_wardata.h @@ -24,7 +24,7 @@ //--------------------------------------------------------------------------------- // Purpose: The shared object that contains a user's stats for a war //--------------------------------------------------------------------------------- -class CWarData : public GCSDK::CProtoBufSharedObject< CSOWarData, k_EEConTypeWarData > +class CWarData final : public GCSDK::CProtoBufSharedObject< CSOWarData, k_EEConTypeWarData > { public: CWarData(); From 499a50bb929e195649e82fb1c0e53ab23ee445d9 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Thu, 9 Mar 2023 22:33:03 -0500 Subject: [PATCH 19/42] perf: disable CWin32ReadOnlyFile to prevent duplicate stat calls * when a file doesn't exist in a search path, then CWin32ReadOnlyFile can't open that file. and then we try yet another open in CStdioFile * technically, we could refactor the open code to be able to communicate an error code, but I didn't see much point to using Win32 specific files * I think there are some cases where not using CWin32ReadOnlyFile can break some demo playback on Windows, so I might either fix that or do the above --- src/filesystem/filesystem_stdio.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/filesystem/filesystem_stdio.cpp b/src/filesystem/filesystem_stdio.cpp index 43916fe33..95a8535ff 100644 --- a/src/filesystem/filesystem_stdio.cpp +++ b/src/filesystem/filesystem_stdio.cpp @@ -240,8 +240,8 @@ ConVar filesystem_unbuffered_io( "filesystem_unbuffered_io", "1", 0, "" ); #define UseUnbufferedIO() true #endif -ConVar filesystem_native( "filesystem_native", "1", 0, "Use native FS or STDIO" ); -ConVar filesystem_max_stdio_read( "filesystem_max_stdio_read", IsX360() ? "64" : "16", 0, "" ); +ConVar filesystem_native( "filesystem_native", "0", 0, "Use native FS or STDIO" ); +ConVar filesystem_max_stdio_read( "filesystem_max_stdio_read", "64", 0, "" ); ConVar filesystem_report_buffered_io( "filesystem_report_buffered_io", "0" ); //----------------------------------------------------------------------------- @@ -417,7 +417,7 @@ FILE *CFileSystem_Stdio::FS_fopen( const char *filenameT, const char *options, u CBaseFileSystem::FixUpPath ( filenameT, filename, sizeof( filename ) ); -#ifdef _WIN32 +#if defined(_WIN32) && 0 if ( CWin32ReadOnlyFile::CanOpen( filename, options ) ) { pFile = CWin32ReadOnlyFile::FS_fopen( filename, options, size ); From 60f231597be343637d21458d5432ddedac65fb00 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Fri, 10 Mar 2023 03:59:52 -0500 Subject: [PATCH 20/42] perf: remove unused r_norefresh functionality --- src/engine/gl_rmain.cpp | 6 ------ src/engine/matsys_interface.cpp | 1 - 2 files changed, 7 deletions(-) diff --git a/src/engine/gl_rmain.cpp b/src/engine/gl_rmain.cpp index 63707e4d5..36a80f6f9 100644 --- a/src/engine/gl_rmain.cpp +++ b/src/engine/gl_rmain.cpp @@ -326,7 +326,6 @@ class CRender : public IRender float m_yFOV; // timing - double m_frameStartTime; float m_framerate; float m_zNear; @@ -377,11 +376,6 @@ void CRender::FrameBegin( void ) r_framecount++; R_AnimateLight (); R_PushDlights(); - - if (!r_norefresh.GetInt()) - { - m_frameStartTime = Sys_FloatTime (); - } } UpdateStudioRenderConfig(); diff --git a/src/engine/matsys_interface.cpp b/src/engine/matsys_interface.cpp index 540bd4497..0b7bfd085 100644 --- a/src/engine/matsys_interface.cpp +++ b/src/engine/matsys_interface.cpp @@ -100,7 +100,6 @@ static CTextureReference g_ResolvedFullFrameDepth; void WorldStaticMeshCreate( void ); void WorldStaticMeshDestroy( void ); -ConVar r_norefresh( "r_norefresh","0"); ConVar r_decals( "r_decals", "2048" ); ConVar mp_decals( "mp_decals","200", FCVAR_ARCHIVE); ConVar r_lightmap( "r_lightmap", "-1", FCVAR_CHEAT | FCVAR_MATERIAL_SYSTEM_THREAD ); From 706f5f5945fbf468a6320439964d5988373a612d Mon Sep 17 00:00:00 2001 From: mastercoms Date: Fri, 10 Mar 2023 22:19:30 -0500 Subject: [PATCH 21/42] perf: fps_max adjustments backport fps_max 49 limit from CSGO add a new frame limiter method which sleeps until a certain accuracy threshold then, it will tightly loop while waiting for our yield to reach the end time this ensures that we always arrive at the exact time that the FPS limit expects before, this was sensitive to 2 issues, one being that the pause instruction has different cycle counts on different CPUs and the other being that the frame limit was not being handled efficiently within the busy wait period --- src/engine/sys_engine.cpp | 30 +++++++++++++----------------- src/public/tier0/threadtools.h | 1 + src/tier0/threadtools.cpp | 11 +++++++++++ 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/src/engine/sys_engine.cpp b/src/engine/sys_engine.cpp index c75abcffb..d80ced99d 100644 --- a/src/engine/sys_engine.cpp +++ b/src/engine/sys_engine.cpp @@ -244,13 +244,13 @@ bool CEngine::FilterTime( float dt ) // Dedicated's tic_rate regulates server frame rate. Don't apply fps filter here. // Only do this restriction on the client. Prevents clients from accomplishing certain // hacks by pausing their client for a period of time. - if ( IsPC() && !sv.IsDedicated() && !CanCheat() && fps_max.GetFloat() < 30 ) + if ( IsPC() && !sv.IsDedicated() && !CanCheat() && fps_max.GetFloat() < 49 ) { // Don't do anything if fps_max=0 (which means it's unlimited). if ( fps_max.GetFloat() != 0.0f ) { - Warning( "sv_cheats is 0 and fps_max is being limited to a minimum of 30 (or set to 0).\n" ); - fps_max.SetValue( 30.0f ); + Warning( "sv_cheats is 0 and fps_max is being limited to a minimum of 49 (or set to 0).\n" ); + fps_max.SetValue( 49.0f ); } } @@ -343,7 +343,9 @@ void CEngine::Frame( void ) { // ThreadSleep may be imprecise. On non-dedicated servers, we busy-sleep // for the last one or two milliseconds to ensure very tight timing. - float fBusyWaitMS = IsWindows() ? 2.25f : 1.5f; + float fBusyWaitMS = IsWindows() ? 2.0f : 1.5f; + float fWaitTime = m_flMinFrameTime - m_flFrameTime; + float fWaitEnd = m_flCurrentTime + fWaitTime; if ( sv.IsDedicated() ) { fBusyWaitMS = host_timer_spin_ms.GetFloat(); @@ -354,23 +356,17 @@ void CEngine::Frame( void ) // to avoid wasting power and to let other threads/processes run. // Calculate how long we need to wait. int nSleepMS = (int)( ( m_flMinFrameTime - m_flFrameTime ) * 1000 - fBusyWaitMS ); - if ( nSleepMS > 0 ) + if ( nSleepMS > fBusyWaitMS ) { ThreadSleep( nSleepMS ); } - else + + while ( Plat_FloatTime() < fWaitEnd ) { - // On x86, busy-wait using PAUSE instruction which encourages - // power savings by idling for ~10 cycles (also yielding to - // the other logical hyperthread core if the CPU supports it) - for (int i = 2000; i >= 0; --i) - { -#if defined(POSIX) - __asm( "pause" ); __asm( "pause" ); __asm( "pause" ); __asm( "pause" ); -#elif defined(IS_WINDOWS_PC) - _asm { pause }; _asm { pause }; _asm { pause }; _asm { pause }; -#endif - } + ThreadPause(); + // Yield the CPU to other threads so we don't spin too tightly + // ThreadSleep(0) is not tight enough. + ThreadYield(); } // Go back to the top of the loop and see if it is time yet. diff --git a/src/public/tier0/threadtools.h b/src/public/tier0/threadtools.h index b0b9b1d27..f1adca1b2 100644 --- a/src/public/tier0/threadtools.h +++ b/src/public/tier0/threadtools.h @@ -118,6 +118,7 @@ PLATFORM_INTERFACE bool ReleaseThreadHandle( ThreadHandle_t ); //----------------------------------------------------------------------------- PLATFORM_INTERFACE void ThreadSleep(unsigned duration = 0); +PLATFORM_INTERFACE void ThreadYield(); PLATFORM_INTERFACE uint ThreadGetCurrentId(); PLATFORM_INTERFACE ThreadHandle_t ThreadGetCurrentHandle(); PLATFORM_INTERFACE int ThreadGetPriority( ThreadHandle_t hThread = NULL ); diff --git a/src/tier0/threadtools.cpp b/src/tier0/threadtools.cpp index 001e703a8..617d3cf8d 100644 --- a/src/tier0/threadtools.cpp +++ b/src/tier0/threadtools.cpp @@ -214,6 +214,17 @@ void ThreadSleep(unsigned nMilliseconds) //----------------------------------------------------------------------------- +void ThreadYield() +{ +#ifdef _WIN32 + SwitchToThread(); +#elif defined(POSIX) + sched_yield(); +#endif +} + +//----------------------------------------------------------------------------- + #ifndef ThreadGetCurrentId uint ThreadGetCurrentId() { From f193968474d0f0ade9dbcd1cb22e7fa8eec564c5 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Fri, 10 Mar 2023 22:26:00 -0500 Subject: [PATCH 22/42] perf: DX9Ex frame latency improvements prioritize GPU thread and use DX9 driver to force frame sync if available --- .../shaderapidx9/shaderdevicedx8.cpp | 26 ++++++++++++++++++- .../shaderapidx9/shaderdevicedx8.h | 7 +++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/materialsystem/shaderapidx9/shaderdevicedx8.cpp b/src/materialsystem/shaderapidx9/shaderdevicedx8.cpp index 210c643bc..2789158b5 100644 --- a/src/materialsystem/shaderapidx9/shaderdevicedx8.cpp +++ b/src/materialsystem/shaderapidx9/shaderdevicedx8.cpp @@ -2401,6 +2401,15 @@ bool CShaderDeviceDx8::CreateD3DDevice( void* pHWnd, int nAdapter, const ShaderD g_pHardwareConfig->SetupHardwareCaps( info, g_ShaderDeviceMgrDx8.GetHardwareCaps( nAdapter ) ); +#if defined(IS_WINDOWS_PC) && defined(SHADERAPIDX9) + if ( g_ShaderDeviceUsingD3D9Ex ) + { + Dx9ExDevice()->SetMaximumFrameLatency(2); + static ConVarRef mat_forcehardwaresync("mat_forcehardwaresync"); + mat_forcehardwaresync.SetValue(0); + } +#endif + // FIXME: Bake this into hardware config // What texture formats do we support? if ( D3DSupportsCompressedTextures() ) @@ -3371,20 +3380,35 @@ void CShaderDeviceDx8::Present() // if we're in queued mode, don't present if the device is already lost bool bValidPresent = true; bool bInMainThread = ThreadInMainThread(); - if ( !bInMainThread ) + static bool s_bSetPriority = true; + if ( bInMainThread ) + { + s_bSetPriority = true; + } + else { // don't present if the device is in an invalid state and in queued mode if ( m_DeviceState != DEVICE_STATE_OK ) { + s_bSetPriority = true; bValidPresent = false; } // check for lost device early in threaded mode CheckDeviceLost( m_bOtherAppInitializing ); if ( m_DeviceState != DEVICE_STATE_OK ) { + s_bSetPriority = true; bValidPresent = false; } } +#if defined(IS_WINDOWS_PC) && defined(SHADERAPIDX9) + if ( bValidPresent && s_bSetPriority && g_ShaderDeviceUsingD3D9Ex ) + { + s_bSetPriority = false; + Dx9ExDevice()->SetGPUThreadPriority(7); + Dx9ExDevice()->SetMaximumFrameLatency(2); + } +#endif // Copy the back buffer into the non-interactive temp buffer if ( m_NonInteractiveRefresh.m_Mode == MATERIAL_NON_INTERACTIVE_MODE_LEVEL_LOAD ) { diff --git a/src/materialsystem/shaderapidx9/shaderdevicedx8.h b/src/materialsystem/shaderapidx9/shaderdevicedx8.h index 4e20efea3..3dbbdc949 100644 --- a/src/materialsystem/shaderapidx9/shaderdevicedx8.h +++ b/src/materialsystem/shaderapidx9/shaderdevicedx8.h @@ -361,6 +361,13 @@ FORCEINLINE IDirect3DDevice9 *Dx9Device() return g_pD3DDevice; } +#if defined(IS_WINDOWS_PC) && defined(SHADERAPIDX9) +FORCEINLINE IDirect3DDevice9Ex* Dx9ExDevice() +{ + return static_cast( g_pD3DDevice ); +} +#endif + extern CShaderDeviceDx8* g_pShaderDeviceDx8; From f95dbe44233a0f4380e240582391ec451f7e5340 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 6 Mar 2023 12:05:17 -0500 Subject: [PATCH 23/42] fix: MvM bomb carrier voice line playing during normal CTF with bots ref: https://github.com/ValveSoftware/Source-1-Games/issues/715 --- .../behavior/scenario/capture_the_flag/tf_bot_deliver_flag.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/game/server/tf/bot/behavior/scenario/capture_the_flag/tf_bot_deliver_flag.cpp b/src/game/server/tf/bot/behavior/scenario/capture_the_flag/tf_bot_deliver_flag.cpp index 873c36ed3..166dbc305 100644 --- a/src/game/server/tf/bot/behavior/scenario/capture_the_flag/tf_bot_deliver_flag.cpp +++ b/src/game/server/tf/bot/behavior/scenario/capture_the_flag/tf_bot_deliver_flag.cpp @@ -243,7 +243,7 @@ ActionResult< CTFBot > CTFBotDeliverFlag::Update( CTFBot *me, float interval ) m_flTotalTravelDistance = NavAreaTravelDistance( me->GetLastKnownArea(), TheNavMesh->GetNavArea( zone->WorldSpaceCenter() ), cost ); - if ( flOldTravelDistance != -1.0f && m_flTotalTravelDistance - flOldTravelDistance > 2000.0f ) + if ( TFGameRules()->IsMannVsMachineMode() && flOldTravelDistance != -1.0f && m_flTotalTravelDistance - flOldTravelDistance > 2000.0f ) { TFGameRules()->BroadcastSound( 255, "Announcer.MVM_Bomb_Reset" ); From c3a06853cb32be89ffcbd330f25830cdc108c1b2 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 25 Jul 2022 14:00:26 -0400 Subject: [PATCH 24/42] fix: tips being changed multiple times during map load when stats are loaded, tips get updated, cycling to another tip this causes an unwanted tip cycle where the first tip could be shown for a very short time while the player is reading it and moving to a new random one seemingly arbitrarily this keeps the same tip on the loading screen, as intended --- src/game/client/tf/vgui/tf_statsummary.cpp | 11 +++++++---- src/game/client/tf/vgui/tf_statsummary.h | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/game/client/tf/vgui/tf_statsummary.cpp b/src/game/client/tf/vgui/tf_statsummary.cpp index 8e968291f..df18e9a0a 100644 --- a/src/game/client/tf/vgui/tf_statsummary.cpp +++ b/src/game/client/tf/vgui/tf_statsummary.cpp @@ -483,7 +483,7 @@ void CTFStatsSummaryPanel::SetStats( CUtlVector &vecClassStats ) m_aClassStats = vecClassStats; if ( m_bControlsLoaded ) { - UpdateDialog(); + UpdateDialog(false); } } @@ -846,7 +846,7 @@ void CTFStatsSummaryPanel::UpdateLeaderboard() //----------------------------------------------------------------------------- // Purpose: Updates the dialog //----------------------------------------------------------------------------- -void CTFStatsSummaryPanel::UpdateDialog() +void CTFStatsSummaryPanel::UpdateDialog(bool bUpdateTip) { UpdateMainBackground(); @@ -921,8 +921,11 @@ void CTFStatsSummaryPanel::UpdateDialog() UpdateBarCharts(); // fill out class details UpdateClassDetails(); - // update the tip - UpdateTip(); + if (bUpdateTip) + { + // update the tip + UpdateTip(); + } // show or hide controls depending on if we're interactive or not UpdateControls(); } diff --git a/src/game/client/tf/vgui/tf_statsummary.h b/src/game/client/tf/vgui/tf_statsummary.h index eca7c723f..ffbc205c1 100644 --- a/src/game/client/tf/vgui/tf_statsummary.h +++ b/src/game/client/tf/vgui/tf_statsummary.h @@ -64,7 +64,7 @@ class CTFStatsSummaryPanel : public vgui::EditablePanel, public CGameEventListen void Reset(); void SetDefaultSelections(); - void UpdateDialog(); + void UpdateDialog(bool bUpdateTip = true); void UpdateBarCharts(); void UpdateClassDetails( bool bIsMVM = false ); void UpdateTip(); From 79f025aebe85e6a684defa7d283eb0669e73d3a8 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Fri, 10 Mar 2023 22:09:04 -0500 Subject: [PATCH 25/42] fix: update networking setting defaults * bump up updaterate and cmdrate to be equal and more closely aligned to the tick interval. this is a slight increase so it should not adversely affect players in terms of bandwidth and processing time. the fact that the rates are equal means that we aren't giving an imbalance of data for the player to process and repredict on. * align cl_interp to 15ms tick interval. we aren't reducing it at the moment because too many systems rely on interpolation to smooth out some evaluation. if these are fixed we can probably drop it down to the safe value of 0.03. I also limited the value to 0.2 instead of 0.5 since players were exploiting this to desync/backtrack. * set sv_maxunlag to 0.5. since cl_interp's max value was reduced, we can set the unlag window to be shorter. CSGO sets this to 0.2 but that is much too aggressive, considering interp. 0.7 would be a value to decrease by for just the interp limit, but I have decreased by a further 0.2s ping to reduce the unlag window for laggy players and cheaters who are using this mechanic to backtrack. * raise bandwidth from 80000 to 131072. this is the maximum value that seems to work well for most users across a variety of connections and routers, possibly due to some buffers or misconfigurations. 196608 was tested over from CSGO but this had issues for some connections. this increase of rate ensures there are no delays with larger net transfers that usually occur in certain situations, such as when a player is teleported, spawned or switches spectator targets. * enforce a minrate of the lowest rate selectable in CSGO * enforce a max rate of the highest rate selectable in CSGO anything higher seems to cause problems with networking * net_maxpacketdrop is set to 0. I'm not clued in why this was added in the first place, but effectively what this does is force you to drop even more packets whenever you drop packets from lossy internet. --- src/engine/baseserver.cpp | 2 +- src/engine/cl_bounded_cvars.cpp | 4 ++-- src/engine/net.h | 4 ++-- src/engine/net_chan.cpp | 2 +- src/engine/sv_client.cpp | 2 +- src/game/client/cdll_bounded_cvars.cpp | 4 ++-- src/game/server/player.cpp | 4 ++-- src/game/server/player_lagcompensation.cpp | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/engine/baseserver.cpp b/src/engine/baseserver.cpp index 17145a187..7822b10c5 100644 --- a/src/engine/baseserver.cpp +++ b/src/engine/baseserver.cpp @@ -2086,7 +2086,7 @@ CBaseClient *CBaseServer::CreateFakeClient( const char *name ) fakeclient->SetUserCVar( "rate", "30000" ); fakeclient->SetUserCVar( "cl_updaterate", "20" ); fakeclient->SetUserCVar( "cl_interp_ratio", "1.0" ); - fakeclient->SetUserCVar( "cl_interp", "0.1" ); + fakeclient->SetUserCVar( "cl_interp", "0.105" ); fakeclient->SetUserCVar( "cl_interpolate", "0" ); fakeclient->SetUserCVar( "cl_predict", "1" ); fakeclient->SetUserCVar( "cl_predictweapons", "1" ); diff --git a/src/engine/cl_bounded_cvars.cpp b/src/engine/cl_bounded_cvars.cpp index 4ce0ddd74..e11401fc6 100644 --- a/src/engine/cl_bounded_cvars.cpp +++ b/src/engine/cl_bounded_cvars.cpp @@ -72,7 +72,7 @@ class CBoundedCvar_CmdRate : public ConVar_ServerBounded CBoundedCvar_CmdRate() : ConVar_ServerBounded( "cl_cmdrate", - "30", + "33", FCVAR_ARCHIVE | FCVAR_USERINFO, "Max number of command packets sent to server per second", true, MIN_CMD_RATE, true, MAX_CMD_RATE ) { @@ -119,7 +119,7 @@ class CBoundedCvar_UpdateRate : public ConVar_ServerBounded CBoundedCvar_UpdateRate() : ConVar_ServerBounded( "cl_updaterate", - "20", + "33", FCVAR_ARCHIVE | FCVAR_USERINFO | FCVAR_NOT_CONNECTED, "Number of packets per second of updates you are requesting from the server" ) { diff --git a/src/engine/net.h b/src/engine/net.h index 3c8762a11..1abd23493 100644 --- a/src/engine/net.h +++ b/src/engine/net.h @@ -19,9 +19,9 @@ #include "proto_version.h" // Flow control bytes per second limits -#define MAX_RATE (1024*1024) +#define MAX_RATE 786432 #define MIN_RATE 1000 -#define DEFAULT_RATE 80000 +#define DEFAULT_RATE 131072 #define SIGNON_TIME_OUT 300.0f // signon disconnect timeout diff --git a/src/engine/net_chan.cpp b/src/engine/net_chan.cpp index 7b033739f..782d1c4db 100644 --- a/src/engine/net_chan.cpp +++ b/src/engine/net_chan.cpp @@ -44,7 +44,7 @@ static ConVar net_maxfilesize( "net_maxfilesize", "16", 0, "Maximum allowed file static ConVar net_compresspackets( "net_compresspackets", "1", 0, "Use compression on game packets." ); static ConVar net_compresspackets_minsize( "net_compresspackets_minsize", "1024", 0, "Don't bother compressing packets below this size." ); static ConVar net_maxcleartime( "net_maxcleartime", "4.0", 0, "Max # of seconds we can wait for next packets to be sent based on rate setting (0 == no limit)." ); -static ConVar net_maxpacketdrop( "net_maxpacketdrop", "5000", 0, "Ignore any packets with the sequence number more than this ahead (0 == no limit)" ); +static ConVar net_maxpacketdrop( "net_maxpacketdrop", "0", 0, "Ignore any packets with the sequence number more than this ahead (0 == no limit)" ); extern ConVar net_maxroutable; diff --git a/src/engine/sv_client.cpp b/src/engine/sv_client.cpp index 6279a2672..3186e652a 100644 --- a/src/engine/sv_client.cpp +++ b/src/engine/sv_client.cpp @@ -39,7 +39,7 @@ extern CNetworkStringTableContainer *networkStringTableContainerServer; static ConVar sv_timeout( "sv_timeout", "65", 0, "After this many seconds without a message from a client, the client is dropped" ); static ConVar sv_maxrate( "sv_maxrate", "0", FCVAR_REPLICATED, "Max bandwidth rate allowed on server, 0 == unlimited" ); -static ConVar sv_minrate( "sv_minrate", "3500", FCVAR_REPLICATED, "Min bandwidth rate allowed on server, 0 == unlimited" ); +static ConVar sv_minrate( "sv_minrate", "16000", FCVAR_REPLICATED, "Min bandwidth rate allowed on server, 0 == unlimited" ); ConVar sv_maxupdaterate( "sv_maxupdaterate", "66", FCVAR_REPLICATED, "Maximum updates per second that the server will allow" ); ConVar sv_minupdaterate( "sv_minupdaterate", "10", FCVAR_REPLICATED, "Minimum updates per second that the server will allow" ); diff --git a/src/game/client/cdll_bounded_cvars.cpp b/src/game/client/cdll_bounded_cvars.cpp index d1114a381..83b2e9d4e 100644 --- a/src/game/client/cdll_bounded_cvars.cpp +++ b/src/game/client/cdll_bounded_cvars.cpp @@ -99,9 +99,9 @@ class CBoundedCvar_Interp : public ConVar_ServerBounded public: CBoundedCvar_Interp() : ConVar_ServerBounded( "cl_interp", - "0.1", + "0.105", FCVAR_USERINFO | FCVAR_NOT_CONNECTED | FCVAR_ARCHIVE, - "Sets the interpolation amount (bounded on low side by server interp ratio settings).", true, 0.0f, true, 0.5f ) + "Sets the interpolation amount (bounded on low side by server interp ratio settings).", true, 0.0f, true, 0.2f ) { } diff --git a/src/game/server/player.cpp b/src/game/server/player.cpp index 7dea1f507..3d09586ae 100644 --- a/src/game/server/player.cpp +++ b/src/game/server/player.cpp @@ -593,8 +593,8 @@ CBasePlayer::CBasePlayer( ) m_hZoomOwner = NULL; - m_nUpdateRate = 20; // cl_updaterate defualt - m_fLerpTime = 0.1f; // cl_interp default + m_nUpdateRate = 33; // cl_updaterate defualt + m_fLerpTime = 0.105f; // cl_interp default m_bPredictWeapons = true; m_bLagCompensation = false; m_flLaggedMovementValue = 1.0f; diff --git a/src/game/server/player_lagcompensation.cpp b/src/game/server/player_lagcompensation.cpp index 37f322e59..a9190253a 100644 --- a/src/game/server/player_lagcompensation.cpp +++ b/src/game/server/player_lagcompensation.cpp @@ -31,7 +31,7 @@ static ConVar sv_lagcompensation_teleport_dist( "sv_lagcompensation_teleport_dis #define LAG_COMPENSATION_ERROR_EPS_SQR ( 4.0f * 4.0f ) ConVar sv_unlag( "sv_unlag", "1", FCVAR_DEVELOPMENTONLY, "Enables player lag compensation" ); -ConVar sv_maxunlag( "sv_maxunlag", "1.0", FCVAR_DEVELOPMENTONLY, "Maximum lag compensation in seconds", true, 0.0f, true, 1.0f ); +ConVar sv_maxunlag( "sv_maxunlag", "0.5", FCVAR_DEVELOPMENTONLY, "Maximum lag compensation in seconds", true, 0.0f, true, 1.0f ); ConVar sv_lagflushbonecache( "sv_lagflushbonecache", "1", FCVAR_DEVELOPMENTONLY, "Flushes entity bone cache on lag compensation" ); ConVar sv_showlagcompensation( "sv_showlagcompensation", "0", FCVAR_CHEAT, "Show lag compensated hitboxes whenever a player is lag compensated." ); From 4d29cb2dcb887cd7e269bd2165aa13ef39fa44b5 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Sun, 5 Mar 2023 11:58:13 -0500 Subject: [PATCH 26/42] gameplay: fix class limit bypass from selecting class to spawn after death this adds a check to ForceRespawn to make sure that the player indeed can choose this class the bypass happens because while you are still dead, you do not count against the class limit if you select a new class to spawn as so, two or more players can select the same class while dead, putting the team over the limit Ref: https://github.com/ValveSoftware/Source-1-Games/issues/2084 --- src/game/server/tf/tf_player.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/game/server/tf/tf_player.cpp b/src/game/server/tf/tf_player.cpp index 332d50259..f9a36e488 100644 --- a/src/game/server/tf/tf_player.cpp +++ b/src/game/server/tf/tf_player.cpp @@ -13893,6 +13893,14 @@ void CTFPlayer::ForceRespawn( void ) DropFlag(); } + // Prevent bypassing class limits. Whoever wins on the draw can spawn as this class, + // and anyone who comes after will get swapped back to their old class. + if (!TFGameRules()->CanPlayerChooseClass(this, iDesiredClass)) + { + iDesiredClass = GetPlayerClass()->GetClassIndex(); + ClientPrint( this, HUD_PRINTCENTER, "#TF_ClassLimitReached" ); // NOTE: Add localization string + } + if ( GetPlayerClass()->GetClassIndex() != iDesiredClass ) { // clean up any pipebombs/buildings in the world (no explosions) From cc96cc9833b65a9c481d85b5791c466439f0ba69 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 25 Jul 2022 13:55:06 -0400 Subject: [PATCH 27/42] gameplay: fix switch from bonuses not considering deploy time bonuses switch from bonus is only applied if the weapon we are switching from has fully deployed. however, the calculation for "fully deployed" was incorrect. it used the default switch time of 0.5, instead of also taking into account weapon attributes which affect deploy time. this affected the degreaser most notably, which would not allow pyros to switch to their flare gun with increased switch speed as soon as the degreaser deployed (0.33s). instead, players would have to wait until 0.5s had passed before switching which is unintuitive ref: https://github.com/ValveSoftware/Source-1-Games/issues/3488 --- src/game/shared/tf/tf_weaponbase.cpp | 33 +++++++++++++++++----------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/game/shared/tf/tf_weaponbase.cpp b/src/game/shared/tf/tf_weaponbase.cpp index 7ebd9b20a..93f97622b 100644 --- a/src/game/shared/tf/tf_weaponbase.cpp +++ b/src/game/shared/tf/tf_weaponbase.cpp @@ -1056,18 +1056,7 @@ bool CTFWeaponBase::Deploy( void ) CALL_ATTRIB_HOOK_FLOAT_ON_OTHER( pPlayer, flDeployTimeMultiplier, mult_deploy_time ); CALL_ATTRIB_HOOK_FLOAT( flDeployTimeMultiplier, mult_single_wep_deploy_time ); - // don't apply mult_switch_from_wep_deploy_time attribute if the last weapon hasn't been deployed for more than 0.67 second to match to weapon script switch time - // unless the player latched to a hook target, then allow switching right away - CTFWeaponBase *pLastWeapon = dynamic_cast< CTFWeaponBase* >( pPlayer->GetLastWeapon() ); - if ( pPlayer->GetGrapplingHookTarget() != NULL || ( pLastWeapon && gpGlobals->curtime - pLastWeapon->m_flLastDeployTime > flWeaponSwitchTime ) ) - { - CALL_ATTRIB_HOOK_FLOAT_ON_OTHER( pLastWeapon, flDeployTimeMultiplier, mult_switch_from_wep_deploy_time ); - } - - if ( pPlayer->m_Shared.InCond( TF_COND_BLASTJUMPING ) ) - { - CALL_ATTRIB_HOOK_FLOAT( flDeployTimeMultiplier, mult_rocketjump_deploy_time ); - } + CTFWeaponBase* pLastWeapon = static_cast(pPlayer->GetLastWeapon()); int iIsSword = 0; CALL_ATTRIB_HOOK_INT_ON_OTHER( pLastWeapon, iIsSword, is_a_sword ); @@ -1078,6 +1067,11 @@ bool CTFWeaponBase::Deploy( void ) flDeployTimeMultiplier *= 1.75f; } + if ( pPlayer->m_Shared.InCond( TF_COND_BLASTJUMPING ) ) + { + CALL_ATTRIB_HOOK_FLOAT( flDeployTimeMultiplier, mult_rocketjump_deploy_time ); + } + #ifdef STAGING_ONLY if ( pPlayer->m_Shared.InCond( TF_COND_TRANQ_SPY_BOOST ) ) { @@ -1096,6 +1090,18 @@ bool CTFWeaponBase::Deploy( void ) { CALL_ATTRIB_HOOK_FLOAT_ON_OTHER( pPlayer, flDeployTimeMultiplier, mod_medic_healed_deploy_time ); } + + // Don't consider mult_switch_from_wep_deploy_time for base deploy time + // This avoid feedback loops and gets switch from bonuses in line with the default switch time + // This calculation replicates the flDeployTime calculation below + float flBaseDeployTime = flWeaponSwitchTime * MAX( flDeployTimeMultiplier, 0.00001f ); + + // don't apply mult_switch_from_wep_deploy_time attribute if the last weapon hasn't past its base deploy time + // unless the player latched to a hook target, then allow switching right away + if ( pPlayer->GetGrapplingHookTarget() != NULL || ( pLastWeapon && gpGlobals->curtime >= pLastWeapon->m_flLastDeployTime ) ) + { + CALL_ATTRIB_HOOK_FLOAT_ON_OTHER( pLastWeapon, flDeployTimeMultiplier, mult_switch_from_wep_deploy_time ); + } flDeployTimeMultiplier = MAX( flDeployTimeMultiplier, 0.00001f ); float flDeployTime = flWeaponSwitchTime * flDeployTimeMultiplier; @@ -1116,7 +1122,8 @@ bool CTFWeaponBase::Deploy( void ) pPlayer->SetNextAttack( m_flNextPrimaryAttack ); - m_flLastDeployTime = gpGlobals->curtime; + // Last deploy time now refers to the time we actually fully deployed, not the time we switched to the weapon + m_flLastDeployTime = gpGlobals->curtime + flBaseDeployTime; #ifdef GAME_DLL // Reset our deploy-lifetime kill counter. From 3403817511d2da3b72b50562446b54e42a470b57 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 25 Jul 2022 14:14:23 -0400 Subject: [PATCH 28/42] gameplay: movement speed modifiers were incorrect stunned movement was not applying correctly to diagonal movement because diagonal movement is a combination of two axes they exceed max speed until clamped by CheckParameters stunned movement slows were applying before this clamp, so they did not properly slow the actual speed of the player which could be much lower than the unclamped speed in addition, high max speed boosts (like the Baby Face's Blaster) did not apply unless the input speed was default speed since stunned movement manipulates input speed directly instead of changing player velocity, this broke the input speed boost and thus scout's max speed would be capped higher but not have the input speed to drive it. with these changes, a 0.6 slow (as applied by the Natascha at close range), slows down a full charge BFB Scout from 520 to 208 in all directions (40% of 520), as intended and proper! And of course, normal Scout speed is slowed from 400 to 160 in all directions (40% of 400). however, with this fix, the slow amount is drastically more effective than it was before. so, we apply legacy handling to reduce slow amount by a percentage value relative to the old effective scaling. a scaling value was chosen to get a near average between the effective slow applied with a 100% slow, and the full slow amount applied with the fix note that there is still technically an issue with strafe tapping boosting your movement speed, but this is a game movement mechanic which I don't think is related to stuns, which seem to be intended to apply to movement controls, not exactly correlating to output velocity. or else stuns would not be a different movement mechanic vs. percentage movement speed slows. ref: https://github.com/ValveSoftware/Source-1-Games/issues/3721 --- src/game/server/tf/tf_player.cpp | 11 +++ src/game/shared/gamemovement.cpp | 3 + src/game/shared/tf/tf_gamemovement.cpp | 92 ++++++++++++++++++++++---- 3 files changed, 93 insertions(+), 13 deletions(-) diff --git a/src/game/server/tf/tf_player.cpp b/src/game/server/tf/tf_player.cpp index f9a36e488..0c4b83ec4 100644 --- a/src/game/server/tf/tf_player.cpp +++ b/src/game/server/tf/tf_player.cpp @@ -21623,6 +21623,17 @@ bool CTFPlayer::CanBreatheUnderwater() const return false; } +//----------------------------------------------------------------------------- +// Purpose: Debug concommand to stun the player +//----------------------------------------------------------------------------- +void StunPlayer() +{ + CTFPlayer* pPlayer = ToTFPlayer(ToTFPlayer(UTIL_PlayerByIndex(1))); + float flStunAmount = 0.60f; + pPlayer->m_Shared.StunPlayer(10.0f, flStunAmount, TF_STUN_MOVEMENT, pPlayer); +} +static ConCommand cc_StunPlayer("tf_stun_player", StunPlayer, "Stuns you.", FCVAR_CHEAT); + //----------------------------------------------------------------------------- // Purpose: //----------------------------------------------------------------------------- diff --git a/src/game/shared/gamemovement.cpp b/src/game/shared/gamemovement.cpp index 0efcfa80c..ab6080dff 100644 --- a/src/game/shared/gamemovement.cpp +++ b/src/game/shared/gamemovement.cpp @@ -4550,7 +4550,10 @@ void CGameMovement::PlayerMove( void ) { VPROF( "CGameMovement::PlayerMove" ); + // TF runs this with speed modifiers +#if !defined(TF_DLL) && !defined(TF_CLIENT_DLL) CheckParameters(); +#endif // clear output applied velocity mv->m_outWishVel.Init(); diff --git a/src/game/shared/tf/tf_gamemovement.cpp b/src/game/shared/tf/tf_gamemovement.cpp index 2e0f9c86c..d3d984f2e 100644 --- a/src/game/shared/tf/tf_gamemovement.cpp +++ b/src/game/shared/tf/tf_gamemovement.cpp @@ -39,7 +39,7 @@ ConVar tf_duck_debug_spew( "tf_duck_debug_spew", "0", FCVAR_REPLICATED | FCVAR_DEVELOPMENTONLY ); -ConVar tf_showspeed( "tf_showspeed", "0", FCVAR_REPLICATED | FCVAR_DEVELOPMENTONLY ); +ConVar tf_showspeed( "tf_showspeed", "0", FCVAR_REPLICATED | FCVAR_CHEAT, "1 = show speed during collisions, 2 = always show speed" ); ConVar tf_avoidteammates( "tf_avoidteammates", "1", FCVAR_REPLICATED | FCVAR_CHEAT | FCVAR_DEVELOPMENTONLY, "Controls how teammates interact when colliding.\n 0: Teammates block each other\n 1: Teammates pass through each other, but push each other away (default)" ); ConVar tf_avoidteammates_pushaway( "tf_avoidteammates_pushaway", "1", FCVAR_REPLICATED, "Whether or not teammates push each other away when occupying the same space" ); ConVar tf_solidobjects( "tf_solidobjects", "1", FCVAR_REPLICATED | FCVAR_CHEAT | FCVAR_DEVELOPMENTONLY ); @@ -317,6 +317,12 @@ void CTFGameMovement::ProcessMovement( CBasePlayer *pBasePlayer, CMoveData *pMov // Handle charging demomens ChargeMove(); + // Handle scouts that can move really fast with buffs + HighMaxSpeedMove(); + + // Limit diagonal movement + CheckParameters(); + // Handle player stun. StunMove(); @@ -326,9 +332,6 @@ void CTFGameMovement::ProcessMovement( CBasePlayer *pBasePlayer, CMoveData *pMov // Handle grappling hook move GrapplingHookMove(); - // Handle scouts that can move really fast with buffs - HighMaxSpeedMove(); - // Run the command. PlayerMove(); @@ -434,17 +437,19 @@ bool CTFGameMovement::GrapplingHookMove() if ( tf_grapplinghook_use_acceleration.GetBool() ) { // Use acceleration with dampening - float flSpeed = mv->m_vecVelocity.Length(); + float flSpeed = mv->m_vecVelocity.LengthSqr(); if ( flSpeed > 0.f ) { + flSpeed = FastSqrt( flSpeed ); float flDampen = Min( tf_grapplinghook_dampening.GetFloat() * gpGlobals->frametime, flSpeed ); mv->m_vecVelocity *= ( flSpeed - flDampen ) / flSpeed; } mv->m_vecVelocity += vDesiredMove.Normalized() * ( tf_grapplinghook_acceleration.GetFloat() * gpGlobals->frametime ); - flSpeed = mv->m_vecVelocity.Length(); + flSpeed = mv->m_vecVelocity.LengthSqr(); if ( flSpeed > mv->m_flMaxSpeed ) { + flSpeed = FastSqrt( flSpeed ); mv->m_vecVelocity *= mv->m_flMaxSpeed / flSpeed; } } @@ -530,6 +535,13 @@ bool CTFGameMovement::ChargeMove() return true; } +#ifdef STAGING_ONLY +static ConVar tf_movement_stun_multiplier("tf_movement_stun_multiplier", "1", FCVAR_REPLICATED, "Multiplier for movement speed when stunned."); +static ConVar tf_movement_stun_clip("tf_movement_stun_clip", "0.41421356237", FCVAR_REPLICATED, "Clip off stun amount."); +#endif +static ConVar tf_movement_stun_legacy_threshold("tf_movement_stun_legacy_threshold", "1.5", FCVAR_REPLICATED, "Relative point for legacy stun amount handling."); +static ConVar tf_movement_stun_legacy_on_charge("tf_movement_stun_legacy_on_charge", "1", FCVAR_REPLICATED, "Always apply full stun to charging players."); + //----------------------------------------------------------------------------- // Purpose: //----------------------------------------------------------------------------- @@ -565,6 +577,42 @@ bool CTFGameMovement::StunMove() // Handle movement stuns float flStunAmount = m_pTFPlayer->m_Shared.GetAmountStunned( TF_STUN_MOVEMENT ); + if ( flStunAmount ) + { + // Handle legacy clipping value. Before the fix to stunned movement, stuns were applied to diagonal movement of sqrt(2) magnitude. + // This means effectively that the stun would have to reduce past the ~141.4% movement speed to have any effect. + // + // So, a stun amount would at minimum need to be greater than ~0.414 to reduce movement speed below 100%. + // Since this effectively meant stuns were clipped, there was non-linear scaling of stun amount to actual movement speed reduction. + // + // A stun value of 0.414 (or below) would have 0% effective stun, 0.6 would have ~31% effective stun, 1.0 would have ~59% effective stun. + // + // This legacy handling has been added so that we get similar slow amounts for stuns that were previously applied, but + // also have them be linear and consistent from 0 to 1. + if ( tf_movement_stun_legacy_on_charge.GetBool() && m_pTFPlayer->m_Shared.InCond( TF_COND_SHIELD_CHARGE ) ) + { + // Slow down charging players the full amount. Charging players never had diagonal movement, so + // they always got the full slow amount, which would end their charge. Being able to end + // their charge is incredibly important, so we don't want to change that. + flStunAmount = flStunAmount; + } + else if ( flStunAmount > tf_movement_stun_legacy_threshold.GetFloat() ) + { + // For any stun amount greater than the threshold, we use the legacy clip behavior. + flStunAmount = max( flStunAmount - 0.41421356237f, 0.0f ); // Reduce by sqrt(2) - 1.0f (see above) + } + else + { +#ifdef STAGING_ONLY + // For playing around with the scaling. + flStunAmount = max( flStunAmount - tf_movement_stun_clip.GetFloat(), 0.0f ) * tf_movement_stun_multiplier.GetFloat(); +#else + // This equation essentially calculates the percentage of the stun amount that was effectively applied to diagonal movement + // at a certain stun amount and applies that to all stun amounts consistently now. + flStunAmount *= ( ( -0.41421356237f / tf_movement_stun_legacy_threshold.GetFloat() ) + 1 ); +#endif + } + } // Lerp to the desired amount if ( flStunAmount ) { @@ -1089,10 +1137,11 @@ void CTFGameMovement::PreventBunnyJumping() return; // Current player speed - float spd = mv->m_vecVelocity.Length(); - if ( spd <= maxscaledspeed ) + float spd = mv->m_vecVelocity.LengthSqr(); + if ( spd <= maxscaledspeed * maxscaledspeed ) return; + spd = FastSqrt(spd); // Apply this cropping fraction to velocity float fraction = ( maxscaledspeed / spd ); @@ -1871,7 +1920,8 @@ void CTFGameMovement::WalkMove( void ) { // Made it to the destination (remove the base velocity). mv->SetAbsOrigin( trace.endpos ); - VectorSubtract( mv->m_vecVelocity, player->GetBaseVelocity(), mv->m_vecVelocity ); + Vector baseVelocity = player->GetBaseVelocity(); + VectorSubtract( mv->m_vecVelocity, baseVelocity, mv->m_vecVelocity ); // Save the wish velocity. mv->m_outWishVel += ( vecWishDirection * flWishSpeed ); @@ -1880,6 +1930,22 @@ void CTFGameMovement::WalkMove( void ) // NOTE YWB 7/5/07: Don't do this here, our version of CategorizePosition encompasses this test // StayOnGround(); +#if 1 + // Debugging!!! + Vector vecTestVelocity = mv->m_vecVelocity; + vecTestVelocity.z = 0.0f; + float flTestSpeed = VectorLength( vecTestVelocity ); + if ( tf_showspeed.GetInt() == 2 && baseVelocity.IsZero() && ( flTestSpeed > ( mv->m_flMaxSpeed + 1.0f ) ) ) + { + Msg( "Step Max Speed < %f\n", flTestSpeed ); + } + + if ( tf_showspeed.GetInt() == 2 ) + { + Msg( "Speed = %f\n", flTestSpeed ); + } +#endif + #ifdef CLIENT_DLL // Track how far we moved (if we're a Scout or an Engineer carrying a building). CTFPlayer* pTFPlayer = ToTFPlayer( player ); @@ -1918,19 +1984,19 @@ void CTFGameMovement::WalkMove( void ) // NOTE YWB 7/5/07: Don't do this here, our version of CategorizePosition encompasses this test // StayOnGround(); -#if 0 +#if 1 // Debugging!!! Vector vecTestVelocity = mv->m_vecVelocity; vecTestVelocity.z = 0.0f; float flTestSpeed = VectorLength( vecTestVelocity ); - if ( baseVelocity.IsZero() && ( flTestSpeed > ( mv->m_flMaxSpeed + 1.0f ) ) ) + if ( tf_showspeed.GetInt() == 1 && baseVelocity.IsZero() && ( flTestSpeed > ( mv->m_flMaxSpeed + 1.0f ) ) ) { Msg( "Step Max Speed < %f\n", flTestSpeed ); } - if ( tf_showspeed.GetBool() ) + if ( tf_showspeed.GetInt() == 1 ) { - Msg( "Speed=%f\n", flTestSpeed ); + Msg( "Speed = %f\n", flTestSpeed ); } #endif From efe9744c6fdcc9bbe34b9ebdf428f71ecd0b46b8 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 13 Mar 2023 11:26:53 -0400 Subject: [PATCH 29/42] perf(load): enable async MDL loading, speeds up map loads and reduces stutters --- src/datacache/mdlcache.cpp | 8 +------- src/engine/modelloader.cpp | 2 +- src/public/datacache/imdlcache.h | 6 ++++++ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/datacache/mdlcache.cpp b/src/datacache/mdlcache.cpp index 8f1ff3650..859c636a8 100644 --- a/src/datacache/mdlcache.cpp +++ b/src/datacache/mdlcache.cpp @@ -78,12 +78,6 @@ namespace { #define MdlCacheMsg if ( !LogMdlCache() ) ; else Msg #define MdlCacheWarning if ( !LogMdlCache() ) ; else Warning -#if defined( _X360 ) -#define AsyncMdlCache() 0 // Explicitly OFF for 360 (incompatible) -#else -#define AsyncMdlCache() 0 -#endif - #define ERROR_MODEL "models/error.mdl" #define IDSTUDIOHEADER (('T'<<24)+('S'<<16)+('D'<<8)+'I') @@ -184,7 +178,7 @@ class CTempAllocHelper // ConVars //----------------------------------------------------------------------------- static ConVar r_rootlod( "r_rootlod", "0", FCVAR_ARCHIVE ); -static ConVar mod_forcedata( "mod_forcedata", ( AsyncMdlCache() ) ? "0" : "1", 0, "Forces all model file data into cache on model load." ); +static ConVar mod_forcedata( "mod_forcedata", ( AsyncMdlCache() && IsX360() ) ? "0" : "1", 0, "Forces all model file data into cache on model load." ); static ConVar mod_test_not_available( "mod_test_not_available", "0", FCVAR_CHEAT ); static ConVar mod_test_mesh_not_available( "mod_test_mesh_not_available", "0", FCVAR_CHEAT ); static ConVar mod_test_verts_not_available( "mod_test_verts_not_available", "0", FCVAR_CHEAT ); diff --git a/src/engine/modelloader.cpp b/src/engine/modelloader.cpp index 72a479d49..4835fea06 100644 --- a/src/engine/modelloader.cpp +++ b/src/engine/modelloader.cpp @@ -5022,7 +5022,7 @@ void CModelLoader::Studio_LoadModel( model_t *pModel, bool bTouchAllData ) if ( bLoadPhysics && !bPreLoaded ) { // load the collision data now - bool bSynchronous = bTouchAllData; + bool bSynchronous = bTouchAllData && !AsyncMdlCache(); double t1 = Plat_FloatTime(); g_pMDLCache->GetVCollideEx( pModel->studio, bSynchronous ); diff --git a/src/public/datacache/imdlcache.h b/src/public/datacache/imdlcache.h index 0f7093abe..6d9d31ec9 100644 --- a/src/public/datacache/imdlcache.h +++ b/src/public/datacache/imdlcache.h @@ -21,6 +21,12 @@ #include "appframework/IAppSystem.h" +#if defined( _X360 ) +#define AsyncMdlCache() 0 // Explicitly OFF for 360 (incompatible) +#else +#define AsyncMdlCache() 1 +#endif + //----------------------------------------------------------------------------- // Forward declarations //----------------------------------------------------------------------------- From bba6420c7c0d9319e4ce490150931afc7094cd30 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 13 Mar 2023 11:28:52 -0400 Subject: [PATCH 30/42] perf: disable affinity this was done in DOTA, let the OS handle where threads should go, we can't really determine what cores will be ideal given that hardware is so varied. --- src/vstdlib/jobthread.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/vstdlib/jobthread.cpp b/src/vstdlib/jobthread.cpp index 46d843e04..e7a7c0a54 100644 --- a/src/vstdlib/jobthread.cpp +++ b/src/vstdlib/jobthread.cpp @@ -1023,6 +1023,7 @@ bool CThreadPool::Start( const ThreadPoolStartParams_t &startParams, const char void CThreadPool::Distribute( bool bDistribute, int *pAffinityTable ) { +#ifdef _X360 if ( bDistribute ) { const CPUInformation &ci = *GetCPUInformation(); @@ -1104,6 +1105,7 @@ void CThreadPool::Distribute( bool bDistribute, int *pAffinityTable ) } #endif } +#endif } //--------------------------------------------------------- From 481f5eb6e98c33f8e6e188d83e95e24d7600e358 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 13 Mar 2023 11:40:30 -0400 Subject: [PATCH 31/42] perf: backport GetPointContents_WorldOnly from CSGO we only query water in these cases, and we don't have a situation where querying the world only would be inaccurate this fixes an issue where bones would be set up in prediction due to a spatial query update which caused attached cosmetics to re-evaluate their origin according to their attachment because prediction changes the time, this would flip flop bone setup multiple times per frame --- src/engine/enginetrace.cpp | 11 ++++++++++- src/game/client/c_baseplayer.cpp | 2 +- src/game/shared/gamemovement.cpp | 4 ++-- src/game/shared/physics_main_shared.cpp | 15 ++++++++++++--- src/public/engine/IEngineTrace.h | 3 +++ 5 files changed, 28 insertions(+), 7 deletions(-) diff --git a/src/engine/enginetrace.cpp b/src/engine/enginetrace.cpp index 54ac243a1..6af21f01b 100644 --- a/src/engine/enginetrace.cpp +++ b/src/engine/enginetrace.cpp @@ -73,7 +73,7 @@ abstract_class CEngineTrace : public IEngineTrace CEngineTrace() { m_pRootMoveParent = NULL; } // Returns the contents mask at a particular world-space position virtual int GetPointContents( const Vector &vecAbsPosition, IHandleEntity** ppEntity ); - + virtual int GetPointContents_WorldOnly( const Vector &vecAbsPosition ); virtual int GetPointContents_Collideable( ICollideable *pCollide, const Vector &vecAbsPosition ); // Traces a ray against a particular edict @@ -375,6 +375,15 @@ class CPointContentsEnum : public IPartitionEnumerator Vector m_Pos; }; +//----------------------------------------------------------------------------- +// Returns the world contents +//----------------------------------------------------------------------------- +int CEngineTrace::GetPointContents_WorldOnly( const Vector &vecAbsPosition ) +{ + int nContents = CM_PointContents( vecAbsPosition, 0 ); + + return nContents; +} //----------------------------------------------------------------------------- // Returns the contents mask at a particular world-space position diff --git a/src/game/client/c_baseplayer.cpp b/src/game/client/c_baseplayer.cpp index 1aadef5c2..6d216eee8 100644 --- a/src/game/client/c_baseplayer.cpp +++ b/src/game/client/c_baseplayer.cpp @@ -499,7 +499,7 @@ bool C_BasePlayer::AudioStateIsUnderwater( Vector vecMainViewOrigin ) if ( IsObserver() ) { // Just check the view position - int cont = enginetrace->GetPointContents ( vecMainViewOrigin ); + int cont = enginetrace->GetPointContents_WorldOnly ( vecMainViewOrigin ); return (cont & MASK_WATER); } diff --git a/src/game/shared/gamemovement.cpp b/src/game/shared/gamemovement.cpp index ab6080dff..f8f88b403 100644 --- a/src/game/shared/gamemovement.cpp +++ b/src/game/shared/gamemovement.cpp @@ -3490,7 +3490,7 @@ int CGameMovement::GetPointContentsCached( const Vector &point, int slot ) if ( m_CachedGetPointContents[ idx ][ slot ] == -9999 || point.DistToSqr( m_CachedGetPointContentsPoint[ idx ][ slot ] ) > 1 ) { - m_CachedGetPointContents[ idx ][ slot ] = enginetrace->GetPointContents ( point ); + m_CachedGetPointContents[ idx ][ slot ] = enginetrace->GetPointContents_WorldOnly ( point ); m_CachedGetPointContentsPoint[ idx ][ slot ] = point; } @@ -3498,7 +3498,7 @@ int CGameMovement::GetPointContentsCached( const Vector &point, int slot ) } else { - return enginetrace->GetPointContents ( point ); + return enginetrace->GetPointContents_WorldOnly ( point ); } } diff --git a/src/game/shared/physics_main_shared.cpp b/src/game/shared/physics_main_shared.cpp index c3719d77b..bf826be4b 100644 --- a/src/game/shared/physics_main_shared.cpp +++ b/src/game/shared/physics_main_shared.cpp @@ -1130,6 +1130,15 @@ unsigned int CBaseEntity::PhysicsSolidMaskForEntity( void ) const return MASK_SOLID; } +static inline int GetWaterContents( const Vector &point ) +{ +#ifdef HL2_DLL + return UTIL_PointContents(point); +#else + // left 4 dead doesn't support moveable water brushes, only world water + return enginetrace->GetPointContents_WorldOnly(point); +#endif +} //----------------------------------------------------------------------------- // Computes the water level + type @@ -1146,7 +1155,7 @@ void CBaseEntity::UpdateWaterState() SetWaterLevel( 0 ); SetWaterType( CONTENTS_EMPTY ); - int cont = UTIL_PointContents (point); + int cont = GetWaterContents (point); if (( cont & MASK_WATER ) == 0) return; @@ -1164,14 +1173,14 @@ void CBaseEntity::UpdateWaterState() // Check the exact center of the box point[2] = WorldSpaceCenter().z; - int midcont = UTIL_PointContents (point); + int midcont = GetWaterContents (point); if ( midcont & MASK_WATER ) { // Now check where the eyes are... SetWaterLevel( 2 ); point[2] = EyePosition().z; - int eyecont = UTIL_PointContents (point); + int eyecont = GetWaterContents (point); if ( eyecont & MASK_WATER ) { SetWaterLevel( 3 ); diff --git a/src/public/engine/IEngineTrace.h b/src/public/engine/IEngineTrace.h index 6e4977190..b16ef36a1 100644 --- a/src/public/engine/IEngineTrace.h +++ b/src/public/engine/IEngineTrace.h @@ -130,6 +130,9 @@ abstract_class IEngineTrace public: // Returns the contents mask + entity at a particular world-space position virtual int GetPointContents( const Vector &vecAbsPosition, IHandleEntity** ppEntity = NULL ) = 0; + + // Returns the contents mask of the world only @ the world-space position (static props are ignored) + virtual int GetPointContents_WorldOnly( const Vector &vecAbsPosition ) = 0; // Get the point contents, but only test the specific entity. This works // on static props and brush models. From 92f8de03b807a42875d0fa37b5a901c8ab0fb604 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 13 Mar 2023 11:42:13 -0400 Subject: [PATCH 32/42] perf: backport IdealPitch optimization from CSGO removes ideal pitch, could not see any relevant code path being hit this is relevant because it causes a setup bones in spatial query from the trace during prediction, which similarly to in the water contents optimization, would cause a lot of setup bones --- src/game/client/prediction.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/game/client/prediction.cpp b/src/game/client/prediction.cpp index f81e79d59..d47f5718f 100644 --- a/src/game/client/prediction.cpp +++ b/src/game/client/prediction.cpp @@ -1708,8 +1708,10 @@ void CPrediction::_Update( bool received_new_world_update, bool validframe, Assert( C_BaseEntity::IsAbsQueriesValid() ); // FIXME: What about hierarchy here?!? +#if 0 // Where is this ever used? SetIdealPitch( localPlayer, localPlayer->GetLocalOrigin(), localPlayer->GetLocalAngles(), localPlayer->m_vecViewOffset ); #endif +#endif } From e0a134ece7f1dbeebcc3c791d2fbf1dc44dcf89b Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 13 Mar 2023 11:43:38 -0400 Subject: [PATCH 33/42] perf: don't force material system config update on exec this causes a duplicate material system reload on init and also slows down config execution. material system config updates are already checked per frame, I'm not exactly sure why this is here but it may cause regressions that I am not aware about --- src/engine/cmd.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine/cmd.cpp b/src/engine/cmd.cpp index 20beda09a..830fcfd87 100644 --- a/src/engine/cmd.cpp +++ b/src/engine/cmd.cpp @@ -685,7 +685,7 @@ void Cmd_Exec_f( const CCommand &args ) } } // force any queued convar changes to flush before reading/writing them - UpdateMaterialSystemConfig(); + //UpdateMaterialSystemConfig(); } From 8d95a3347210669488f8c0d1a78c365e29e14894 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 13 Mar 2023 11:58:28 -0400 Subject: [PATCH 34/42] perf: backport int networking optimization from CSGO --- src/engine/dt_encode.cpp | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/src/engine/dt_encode.cpp b/src/engine/dt_encode.cpp index fdd0639ae..16ab72e04 100644 --- a/src/engine/dt_encode.cpp +++ b/src/engine/dt_encode.cpp @@ -221,31 +221,20 @@ void DecodeInfo::CopyVars( const DecodeInfo *pOther ) // ---------------------------------------------------------------------------------------- // void Int_Encode( const unsigned char *pStruct, DVariant *pVar, const SendProp *pProp, bf_write *pOut, int objectID ) -{ - int nValue = pVar->m_Int; - +{ if ( pProp->GetFlags() & SPROP_VARINT) { if ( pProp->GetFlags() & SPROP_UNSIGNED ) { - pOut->WriteVarInt32( nValue ); + pOut->WriteVarInt32( pVar->m_Int ); } else { - pOut->WriteSignedVarInt32( nValue ); + pOut->WriteSignedVarInt32( pVar->m_Int ); } } else { - // If signed, preserve lower bits and then re-extend sign if nValue < 0; - // if unsigned, preserve all 32 bits no matter what. Bonus: branchless. - int nPreserveBits = ( 0x7FFFFFFF >> ( 32 - pProp->m_nBits ) ); - nPreserveBits |= ( pProp->GetFlags() & SPROP_UNSIGNED ) ? 0xFFFFFFFF : 0; - int nSignExtension = ( nValue >> 31 ) & ~nPreserveBits; - - nValue &= nPreserveBits; - nValue |= nSignExtension; - #ifdef DBGFLAG_ASSERT // Assert that either the property is unsigned and in valid range, // or signed with a consistent sign extension in the high bits @@ -253,21 +242,28 @@ void Int_Encode( const unsigned char *pStruct, DVariant *pVar, const SendProp *p { if ( pProp->GetFlags() & SPROP_UNSIGNED ) { - AssertMsg3( nValue == pVar->m_Int, "Unsigned prop %s needs more bits? Expected %i == %i", pProp->GetName(), nValue, pVar->m_Int ); + int32 nMaskedValue = pVar->m_Int; + nMaskedValue &= (1u << pProp->m_nBits) - 1; + Assert(nMaskedValue == pVar->m_Int); } else { - AssertMsg3( nValue == pVar->m_Int, "Signed prop %s needs more bits? Expected %i == %i", pProp->GetName(), nValue, pVar->m_Int ); + int32 nSignExtendedValue = pVar->m_Int; + nSignExtendedValue <<= 32 - pProp->m_nBits; + nSignExtendedValue >>= 32 - pProp->m_nBits; + Assert(nSignExtendedValue == pVar->m_Int); } } +#endif + + if (pProp->IsSigned()) + { + pOut->WriteSBitLong(pVar->m_Int, pProp->m_nBits); + } else { - // This should never trigger, but I'm leaving it in for old-time's sake. - Assert( nValue == pVar->m_Int ); + pOut->WriteUBitLong((unsigned int)pVar->m_Int, pProp->m_nBits); } -#endif - - pOut->WriteUBitLong( nValue, pProp->m_nBits, false ); } } @@ -322,7 +318,7 @@ int Int_CompareDeltas( const SendProp *pProp, bf_read *p1, bf_read *p2 ) return p1->ReadSignedVarInt32() != p2->ReadSignedVarInt32(); } - return p1->CompareBits(p2, pProp->m_nBits); + return p1->ReadUBitLong( pProp->m_nBits ) != p2->ReadUBitLong( pProp->m_nBits ); } const char* Int_GetTypeNameString() From f1885810e3d8be71e450c625465e7e641fd2a8fa Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 13 Mar 2023 12:10:17 -0400 Subject: [PATCH 35/42] perf: shared object find optimizations when an econ item view goes looking for a econ item, it looks through the CSharedObjectTypeCache ideally, this would maintain a map instead of looking up with O(n) search, but that's a more complicated and scary change instead, skip the amount of virtual GetTypeID calls since SharedObject::BIsKeyEqual is only used in CSharedObjectTypeCache so the equal types are guaranteed. on GC, BIsKeyEqual is used in CSharedObjectTransaction as well, so we keep the old check for it. also move to a static_cast for CEconItem to be explicit/efficient with casting --- src/game/shared/econ/econ_item.cpp | 2 +- src/gcsdk/sharedobject.cpp | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/game/shared/econ/econ_item.cpp b/src/game/shared/econ/econ_item.cpp index a5ec004fa..389a5e832 100644 --- a/src/game/shared/econ/econ_item.cpp +++ b/src/game/shared/econ/econ_item.cpp @@ -1509,7 +1509,7 @@ bool CEconItem::BAddDestroyToMessage( std::string *pBuffer ) const bool CEconItem::BIsKeyLess( const CSharedObject & soRHS ) const { Assert( GetTypeID() == soRHS.GetTypeID() ); - const CEconItem & soSchemaRHS = (const CEconItem &)soRHS; + const CEconItem & soSchemaRHS = static_cast(soRHS); return m_ulID < soSchemaRHS.m_ulID; } diff --git a/src/gcsdk/sharedobject.cpp b/src/gcsdk/sharedobject.cpp index 5b9c4e235..3baf0aa0a 100644 --- a/src/gcsdk/sharedobject.cpp +++ b/src/gcsdk/sharedobject.cpp @@ -113,8 +113,13 @@ const char *CSharedObject::PchClassUpdateNodeName( int nTypeID ) bool CSharedObject::BIsKeyEqual( const CSharedObject & soRHS ) const { // Make sure they are the same type. +#ifdef GC if ( GetTypeID() != soRHS.GetTypeID() ) return false; +#else + // BIsKeyEqual is only used for objects of the same type within their CSharedObjectTypeCache. + Assert ( GetTypeID() == soRHS.GetTypeID() ); +#endif return !BIsKeyLess( soRHS ) && !soRHS.BIsKeyLess( *this ); } From 0ac37364e2e4e0ddf89c5fa0fa5501ea9e1899d4 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 13 Mar 2023 12:21:18 -0400 Subject: [PATCH 36/42] perf: skip costly per-frame RTTI for players and weapons --- src/game/shared/tf/tf_viewmodel.cpp | 74 ++++++++++++++++------------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/src/game/shared/tf/tf_viewmodel.cpp b/src/game/shared/tf/tf_viewmodel.cpp index c3d2231d6..bcd5dcbda 100644 --- a/src/game/shared/tf/tf_viewmodel.cpp +++ b/src/game/shared/tf/tf_viewmodel.cpp @@ -384,8 +384,8 @@ class CViewModelInvisProxy : public CBaseInvisMaterialProxy virtual void OnBind( C_BaseEntity *pC_BaseEntity ); }; -#define TF_VM_MIN_INVIS 0.22 -#define TF_VM_MAX_INVIS 0.5 +#define TF_VM_MIN_INVIS 0.22f +#define TF_VM_MAX_INVIS 0.5f //----------------------------------------------------------------------------- // Purpose: @@ -475,6 +475,9 @@ class CInvisProxy : public CBaseInvisMaterialProxy { public: virtual void OnBind( C_BaseEntity *pC_BaseEntity ) OVERRIDE; +private: + CTFPlayer *pPlayer = NULL; + C_BaseEntity *pCachedEntity = NULL; }; //----------------------------------------------------------------------------- @@ -487,59 +490,64 @@ void CInvisProxy::OnBind( C_BaseEntity *pC_BaseEntity ) C_BaseEntity *pEnt = pC_BaseEntity; - CTFPlayer *pPlayer = NULL; - - // Check if we have a move parent and if it's a player - C_BaseEntity *pMoveParent = pEnt->GetMoveParent(); - if ( pMoveParent && pMoveParent->IsPlayer() ) + if ( pEnt != pCachedEntity ) { - pPlayer = ToTFPlayer( pMoveParent ); + pPlayer = NULL; + pCachedEntity = pEnt; } - // If it's not a player then check for viewmodel. if ( !pPlayer ) { - CBaseEntity *pEntParent = pMoveParent ? pMoveParent : pEnt; - - CTFViewModel *pVM = dynamic_cast( pEntParent ); - if ( pVM ) + // Check if we have a move parent and if it's a player + C_BaseEntity *pMoveParent = pEnt->GetMoveParent(); + if ( pMoveParent && pMoveParent->IsPlayer() ) { - pPlayer = ToTFPlayer( pVM->GetOwner() ); + pPlayer = ToTFPlayer( pMoveParent ); } - } - - if ( !pPlayer ) - { - if ( pEnt->IsPlayer() ) + // If it's not a player then check for viewmodel. + if ( !pPlayer ) { - pPlayer = dynamic_cast( pEnt ); + CBaseEntity *pEntParent = pMoveParent ? pMoveParent : pEnt; + + CTFViewModel *pVM = dynamic_cast( pEntParent ); + if ( pVM ) + { + pPlayer = ToTFPlayer( pVM->GetOwner() ); + } } - else + + if ( !pPlayer ) { - IHasOwner *pOwnerInterface = dynamic_cast( pEnt ); - if ( pOwnerInterface ) + if ( pEnt->IsPlayer() ) + { + pPlayer = dynamic_cast( pEnt ); + } + else { - pPlayer = ToTFPlayer( pOwnerInterface->GetOwnerViaInterface() ); + IHasOwner *pOwnerInterface = dynamic_cast( pEnt ); + if ( pOwnerInterface ) + { + pPlayer = ToTFPlayer( pOwnerInterface->GetOwnerViaInterface() ); + } } } - } - - if ( !pPlayer ) - { - m_pPercentInvisible->SetFloatValue( 0.0f ); - return; + + if ( !pPlayer ) + { + m_pPercentInvisible->SetFloatValue( 0.0f ); + return; + } } // If we're the local player, use the old "vm_invis" code. Otherwise, use the "weapon_invis". if ( pPlayer->IsLocalPlayer() ) { float flPercentInvisible = pPlayer->GetPercentInvisible(); - float flWeaponInvis = flPercentInvisible; // remap from 0.22 to 0.5 // but drop to 0.0 if we're not invis at all - flWeaponInvis = ( flPercentInvisible < 0.01 ) ? - 0.0 : + float flWeaponInvis = ( flPercentInvisible < 0.01f ) ? + 0.0f : RemapVal( flPercentInvisible, 0.0, 1.0, TF_VM_MIN_INVIS, TF_VM_MAX_INVIS ); // Exaggerated blink effect on bump. From 1f38e57f2fcfd738c87f8876ef118ab7f37dcbd1 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 13 Mar 2023 12:25:41 -0400 Subject: [PATCH 37/42] perf: disable COM_TimestampedLog within threaded particles, there was a lot of traffic on a log mutex, and that was slowing things down significantly I'm not exactly sure why this was triggering, as the code seems to prevent this from going off unless there is a -profile or -etwprofile argument, will investigate later. another option is also removing the COM_TimestampedLog from just the particles function, but who knows what else this could be slowing down --- src/tier0/dbg.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/tier0/dbg.cpp b/src/tier0/dbg.cpp index ce01801f1..d10e4df04 100644 --- a/src/tier0/dbg.cpp +++ b/src/tier0/dbg.cpp @@ -870,6 +870,7 @@ void ValidateSpew( CValidator &validator ) //----------------------------------------------------------------------------- void COM_TimestampedLog( char const *fmt, ... ) { +#ifdef _DEBUG static float s_LastStamp = 0.0; static bool s_bShouldLog = false; static bool s_bShouldLogToETW = false; @@ -925,6 +926,7 @@ void COM_TimestampedLog( char const *fmt, ... ) } s_LastStamp = curStamp; +#endif } //----------------------------------------------------------------------------- From ed58eae7742c4156752338f364015bc6d1a84e25 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 13 Mar 2023 12:26:44 -0400 Subject: [PATCH 38/42] perf: do not declare convars inline with runtime code a weird misusage of the convar constructor, should be done at init instead --- src/engine/l_studio.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/engine/l_studio.cpp b/src/engine/l_studio.cpp index ef96ab929..a3ab707c0 100644 --- a/src/engine/l_studio.cpp +++ b/src/engine/l_studio.cpp @@ -3179,6 +3179,11 @@ int CModelRender::DrawStaticPropArrayFast( StaticPropRenderInfo_t *pProps, int c #endif // SWDS } +#ifndef SWDS +static ConVar r_shadowlod("r_shadowlod", "-1"); +static ConVar r_shadowlodbias("r_shadowlodbias", "2"); +#endif + //----------------------------------------------------------------------------- // Shadow rendering //----------------------------------------------------------------------------- @@ -3186,8 +3191,7 @@ matrix3x4_t* CModelRender::DrawModelShadowSetup( IClientRenderable *pRenderable, { #ifndef SWDS DrawModelInfo_t &info = *pInfo; - static ConVar r_shadowlod("r_shadowlod", "-1"); - static ConVar r_shadowlodbias("r_shadowlodbias", "2"); + model_t const* pModel = pRenderable->GetModel(); if ( !pModel ) From 94fdfd445bde72ae5296df19b3720983ba4dfe43 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Mon, 13 Mar 2023 12:39:16 -0400 Subject: [PATCH 39/42] perf: enable rate limiting water bullet impact effects --- src/game/shared/tf/tf_player_shared.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/game/shared/tf/tf_player_shared.cpp b/src/game/shared/tf/tf_player_shared.cpp index 788310cab..79c58ec21 100644 --- a/src/game/shared/tf/tf_player_shared.cpp +++ b/src/game/shared/tf/tf_player_shared.cpp @@ -10095,8 +10095,8 @@ void CTFPlayer::FireBullet( CTFWeaponBase *pWpn, const FireBulletsInfo_t &info, } #ifdef CLIENT_DLL -static ConVar tf_impactwatertimeenable( "tf_impactwatertimeenable", "0", FCVAR_CHEAT, "Draw impact debris effects." ); -static ConVar tf_impactwatertime( "tf_impactwatertime", "1.0f", FCVAR_CHEAT, "Draw impact debris effects." ); +static ConVar tf_impactwatertimeenable( "tf_impactwatertimeenable", "1", 0, "Rate limit bullet impact effects on water." ); +static ConVar tf_impactwatertime( "tf_impactwatertime", "0.2f", 0, "The interval between bullet impact effects on water." ); #endif //----------------------------------------------------------------------------- From 75f42245c2e6647c019fd4c1a2f75a274abeba30 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Sun, 19 Mar 2023 06:54:06 -0400 Subject: [PATCH 40/42] pending: experimental changes --- src/engine/cmodel.cpp | 2 +- src/engine/engine.vpc | 2 +- src/engine/enginetool.cpp | 2 +- src/engine/gl_rsurf.cpp | 35 ++++++++++++++- src/engine/host.cpp | 21 ++++++--- src/engine/host.h | 4 +- src/engine/l_studio.cpp | 20 ++++----- src/engine/modelloader.cpp | 8 +--- src/engine/net_ws.cpp | 10 ++++- src/engine/vgui_baseui_interface.cpp | 6 ++- src/game/client/c_vguiscreen.cpp | 2 + src/game/client/game_controls/MapOverview.cpp | 6 +-- .../client/game_controls/baseviewport.cpp | 4 +- src/game/client/hud_base_account.cpp | 6 +-- src/game/client/hud_controlpointicons.cpp | 2 +- src/game/client/menu.cpp | 12 ++--- src/game/client/perfvisualbenchmark.cpp | 2 +- .../replay/vgui/replayperformanceeditor.cpp | 2 +- src/game/client/tf/c_tf_player.cpp | 24 ++++++++-- src/game/client/tf/tf_hud_arena_vs_panel.cpp | 2 +- src/game/client/tf/tf_hud_escort.cpp | 6 +-- src/game/client/tf/tf_hud_flagstatus.cpp | 18 ++++---- src/game/client/tf/tf_hud_itemeffectmeter.cpp | 4 +- .../tf/tf_hud_mann_vs_machine_status.cpp | 2 +- src/game/client/tf/tf_hud_match_status.cpp | 2 +- src/game/client/tf/tf_hud_passtime.cpp | 4 +- src/game/client/tf/tf_hud_playerstatus.cpp | 4 +- src/game/client/tf/tf_hud_pve_winpanel.cpp | 2 +- .../tf/tf_hud_robot_destruction_status.cpp | 22 +++++----- src/game/client/tf/tf_hud_scope.cpp | 1 + src/game/client/tf/tf_hud_target_id.cpp | 6 +-- src/game/client/tf/tf_hud_tournament.cpp | 44 +++++++++---------- src/game/client/tf/tf_time_panel.cpp | 2 +- ..._matchmaking_dashboard_next_map_voting.cpp | 8 +++- .../client/tf/vgui/tf_playermodelpanel.cpp | 4 ++ src/game/client/tf/vgui/tf_training_ui.cpp | 16 +++---- src/game/client/viewrender.cpp | 2 +- src/game/shared/econ/econ_item_inventory.cpp | 12 ++--- src/game/shared/econ/econ_item_schema.cpp | 12 ++--- src/game/shared/econ/econ_item_system.cpp | 3 +- src/game/shared/econ/econ_item_view.cpp | 3 ++ src/game/shared/econ/econ_item_view.h | 36 +++++++++++++++ src/game/shared/teamplay_round_timer.cpp | 3 ++ .../shared/teamplayroundbased_gamerules.cpp | 2 +- src/game/shared/tf/tf_gamerules.h | 2 +- src/game/shared/tf/tf_viewmodel.cpp | 4 +- src/game/shared/tf/tf_weapon_sniperrifle.cpp | 14 +++--- src/gameui/GameUI_Interface.cpp | 4 ++ src/inputsystem/inputsystem.cpp | 2 +- src/mathlib/mathlib_base.cpp | 4 +- src/public/bone_setup.cpp | 4 +- src/public/collisionutils.cpp | 10 ++--- src/public/collisionutils.h | 25 ++++++++--- src/public/mathlib/ssemath.h | 3 +- src/public/tier1/strtools.h | 29 +++++++++++- .../vgui_controls/AnimationController.h | 6 +-- src/public/vgui_controls/EditablePanel.h | 8 ++-- src/public/vgui_controls/Label.h | 2 + src/public/vgui_controls/Panel.h | 2 + .../DirectXMath-dec2022/Inc/DirectXMath.h | 4 -- src/thirdparty/quickhull/quickhull.vpc | 2 +- src/tier1/utlbuffer.cpp | 2 +- .../vgui_controls/AnimationController.cpp | 22 ++++++---- src/vgui2/vgui_controls/EditablePanel.cpp | 24 ++++++++-- src/vgui2/vgui_controls/Label.cpp | 43 ++++++++++++++---- 65 files changed, 411 insertions(+), 195 deletions(-) diff --git a/src/engine/cmodel.cpp b/src/engine/cmodel.cpp index 466d4451d..2647fe30b 100644 --- a/src/engine/cmodel.cpp +++ b/src/engine/cmodel.cpp @@ -1570,7 +1570,7 @@ void FASTCALL CM_TraceToLeaf( TraceInfo_t * RESTRICT pTraceInfo, int ndxLeaf, fl pCounters = pTraceInfo->GetDispCounters(); count = pTraceInfo->GetCount(); - if (IsX360()) + if (IsX360() || 1) { // set up some relatively constant variables we'll use in the loop below fltx4 traceStart = LoadUnaligned3SIMD(pTraceInfo->m_start.Base()); diff --git a/src/engine/engine.vpc b/src/engine/engine.vpc index d0ebc3a55..ef661b04d 100644 --- a/src/engine/engine.vpc +++ b/src/engine/engine.vpc @@ -31,7 +31,7 @@ $Configuration $Compiler [$WIN32] { - $EnableEnhancedInstructionSet "Streaming SIMD Extensions (/arch:SSE)" + $EnableEnhancedInstructionSet "Streaming SIMD Extensions 2 (/arch:SSE2)" } $Linker diff --git a/src/engine/enginetool.cpp b/src/engine/enginetool.cpp index 460838723..6fee58081 100644 --- a/src/engine/enginetool.cpp +++ b/src/engine/enginetool.cpp @@ -444,7 +444,7 @@ void CEngineTool::SetGamePaused( bool paused ) float CEngineTool::GetTimescale() { - return host_timescale.GetFloat(); + return host_timescale.GetFloat() ? host_timescale.GetFloat() : 1.0f; } void CEngineTool::SetTimescale( float scale ) diff --git a/src/engine/gl_rsurf.cpp b/src/engine/gl_rsurf.cpp index 82ed85ac9..650de8c00 100644 --- a/src/engine/gl_rsurf.cpp +++ b/src/engine/gl_rsurf.cpp @@ -47,6 +47,7 @@ #include "materialsystem/imaterialvar.h" #include "coordsize.h" #include "mempool.h" +#include "mathlib/ssemath.h" #ifndef SWDS #include "Overlay.h" #endif @@ -4839,10 +4840,12 @@ static bool EnumerateLeafInBox_R(mnode_t *node, EnumLeafBoxInfo_t& info ) } } -#ifdef _X360 +#if defined(_X360) || USE_DXMATH +#ifdef _DEBUG static fltx4 AlignThatVector(const Vector &vc) { +#ifdef _X360 fltx4 out = __loadunalignedvector(vc.Base()); /* @@ -4853,7 +4856,12 @@ static fltx4 AlignThatVector(const Vector &vc) // squelch the w component return __vrlimi( out, __vzero(), 1, 0 ); +#elif USE_DXMATH + fltx4 out = LoadUnaligned3SIMD(vc.Base()); + return DirectX::XMVectorSetW(out, 0); +#endif } +#endif //----------------------------------------------------------------------------- // Finds all leaves of the BSP tree within a particular volume @@ -4864,9 +4872,11 @@ static bool EnumerateLeafInBox_R(mnode_t * RESTRICT node, const EnumLeafBoxInfo_ if (node->contents == CONTENTS_SOLID) return true; // solid +#ifdef _X360 // speculatively get the children into the cache __dcbt(0,node->children[0]); __dcbt(0,node->children[1]); +#endif // constructing these here prevents LHS if we spill. // it's not quite a quick enough operation to do extemporaneously. @@ -4937,6 +4947,7 @@ static bool EnumerateLeafInBox_R(mnode_t * RESTRICT node, const EnumLeafBoxInfo_ fltx4 vecBoxMax = LoadAlignedSIMD(pInfo->m_vecBoxMax); fltx4 cornermin, cornermax; // by now planeNormal is ready... +#ifdef _X360 fltx4 control = XMVectorGreaterOrEqual( planeNormal, __vzero() ); // now control[i] = planeNormal[i] > 0 ? 0xFF : 0x00 cornermin = XMVectorSelect( vecBoxMax, vecBoxMin, control); // cornermin[i] = control[i] ? vecBoxMin[i] : vecBoxMax[i] @@ -4945,6 +4956,7 @@ static bool EnumerateLeafInBox_R(mnode_t * RESTRICT node, const EnumLeafBoxInfo_ // compute dot products fltx4 dotCornerMax = __vmsum3fp(planeNormal, cornermax); // vsumfp ignores w component fltx4 dotCornerMin = __vmsum3fp(planeNormal, cornermin); + fltx4 vPlaneDist = ReplicateX4(plane->dist); UINT conditionRegister; XMVectorGreaterR(&conditionRegister,vPlaneDist,dotCornerMax); @@ -4954,6 +4966,25 @@ static bool EnumerateLeafInBox_R(mnode_t * RESTRICT node, const EnumLeafBoxInfo_ XMVectorGreaterOrEqualR(&conditionRegister,dotCornerMin,vPlaneDist); if ( XMComparisonAllTrue(conditionRegister) ) return EnumerateLeafInBox_R( node->children[0], pInfo ); +#elif USE_DXMATH + fltx4 control = DirectX::XMVectorGreaterOrEqual( planeNormal, LoadZeroSIMD() ); + // now control[i] = planeNormal[i] > 0 ? 0xFF : 0x00 + cornermin = DirectX::XMVectorSelect( vecBoxMax, vecBoxMin, control); // cornermin[i] = control[i] ? vecBoxMin[i] : vecBoxMax[i] + cornermax = DirectX::XMVectorSelect( vecBoxMin, vecBoxMax, control); + // compute dot products + fltx4 dotCornerMax = DirectX::XMVector3Dot(planeNormal, cornermax); // vsumfp ignores w component + fltx4 dotCornerMin = DirectX::XMVector3Dot(planeNormal, cornermin); + + fltx4 vPlaneDist = ReplicateX4(plane->dist); + uint conditionRegister; + DirectX::XMVectorGreaterR(&conditionRegister,vPlaneDist,dotCornerMax); + if (DirectX::XMComparisonAllTrue(conditionRegister)) // plane->normal . cornermax <= plane->dist + return EnumerateLeafInBox_R( node->children[1], pInfo ); + + DirectX::XMVectorGreaterOrEqualR(&conditionRegister,dotCornerMin,vPlaneDist); + if ( DirectX::XMComparisonAllTrue(conditionRegister) ) + return EnumerateLeafInBox_R( node->children[0], pInfo ); +#endif return EnumerateLeafInBox_R( node->children[0], pInfo ) && EnumerateLeafInBox_R( node->children[1], pInfo ); @@ -5326,7 +5357,7 @@ bool CEngineBSPTree::EnumerateLeavesInBox( const Vector& mins, const Vector& max info.m_nContext = context; info.m_vecBoxMax = maxs; info.m_vecBoxMin = mins; -#ifdef _X360 +#if defined(_X360) || USE_DXMATH if (opt_EnumerateLeavesFastAlgorithm.GetBool()) return EnumerateLeafInBox_R( host_state.worldbrush->nodes, &info ); else diff --git a/src/engine/host.cpp b/src/engine/host.cpp index 0ff16bb23..9f1693ba4 100644 --- a/src/engine/host.cpp +++ b/src/engine/host.cpp @@ -585,7 +585,7 @@ static ConVar host_profile( "host_profile","0" ); ConVar host_limitlocal( "host_limitlocal", "0", 0, "Apply cl_cmdrate and cl_updaterate to loopback connection" ); ConVar host_framerate( "host_framerate","0", 0, "Set to lock per-frame time elapse." ); -ConVar host_timescale( "host_timescale","1.0", FCVAR_REPLICATED, "Prescale the clock by this amount." ); +ConVar host_timescale( "host_timescale","0.0", FCVAR_REPLICATED, "Prescale the clock by this amount." ); ConVar host_speeds( "host_speeds","0", 0, "Show general system running times." ); // set for running times ConVar host_flush_threshold( "host_flush_threshold", "20", 0, "Memory threshold below which the host should flush caches between server instances" ); @@ -1758,7 +1758,10 @@ void Host_ReadPreStartupConfiguration() { "sv_unlockedchapters", // needed to display the startup graphic while loading "snd_legacy_surround", // needed to init the sound system +#if defined( _X360 ) || defined( STAGING_ONLY ) "gameui_xbox", // needed to initialize the correct UI +#endif + "cl_hud_minmode", // needed to initialize the correct UI "save_in_memory" // needed to preread data from the correct location in UI }; @@ -1867,10 +1870,13 @@ void Host_AccumulateTime( float dt ) host_frametime = host_state.interval_per_tick; } + const bool bIsPlayingDemo = demoplayer->IsPlayingBack(); + const float flDemoTimescale = bIsPlayingDemo ? demoplayer->GetPlaybackTimeScale() : 1.0f; + #if 1 if ( host_framerate.GetFloat() > 0 #if !defined(SWDS) - && ( CanCheat() || demoplayer->IsPlayingBack() ) + && ( CanCheat() || bIsPlayingDemo ) #endif ) { @@ -1883,10 +1889,10 @@ void Host_AccumulateTime( float dt ) #if !defined(SWDS) && defined( REPLAY_ENABLED ) extern IDemoPlayer *g_pReplayDemoPlayer; - if ( demoplayer->IsPlayingBack() && demoplayer == g_pReplayDemoPlayer ) + if ( bIsPlayingDemo && demoplayer == g_pReplayDemoPlayer ) { // adjust time scale if playing back demo - host_frametime *= demoplayer->GetPlaybackTimeScale(); + host_frametime *= flDemoTimescale; } #endif @@ -1894,17 +1900,18 @@ void Host_AccumulateTime( float dt ) } else if (host_timescale.GetFloat() > 0 #if !defined(SWDS) - && ( CanCheat() || demoplayer->IsPlayingBack() ) + && ( CanCheat() || bIsPlayingDemo ) + || ( bIsPlayingDemo && flDemoTimescale != 1.0f ) #endif ) { float fullscale = host_timescale.GetFloat(); #if !defined(SWDS) - if ( demoplayer->IsPlayingBack() ) + if ( bIsPlayingDemo ) { // adjust time scale if playing back demo - fullscale *= demoplayer->GetPlaybackTimeScale(); + fullscale *= flDemoTimescale; } #endif diff --git a/src/engine/host.h b/src/engine/host.h index 473a6abd5..4059ea441 100644 --- a/src/engine/host.h +++ b/src/engine/host.h @@ -149,10 +149,10 @@ extern int host_currentframetick; // PERFORMANCE INFO #define MIN_FPS 0.1 // Host minimum fps value for maxfps. -#define MAX_FPS 1000.0 // Upper limit for maxfps. +#define MAX_FPS 10000.0 // Upper limit for maxfps. #define MAX_FRAMETIME 0.1 -#define MIN_FRAMETIME 0.001 +#define MIN_FRAMETIME 0.0001 #define TIME_TO_TICKS( dt ) ( (int)( 0.5f + (float)(dt) / host_state.interval_per_tick ) ) #define TICKS_TO_TIME( dt ) ( host_state.interval_per_tick * (float)(dt) ) diff --git a/src/engine/l_studio.cpp b/src/engine/l_studio.cpp index a3ab707c0..aaeeea989 100644 --- a/src/engine/l_studio.cpp +++ b/src/engine/l_studio.cpp @@ -2785,11 +2785,11 @@ struct rbatch_t // ---------------------------------------- */ -inline int FindModel( const CUtlVector &list, const model_t *pModel ) +inline int FindModel( const rmodel_t* pList, int listCount, const model_t* pModel ) { - for ( int j = list.Count(); --j >= 0 ; ) + for ( int j = listCount; --j >= 0 ; ) { - if ( list[j].pModel == pModel ) + if ( pList[j].pModel == pModel ) return j; } return -1; @@ -2806,13 +2806,13 @@ int CModelRender::DrawStaticPropArrayFast( StaticPropRenderInfo_t *pProps, int c #ifndef SWDS MDLCACHE_CRITICAL_SECTION_( g_pMDLCache ); CMatRenderContextPtr pRenderContext( materials ); - const int MAX_OBJECTS = 1024; + const int MAX_OBJECTS = 2048; CUtlSortVector objectList(0, MAX_OBJECTS); - CUtlVector modelList(0,256); - CUtlVector lightObjects(0,256); - CUtlVector shadowObjects(0,64); - CUtlVector decalObjects(0,64); - CUtlVector lightStates(0,256); + CUtlVectorFixedGrowable modelList; + CUtlVectorFixedGrowable lightObjects; + CUtlVectorFixedGrowable shadowObjects; + CUtlVectorFixedGrowable decalObjects; + CUtlVectorFixedGrowable lightStates; bool bForceCubemap = r_showenvcubemap.GetBool(); int drawnCount = 0; int forcedLodSetting = r_lod.GetInt(); @@ -2826,7 +2826,7 @@ int CModelRender::DrawStaticPropArrayFast( StaticPropRenderInfo_t *pProps, int c { drawnCount++; // UNDONE: This is a perf hit in some scenes! Use a hash? - int modelIndex = FindModel( modelList, pProps[i].pModel ); + int modelIndex = FindModel( modelList.Base(), modelList.Count(), pProps[i].pModel ); if ( modelIndex < 0 ) { modelIndex = modelList.AddToTail(); diff --git a/src/engine/modelloader.cpp b/src/engine/modelloader.cpp index 4835fea06..77c379286 100644 --- a/src/engine/modelloader.cpp +++ b/src/engine/modelloader.cpp @@ -62,13 +62,7 @@ ConVar mat_loadtextures( "mat_loadtextures", "1", FCVAR_CHEAT ); -// OS X and Linux are blowing up right now due to this. Benefits vs possible regressions on DX less clear. -#if defined( DX_TO_GL_ABSTRACTION ) || defined( STAGING_ONLY ) - #define CONVAR_DEFAULT_MOD_OFFLINE_HDR_SWITCH "1" -#else - #define CONVAR_DEFAULT_MOD_OFFLINE_HDR_SWITCH "0" -#endif -static ConVar mod_offline_hdr_switch( "mod_offline_hdr_switch", CONVAR_DEFAULT_MOD_OFFLINE_HDR_SWITCH, FCVAR_INTERNAL_USE, +static ConVar mod_offline_hdr_switch( "mod_offline_hdr_switch", "1", FCVAR_INTERNAL_USE, "Re-order the HDR/LDR mode switch to do most of the material system " "reloading with the device offline. This reduces unnecessary device " "resource uploads and may drastically reduce load time and memory pressure " diff --git a/src/engine/net_ws.cpp b/src/engine/net_ws.cpp index 3c49ab6cd..5ae945fc0 100644 --- a/src/engine/net_ws.cpp +++ b/src/engine/net_ws.cpp @@ -2960,7 +2960,15 @@ void NET_SetTime( double flRealtime ) } // adjust network time so fakelag works with host_timescale - net_time += frametime * host_timescale.GetFloat(); + const float timescale = host_timescale.GetFloat(); + if (timescale > 0) + { + net_time += frametime * timescale; + } + else + { + net_time += frametime; + } } /* diff --git a/src/engine/vgui_baseui_interface.cpp b/src/engine/vgui_baseui_interface.cpp index 368485a4d..7bdd077d3 100644 --- a/src/engine/vgui_baseui_interface.cpp +++ b/src/engine/vgui_baseui_interface.cpp @@ -124,7 +124,9 @@ IGameConsole *staticGameConsole = NULL; bool s_bWindowsInputEnabled = true; ConVar r_drawvgui( "r_drawvgui", "1", FCVAR_CHEAT, "Enable the rendering of vgui panels" ); +#if defined( _X360 ) || defined( STAGING_ONLY ) ConVar gameui_xbox( "gameui_xbox", "0", 0 ); +#endif void Con_CreateConsolePanel( vgui::Panel *parent ); void CL_CreateEntityReportPanel( vgui::Panel *parent ); @@ -2142,11 +2144,11 @@ void VGui_FindNamedPanels( CUtlVector< vgui::VPANEL >& panelList, char const *pa VGui_RecursiveFindPanels( panelList, embedded, panelname ); } -CON_COMMAND( vgui_togglepanel, "show/hide vgui panel by name." ) +CON_COMMAND_F( vgui_togglepanel, "show/hide vgui panel by name.", FCVAR_CHEAT ) { if ( args.ArgC() < 2 ) { - ConMsg( "Usage: vgui_showpanel panelname\n" ); + ConMsg( "Usage: vgui_togglepanel panelname\n" ); return; } diff --git a/src/game/client/c_vguiscreen.cpp b/src/game/client/c_vguiscreen.cpp index 1be470248..34aada843 100644 --- a/src/game/client/c_vguiscreen.cpp +++ b/src/game/client/c_vguiscreen.cpp @@ -650,6 +650,8 @@ C_BaseEntity *FindNearbyVguiScreen( const Vector &viewPosition, const QAngle &vi // X360TBD: Turn this on if feature actually used return NULL; } + // Feature not used, causes crashes if entity exists anyway... + return NULL; C_BasePlayer *pLocalPlayer = C_BasePlayer::GetLocalPlayer(); diff --git a/src/game/client/game_controls/MapOverview.cpp b/src/game/client/game_controls/MapOverview.cpp index 0deb535ca..8b574aed4 100644 --- a/src/game/client/game_controls/MapOverview.cpp +++ b/src/game/client/game_controls/MapOverview.cpp @@ -1019,7 +1019,7 @@ void CMapOverview::SetMode(int mode) { ShowPanel( false ); - g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "MapOff" ); + g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( this, "MapOff", true, true ); } else if ( mode == MAP_MODE_INSET ) { @@ -1041,7 +1041,7 @@ void CMapOverview::SetMode(int mode) if ( mode != m_nMode && RunHudAnimations() ) { - g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "MapZoomToSmall" ); + g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( this, "MapZoomToSmall", true, true ); } } else if ( mode == MAP_MODE_FULL ) @@ -1061,7 +1061,7 @@ void CMapOverview::SetMode(int mode) if ( mode != m_nMode && RunHudAnimations() ) { - g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "MapZoomToLarge" ); + g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( this, "MapZoomToLarge", true, true ); } } diff --git a/src/game/client/game_controls/baseviewport.cpp b/src/game/client/game_controls/baseviewport.cpp index 181b0d333..90dab0849 100644 --- a/src/game/client/game_controls/baseviewport.cpp +++ b/src/game/client/game_controls/baseviewport.cpp @@ -79,7 +79,7 @@ void hud_autoreloadscript_callback( IConVar *var, const char *pOldValue, float f static ConVar cl_leveloverviewmarker( "cl_leveloverviewmarker", "0", FCVAR_CHEAT ); -CON_COMMAND( showpanel, "Shows a viewport panel " ) +CON_COMMAND_F( showpanel, "Shows a viewport panel ", FCVAR_CHEAT ) { if ( !gViewPortInterface ) return; @@ -90,7 +90,7 @@ CON_COMMAND( showpanel, "Shows a viewport panel " ) gViewPortInterface->ShowPanel( args[ 1 ], true ); } -CON_COMMAND( hidepanel, "Hides a viewport panel " ) +CON_COMMAND_F( hidepanel, "Hides a viewport panel ", FCVAR_CHEAT ) { if ( !gViewPortInterface ) return; diff --git a/src/game/client/hud_base_account.cpp b/src/game/client/hud_base_account.cpp index 9d99a2c3c..0ec27e0ca 100644 --- a/src/game/client/hud_base_account.cpp +++ b/src/game/client/hud_base_account.cpp @@ -28,7 +28,7 @@ void CHudBaseAccount::LevelInit( void ) m_pszLastAnimationName = NULL; m_pszQueuedAnimationName = NULL; - GetAnimationController()->StartAnimationSequence("AccountMoneyInvisible"); + GetAnimationController()->StartAnimationSequence(this, "AccountMoneyInvisible", true, true); } void CHudBaseAccount::ApplySchemeSettings(vgui::IScheme *pScheme) @@ -91,14 +91,14 @@ void CHudBaseAccount::Paint() { m_pszLastAnimationName = "AccountMoneyAdded"; } - GetAnimationController()->StartAnimationSequence( m_pszLastAnimationName ); + GetAnimationController()->StartAnimationSequence( this, m_pszLastAnimationName, true, true ); m_flLastAnimationEnd = gpGlobals->curtime + GetAnimationController()->GetAnimationSequenceLength( m_pszLastAnimationName ); m_iPreviousAccount = account; } else if ( m_pszQueuedAnimationName ) { - GetAnimationController()->StartAnimationSequence( m_pszQueuedAnimationName ); + GetAnimationController()->StartAnimationSequence( this, m_pszQueuedAnimationName, true, true ); m_pszQueuedAnimationName = NULL; } diff --git a/src/game/client/hud_controlpointicons.cpp b/src/game/client/hud_controlpointicons.cpp index b18ab4491..0c1372732 100644 --- a/src/game/client/hud_controlpointicons.cpp +++ b/src/game/client/hud_controlpointicons.cpp @@ -551,7 +551,7 @@ void CControlPointIcon::PerformLayout( void ) if ( m_pCapNumPlayers ) { m_pCapNumPlayers->SetVisible( (iPlayers>1) ); - SetDialogVariable( "numcappers", iPlayers ); + SetDialogVariable( "numcappers", iPlayers, false ); m_pCapNumPlayers->SetFgColor( Color(0,0,0,255) ); } diff --git a/src/game/client/menu.cpp b/src/game/client/menu.cpp index f5ee31692..1174fa2cd 100644 --- a/src/game/client/menu.cpp +++ b/src/game/client/menu.cpp @@ -256,12 +256,12 @@ void CHudMenu::SelectMenuItem( int menu_item ) m_nSelectedItem = menu_item; // Pulse the selection - g_pClientMode->GetViewportAnimationController()->StartAnimationSequence("MenuPulse"); + g_pClientMode->GetViewportAnimationController()->StartAnimationSequence(this, "MenuPulse", true, true); // remove the menu quickly m_bMenuTakesInput = false; m_flShutoffTime = gpGlobals->realtime + m_flOpenCloseTime; - g_pClientMode->GetViewportAnimationController()->StartAnimationSequence("MenuClose"); + g_pClientMode->GetViewportAnimationController()->StartAnimationSequence(this, "MenuClose", true, true); } } @@ -365,7 +365,7 @@ void CHudMenu::HideMenu( void ) { m_bMenuTakesInput = false; m_flShutoffTime = gpGlobals->realtime + m_flOpenCloseTime; - g_pClientMode->GetViewportAnimationController()->StartAnimationSequence("MenuClose"); + g_pClientMode->GetViewportAnimationController()->StartAnimationSequence(this, "MenuClose", true, true); } //----------------------------------------------------------------------------- @@ -384,7 +384,7 @@ void CHudMenu::ShowMenu( const char * menuName, int validSlots ) Q_strncpy( g_szPrelocalisedMenuString, menuName, sizeof( g_szPrelocalisedMenuString ) ); - g_pClientMode->GetViewportAnimationController()->StartAnimationSequence("MenuOpen"); + g_pClientMode->GetViewportAnimationController()->StartAnimationSequence(this, "MenuOpen", true, true); m_nSelectedItem = -1; // we have the whole string, so we can localise it now @@ -409,7 +409,7 @@ void CHudMenu::ShowMenu_KeyValueItems( KeyValues *pKV ) m_fWaitingForMore = 0; m_bitsValidSlots = 0; - g_pClientMode->GetViewportAnimationController()->StartAnimationSequence("MenuOpen"); + g_pClientMode->GetViewportAnimationController()->StartAnimationSequence(this, "MenuOpen", true, true); m_nSelectedItem = -1; g_szMenuString[0] = '\0'; @@ -489,7 +489,7 @@ void CHudMenu::MsgFunc_ShowMenu( bf_read &msg) if ( !NeedMore ) { - g_pClientMode->GetViewportAnimationController()->StartAnimationSequence("MenuOpen"); + g_pClientMode->GetViewportAnimationController()->StartAnimationSequence(this, "MenuOpen", true, true); m_nSelectedItem = -1; // we have the whole string, so we can localise it now diff --git a/src/game/client/perfvisualbenchmark.cpp b/src/game/client/perfvisualbenchmark.cpp index 29eb1fdf8..91e336633 100644 --- a/src/game/client/perfvisualbenchmark.cpp +++ b/src/game/client/perfvisualbenchmark.cpp @@ -233,7 +233,7 @@ void CPerfVisualBenchmark::Stop() #endif m_bIsOn = false; Print(); - engine->ClientCmd_Unrestricted("host_timescale 0"); // pause the mofo + engine->ClientCmd_Unrestricted("host_timescale 0.0001"); // pause the mofo // engine->ClientCmd_Unrestricted("unpause"); // unpause the mofo // engine->ClientCmd_Unrestricted("wait"); engine->ClientCmd_Unrestricted("toggleconsole"); diff --git a/src/game/client/replay/vgui/replayperformanceeditor.cpp b/src/game/client/replay/vgui/replayperformanceeditor.cpp index be274ccb6..3d05fb802 100644 --- a/src/game/client/replay/vgui/replayperformanceeditor.cpp +++ b/src/game/client/replay/vgui/replayperformanceeditor.cpp @@ -909,7 +909,7 @@ class CReplayEditorFastForwardButton : public CReplayButton // the user is still holding downt he FF button at the end of the replay. if ( m_pHostTimescale ) { - m_pHostTimescale->SetValue( 1.0f ); + m_pHostTimescale->SetValue( 0.0f ); } // Resume demo playback so that any demo played later won't start paused. diff --git a/src/game/client/tf/c_tf_player.cpp b/src/game/client/tf/c_tf_player.cpp index d43b0c3da..c5396cce3 100644 --- a/src/game/client/tf/c_tf_player.cpp +++ b/src/game/client/tf/c_tf_player.cpp @@ -2922,7 +2922,9 @@ class CProxyAnimatedWeaponSheen : public CBaseAnimatedTextureProxy } C_BaseEntity *pBaseEntity = pRend->GetIClientUnknown()->GetBaseEntity(); - const CEconItemView *pItem = dynamic_cast< CEconItemView* >( pRend ); + CEconItemView *pItem = dynamic_cast< CEconItemView* >( pRend ); + + CEconItemViewDataCacher itemDataCacher(pItem); uint32 unAttrValue = 0; uint32 unEffectValue = 0; @@ -2974,6 +2976,7 @@ class CProxyAnimatedWeaponSheen : public CBaseAnimatedTextureProxy if ( pWearable ) { pItem = pWearable->GetAttributeContainer()->GetItem(); + itemDataCacher.SetItem(pItem); pTFPlayer = ToTFPlayer( pWearable->GetOwnerEntity() ); break; } @@ -2983,6 +2986,7 @@ class CProxyAnimatedWeaponSheen : public CBaseAnimatedTextureProxy if ( pModel->GetOuter() ) { pItem = pModel->GetOuter()->GetAttributeContainer()->GetItem(); + itemDataCacher.SetItem(pItem); pBaseEntity = pBaseEntity->GetOwnerEntity(); if ( pItem ) { @@ -3005,6 +3009,7 @@ class CProxyAnimatedWeaponSheen : public CBaseAnimatedTextureProxy if ( pWeapon ) { pItem = pWeapon->GetAttributeContainer()->GetItem(); + itemDataCacher.SetItem(pItem); pBaseEntity = pWeapon; } bIsFirstPerson = true; @@ -3017,6 +3022,7 @@ class CProxyAnimatedWeaponSheen : public CBaseAnimatedTextureProxy if ( pWeapon ) { pItem = pWeapon->GetAttributeContainer()->GetItem(); + itemDataCacher.SetItem(pItem); pBaseEntity = pWeapon; } } @@ -3025,6 +3031,7 @@ class CProxyAnimatedWeaponSheen : public CBaseAnimatedTextureProxy else { pItem = pWeapon->GetAttributeContainer()->GetItem(); + itemDataCacher.SetItem(pItem); pBaseEntity = pWeapon; pTFPlayer = ToTFPlayer( pWeapon->GetOwner() ); } @@ -3046,6 +3053,7 @@ class CProxyAnimatedWeaponSheen : public CBaseAnimatedTextureProxy if ( pTFPlayer && pTFPlayer->m_Shared.GetDisguiseWeapon() ) { pItem = pTFPlayer->m_Shared.GetDisguiseWeapon()->GetAttributeContainer()->GetItem(); + itemDataCacher.SetItem(pItem); pBaseEntity = pTFPlayer->m_Shared.GetDisguiseWeapon(); } } @@ -3170,11 +3178,13 @@ class CProxyAnimatedWeaponSheen : public CBaseAnimatedTextureProxy int iShaderIndex = sheenParams.m_iShaderIndex; // Australium weapons always use iShaderIndex 1 + pItem->CacheSOCData(); const CEconStyleInfo *pStyle = pItem->GetStaticData()->GetStyleInfo( pItem->GetItemStyle() ); if ( pStyle && !pStyle->IsSelectable() ) { iShaderIndex = 1; } + pItem->UncacheSOCData(); #ifdef STAGING_ONLY if ( tf_sheen_shader_override.GetInt() > 0 ) @@ -3728,6 +3738,8 @@ class CWeaponSkinProxy : public IMaterialProxy if ( !pItem ) return; + CEconItemViewDataCacher dataCacher(pItem); + C_TFPlayer *pOwner = GetOwnerFromProxyEntity( pC_BaseEntity ); int desiredW = m_pBaseTextureOrig->GetActualWidth(); int desiredH = m_pBaseTextureOrig->GetActualHeight(); @@ -7493,8 +7505,8 @@ void C_TFPlayer::UpdateIDTarget() trace_t tr; Vector vecStart, vecEnd; - VectorMA( MainViewOrigin(), MAX_TRACE_LENGTH, MainViewForward(), vecEnd ); - VectorMA( MainViewOrigin(), 10, MainViewForward(), vecStart ); + VectorMA( MainViewOrigin(), 8192.0f, MainViewForward(), vecEnd ); + VectorMA( MainViewOrigin(), 10.0f, MainViewForward(), vecStart ); // If we're in observer mode, ignore our observer target. Otherwise, ignore ourselves. if ( IsObserver() ) @@ -7511,7 +7523,11 @@ void C_TFPlayer::UpdateIDTarget() iReviveMedic = 1; } - int nMask = MASK_SOLID | CONTENTS_DEBRIS; + int nMask = MASK_SOLID; + if ( iReviveMedic == 1) + { + nMask |= CONTENTS_DEBRIS; + } UTIL_TraceLine( vecStart, vecEnd, nMask, this, COLLISION_GROUP_NONE, &tr ); } diff --git a/src/game/client/tf/tf_hud_arena_vs_panel.cpp b/src/game/client/tf/tf_hud_arena_vs_panel.cpp index d2df4f395..cee248151 100644 --- a/src/game/client/tf/tf_hud_arena_vs_panel.cpp +++ b/src/game/client/tf/tf_hud_arena_vs_panel.cpp @@ -121,7 +121,7 @@ void CHudArenaVsPanel::FireGameEvent( IGameEvent * event ) if ( m_bVisible ) { - g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "ArenaVsPanelOnShow" ); + g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( this, "ArenaVsPanelOnShow", true, true ); m_flHideTime = gpGlobals->curtime + 10.0f; diff --git a/src/game/client/tf/tf_hud_escort.cpp b/src/game/client/tf/tf_hud_escort.cpp index 68c4ca740..233fb12b9 100644 --- a/src/game/client/tf/tf_hud_escort.cpp +++ b/src/game/client/tf/tf_hud_escort.cpp @@ -696,7 +696,7 @@ void CTFHudEscort::UpdateAlarmAnimations( void ) void CTFHudEscort::OnTick() { // don't need to do this on non-escort maps (unless we're trying to override the HUD type) - if ( TFGameRules() && ( TFGameRules()->GetGameType() != TF_GAMETYPE_ESCORT ) && ( TFGameRules()->GetHUDType() != TF_HUDTYPE_ESCORT ) ) + if ( !TFGameRules() || ( TFGameRules()->GetGameType() != TF_GAMETYPE_ESCORT ) && ( TFGameRules()->GetHUDType() != TF_HUDTYPE_ESCORT ) ) return; if ( !BaseClass::IsVisible() ) // intentionally skipping our version of IsVisible() to bypass the !m_bHaveValidPointPositions check @@ -831,14 +831,14 @@ void CTFHudEscort::OnTick() if ( flSecondsToRecede > 0.0f && flSecondsToRecede <= TF_ESCORT_RECEDE_COUNTDOWN ) { int iDisplaySeconds = (int)( flSecondsToRecede ) + 1; - m_pEscortItemPanel->SetDialogVariable( "recede", VarArgs( "%d", iDisplaySeconds ) ); + m_pEscortItemPanel->SetDialogVariable( "recede", VarArgs( "%d", iDisplaySeconds ), false ); // we should not be showing the blocked image if we're showing the countdown m_pBlocked->SetVisible( false ); } else { - m_pEscortItemPanel->SetDialogVariable( "recede", "" ); + m_pEscortItemPanel->SetDialogVariable( "recede", "", false ); } // Debug string diff --git a/src/game/client/tf/tf_hud_flagstatus.cpp b/src/game/client/tf/tf_hud_flagstatus.cpp index c851ca162..1cefa8398 100644 --- a/src/game/client/tf/tf_hud_flagstatus.cpp +++ b/src/game/client/tf/tf_hud_flagstatus.cpp @@ -507,7 +507,7 @@ void CTFHudFlagObjectives::ApplySchemeSettings( IScheme *pScheme ) //----------------------------------------------------------------------------- void CTFHudFlagObjectives::Reset() { - g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "FlagOutlineHide" ); + g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( this, "FlagOutlineHide", true, true ); UpdateStatus(); } @@ -641,37 +641,37 @@ void CTFHudFlagObjectives::OnTick() } // are we playing captures for rounds? - if ( !TFGameRules() || ( !TFGameRules()->IsPlayingHybrid_CTF_CP() && !TFGameRules()->IsPlayingSpecialDeliveryMode() && !TFGameRules()->IsMannVsMachineMode() ) ) + if ( TFGameRules() && ( !TFGameRules()->IsPlayingHybrid_CTF_CP() && !TFGameRules()->IsPlayingSpecialDeliveryMode() && !TFGameRules()->IsMannVsMachineMode() ) ) { if ( tf_flag_caps_per_round.GetInt() > 0 ) { C_TFTeam *pTeam = GetGlobalTFTeam( TF_TEAM_BLUE ); if ( pTeam ) { - SetDialogVariable( "bluescore", pTeam->GetFlagCaptures() ); + SetDialogVariable( "bluescore", pTeam->GetFlagCaptures(), false ); } pTeam = GetGlobalTFTeam( TF_TEAM_RED ); if ( pTeam ) { - SetDialogVariable( "redscore", pTeam->GetFlagCaptures() ); + SetDialogVariable( "redscore", pTeam->GetFlagCaptures(), false ); } SetPlayingToLabelVisible( true ); - SetDialogVariable( "rounds", tf_flag_caps_per_round.GetInt() ); + SetDialogVariable( "rounds", tf_flag_caps_per_round.GetInt(), false ); } else // we're just playing straight score { C_TFTeam *pTeam = GetGlobalTFTeam( TF_TEAM_BLUE ); if ( pTeam ) { - SetDialogVariable( "bluescore", pTeam->Get_Score() ); + SetDialogVariable( "bluescore", pTeam->Get_Score(), false ); } pTeam = GetGlobalTFTeam( TF_TEAM_RED ); if ( pTeam ) { - SetDialogVariable( "redscore", pTeam->Get_Score() ); + SetDialogVariable( "redscore", pTeam->Get_Score(), false ); } SetPlayingToLabelVisible( false ); @@ -834,7 +834,7 @@ void CTFHudFlagObjectives::UpdateStatus( C_BasePlayer *pNewOwner /*= NULL*/, C_B if ( !m_bFlagAnimationPlayed ) { m_bFlagAnimationPlayed = true; - g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "FlagOutline" ); + g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( this, "FlagOutline", true, true ); } if ( m_pCapturePoint && !m_pCapturePoint->IsVisible() ) @@ -864,7 +864,7 @@ void CTFHudFlagObjectives::UpdateStatus( C_BasePlayer *pNewOwner /*= NULL*/, C_B if ( m_bCarryingFlag ) { m_bCarryingFlag = false; - g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "FlagOutline" ); + g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( this, "FlagOutline", true, true ); } m_bFlagAnimationPlayed = false; diff --git a/src/game/client/tf/tf_hud_itemeffectmeter.cpp b/src/game/client/tf/tf_hud_itemeffectmeter.cpp index 6f9bb2b42..f16d3b2fb 100644 --- a/src/game/client/tf/tf_hud_itemeffectmeter.cpp +++ b/src/game/client/tf/tf_hud_itemeffectmeter.cpp @@ -451,11 +451,11 @@ void CHudItemEffectMeter::Update( C_TFPlayer* pPlayer, const char* pSoundScript { if ( ShowPercentSymbol() ) { - SetDialogVariable( "progresscount", VarArgs( "%d%%", iCount ) ); + SetDialogVariable( "progresscount", VarArgs( "%d%%", iCount ), false ); } else { - SetDialogVariable( "progresscount", iCount ); + SetDialogVariable( "progresscount", iCount, false ); } } diff --git a/src/game/client/tf/tf_hud_mann_vs_machine_status.cpp b/src/game/client/tf/tf_hud_mann_vs_machine_status.cpp index b05c77992..d969ce6a5 100644 --- a/src/game/client/tf/tf_hud_mann_vs_machine_status.cpp +++ b/src/game/client/tf/tf_hud_mann_vs_machine_status.cpp @@ -1123,7 +1123,7 @@ void CInWorldCurrencyStatus::OnTick( void ) char szTmp[16]; Q_snprintf( szTmp, ARRAYSIZE( szTmp ), "$%d", nWorldMoney ); - SetDialogVariable( "currency", szTmp ); + SetDialogVariable( "currency", szTmp, false ); } //----------------------------------------------------------------------------- // Purpose: diff --git a/src/game/client/tf/tf_hud_match_status.cpp b/src/game/client/tf/tf_hud_match_status.cpp index ed0dac2cb..acd7bc892 100644 --- a/src/game/client/tf/tf_hud_match_status.cpp +++ b/src/game/client/tf/tf_hud_match_status.cpp @@ -41,7 +41,7 @@ static ConVar tf_use_match_hud("tf_use_match_hud", "1", FCVAR_ARCHIVE); //----------------------------------------------------------------------------- bool ShouldUseMatchHUD() { - if ((TFGameRules()->IsMannVsMachineMode())) + if (!TFGameRules() || (TFGameRules()->IsMannVsMachineMode())) return false; return tf_use_match_hud.GetBool(); diff --git a/src/game/client/tf/tf_hud_passtime.cpp b/src/game/client/tf/tf_hud_passtime.cpp index 6181fedba..4b2a0435d 100644 --- a/src/game/client/tf/tf_hud_passtime.cpp +++ b/src/game/client/tf/tf_hud_passtime.cpp @@ -218,8 +218,8 @@ void CTFHudTeamScore::OnTick() tf_passtime_scores_per_round.GetInt() ); } - SetDialogVariable( "bluescore", iBlueScore ); - SetDialogVariable( "redscore", iRedScore ); + SetDialogVariable( "bluescore", iBlueScore, false ); + SetDialogVariable( "redscore", iRedScore, false ); } //----------------------------------------------------------------------------- diff --git a/src/game/client/tf/tf_hud_playerstatus.cpp b/src/game/client/tf/tf_hud_playerstatus.cpp index 26723ce15..806ceb5fd 100644 --- a/src/game/client/tf/tf_hud_playerstatus.cpp +++ b/src/game/client/tf/tf_hud_playerstatus.cpp @@ -866,11 +866,11 @@ void CTFHudPlayerHealth::SetHealth( int iNewHealth, int iMaxHealth, int iMaxBuff // set our health display value if ( m_nHealth > 0 ) { - SetDialogVariable( "Health", m_nHealth ); + SetDialogVariable( "Health", m_nHealth, false ); } else { - SetDialogVariable( "Health", "" ); + SetDialogVariable( "Health", "", false ); } } diff --git a/src/game/client/tf/tf_hud_pve_winpanel.cpp b/src/game/client/tf/tf_hud_pve_winpanel.cpp index a8ab8efc6..3f70a4411 100644 --- a/src/game/client/tf/tf_hud_pve_winpanel.cpp +++ b/src/game/client/tf/tf_hud_pve_winpanel.cpp @@ -167,7 +167,7 @@ void CTFPVEWinPanel::OnTick() // Do this only once if ( bRespecVisible && !m_pRespecBackground->IsVisible() ) { - g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "RespecEarnedPulseLoss" ); + g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( m_pRespecContainerPanel, "RespecEarnedPulseLoss", true, true ); C_TFPlayer *pLocalTFPlayer = C_TFPlayer::GetLocalTFPlayer(); if ( pLocalTFPlayer ) diff --git a/src/game/client/tf/tf_hud_robot_destruction_status.cpp b/src/game/client/tf/tf_hud_robot_destruction_status.cpp index 115f708d3..4b88ebcf2 100644 --- a/src/game/client/tf/tf_hud_robot_destruction_status.cpp +++ b/src/game/client/tf/tf_hud_robot_destruction_status.cpp @@ -14,6 +14,7 @@ #include "tf_logic_player_destruction.h" #include "c_tf_objective_resource.h" #include "c_func_capture_zone.h" +#include "tf_hud_objectivestatus.h" #define ATTACK_BLINK_TIME 2.f @@ -516,7 +517,8 @@ void CTFHUDRobotDestruction::PerformRobotLayout( RobotVector_t& vecRobots, int n //----------------------------------------------------------------------------- void CTFHUDRobotDestruction::Reset() { - g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "FlagOutlineHide" ); + CTFHudObjectiveStatus *pStatus = GET_HUDELEMENT( CTFHudObjectiveStatus ); + g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( pStatus, "FlagOutlineHide" ); } //----------------------------------------------------------------------------- @@ -562,8 +564,8 @@ void CTFHUDRobotDestruction::OnTick() if ( !pRoboLogic ) return; - m_pRedScoreValueContainer->SetDialogVariable( "score", pRoboLogic->GetScore( TF_TEAM_RED ) ); - m_pBlueScoreValueContainer->SetDialogVariable( "score", pRoboLogic->GetScore( TF_TEAM_BLUE ) ); + m_pRedScoreValueContainer->SetDialogVariable( "score", pRoboLogic->GetScore( TF_TEAM_RED ), false ); + m_pBlueScoreValueContainer->SetDialogVariable( "score", pRoboLogic->GetScore( TF_TEAM_BLUE ), false ); #ifdef STAGING_ONLY if ( rd_hud_test_bars.GetBool() ) @@ -574,8 +576,8 @@ void CTFHUDRobotDestruction::OnTick() m_pBlueProgressBarEscrow->SetProgress( 0.f, true ); m_pRedProgressBarEscrow->SetProgress( 0.f, true ); - m_pRedScoreValueContainer->SetDialogVariable( "score", flProgress ); - m_pBlueScoreValueContainer->SetDialogVariable( "score", flProgress ); + m_pRedScoreValueContainer->SetDialogVariable( "score", flProgress, false ); + m_pBlueScoreValueContainer->SetDialogVariable( "score", flProgress, false ); } else #endif @@ -599,8 +601,8 @@ void CTFHUDRobotDestruction::OnTick() if ( m_pProgressBarsContainer ) { - m_pProgressBarsContainer->SetDialogVariable( "red_escrow", nRedEscrow ); - m_pProgressBarsContainer->SetDialogVariable( "blue_escrow", nBlueEscrow ); + m_pProgressBarsContainer->SetDialogVariable( "red_escrow", nRedEscrow, false ); + m_pProgressBarsContainer->SetDialogVariable( "blue_escrow", nBlueEscrow, false ); } // update the team leader image @@ -778,7 +780,7 @@ void CTFHUDRobotDestruction::OnTick() } SetPlayingToLabelVisible( true ); - SetDialogVariable( "rounds", pRoboLogic->GetMaxPoints() ); + SetDialogVariable( "rounds", pRoboLogic->GetMaxPoints(), false ); // HACK! Fix the events UpdateCarriedFlagStatus( NULL, NULL ); } @@ -865,7 +867,7 @@ void CTFHUDRobotDestruction::UpdateStolenPoints( int nTeam, EditablePanel* pCont } // Show the stolen panels if the stolen score is anything pContainer->SetVisible( nStolenPoints > 0 ); - pContainer->SetDialogVariable( "intelvalue", nStolenPoints ); + pContainer->SetDialogVariable( "intelvalue", nStolenPoints, false ); } // Find our stolen flag @@ -947,7 +949,7 @@ void CTFHUDRobotDestruction::UpdateCarriedFlagStatus( C_BasePlayer *pNewOwner /* if ( pPlayerFlag && !pPlayerFlag->IsMarkedForDeletion() && !pPlayerFlag->IsDormant() ) { m_pCarriedContainer->SetVisible( true ); - m_pCarriedContainer->SetDialogVariable( "flagvalue", pPlayerFlag->GetPointValue() ); + m_pCarriedContainer->SetDialogVariable( "flagvalue", pPlayerFlag->GetPointValue(), false ); // make sure the panels are on, set the initial alpha values, // set the color of the flag we're carrying, and start the animations if ( m_pCarriedImage && !m_pCarriedImage->IsVisible() ) diff --git a/src/game/client/tf/tf_hud_scope.cpp b/src/game/client/tf/tf_hud_scope.cpp index 686c68ff7..e071292df 100644 --- a/src/game/client/tf/tf_hud_scope.cpp +++ b/src/game/client/tf/tf_hud_scope.cpp @@ -242,6 +242,7 @@ class CHudScope : public vgui::Panel, public CHudElement virtual void ApplySchemeSettings(vgui::IScheme *scheme); virtual void Paint( void ); virtual bool ShouldDraw( void ); + virtual bool CanAnimate() const override { return false; }; private: int m_iScopeTexture[4]; diff --git a/src/game/client/tf/tf_hud_target_id.cpp b/src/game/client/tf/tf_hud_target_id.cpp index dd41b296b..8dddb7060 100644 --- a/src/game/client/tf/tf_hud_target_id.cpp +++ b/src/game/client/tf/tf_hud_target_id.cpp @@ -1016,7 +1016,7 @@ void CTargetID::UpdateID( void ) if ( m_pMoveableSubPanel->IsVisible() ) { const char *pBoundKey = engine->Key_LookupBinding( pszActionCommand ); - m_pMoveableSubPanel->SetDialogVariable( "movekey", pBoundKey ); + m_pMoveableSubPanel->SetDialogVariable( "movekey", pBoundKey, false ); } if ( m_pMoveableIcon ) @@ -1060,7 +1060,7 @@ void CTargetID::UpdateID( void ) m_pTargetNameLabel->SetFgColor( colorName ); // TODO: Support if( hud_centerid.GetInt() == 0 ) - SetDialogVariable( "targetname", sIDString ); + SetDialogVariable( "targetname", sIDString, false ); } else { @@ -1075,7 +1075,7 @@ void CTargetID::UpdateID( void ) m_pTargetDataLabel->SetVisible(true); m_pTargetDataLabel->SetFgColor( colorData ); - SetDialogVariable( "targetdata", sDataString ); + SetDialogVariable( "targetdata", sDataString, false ); } else { diff --git a/src/game/client/tf/tf_hud_tournament.cpp b/src/game/client/tf/tf_hud_tournament.cpp index d34cd5738..6653e29fa 100644 --- a/src/game/client/tf/tf_hud_tournament.cpp +++ b/src/game/client/tf/tf_hud_tournament.cpp @@ -299,15 +299,15 @@ void CHudTournament::PreparePanel( void ) pszLabelText = "Tournament_Instructions_Waiting"; } - SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( pszLabelText ) ); - SetDialogVariable( "tournamentstatelabel", g_pVGuiLocalize->Find( "Tournament_WaitingForTeam" ) ); + SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( pszLabelText ), false ); + SetDialogVariable( "tournamentstatelabel", g_pVGuiLocalize->Find( "Tournament_WaitingForTeam" ), false ); SetPlayerPanelsVisible( true ); m_pModeImage->SetVisible( m_bCompetitiveMode ); } else { - SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "Tournament_Instructions" ) ); - SetDialogVariable( "tournamentstatelabel", g_pVGuiLocalize->Find( "Tournament_WaitingForTeams" ) ); + SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "Tournament_Instructions" ), false ); + SetDialogVariable( "tournamentstatelabel", g_pVGuiLocalize->Find( "Tournament_WaitingForTeams" ), false ); SetPlayerPanelsVisible( false ); m_pModeImage->SetVisible( false ); } @@ -333,18 +333,18 @@ void CHudTournament::PreparePanel( void ) if ( pFormatString ) { g_pVGuiLocalize->ConstructString_safe( szCountdown, pFormatString, 1, wzVal ); - SetDialogVariable( "tournamentstatelabel", szCountdown ); + SetDialogVariable( "tournamentstatelabel", szCountdown, false ); } if ( bAutoReady ) { - SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "" ) ); + SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "" ), false ); m_pModeImage->SetVisible( false ); SetPlayerPanelsVisible( false ); } else if ( nTime <= TOURNAMENT_NOCANCEL_TIME ) { - SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "" ) ); + SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "" ), false ); } else { @@ -352,17 +352,17 @@ void CHudTournament::PreparePanel( void ) { if ( bSteamController ) { - SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "Tournament_Instructions_Ready_NoKeyHintText" ) ); + SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "Tournament_Instructions_Ready_NoKeyHintText" ), false ); bShowReadyHintIcon = true; } else { - SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "Tournament_Instructions_Ready" ) ); + SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "Tournament_Instructions_Ready" ), false ); } } else { - SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "" ) ); + SetDialogVariable( "readylabel", g_pVGuiLocalize->Find( "" ), false ); } } @@ -411,13 +411,13 @@ void CHudTournament::PreparePanel( void ) #endif C_TFTeam *pBlueTeam = GetGlobalTFTeam( TF_TEAM_BLUE ); - SetDialogVariable( "bluenamelabel", pBlueTeam ? pBlueTeam->Get_Localized_Name() : L"BLU" ); + SetDialogVariable( "bluenamelabel", pBlueTeam ? pBlueTeam->Get_Localized_Name() : L"BLU", false ); C_TFTeam *pRedTeam = GetGlobalTFTeam( TF_TEAM_RED ); - SetDialogVariable( "rednamelabel", pRedTeam ? pRedTeam->Get_Localized_Name() : L"RED" ); + SetDialogVariable( "rednamelabel", pRedTeam ? pRedTeam->Get_Localized_Name() : L"RED", false ); - SetDialogVariable( "bluestate", TFGameRules()->IsTeamReady( TF_TEAM_BLUE ) ? g_pVGuiLocalize->Find( "Tournament_TeamReady" ) : g_pVGuiLocalize->Find( "Tournament_TeamNotReady" ) ); - SetDialogVariable( "redstate", TFGameRules()->IsTeamReady( TF_TEAM_RED ) ? g_pVGuiLocalize->Find( "Tournament_TeamReady" ) : g_pVGuiLocalize->Find( "Tournament_TeamNotReady" ) ); + SetDialogVariable( "bluestate", TFGameRules()->IsTeamReady( TF_TEAM_BLUE ) ? g_pVGuiLocalize->Find( "Tournament_TeamReady" ) : g_pVGuiLocalize->Find( "Tournament_TeamNotReady" ), false ); + SetDialogVariable( "redstate", TFGameRules()->IsTeamReady( TF_TEAM_RED ) ? g_pVGuiLocalize->Find( "Tournament_TeamReady" ) : g_pVGuiLocalize->Find( "Tournament_TeamNotReady" ), false ); if ( m_bTeamReady[TF_TEAM_BLUE] != TFGameRules()->IsTeamReady( TF_TEAM_BLUE ) || m_bTeamReady[TF_TEAM_RED] != TFGameRules()->IsTeamReady( TF_TEAM_RED ) ) { @@ -475,7 +475,7 @@ void CHudTournament::PreparePanel( void ) _snwprintf( szWindConditions, ARRAYSIZE( szWindConditions ), STRING_FMT STRING_FMT, szWindConditions, g_pVGuiLocalize->Find( "Tournament_WinConditionsNone" ) ); } - SetDialogVariable( "winconditions", szWindConditions ); + SetDialogVariable( "winconditions", szWindConditions, false ); } //----------------------------------------------------------------------------- @@ -1199,7 +1199,7 @@ void CHudTournamentSetup::OnTick( void ) m_pNameEntry->SetText( ( iLocalTeam == TF_TEAM_BLUE ) ? mp_tournament_blueteamname.GetString() : mp_tournament_redteamname.GetString() ); } - SetDialogVariable( "tournamentstatelabel", TFGameRules()->IsTeamReady( iLocalTeam ) ? g_pVGuiLocalize->Find( "Tournament_TeamSetupReady" ) : g_pVGuiLocalize->Find( "Tournament_TeamSetupNotReady" ) ); + SetDialogVariable( "tournamentstatelabel", TFGameRules()->IsTeamReady( iLocalTeam ) ? g_pVGuiLocalize->Find( "Tournament_TeamSetupReady" ) : g_pVGuiLocalize->Find( "Tournament_TeamSetupNotReady" ), false ); m_flNextThink = gpGlobals->curtime + TOURNAMENT_PANEL_UPDATE_INTERVAL; } @@ -1431,7 +1431,7 @@ void CHudStopWatch::OnTick( void ) m_pStopWatchImage->SetImage( "../hud/ico_time_none" ); - SetDialogVariable( "stopwatchlabel", g_pVGuiLocalize->Find( "Tournament_StopWatchNoCap" ) ); + SetDialogVariable( "stopwatchlabel", g_pVGuiLocalize->Find( "Tournament_StopWatchNoCap" ), false ); } else if ( TFGameRules()->GetStopWatchState() == STOPWATCH_RUNNING ) { @@ -1472,8 +1472,8 @@ void CHudStopWatch::OnTick( void ) pszPoints = g_pVGuiLocalize->Find( "#Tournament_StopWatch_Points" ); } - SetDialogVariable( "pointslabel", pszPoints ); - SetDialogVariable( "scoretobeat", wzScoreVal ); + SetDialogVariable( "pointslabel", pszPoints, false ); + SetDialogVariable( "scoretobeat", wzScoreVal, false ); wchar_t wzHelp[128]; @@ -1486,7 +1486,7 @@ void CHudStopWatch::OnTick( void ) g_pVGuiLocalize->ConstructString_safe( wzHelp, g_pVGuiLocalize->Find( "Tournament_StopWatch_TimeVictoryDefender" ), 1, pDefender->Get_Localized_Name() ); } - SetDialogVariable( "descriptionlabel", wzHelp ); + SetDialogVariable( "descriptionlabel", wzHelp, false ); if ( pTimer && !pTimer->IsWatchingTimeStamps() ) { @@ -1509,7 +1509,7 @@ void CHudStopWatch::OnTick( void ) m_pStopWatchDescriptionBG->SetVisible( false ); m_pStopWatchDescriptionLabel->SetVisible( false ); - SetDialogVariable( "descriptionlabel", g_pVGuiLocalize->Find( "#Tournament_StopWatch_CapVictory" ) ); + SetDialogVariable( "descriptionlabel", g_pVGuiLocalize->Find( "#Tournament_StopWatch_CapVictory" ), false ); m_pStopWatchImage->SetImage( "../hud/ico_time_60" ); @@ -1533,7 +1533,7 @@ void CHudStopWatch::OnTick( void ) g_pVGuiLocalize->ConstructString_safe( wzScoreVal, g_pVGuiLocalize->Find( "Tournament_StopWatchPointCaptureSpectator" ), 2, wzVal, iPoints == 1 ? g_pVGuiLocalize->Find( "#Tournament_StopWatch_Point" ) : g_pVGuiLocalize->Find( "#Tournament_StopWatch_Points" ) ); } - SetDialogVariable( "stopwatchlabel", wzScoreVal ); + SetDialogVariable( "stopwatchlabel", wzScoreVal, false ); } } } diff --git a/src/game/client/tf/tf_time_panel.cpp b/src/game/client/tf/tf_time_panel.cpp index 4d57be64c..26dc9b576 100644 --- a/src/game/client/tf/tf_time_panel.cpp +++ b/src/game/client/tf/tf_time_panel.cpp @@ -562,7 +562,7 @@ void CTFHudTimeStatus::SetExtraTimePanels() CheckClockLabelLength( m_pOvertimeLabel, m_pOvertimeBG ); } } - else + else if ( m_pOvertimeLabel->IsVisible() ) { m_pOvertimeBG->SetVisible( false ); m_pOvertimeLabel->SetVisible( false ); diff --git a/src/game/client/tf/vgui/tf_matchmaking_dashboard_next_map_voting.cpp b/src/game/client/tf/vgui/tf_matchmaking_dashboard_next_map_voting.cpp index e8b86acb0..9fb0067ec 100644 --- a/src/game/client/tf/vgui/tf_matchmaking_dashboard_next_map_voting.cpp +++ b/src/game/client/tf/vgui/tf_matchmaking_dashboard_next_map_voting.cpp @@ -225,6 +225,12 @@ class CNextMapVotingDashboardState : public CTFMatchmakingPopup void UpdateVoteCounts() { +#ifndef STAGING_ONLY + if ( !TFGameRules() ) + { + return; + } +#endif int nVotes[ CTFGameRules::EUserNextMapVote::NUM_VOTE_STATES ]; memset( nVotes, 0, sizeof( nVotes ) ); int nTotalVotes = 0; @@ -257,7 +263,7 @@ class CNextMapVotingDashboardState : public CTFMatchmakingPopup if ( pMapChoicePanel ) { // Update the label with the % total - pMapChoicePanel->SetDialogVariable( "votes", CFmtStr( "%3.0f%%", flPercent ) ); + pMapChoicePanel->SetDialogVariable( "votes", CFmtStr( "%3.0f%%", flPercent ), false ); // Do a color change animation if ( g_pClientMode && g_pClientMode->GetViewport() ) { diff --git a/src/game/client/tf/vgui/tf_playermodelpanel.cpp b/src/game/client/tf/vgui/tf_playermodelpanel.cpp index cb49453a5..6f130b9d2 100644 --- a/src/game/client/tf/vgui/tf_playermodelpanel.cpp +++ b/src/game/client/tf/vgui/tf_playermodelpanel.cpp @@ -1349,6 +1349,8 @@ CEconItemView *CTFPlayerModelPanel::GetLoadoutItemFromMDLHandle( loadout_positio if ( ( IsMiscSlot( iLoadoutSlot ) && IsMiscSlot( iPosition ) ) || ( IsValidPickupWeaponSlot( iLoadoutSlot ) && iLoadoutSlot == iPosition ) ) { + // See if we need to cache for our style getters. + CEconItemViewDataCacher dataCacher(pItem->GetStaticData()->GetNumStyles() ? pItem : NULL); const char * pDisplayModel = pItem->GetPlayerDisplayModel( m_iCurrentClassIndex, m_iTeam ); if ( pDisplayModel ) { @@ -1492,6 +1494,8 @@ bool CTFPlayerModelPanel::UpdateCosmeticParticles( if ( m_aParticleSystems[ iSystem ] && m_aParticleSystems[ iSystem ]->m_bIsUpdateToDate ) return false; + CEconItemViewDataCacher dataCacher(pEconItem); + attachedparticlesystem_t *pParticleSystem = NULL; // do community_sparkle effect if this is a community item? diff --git a/src/game/client/tf/vgui/tf_training_ui.cpp b/src/game/client/tf/vgui/tf_training_ui.cpp index 808c4236d..316520e6f 100644 --- a/src/game/client/tf/vgui/tf_training_ui.cpp +++ b/src/game/client/tf/vgui/tf_training_ui.cpp @@ -1765,24 +1765,24 @@ class CTrainingDialog : public EditablePanel ivgui()->RemoveTickSignal( GetVPanel() ); } - virtual void SetDialogVariable( const char *pVarName, const char *pValue ) + virtual void SetDialogVariable( const char *pVarName, const char *pValue, bool bForceUpdate = true ) { - m_pContainer->SetDialogVariable( pVarName, pValue ); + m_pContainer->SetDialogVariable( pVarName, pValue, bForceUpdate ); } - virtual void SetDialogVariable( const char *pVarName, const wchar_t *pValue ) + virtual void SetDialogVariable( const char *pVarName, const wchar_t *pValue, bool bForceUpdate = true ) { - m_pContainer->SetDialogVariable( pVarName, pValue ); + m_pContainer->SetDialogVariable( pVarName, pValue, bForceUpdate ); } - virtual void SetDialogVariable( const char *pVarName, int nValue ) + virtual void SetDialogVariable( const char *pVarName, int nValue, bool bForceUpdate = true ) { - m_pContainer->SetDialogVariable( pVarName, nValue ); + m_pContainer->SetDialogVariable( pVarName, nValue, bForceUpdate ); } - virtual void SetDialogVariable( const char *pVarName, float flValue ) + virtual void SetDialogVariable( const char *pVarName, float flValue, bool bForceUpdate = true ) { - m_pContainer->SetDialogVariable( pVarName, flValue ); + m_pContainer->SetDialogVariable( pVarName, flValue, bForceUpdate ); } void SetupButton( const char *pPanelName, CExButton **ppOut = NULL ) diff --git a/src/game/client/viewrender.cpp b/src/game/client/viewrender.cpp index 3bc622857..72f25164c 100644 --- a/src/game/client/viewrender.cpp +++ b/src/game/client/viewrender.cpp @@ -3877,7 +3877,7 @@ static void DrawOpaqueRenderables_DrawStaticProps( CClientRenderablesList::CEntr render->SetColorModulation( one ); render->SetBlend( 1.0f ); - const int MAX_STATICS_PER_BATCH = 512; + const int MAX_STATICS_PER_BATCH = 2048; IClientRenderable *pStatics[ MAX_STATICS_PER_BATCH ]; int numScheduled = 0, numAvailable = MAX_STATICS_PER_BATCH; diff --git a/src/game/shared/econ/econ_item_inventory.cpp b/src/game/shared/econ/econ_item_inventory.cpp index 1bc2b2b93..e2996a1c7 100644 --- a/src/game/shared/econ/econ_item_inventory.cpp +++ b/src/game/shared/econ/econ_item_inventory.cpp @@ -296,6 +296,12 @@ void CInventoryManager::OnPersonaStateChanged( PersonaStateChange_t *info ) //----------------------------------------------------------------------------- bool CInventoryManager::Init( void ) { +#ifdef GAME_DLL + if ( engine->IsDedicatedServer() ) +#endif + { + InitializeInventory(); + } return true; } @@ -304,12 +310,6 @@ bool CInventoryManager::Init( void ) //----------------------------------------------------------------------------- void CInventoryManager::PostInit( void ) { -#ifdef GAME_DLL - if ( engine->IsDedicatedServer() ) -#endif - { - InitializeInventory(); - } } void CInventoryManager::InitializeInventory() diff --git a/src/game/shared/econ/econ_item_schema.cpp b/src/game/shared/econ/econ_item_schema.cpp index 1093c5c2a..643a349a1 100644 --- a/src/game/shared/econ/econ_item_schema.cpp +++ b/src/game/shared/econ/econ_item_schema.cpp @@ -830,27 +830,29 @@ bool CEconItemPaintKitDefinition::BInitFromKV( KeyValues *pKVPItemPaintKit, CUtl SCHEMA_INIT_CHECK( m_pszLocalizedName != NULL, "Paint Kit %s: PaintKit contains no localized name", m_pszName ); + pKVPItemPaintKit = pKVPItemPaintKit->MakeCopy(); + KeyValues *pKVWearInputItems = NULL; pKVWearInputItems = pKVPItemPaintKit->FindKey( "wear_level_1", false ); SCHEMA_INIT_CHECK( VerifyPaintKitComposite( pKVWearInputItems, m_pszName, 1, pVecErrors ), "Could Not Create Weapon Skin Compositor for [%s][Wear %d]", m_pszName, 1 ); - m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems->MakeCopy() ); + m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems ); pKVWearInputItems = pKVPItemPaintKit->FindKey( "wear_level_2", false ); SCHEMA_INIT_CHECK( VerifyPaintKitComposite( pKVWearInputItems, m_pszName, 2, pVecErrors ), "Could Not Create Weapon Skin Compositor for [%s][Wear %d]", m_pszName, 2 ); - m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems->MakeCopy() ); + m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems ); pKVWearInputItems = pKVPItemPaintKit->FindKey( "wear_level_3", false ); SCHEMA_INIT_CHECK( VerifyPaintKitComposite( pKVWearInputItems, m_pszName, 3, pVecErrors ), "Could Not Create Weapon Skin Compositor for [%s][Wear %d]", m_pszName, 3 ); - m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems->MakeCopy() ); + m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems ); pKVWearInputItems = pKVPItemPaintKit->FindKey( "wear_level_4", false ); SCHEMA_INIT_CHECK( VerifyPaintKitComposite( pKVWearInputItems, m_pszName, 4, pVecErrors ), "Could Not Create Weapon Skin Compositor for [%s][Wear %d]", m_pszName, 4 ); - m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems->MakeCopy() ); + m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems ); pKVWearInputItems = pKVPItemPaintKit->FindKey( "wear_level_5", false ); SCHEMA_INIT_CHECK( VerifyPaintKitComposite( pKVWearInputItems, m_pszName, 5, pVecErrors ), "Could Not Create Weapon Skin Compositor for [%s][Wear %d]", m_pszName, 5 ); - m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems->MakeCopy() ); + m_vecPaintKitWearKVP.AddToTail( pKVWearInputItems ); return SCHEMA_INIT_SUCCESS(); } diff --git a/src/game/shared/econ/econ_item_system.cpp b/src/game/shared/econ/econ_item_system.cpp index aa6478abf..01ba43d4c 100644 --- a/src/game/shared/econ/econ_item_system.cpp +++ b/src/game/shared/econ/econ_item_system.cpp @@ -520,11 +520,12 @@ class CGCUpdateItemSchema : public GCSDK::CGCClientJob // Check if we're already up-to-date m_nExpectedVersion = msg.Body().item_schema_version(); uint32 nCurrentSchemaVersion = ItemSystem()->GetItemSchema()->GetVersion(); - if ( m_nExpectedVersion != 0 && m_nExpectedVersion == nCurrentSchemaVersion ) + if ( m_nExpectedVersion != 0 && m_nExpectedVersion == nCurrentSchemaVersion || m_nExpectedVersion == 1265307132 && nCurrentSchemaVersion == 1797044324 ) { Msg( "Current item schema is up-to-date with version %08X.\n", nCurrentSchemaVersion ); return true; } + Warning( "Current item schema is outdated with version %d instead of %d.\n", nCurrentSchemaVersion, m_nExpectedVersion ); m_sSignature = msg.Body().signature(); diff --git a/src/game/shared/econ/econ_item_view.cpp b/src/game/shared/econ/econ_item_view.cpp index caf16d067..bb9be0c3e 100644 --- a/src/game/shared/econ/econ_item_view.cpp +++ b/src/game/shared/econ/econ_item_view.cpp @@ -844,6 +844,9 @@ CEconItem *CEconItemView::GetSOCData( void ) const if ( m_pNonSOEconItem ) return m_pNonSOEconItem; + if (m_pSOCDataCache) + return m_pSOCDataCache; + #ifdef CLIENT_DLL // We need to find the inventory that contains this item. If we're not connected // to a server, and the owner is the same as the local player, use the local inventory. diff --git a/src/game/shared/econ/econ_item_view.h b/src/game/shared/econ/econ_item_view.h index 5ccf8253d..1beef8e4a 100644 --- a/src/game/shared/econ/econ_item_view.h +++ b/src/game/shared/econ/econ_item_view.h @@ -358,6 +358,9 @@ class CEconItemView : public CMaterialOverrideContainer< IEconItemInterface > inline int GetTeamNumber() const { return m_iTeamNumber; } inline void SetTeamNumber( int iTeamNumber ) { m_iTeamNumber = iTeamNumber; } + void CacheSOCData() { if (!m_pSOCDataCache) m_pSOCDataCache = GetSOCData(); } + void UncacheSOCData() { m_pSOCDataCache = NULL; } + protected: // Index of the item definition in the item script file. CNetworkVar( item_definition_index_t, m_iItemDefinitionIndex ); @@ -395,6 +398,9 @@ class CEconItemView : public CMaterialOverrideContainer< IEconItemInterface > eEconItemOrigin m_unOverrideOrigin; #endif + // Can set this temporarily while calling several attribute getters to avoid looking up each time + CEconItem *m_pSOCDataCache = NULL; + bool m_bColorInit; bool m_bPaintOverrideInit; bool m_bHasPaintOverride; @@ -452,4 +458,34 @@ bool DoesItemPassSearchFilter( const class IEconItemDescription *pDescription, c CBasePlayer *GetPlayerByAccountID( uint32 unAccountID ); #endif // CLIENT_DLL +/** There are some function calls which repeatedly call out to our underlying item, lets cache beforehand. */ +class CEconItemViewDataCacher +{ +public: + CEconItemViewDataCacher(CEconItemView* pItem) : m_pItem(pItem) + { + if (!m_pItem) return; + pItem->CacheSOCData(); + } + + ~CEconItemViewDataCacher() + { + if (!m_pItem) return; + m_pItem->UncacheSOCData(); + } + + void SetItem(CEconItemView* pItem) + { + if (pItem == m_pItem) return; + if (!pItem) return; + if (m_pItem) m_pItem->UncacheSOCData(); + m_pItem = pItem; + m_pItem->CacheSOCData(); + } + +private: + + CEconItemView* m_pItem; +}; + #endif // ECON_ITEM_CONSTANTS_H diff --git a/src/game/shared/teamplay_round_timer.cpp b/src/game/shared/teamplay_round_timer.cpp index f451a296b..047f9d9f3 100644 --- a/src/game/shared/teamplay_round_timer.cpp +++ b/src/game/shared/teamplay_round_timer.cpp @@ -97,10 +97,13 @@ static void RecvProxy_TimerPaused( const CRecvProxyData *pData, void *pStruct, v bool bTimerPaused = ( pData->m_Value.m_Int > 0 ); + // UNDONE: Unused HUD animation +#if 0 if ( bTimerPaused == false ) { g_pClientMode->GetViewportAnimationController()->StartAnimationSequence( "TimerFlash" ); } +#endif if ( pTimer ) { diff --git a/src/game/shared/teamplayroundbased_gamerules.cpp b/src/game/shared/teamplayroundbased_gamerules.cpp index fb8d63418..f8fb8d373 100644 --- a/src/game/shared/teamplayroundbased_gamerules.cpp +++ b/src/game/shared/teamplayroundbased_gamerules.cpp @@ -806,7 +806,7 @@ void CTeamplayRoundBasedRules::GoToIntermission( void ) { if ( IsInTournamentMode() == true #ifdef TF_DLL - && TFGameRules() && !TFGameRules()->IsMannVsMachineMode() + && TFGameRules() && TFGameRules()->IsMannVsMachineMode() #endif ) return; diff --git a/src/game/shared/tf/tf_gamerules.h b/src/game/shared/tf/tf_gamerules.h index 4a9678eeb..284930287 100644 --- a/src/game/shared/tf/tf_gamerules.h +++ b/src/game/shared/tf/tf_gamerules.h @@ -1466,7 +1466,7 @@ inline float CTFGameRules::ItemTesting_GetBotAnimSpeed( void ) pHostTimescale = cvar->FindVar( "host_timescale" ); } - if ( pHostTimescale ) + if ( pHostTimescale && pHostTimescale->GetFloat() > 0 ) return (m_flItemTesting_BotAnimSpeed * pHostTimescale->GetFloat()); return m_flItemTesting_BotAnimSpeed; } diff --git a/src/game/shared/tf/tf_viewmodel.cpp b/src/game/shared/tf/tf_viewmodel.cpp index bcd5dcbda..2111d5a46 100644 --- a/src/game/shared/tf/tf_viewmodel.cpp +++ b/src/game/shared/tf/tf_viewmodel.cpp @@ -338,6 +338,7 @@ int CTFViewModel::GetSkin() CEconItemView *pItem = pWeapon->GetAttributeContainer()->GetItem(); if ( pItem->IsValid() ) { + CEconItemViewDataCacher dataCacher(pItem); iItemSkin = pItem->GetSkin( pPlayer->GetTeamNumber(), true ); } @@ -490,10 +491,11 @@ void CInvisProxy::OnBind( C_BaseEntity *pC_BaseEntity ) C_BaseEntity *pEnt = pC_BaseEntity; + // TODO: causes crashes if ( pEnt != pCachedEntity ) { pPlayer = NULL; - pCachedEntity = pEnt; + //pCachedEntity = pEnt; } if ( !pPlayer ) diff --git a/src/game/shared/tf/tf_weapon_sniperrifle.cpp b/src/game/shared/tf/tf_weapon_sniperrifle.cpp index 40f095f86..1c50cbb8e 100644 --- a/src/game/shared/tf/tf_weapon_sniperrifle.cpp +++ b/src/game/shared/tf/tf_weapon_sniperrifle.cpp @@ -523,19 +523,19 @@ void CTFSniperRifle::ZoomOutIn( void ) ZoomOut(); CTFPlayer *pPlayer = GetTFPlayerOwner(); + float flRezoomDelay = 0.9f; + if ( !UsesClipsForAmmo1() ) + { + // Since sniper rifles don't actually use clips the fast reload hook also affects unzoom and zoom delays + ApplyScopeSpeedModifications( flRezoomDelay ); + } if ( pPlayer && pPlayer->ShouldAutoRezoom() ) { - float flRezoomDelay = 0.9f; - if ( !UsesClipsForAmmo1() ) - { - // Since sniper rifles don't actually use clips the fast reload hook also affects unzoom and zoom delays - ApplyScopeSpeedModifications( flRezoomDelay ); - } m_flRezoomTime = gpGlobals->curtime + flRezoomDelay; } else { - m_flNextSecondaryAttack = gpGlobals->curtime + 1.0f; + m_flNextSecondaryAttack = gpGlobals->curtime + flRezoomDelay + 0.1f; } } diff --git a/src/gameui/GameUI_Interface.cpp b/src/gameui/GameUI_Interface.cpp index 01c7b39bb..1d699d840 100644 --- a/src/gameui/GameUI_Interface.cpp +++ b/src/gameui/GameUI_Interface.cpp @@ -170,8 +170,12 @@ void CGameUI::Initialize( CreateInterfaceFn factory ) steamapicontext->Init(); +#if defined( _X360 ) || defined( STAGING_ONLY ) ConVarRef var( "gameui_xbox" ); m_bIsConsoleUI = var.IsValid() && var.GetBool(); +#else + m_bIsConsoleUI = false; +#endif vgui::VGui_InitInterfacesList( "GameUI", &factory, 1 ); vgui::VGui_InitMatSysInterfacesList( "GameUI", &factory, 1 ); diff --git a/src/inputsystem/inputsystem.cpp b/src/inputsystem/inputsystem.cpp index 5f11ac7d3..3106a5a63 100644 --- a/src/inputsystem/inputsystem.cpp +++ b/src/inputsystem/inputsystem.cpp @@ -271,7 +271,7 @@ void CInputSystem::SleepUntilInput( int nMaxSleepTimeMS ) #else #warning "need a SleepUntilInput impl" #endif -} + } diff --git a/src/mathlib/mathlib_base.cpp b/src/mathlib/mathlib_base.cpp index 4fe9d8fae..14fad005d 100644 --- a/src/mathlib/mathlib_base.cpp +++ b/src/mathlib/mathlib_base.cpp @@ -838,7 +838,7 @@ void AngleVectors( const QAngle &angles, Vector *forward, Vector *right, Vector float sr, sp, sy, cr, cp, cy; -#ifdef _X360 +#if defined(_X360) || USE_DXMATH fltx4 radians, scale, sine, cosine; radians = LoadUnaligned3SIMD( angles.Base() ); scale = ReplicateX4( M_PI_F / 180.f ); @@ -1984,7 +1984,7 @@ void AngleQuaternion( const QAngle &angles, Quaternion &outQuat ) float sr, sp, sy, cr, cp, cy; -#if defined(_X360) +#if defined(_X360) || USE_DXMATH fltx4 radians, scale, sine, cosine; radians = LoadUnaligned3SIMD( angles.Base() ); scale = ReplicateX4( 0.5f * M_PI_F / 180.f ); diff --git a/src/public/bone_setup.cpp b/src/public/bone_setup.cpp index 27ab95dcf..c2530fc25 100644 --- a/src/public/bone_setup.cpp +++ b/src/public/bone_setup.cpp @@ -3118,14 +3118,14 @@ class CIKSolver X[i] = P[i]; normalize(X); -// Its y axis is perpendicular to P, so Y = unit( E - X(E�X) ). +// Its y axis is perpendicular to P, so Y = unit( E - X(E⋅X) ). float dDOTx = dot(D,X); for (i = 0 ; i < 3 ; i++) Y[i] = D[i] - dDOTx * X[i]; normalize(Y); -// Its z axis is perpendicular to both X and Y, so Z = X�Y. +// Its z axis is perpendicular to both X and Y, so Z = X⋅Y. cross(X,Y,Z); diff --git a/src/public/collisionutils.cpp b/src/public/collisionutils.cpp index 2549a569f..1c648357e 100644 --- a/src/public/collisionutils.cpp +++ b/src/public/collisionutils.cpp @@ -543,7 +543,7 @@ bool IsPointInBox( const Vector& pt, const Vector& boxMin, const Vector& boxMax Assert( boxMin[2] <= boxMax[2] ); // on x360, force use of SIMD version. - if (IsX360()) + if (IsX360() || 1) { return IsPointInBox( LoadUnaligned3SIMD(pt.Base()), LoadUnaligned3SIMD(boxMin.Base()), LoadUnaligned3SIMD(boxMax.Base()) ) ; } @@ -893,8 +893,8 @@ bool FASTCALL IsBoxIntersectingRay( const Vector& boxMin, const Vector& boxMax, bool FASTCALL IsBoxIntersectingRay( const Vector& vecBoxMin, const Vector& vecBoxMax, const Ray_t& ray, float flTolerance ) { // On the x360, we force use of the SIMD functions. -#if defined(_X360) - if (IsX360()) +#if defined(_X360) || 1 + if (IsX360() || 1) { return IsBoxIntersectingRay( LoadUnaligned3SIMD(vecBoxMin.Base()), LoadUnaligned3SIMD(vecBoxMax.Base()), @@ -927,7 +927,7 @@ bool FASTCALL IsBoxIntersectingRay( const Vector& vecBoxMin, const Vector& vecBo //----------------------------------------------------------------------------- -#ifdef _X360 +#if defined(_X360) || 1 bool FASTCALL IsBoxIntersectingRay( fltx4 boxMin, fltx4 boxMax, fltx4 origin, fltx4 delta, fltx4 invDelta, // ray parameters fltx4 vTolerance ///< eg from ReplicateX4(flTolerance) @@ -943,7 +943,7 @@ bool FASTCALL IsBoxIntersectingRay( const fltx4 &inBoxMin, const fltx4 & inBoxMa // compute the mins/maxs of the box expanded by the ray extents // relocate the problem so that the ray start is at the origin. -#ifdef _X360 +#if defined(_X360) || 1 boxMin = SubSIMD(boxMin, origin); boxMax = SubSIMD(boxMax, origin); #else diff --git a/src/public/collisionutils.h b/src/public/collisionutils.h index 50f87046b..d511287ca 100644 --- a/src/public/collisionutils.h +++ b/src/public/collisionutils.h @@ -222,9 +222,7 @@ bool IsBoxIntersectingBoxExtents( const Vector& boxCenter1, const Vector& boxHal const Vector& boxCenter2, const Vector& boxHalfDiagonal2 ); -#ifdef _X360 -// inline version: -#include "mathlib/ssemath.h" +#if defined(_X360) || USE_DXMATH inline bool IsBoxIntersectingBoxExtents( const fltx4 boxCenter1, const fltx4 boxHalfDiagonal1, const fltx4 boxCenter2, const fltx4 boxHalfDiagonal2 ); #endif @@ -259,9 +257,10 @@ bool FASTCALL IsBoxIntersectingRay( const Vector& boxMin, const Vector& boxMax, const Vector& invDelta, float flTolerance = 0.0f ); +// UNDONE: with SSE2 on PC, we now can. // On the PC, we can't pass fltx4's in registers like this. On the x360, it is // much better if we do. -#ifdef _X360 +#if defined(_X360) || 1 bool FASTCALL IsBoxIntersectingRay( fltx4 boxMin, fltx4 boxMax, fltx4 origin, fltx4 delta, fltx4 invDelta, // ray parameters fltx4 vTolerance = LoadZeroSIMD() ///< eg from ReplicateX4(flTolerance) @@ -428,7 +427,23 @@ bool RayHasFullyContainedIntersectionWithQuad( const Ray_t &ray, //----------------------------------------------------------------------------- -#ifdef _X360 +#if USE_DXMATH +inline bool IsBoxIntersectingBoxExtents( const fltx4 boxCenter1, const fltx4 boxHalfDiagonal1, + const fltx4 boxCenter2, const fltx4 boxHalfDiagonal2 ) +{ + fltx4 vecDelta, vecSize; + + vecDelta = SubSIMD(boxCenter1, boxCenter2); + vecSize = AddSIMD(boxHalfDiagonal1, boxHalfDiagonal2); + + uint condition; + DirectX::XMVectorInBoundsR(&condition, vecDelta, vecSize); + // we want the top three words to be all 1's ; that means in bounds + + + return DirectX::XMComparisonAllInBounds( condition ); +} +#elif defined(_X360) inline bool IsBoxIntersectingBoxExtents( const fltx4 boxCenter1, const fltx4 boxHalfDiagonal1, const fltx4 boxCenter2, const fltx4 boxHalfDiagonal2 ) { diff --git a/src/public/mathlib/ssemath.h b/src/public/mathlib/ssemath.h index 3bcda408a..365de17e5 100644 --- a/src/public/mathlib/ssemath.h +++ b/src/public/mathlib/ssemath.h @@ -2057,7 +2057,8 @@ FORCEINLINE fltx4 SinSIMD( const fltx4 &radians ) FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) { #if USE_DXMATH - DirectX::XMVectorSinCos( &sine, &cosine, radians ); + //DirectX::XMVectorSinCos( &sine, &cosine, radians ); + sincos_ps(radians, &sine, &cosine); #else // FIXME: Make a fast SSE version SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); diff --git a/src/public/tier1/strtools.h b/src/public/tier1/strtools.h index e94d9cedf..d430c032b 100644 --- a/src/public/tier1/strtools.h +++ b/src/public/tier1/strtools.h @@ -253,7 +253,34 @@ inline bool V_islower(char c) { return islower( (unsigned char)c ) != 0; } inline bool V_iscntrl(char c) { return iscntrl( (unsigned char)c ) != 0; } //#undef iscntrl //#define iscntrl use_V_iscntrl_instead_of_iscntrl -inline bool V_isspace(char c) { return isspace( (unsigned char)c ) != 0; } +inline bool V_isspace(int c) +{ + // The standard white-space characters are the following: space, tab, carriage-return, newline, vertical tab, and form-feed. In the C locale, V_isspace() returns true only for the standard white-space characters. + //return c == ' ' || c == 9 /*horizontal tab*/ || c == '\r' || c == '\n' || c == 11 /*vertical tab*/ || c == '\f'; + // codes of whitespace symbols: 9 HT, 10 \n, 11 VT, 12 form feed, 13 \r, 32 space + + // easy to understand version, validated: + // return ((1 << (c-1)) & 0x80001F00) != 0 && ((c-1)&0xE0) == 0; + + // 5% faster on Core i7, 35% faster on Xbox360, no branches, validated: + #ifdef _X360 + return ((1 << (c-1)) & 0x80001F00 & ~(-int((c-1)&0xE0))) != 0; + #else + // this is 11% faster on Core i7 than the previous, VC2005 compiler generates a seemingly unbalanced search tree that's faster + switch(c) + { + case ' ': + case 9: + case '\r': + case '\n': + case 11: + case '\f': + return true; + default: + return false; + } + #endif +} //#undef isspace //#define isspace use_V_isspace_instead_of_isspace diff --git a/src/public/vgui_controls/AnimationController.h b/src/public/vgui_controls/AnimationController.h index 63f29f41b..4836055e3 100644 --- a/src/public/vgui_controls/AnimationController.h +++ b/src/public/vgui_controls/AnimationController.h @@ -50,7 +50,7 @@ class AnimationController : public Panel // starts an animation sequence script bool StartAnimationSequence(const char *sequenceName, bool bCanBeCancelled = true ); - bool StartAnimationSequence(Panel *pWithinParent, const char *sequenceName, bool bCanBeCancelled = true ); + bool StartAnimationSequence(Panel *pWithinParent, const char *sequenceName, bool bCanBeCancelled = true, bool bIncludeParent = false ); bool StopAnimationSequence( Panel *pWithinParent, const char *sequenceName ); void CancelAnimationsForPanel( Panel *pWithinParent ); @@ -241,14 +241,14 @@ class AnimationController : public Panel CUtlVector m_ScriptFileNames; // runs a single line of the script - void ExecAnimationCommand(UtlSymId_t seqName, AnimCommand_t &animCommand, Panel *pWithinParent, bool bCanBeCancelled); + void ExecAnimationCommand(UtlSymId_t seqName, AnimCommand_t &animCommand, Panel *pWithinParent, bool bCanBeCancelled, bool bIncludeParent = false); // removes all commands belonging to a script void RemoveQueuedAnimationCommands(UtlSymId_t seqName, vgui::Panel *panel = NULL); // removes an existing instance of a command void RemoveQueuedAnimationByType(vgui::Panel *panel, UtlSymId_t variable, UtlSymId_t sequenceToIgnore); // handlers - void StartCmd_Animate(UtlSymId_t seqName, AnimCmdAnimate_t &cmd, Panel *pWithinParent, bool bCanBeCancelled); + void StartCmd_Animate(UtlSymId_t seqName, AnimCmdAnimate_t &cmd, Panel *pWithinParent, bool bCanBeCancelled, bool bIncludeParent = false); void StartCmd_Animate(Panel *panel, UtlSymId_t seqName, AnimCmdAnimate_t &cmd, bool bCanBeCancelled); void RunCmd_RunEvent(PostedMessage_t &msg); void RunCmd_StopEvent(PostedMessage_t &msg); diff --git a/src/public/vgui_controls/EditablePanel.h b/src/public/vgui_controls/EditablePanel.h index ea1f7248d..c8ee840ce 100644 --- a/src/public/vgui_controls/EditablePanel.h +++ b/src/public/vgui_controls/EditablePanel.h @@ -74,10 +74,10 @@ class EditablePanel : public Panel // localization variables (used in constructing UI strings) // after the variable is set, causes all the necessary sub-panels to update - virtual void SetDialogVariable(const char *varName, const char *value); - virtual void SetDialogVariable(const char *varName, const wchar_t *value); - virtual void SetDialogVariable(const char *varName, int value); - virtual void SetDialogVariable(const char *varName, float value); + virtual void SetDialogVariable(const char *varName, const char *value, bool bForceUpdate = true); + virtual void SetDialogVariable(const char *varName, const wchar_t *value, bool bForceUpdate = true); + virtual void SetDialogVariable(const char *varName, int value, bool bForceUpdate = true); + virtual void SetDialogVariable(const char *varName, float value, bool bForceUpdate = true); // Focus handling // Delegate focus to a sub panel diff --git a/src/public/vgui_controls/Label.h b/src/public/vgui_controls/Label.h index 53422f764..03d37854e 100644 --- a/src/public/vgui_controls/Label.h +++ b/src/public/vgui_controls/Label.h @@ -196,6 +196,8 @@ class Label : public Panel short width; }; CUtlVector _imageDar; + bool _isSimpleTextImage = false; + TImageInfo *_cachedSimpleTextImage; int _textInset[2]; Color _disabledFgColor1; diff --git a/src/public/vgui_controls/Panel.h b/src/public/vgui_controls/Panel.h index 5d1abd1f7..12422c81b 100644 --- a/src/public/vgui_controls/Panel.h +++ b/src/public/vgui_controls/Panel.h @@ -344,6 +344,8 @@ class Panel : public IClientPanel, virtual public IForceVirtualInheritancePanel bool IsRightAligned(); // returns true if the settings are aligned to the right of the screen bool IsBottomAligned(); // returns true if the settings are aligned to the bottom of the screen + virtual bool CanAnimate() const { return true; } + // scheme access functions virtual HScheme GetScheme(); virtual void SetScheme(const char *tag); diff --git a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h index fd542388f..5214a7f1f 100644 --- a/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h +++ b/src/thirdparty/DirectXMath-dec2022/Inc/DirectXMath.h @@ -15,10 +15,6 @@ #define DIRECTX_MATH_VERSION 318 -#if defined(_MSC_VER) && (_MSC_VER < 1910) -#error DirectX Math requires Visual C++ 2017 or later. -#endif - #if defined(_MSC_VER) && !defined(_M_ARM) && !defined(_M_ARM64) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC) && (!_MANAGED) && (!_M_CEE) && (!defined(_M_IX86_FP) || (_M_IX86_FP > 1)) && !defined(_XM_NO_INTRINSICS_) && !defined(_XM_VECTORCALL_) #define _XM_VECTORCALL_ 1 #endif diff --git a/src/thirdparty/quickhull/quickhull.vpc b/src/thirdparty/quickhull/quickhull.vpc index 4935a5d5f..cd6fc11eb 100644 --- a/src/thirdparty/quickhull/quickhull.vpc +++ b/src/thirdparty/quickhull/quickhull.vpc @@ -19,7 +19,7 @@ $Configuration } $Compiler [$WINDOWS] { - $EnableEnhancedInstructionSet "Streaming SIMD Extensions (/arch:SSE)" + $EnableEnhancedInstructionSet "Streaming SIMD Extensions 2 (/arch:SSE2)" } } diff --git a/src/tier1/utlbuffer.cpp b/src/tier1/utlbuffer.cpp index ff03da068..be3768137 100644 --- a/src/tier1/utlbuffer.cpp +++ b/src/tier1/utlbuffer.cpp @@ -392,7 +392,7 @@ void CUtlBuffer::EatWhiteSpace() { while ( CheckGet( sizeof(char) ) ) { - if ( !isspace( *(const unsigned char*)PeekGet() ) ) + if ( !V_isspace( *(const unsigned char*)PeekGet() ) ) break; m_Get += sizeof(char); } diff --git a/src/vgui2/vgui_controls/AnimationController.cpp b/src/vgui2/vgui_controls/AnimationController.cpp index cee769a97..caac79996 100644 --- a/src/vgui2/vgui_controls/AnimationController.cpp +++ b/src/vgui2/vgui_controls/AnimationController.cpp @@ -1042,7 +1042,7 @@ bool AnimationController::StartAnimationSequence(const char *sequenceName, bool //----------------------------------------------------------------------------- // Purpose: starts an animation sequence script //----------------------------------------------------------------------------- -bool AnimationController::StartAnimationSequence(Panel *pWithinParent, const char *sequenceName, bool bCanBeCancelled ) +bool AnimationController::StartAnimationSequence(Panel *pWithinParent, const char *sequenceName, bool bCanBeCancelled, bool bIncludeParent ) { Assert( pWithinParent ); @@ -1075,7 +1075,7 @@ bool AnimationController::StartAnimationSequence(Panel *pWithinParent, const cha // execute the sequence for (int cmdIndex = 0; cmdIndex < m_Sequences[i].cmdList.Count(); cmdIndex++) { - ExecAnimationCommand(seqName, m_Sequences[i].cmdList[cmdIndex], pWithinParent, bCanBeCancelled); + ExecAnimationCommand(seqName, m_Sequences[i].cmdList[cmdIndex], pWithinParent, bCanBeCancelled, bIncludeParent); } return true; @@ -1277,11 +1277,11 @@ void AnimationController::RemoveQueuedAnimationByType(vgui::Panel *panel, UtlSym //----------------------------------------------------------------------------- // Purpose: runs a single line of the script //----------------------------------------------------------------------------- -void AnimationController::ExecAnimationCommand(UtlSymId_t seqName, AnimCommand_t &animCommand, Panel *pWithinParent, bool bCanBeCancelled) +void AnimationController::ExecAnimationCommand(UtlSymId_t seqName, AnimCommand_t &animCommand, Panel *pWithinParent, bool bCanBeCancelled, bool bIncludeParent) { if (animCommand.commandType == CMD_ANIMATE) { - StartCmd_Animate(seqName, animCommand.cmdData.animate, pWithinParent, bCanBeCancelled); + StartCmd_Animate(seqName, animCommand.cmdData.animate, pWithinParent, bCanBeCancelled, bIncludeParent); } else { @@ -1301,19 +1301,21 @@ void AnimationController::ExecAnimationCommand(UtlSymId_t seqName, AnimCommand_t //----------------------------------------------------------------------------- // Purpose: starts a variable animation //----------------------------------------------------------------------------- -void AnimationController::StartCmd_Animate(UtlSymId_t seqName, AnimCmdAnimate_t &cmd, Panel *pWithinParent, bool bCanBeCancelled) +void AnimationController::StartCmd_Animate(UtlSymId_t seqName, AnimCmdAnimate_t &cmd, Panel *pWithinParent, bool bCanBeCancelled, bool bIncludeParent) { Assert( pWithinParent ); if ( !pWithinParent ) return; + const char* panelName = g_ScriptSymbols.String(cmd.panel); + // make sure the child exists - Panel *panel = pWithinParent->FindChildByName(g_ScriptSymbols.String(cmd.panel),true); + Panel *panel = pWithinParent->FindChildByName(panelName,true); if ( !panel ) { // Check the parent - Panel *parent = GetParent(); - if ( !Q_stricmp( parent->GetName(), g_ScriptSymbols.String(cmd.panel) ) ) + Panel *parent = bIncludeParent ? pWithinParent : GetParent(); + if ( !Q_stricmp( parent->GetName(), panelName ) ) { panel = parent; } @@ -1321,6 +1323,10 @@ void AnimationController::StartCmd_Animate(UtlSymId_t seqName, AnimCmdAnimate_t if (!panel) return; + // Block some panels (like HudScope). Unfortunately players are abusing animations with broad/null parents. + if ( !panel->CanAnimate() ) + return; + StartCmd_Animate(panel, seqName, cmd, bCanBeCancelled); } diff --git a/src/vgui2/vgui_controls/EditablePanel.cpp b/src/vgui2/vgui_controls/EditablePanel.cpp index 670d4dbc4..553a6e9fc 100644 --- a/src/vgui2/vgui_controls/EditablePanel.cpp +++ b/src/vgui2/vgui_controls/EditablePanel.cpp @@ -997,8 +997,12 @@ void EditablePanel::GetControlString(const char *controlName, char *buf, int buf //----------------------------------------------------------------------------- // Purpose: localization variables (used in constructing UI strings) //----------------------------------------------------------------------------- -void EditablePanel::SetDialogVariable(const char *varName, const char *value) +void EditablePanel::SetDialogVariable(const char *varName, const char *value, bool bForceUpdate) { + if (!bForceUpdate && !GetDialogVariables()->IsEmpty(varName) && !strcmp(GetDialogVariables()->GetString(varName), value ? value : "")) + { + return; + } GetDialogVariables()->SetString(varName, value); ForceSubPanelsToUpdateWithNewDialogVariables(); } @@ -1006,8 +1010,12 @@ void EditablePanel::SetDialogVariable(const char *varName, const char *value) //----------------------------------------------------------------------------- // Purpose: localization variables (used in constructing UI strings) //----------------------------------------------------------------------------- -void EditablePanel::SetDialogVariable(const char *varName, const wchar_t *value) +void EditablePanel::SetDialogVariable(const char *varName, const wchar_t *value, bool bForceUpdate) { + if (!bForceUpdate && !GetDialogVariables()->IsEmpty(varName) && !wcscmp(GetDialogVariables()->GetWString(varName), value ? value : L"")) + { + return; + } GetDialogVariables()->SetWString(varName, value); ForceSubPanelsToUpdateWithNewDialogVariables(); } @@ -1015,8 +1023,12 @@ void EditablePanel::SetDialogVariable(const char *varName, const wchar_t *value) //----------------------------------------------------------------------------- // Purpose: localization variables (used in constructing UI strings) //----------------------------------------------------------------------------- -void EditablePanel::SetDialogVariable(const char *varName, int value) +void EditablePanel::SetDialogVariable(const char *varName, int value, bool bForceUpdate) { + if (!bForceUpdate && !GetDialogVariables()->IsEmpty(varName) && GetDialogVariables()->GetInt(varName) == value) + { + return; + } GetDialogVariables()->SetInt(varName, value); ForceSubPanelsToUpdateWithNewDialogVariables(); } @@ -1024,8 +1036,12 @@ void EditablePanel::SetDialogVariable(const char *varName, int value) //----------------------------------------------------------------------------- // Purpose: localization variables (used in constructing UI strings) //----------------------------------------------------------------------------- -void EditablePanel::SetDialogVariable(const char *varName, float value) +void EditablePanel::SetDialogVariable(const char *varName, float value, bool bForceUpdate) { + if (!bForceUpdate && !GetDialogVariables()->IsEmpty(varName) && GetDialogVariables()->GetFloat(varName) == value) + { + return; + } GetDialogVariables()->SetFloat(varName, value); ForceSubPanelsToUpdateWithNewDialogVariables(); } diff --git a/src/vgui2/vgui_controls/Label.cpp b/src/vgui2/vgui_controls/Label.cpp index 2178f3904..a50205a8c 100644 --- a/src/vgui2/vgui_controls/Label.cpp +++ b/src/vgui2/vgui_controls/Label.cpp @@ -403,13 +403,10 @@ void Label::ComputeAlignment(int &tx0, int &ty0, int &tx1, int &ty1) int maxX = 0, maxY = 0; int actualXAlignment = _contentAlignment; - for (int i = 0; i < _imageDar.Count(); i++) + if (_isSimpleTextImage) { - TImageInfo &imageInfo = _imageDar[i]; + TImageInfo &imageInfo = *_cachedSimpleTextImage; IImage *image = imageInfo.image; - if (!image) - continue; // skip over null images - // add up the bounds int iWide, iTall; image->GetSize(iWide, iTall); @@ -423,6 +420,29 @@ void Label::ComputeAlignment(int &tx0, int &ty0, int &tx1, int &ty1) // add the offset to x maxX += imageInfo.offset; } + else + { + for (int i = 0; i < _imageDar.Count(); i++) + { + TImageInfo &imageInfo = _imageDar[i]; + IImage *image = imageInfo.image; + if (!image) + continue; // skip over null images + + // add up the bounds + int iWide, iTall; + image->GetSize(iWide, iTall); + if (iWide > wide) // if the image is larger than the label just do a west alignment + actualXAlignment = Label::a_west; + + // get the max height + maxY = max(maxY, iTall); + maxX += iWide; + + // add the offset to x + maxX += imageInfo.offset; + } + } tWide = maxX; tTall = maxY; @@ -824,11 +844,21 @@ void Label::OnSetText(KeyValues *params) //----------------------------------------------------------------------------- int Label::AddImage(IImage *image, int offset) { + if (_isSimpleTextImage) + { + _cachedSimpleTextImage = NULL; + _isSimpleTextImage = false; + } int newImage = _imageDar.AddToTail(); _imageDar[newImage].image = image; _imageDar[newImage].offset = (short)offset; _imageDar[newImage].xpos = -1; _imageDar[newImage].width = -1; + if (_imageDar.Count() == 1 && image != NULL) + { + _cachedSimpleTextImage = _imageDar.Base(); + _isSimpleTextImage = true; + } InvalidateLayout(); return newImage; } @@ -1307,9 +1337,6 @@ void Label::PerformLayout() } HandleAutoSizing(); - - HandleAutoSizing(); - return; } From 3887ac5c7c356c58b54af4fa30183cac678e3702 Mon Sep 17 00:00:00 2001 From: mastercoms Date: Wed, 22 Mar 2023 18:03:33 -0400 Subject: [PATCH 41/42] perf: use static ConVarRefs during relatively hot functions instead of doing a O(n) lookup per frame in some cases, we can just init the ConVarRef once hottest function in R_LoadSkys, other ones are just possible during runtime or called relatively a lot compared to other non-static ConVarRefs --- src/common/ServerBrowser/blacklisted_server_manager.cpp | 2 +- src/engine/gl_warp.cpp | 2 +- src/engine/matsys_interface.cpp | 6 +++--- src/game/client/in_steamcontroller.cpp | 4 ++-- src/game/client/tf/tf_hud_notification_panel.cpp | 2 +- src/game/server/effects.cpp | 2 +- src/game/server/env_tonemap_controller.cpp | 2 +- src/game/server/func_break.cpp | 2 +- src/game/server/triggers.cpp | 2 +- src/game/shared/achievementmgr.cpp | 2 +- src/materialsystem/shaderapidx9/shaderdevicedx8.cpp | 2 +- src/serverbrowser/BaseGamesPage.cpp | 4 ++-- 12 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/common/ServerBrowser/blacklisted_server_manager.cpp b/src/common/ServerBrowser/blacklisted_server_manager.cpp index 95fcbfc91..a1a747523 100644 --- a/src/common/ServerBrowser/blacklisted_server_manager.cpp +++ b/src/common/ServerBrowser/blacklisted_server_manager.cpp @@ -223,7 +223,7 @@ bool CBlacklistedServerManager::IsServerBlacklisted( uint32 serverIP, int server { netadr_t netAdr( serverIP, serverPort ); - ConVarRef sb_showblacklists( "sb_showblacklists" ); + static ConVarRef sb_showblacklists( "sb_showblacklists" ); for ( int i = 0; i < m_Blacklist.Count(); i++ ) { diff --git a/src/engine/gl_warp.cpp b/src/engine/gl_warp.cpp index 881ac7abd..b52854ce8 100644 --- a/src/engine/gl_warp.cpp +++ b/src/engine/gl_warp.cpp @@ -165,7 +165,7 @@ void R_LoadSkys( void ) char requestedsky[ 128 ]; - ConVarRef skyname( "sv_skyname" ); + static ConVarRef skyname( "sv_skyname" ); if ( skyname.IsValid() ) { Q_strncpy( requestedsky, skyname.GetString(), sizeof( requestedsky ) ); diff --git a/src/engine/matsys_interface.cpp b/src/engine/matsys_interface.cpp index 0b7bfd085..e8446c2f7 100644 --- a/src/engine/matsys_interface.cpp +++ b/src/engine/matsys_interface.cpp @@ -412,7 +412,7 @@ static void ReadMaterialSystemConfigFromRegistry( MaterialSystem_Config_t &confi config.SetFlag( MATSYS_VIDCFG_FLAGS_WINDOWED, ReadVideoConfigInt( "ScreenWindowed", 0 ) != 0 ); #if defined( USE_SDL ) && !defined( SWDS ) // Read the ScreenDisplayIndex and set sdl_displayindex if it's there. - ConVarRef conVar( "sdl_displayindex" ); + static ConVarRef conVar( "sdl_displayindex" ); if ( conVar.IsValid() ) { int displayIndex = 0; @@ -531,7 +531,7 @@ static void WriteMaterialSystemConfigToRegistry( const MaterialSystem_Config_t & #if defined( USE_SDL ) && !defined( SWDS ) // Save sdl_displayindex out to ScreenDisplayIndex. - ConVarRef conVar( "sdl_displayindex" ); + static ConVarRef conVar( "sdl_displayindex" ); if ( conVar.IsValid() && !UseVR() ) { WriteVideoConfigInt( "ScreenDisplayIndex", conVar.GetInt() ); @@ -683,7 +683,7 @@ void OverrideMaterialSystemConfig( MaterialSystem_Config_t &config ) { // enable/disable flashlight support based on mod (user can also set this explicitly) // FIXME: this is only here because dxsupport_override.cfg is currently broken - ConVarRef mat_supportflashlight( "mat_supportflashlight" ); + static ConVarRef mat_supportflashlight( "mat_supportflashlight" ); if ( mat_supportflashlight.GetInt() == -1 ) { const char * gameName = COM_GetModDirectory(); diff --git a/src/game/client/in_steamcontroller.cpp b/src/game/client/in_steamcontroller.cpp index b0c062aa5..6748b6f9d 100644 --- a/src/game/client/in_steamcontroller.cpp +++ b/src/game/client/in_steamcontroller.cpp @@ -69,8 +69,8 @@ void CInput::ApplySteamControllerCameraMove( QAngle& viewangles, CUserCmd *cmd, //roll the view angles so roll is 0 (the HL2 assumed state) and mouse adjustments are relative to the screen. //Assuming roll is unchanging, we want mouse left to translate to screen left at all times (same for right, up, and down) - ConVarRef cl_pitchdown ( "cl_pitchdown" ); - ConVarRef cl_pitchup ( "cl_pitchup" ); + static ConVarRef cl_pitchdown ( "cl_pitchdown" ); + static ConVarRef cl_pitchup ( "cl_pitchup" ); // Scale yaw and pitch inputs by sensitivity, and make sure they are within acceptable limits (important to avoid exploits, e.g. during Demoman charge we must restrict allowed yaw). float yaw = CAM_CapYaw( sc_yaw_sensitivity.GetFloat() * vecPosition.x ); diff --git a/src/game/client/tf/tf_hud_notification_panel.cpp b/src/game/client/tf/tf_hud_notification_panel.cpp index 26333fe5a..ee2b7dd92 100644 --- a/src/game/client/tf/tf_hud_notification_panel.cpp +++ b/src/game/client/tf/tf_hud_notification_panel.cpp @@ -140,7 +140,7 @@ void CHudNotificationPanel::MsgFunc_HudNotify( bf_read &msg ) void CHudNotificationPanel::MsgFunc_HudNotifyCustom( bf_read &msg ) { // Ignore notifications in minmode - ConVarRef cl_hud_minmode( "cl_hud_minmode", true ); + static ConVarRef cl_hud_minmode( "cl_hud_minmode", true ); if ( cl_hud_minmode.IsValid() && cl_hud_minmode.GetBool() ) return; diff --git a/src/game/server/effects.cpp b/src/game/server/effects.cpp index f9d457c6e..38a88df1c 100644 --- a/src/game/server/effects.cpp +++ b/src/game/server/effects.cpp @@ -451,7 +451,7 @@ void CGibShooter::Spawn( void ) CGib *CGibShooter::CreateGib ( void ) { - ConVarRef violence_hgibs( "violence_hgibs" ); + static ConVarRef violence_hgibs( "violence_hgibs" ); if ( violence_hgibs.IsValid() && !violence_hgibs.GetInt() ) return NULL; diff --git a/src/game/server/env_tonemap_controller.cpp b/src/game/server/env_tonemap_controller.cpp index 613d45481..01a39f17e 100644 --- a/src/game/server/env_tonemap_controller.cpp +++ b/src/game/server/env_tonemap_controller.cpp @@ -178,7 +178,7 @@ void CEnvTonemapController::InputSetBloomScaleRange( inputdata_t &inputdata ) void CEnvTonemapController::InputSetTonemapRate( inputdata_t &inputdata ) { // TODO: There should be a better way to do this. - ConVarRef mat_hdr_manual_tonemap_rate( "mat_hdr_manual_tonemap_rate" ); + static ConVarRef mat_hdr_manual_tonemap_rate( "mat_hdr_manual_tonemap_rate" ); if ( mat_hdr_manual_tonemap_rate.IsValid() ) { float flTonemapRate = inputdata.value.Float(); diff --git a/src/game/server/func_break.cpp b/src/game/server/func_break.cpp index e7043ea93..6438555cb 100644 --- a/src/game/server/func_break.cpp +++ b/src/game/server/func_break.cpp @@ -1050,7 +1050,7 @@ void CBreakable::Die( void ) iCount = func_break_max_pieces.GetInt(); } - ConVarRef breakable_disable_gib_limit( "breakable_disable_gib_limit" ); + static ConVarRef breakable_disable_gib_limit( "breakable_disable_gib_limit" ); if ( !breakable_disable_gib_limit.GetBool() && iCount ) { if ( m_PerformanceMode == PM_NO_GIBS ) diff --git a/src/game/server/triggers.cpp b/src/game/server/triggers.cpp index 8154e6234..cba01f29e 100644 --- a/src/game/server/triggers.cpp +++ b/src/game/server/triggers.cpp @@ -2243,7 +2243,7 @@ void CTriggerPush::Activate() { // Fix problems with triggers pushing too hard under sv_alternateticks. // This is somewhat hacky, but it's simple and we're really close to shipping. - ConVarRef sv_alternateticks( "sv_alternateticks" ); + static ConVarRef sv_alternateticks( "sv_alternateticks" ); if ( ( m_flAlternateTicksFix != 0 ) && sv_alternateticks.GetBool() ) { m_flPushSpeed = m_flSpeed * m_flAlternateTicksFix; diff --git a/src/game/shared/achievementmgr.cpp b/src/game/shared/achievementmgr.cpp index ba1fab2bf..c6b51eb95 100644 --- a/src/game/shared/achievementmgr.cpp +++ b/src/game/shared/achievementmgr.cpp @@ -1095,7 +1095,7 @@ bool CAchievementMgr::CheckAchievementsEnabled() return false; } - ConVarRef tf_bot_offline_practice( "tf_bot_offline_practice" ); + static ConVarRef tf_bot_offline_practice( "tf_bot_offline_practice" ); // no achievements for offline practice if ( tf_bot_offline_practice.GetInt() != 0 ) { diff --git a/src/materialsystem/shaderapidx9/shaderdevicedx8.cpp b/src/materialsystem/shaderapidx9/shaderdevicedx8.cpp index 2789158b5..ae138aed5 100644 --- a/src/materialsystem/shaderapidx9/shaderdevicedx8.cpp +++ b/src/materialsystem/shaderapidx9/shaderdevicedx8.cpp @@ -2202,7 +2202,7 @@ IDirect3DDevice9* CShaderDeviceDx8::InvokeCreateDevice( void* hWnd, int nAdapter // Create the device with multi-threaded safeguards if we're using mat_queue_mode 2. // The logic to enable multithreaded rendering happens well after the device has been created, // so we replicate some of that logic here. - ConVarRef mat_queue_mode( "mat_queue_mode" ); + static ConVarRef mat_queue_mode( "mat_queue_mode" ); if ( mat_queue_mode.GetInt() == 2 || ( mat_queue_mode.GetInt() == -2 && GetCPUInformation()->m_nPhysicalProcessors >= 2 ) || ( mat_queue_mode.GetInt() == -1 && GetCPUInformation()->m_nPhysicalProcessors >= 2 ) ) diff --git a/src/serverbrowser/BaseGamesPage.cpp b/src/serverbrowser/BaseGamesPage.cpp index 6cef88277..89cce6437 100644 --- a/src/serverbrowser/BaseGamesPage.cpp +++ b/src/serverbrowser/BaseGamesPage.cpp @@ -2129,7 +2129,7 @@ void CDialogServerWarning::OnCommand(const char *command) //----------------------------------------------------------------------------- void CDialogServerWarning::OnButtonToggled(Panel *panel, int state) { - ConVarRef sb_dontshow_maxplayer_warning( "sb_dontshow_maxplayer_warning", true ); + static ConVarRef sb_dontshow_maxplayer_warning( "sb_dontshow_maxplayer_warning", true ); if ( sb_dontshow_maxplayer_warning.IsValid() ) { sb_dontshow_maxplayer_warning.SetValue( state ); @@ -2150,7 +2150,7 @@ void CBaseGamesPage::OnBeginConnect() // Stop the current refresh StopRefresh(); - ConVarRef sb_dontshow_maxplayer_warning( "sb_dontshow_maxplayer_warning", true ); + static ConVarRef sb_dontshow_maxplayer_warning( "sb_dontshow_maxplayer_warning", true ); if ( sb_dontshow_maxplayer_warning.IsValid() ) { // If the server is above the suggested maxplayers, warn the player From a8f842c1575a985c3bc2f260762e0341a411b08e Mon Sep 17 00:00:00 2001 From: seth Date: Wed, 22 Mar 2023 18:11:26 -0400 Subject: [PATCH 42/42] add link script for powershell --- game_clean/link.ps1 | 53 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 game_clean/link.ps1 diff --git a/game_clean/link.ps1 b/game_clean/link.ps1 new file mode 100644 index 000000000..53c60b5a5 --- /dev/null +++ b/game_clean/link.ps1 @@ -0,0 +1,53 @@ +<# + .SYNOPSIS + symlink files from a retail Team Fortress 2 build to a patched build +#> + +param ( + [Parameter(Mandatory=$true)] + [string]$TF2Dir +) + +$VerbosePreference = "Continue" +$OutPath = "$PSScriptRoot\..\game" +$GameClean = "$PSScriptRoot\." + + +function Make-Symlink { + param ( + [String]$Path + ) + + $linkpath = "$OutPath/$Path" + $targetpath = "$TF2Dir/$Path" + + Write-Verbose -Message "Linking $linkpath to $targetpath" + New-Item -ItemType SymbolicLink -Path $linkpath -Target $targetpath +} + +function Glob-Symlink { + param ( + [String]$Path, + [String]$Glob + ) + + Get-ChildItem -Path "$TF2Dir\$Path\*" -Include $Glob | % { $_.Name } | % { Make-Symlink -Path $Path/$_ } +} + +New-Item -ItemType Directory -Path $OutPath + +Write-Verbose -Message "Copying $GameClean/copy/ to $OutPath" +Copy-Item -Recurse -Force $GameClean/clean/* $OutPath + +Write-Verbose -Message "Creating $OutPath/tf/materials" +New-Item -Type Directory -Path $OutPath/tf/materials + +$targets = "hl2","platform" +$targets += ,"maps","media","resource","scripts" | % { "tf/$_" } +$targets += ,"models","vgui" | % { "tf/materials/$_" } +ForEach ($t in $targets) { + Make-Symlink -Path $t +} + +Glob-Symlink -Glob '' -Path 'bin' +ForEach ($g in '*.vpk','*.cache') { Glob-Symlink -Glob $g -Path 'tf' }