REVIEWED: SIMD instrinsics must be explicitly enabled by developer, only SSE supported at the moment #5316

raysan5 · raysan5 · commit c124f2552bbd · 2025-12-31T11:22:26.000+01:00
diff --git a/src/raymath.h b/src/raymath.h
@@ -19,17 +19,22 @@
 *
 *   CONFIGURATION:
 *       #define RAYMATH_IMPLEMENTATION
-*           Generates the implementation of the library into the included file.
+*           Generates the implementation of the library into the included file
 *           If not defined, the library is in header only mode and can be included in other headers
-*           or source files without problems. But only ONE file should hold the implementation.
+*           or source files without problems. But only ONE file should hold the implementation
 *
 *       #define RAYMATH_STATIC_INLINE
-*           Define static inline functions code, so #include header suffices for use.
-*           This may use up lots of memory.
+*           Define static inline functions code, so #include header suffices for use
+*           This may use up lots of memory
 *
 *       #define RAYMATH_DISABLE_CPP_OPERATORS
 *           Disables C++ operator overloads for raymath types.
 *
+*       #define RAYMATH_USE_SIMD_INTRINSICS
+*           Try to enable SIMD intrinsics for MatrixMultiply()
+*           Note that users enabling it must be aware of the target platform where application will
+*           run to support the selected SIMD intrinsic, for now, only SSE is supported
+*
 *   LICENSE: zlib/libpng
 *
 *   Copyright (c) 2015-2025 Ramon Santamaria (@raysan5)
@@ -79,7 +84,6 @@
     #endif
 #endif
 
-
 //----------------------------------------------------------------------------------
 // Defines and Macros
 //----------------------------------------------------------------------------------
@@ -170,9 +174,35 @@ typedef struct float16 {
 
 #include <math.h>       // Required for: sinf(), cosf(), tan(), atan2f(), sqrtf(), floor(), fminf(), fmaxf(), fabsf()
 
-#if defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
-    #include <xmmintrin.h>
-    #define RAYMATH_SSE_ENABLED
+#if defined(RAYMATH_USE_SIMD_INTRINSICS)
+    // SIMD is used on the most costly raymath function MatrixMultiply()
+    // NOTE: Only SSE intrinsics support implemented
+    // TODO: Consider support for other SIMD instrinsics
+    /*
+    #if defined(__SSE4_2__)
+        #define SW_HAS_SSE42
+        #include <nmmintrin.h>
+    #elif defined(__SSE4_1__)
+        #define SW_HAS_SSE41
+        #include <smmintrin.h>
+    #elif defined(__SSSE3__)
+        #define SW_HAS_SSSE3
+        #include <tmmintrin.h>
+    #elif defined(__SSE3__)
+        #define SW_HAS_SSE3
+        #include <pmmintrin.h>
+    #elif defined(__SSE2__) || (defined(_M_AMD64) || defined(_M_X64)) // SSE2 x64
+        #define SW_HAS_SSE2
+        #include <emmintrin.h>
+    #elif defined(__SSE__)
+        #define SW_HAS_SSE
+        #include <xmmintrin.h>
+    #endif
+    */
+    #if defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1))
+        #include <xmmintrin.h>
+        #define RAYMATH_SSE_ENABLED
+    #endif
 #endif
 
 //----------------------------------------------------------------------------------
@@ -1652,18 +1682,20 @@ RMAPI Matrix MatrixSubtract(Matrix left, Matrix right)
 RMAPI Matrix MatrixMultiply(Matrix left, Matrix right)
 {
     Matrix result = { 0 };
-#ifdef RAYMATH_SSE_ENABLED
-    // Load left side and right side.
+    
+#if defined(RAYMATH_SSE_ENABLED)
+    // Load left side and right side
     __m128 c0 = _mm_set_ps(right.m12, right.m8,  right.m4,  right.m0);
     __m128 c1 = _mm_set_ps(right.m13, right.m9,  right.m5,  right.m1);
     __m128 c2 = _mm_set_ps(right.m14, right.m10, right.m6,  right.m2);
     __m128 c3 = _mm_set_ps(right.m15, right.m11, right.m7,  right.m3);
-    // Transpose so c0..c3 become *rows* of the right matrix in semantic order.
+    
+    // Transpose so c0..c3 become *rows* of the right matrix in semantic order
     _MM_TRANSPOSE4_PS(c0, c1, c2, c3);
 
+    float tmp[4] = { 0 };
     __m128 row;
-    float tmp[4];
-
+    
     // Row 0 of result: [m0, m1, m2, m3]
     row  = _mm_mul_ps(_mm_set1_ps(left.m0),  c0);
     row  = _mm_add_ps(row, _mm_mul_ps(_mm_set1_ps(left.m1),  c1));
@@ -1707,7 +1739,6 @@ RMAPI Matrix MatrixMultiply(Matrix left, Matrix right)
     result.m13 = tmp[1];
     result.m14 = tmp[2];
     result.m15 = tmp[3];
-
 #else
     result.m0 = left.m0*right.m0 + left.m1*right.m4 + left.m2*right.m8 + left.m3*right.m12;
     result.m1 = left.m0*right.m1 + left.m1*right.m5 + left.m2*right.m9 + left.m3*right.m13;
@@ -1726,6 +1757,7 @@ RMAPI Matrix MatrixMultiply(Matrix left, Matrix right)
     result.m14 = left.m12*right.m2 + left.m13*right.m6 + left.m14*right.m10 + left.m15*right.m14;
     result.m15 = left.m12*right.m3 + left.m13*right.m7 + left.m14*right.m11 + left.m15*right.m15;
 #endif
+
     return result;
 }