Skip to content

Commit 25ce646

Browse files
authored
Added SSE to MatrixMultiply. (#5427)
1 parent 4af95a3 commit 25ce646

File tree

1 file changed

+62
-1
lines changed

1 file changed

+62
-1
lines changed

src/raymath.h

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,11 @@ typedef struct float16 {
170170

171171
#include <math.h> // Required for: sinf(), cosf(), tan(), atan2f(), sqrtf(), floor(), fminf(), fmaxf(), fabsf()
172172

173+
#if defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
174+
#include <xmmintrin.h>
175+
#define RAYMATH_SSE_ENABLED
176+
#endif
177+
173178
//----------------------------------------------------------------------------------
174179
// Module Functions Definition - Utils math
175180
//----------------------------------------------------------------------------------
@@ -1647,7 +1652,63 @@ RMAPI Matrix MatrixSubtract(Matrix left, Matrix right)
16471652
RMAPI Matrix MatrixMultiply(Matrix left, Matrix right)
16481653
{
16491654
Matrix result = { 0 };
1655+
#ifdef RAYMATH_SSE_ENABLED
1656+
// Load left side and right side.
1657+
__m128 c0 = _mm_set_ps(right.m12, right.m8, right.m4, right.m0);
1658+
__m128 c1 = _mm_set_ps(right.m13, right.m9, right.m5, right.m1);
1659+
__m128 c2 = _mm_set_ps(right.m14, right.m10, right.m6, right.m2);
1660+
__m128 c3 = _mm_set_ps(right.m15, right.m11, right.m7, right.m3);
1661+
// Transpose so c0..c3 become *rows* of the right matrix in semantic order.
1662+
_MM_TRANSPOSE4_PS(c0, c1, c2, c3);
1663+
1664+
__m128 row;
1665+
float tmp[4];
1666+
1667+
// Row 0 of result: [m0, m1, m2, m3]
1668+
row = _mm_mul_ps(_mm_set1_ps(left.m0), c0);
1669+
row = _mm_add_ps(row, _mm_mul_ps(_mm_set1_ps(left.m1), c1));
1670+
row = _mm_add_ps(row, _mm_mul_ps(_mm_set1_ps(left.m2), c2));
1671+
row = _mm_add_ps(row, _mm_mul_ps(_mm_set1_ps(left.m3), c3));
1672+
_mm_storeu_ps(tmp, row);
1673+
result.m0 = tmp[0];
1674+
result.m1 = tmp[1];
1675+
result.m2 = tmp[2];
1676+
result.m3 = tmp[3];
1677+
1678+
// Row 1 of result: [m4, m5, m6, m7]
1679+
row = _mm_mul_ps(_mm_set1_ps(left.m4), c0);
1680+
row = _mm_add_ps(row, _mm_mul_ps(_mm_set1_ps(left.m5), c1));
1681+
row = _mm_add_ps(row, _mm_mul_ps(_mm_set1_ps(left.m6), c2));
1682+
row = _mm_add_ps(row, _mm_mul_ps(_mm_set1_ps(left.m7), c3));
1683+
_mm_storeu_ps(tmp, row);
1684+
result.m4 = tmp[0];
1685+
result.m5 = tmp[1];
1686+
result.m6 = tmp[2];
1687+
result.m7 = tmp[3];
1688+
1689+
// Row 2 of result: [m8, m9, m10, m11]
1690+
row = _mm_mul_ps(_mm_set1_ps(left.m8), c0);
1691+
row = _mm_add_ps(row, _mm_mul_ps(_mm_set1_ps(left.m9), c1));
1692+
row = _mm_add_ps(row, _mm_mul_ps(_mm_set1_ps(left.m10), c2));
1693+
row = _mm_add_ps(row, _mm_mul_ps(_mm_set1_ps(left.m11), c3));
1694+
_mm_storeu_ps(tmp, row);
1695+
result.m8 = tmp[0];
1696+
result.m9 = tmp[1];
1697+
result.m10 = tmp[2];
1698+
result.m11 = tmp[3];
1699+
1700+
// Row 3 of result: [m12, m13, m14, m15]
1701+
row = _mm_mul_ps(_mm_set1_ps(left.m12), c0);
1702+
row = _mm_add_ps(row, _mm_mul_ps(_mm_set1_ps(left.m13), c1));
1703+
row = _mm_add_ps(row, _mm_mul_ps(_mm_set1_ps(left.m14), c2));
1704+
row = _mm_add_ps(row, _mm_mul_ps(_mm_set1_ps(left.m15), c3));
1705+
_mm_storeu_ps(tmp, row);
1706+
result.m12 = tmp[0];
1707+
result.m13 = tmp[1];
1708+
result.m14 = tmp[2];
1709+
result.m15 = tmp[3];
16501710

1711+
#else
16511712
result.m0 = left.m0*right.m0 + left.m1*right.m4 + left.m2*right.m8 + left.m3*right.m12;
16521713
result.m1 = left.m0*right.m1 + left.m1*right.m5 + left.m2*right.m9 + left.m3*right.m13;
16531714
result.m2 = left.m0*right.m2 + left.m1*right.m6 + left.m2*right.m10 + left.m3*right.m14;
@@ -1664,7 +1725,7 @@ RMAPI Matrix MatrixMultiply(Matrix left, Matrix right)
16641725
result.m13 = left.m12*right.m1 + left.m13*right.m5 + left.m14*right.m9 + left.m15*right.m13;
16651726
result.m14 = left.m12*right.m2 + left.m13*right.m6 + left.m14*right.m10 + left.m15*right.m14;
16661727
result.m15 = left.m12*right.m3 + left.m13*right.m7 + left.m14*right.m11 + left.m15*right.m15;
1667-
1728+
#endif
16681729
return result;
16691730
}
16701731

0 commit comments

Comments
 (0)