From 94d3de33705863b7795a0f03f7c7de15c24227ac Mon Sep 17 00:00:00 2001
From: Fanchen Kong <fanchen.kong@intel.com>
Date: Mon, 15 Jun 2026 13:49:16 +0800
Subject: [PATCH] Add support for x86 FMA intrinsics

Implement all 32 FMA intrinsics in a new <fmaintrin.h>, wired into
<immintrin.h> and gated on __FMA__. Pass -msimd128 -mfma to enable.

Each intrinsic has a dual implementation:
- With -mrelaxed-simd, it lowers to Wasm relaxed SIMD fused multiply-add
  (relaxed_madd / relaxed_nmadd).
- With -msimd128 only, it is emulated with separate multiply and
  add/subtract.

The 256-bit AVX variants are emulated by applying the 128-bit operation
to each half of the vector.

Compiler wiring adds -mfma to the SIMD Intel feature tower and defines
__FMA__ accordingly. Documentation is added to the SIMD porting guide
and ChangeLog.

Tests:
- test/sse/test_fma.cpp: self-contained correctness checks covering all
  32 intrinsics, run with the emulated path.
- test/sse/test_fma_relaxed.cpp: exhaustive comparison against native x86,
  run with relaxed SIMD.
---
 ChangeLog.md                      |   4 +
 site/source/docs/porting/simd.rst |  56 +++-
 system/include/compat/fmaintrin.h | 445 ++++++++++++++++++++++++++
 system/include/compat/immintrin.h |   4 +
 test/sse/test_fma.cpp             | 498 ++++++++++++++++++++++++++++++
 test/sse/test_fma_relaxed.cpp     | 103 ++++++
 test/test_core.py                 |  24 ++
 test/test_other.py                |   6 +-
 tools/cmdline.py                  |   2 +-
 tools/compile.py                  |   3 +
 10 files changed, 1139 insertions(+), 6 deletions(-)
 create mode 100644 system/include/compat/fmaintrin.h
 create mode 100644 test/sse/test_fma.cpp
 create mode 100644 test/sse/test_fma_relaxed.cpp

diff --git a/ChangeLog.md b/ChangeLog.md
index ac78148d3d2bf..3daa3fa8ae64a 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -28,6 +28,10 @@ See docs/process.md for more on how version tagging works.
 
 6.0.1 - 06/22/26
 ----------------
+- Added support for compiling FMA intrinsics. All 32 FMA intrinsics are
+  supported, with 256-bit variants emulated via two 128-bit operations. Pass
+  ``-msimd128 -mfma`` to enable. With ``-mrelaxed-simd -mfma``, Wasm relaxed
+  SIMD FMA is used.
 - The ability to redirect JS compiler stderr using `EMCC_STDERR_FILE` was
   removed.  These days you can use `EMCC_DEBUG` and/or `EMCC_DEBUG_SAVE` to
   preserve all the intermediate JS compiler files. (#27101)
diff --git a/site/source/docs/porting/simd.rst b/site/source/docs/porting/simd.rst
index cf1241a6ba942..94b4c18f78ff9 100644
--- a/site/source/docs/porting/simd.rst
+++ b/site/source/docs/porting/simd.rst
@@ -12,7 +12,7 @@ Emscripten supports the `WebAssembly SIMD <https://github.com/webassembly/simd/>
 1. Enable LLVM/Clang SIMD autovectorizer to automatically target WebAssembly SIMD, without requiring changes to C/C++ source code.
 2. Write SIMD code using the GCC/Clang SIMD Vector Extensions (``__attribute__((vector_size(16)))``)
 3. Write SIMD code using the WebAssembly SIMD intrinsics (``#include <wasm_simd128.h>``)
-4. Compile existing SIMD code that uses the x86 SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX or AVX2 intrinsics (``#include <*mmintrin.h>``)
+4. Compile existing SIMD code that uses the x86 SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, or FMA intrinsics (``#include <*mmintrin.h>``)
 5. Compile existing SIMD code that uses the ARM NEON intrinsics (``#include <arm_neon.h>``)
 
 These techniques can be freely combined in a single program.
@@ -154,8 +154,9 @@ Emscripten supports compiling existing codebases that use x86 SSE instructions b
 * **SSE4.2**: pass ``-msse4.2`` and ``#include <nmmintrin.h>``. Use ``#ifdef __SSE4_2__`` to gate code.
 * **AVX**: pass ``-mavx`` and ``#include <immintrin.h>``. Use ``#ifdef __AVX__`` to gate code.
 * **AVX2**: pass ``-mavx2`` and ``#include <immintrin.h>``. Use ``#ifdef __AVX2__`` to gate code.
+* **FMA**: pass ``-mfma`` and ``#include <immintrin.h>``. Use ``#ifdef __FMA__`` to gate code. Also pass ``-mrelaxed-simd`` to enable Wasm relaxed SIMD FMA.
 
-Currently only the SSE1, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, and AVX instruction sets are supported. Each of these instruction sets add on top of the previous ones, so e.g. when targeting SSE3, the instruction sets SSE1 and SSE2 are also available.
+Currently the SSE1, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, and FMA instruction sets are supported. Each of these instruction sets add on top of the previous ones, so e.g. when targeting SSE3, the instruction sets SSE1 and SSE2 are also available.
 
 The following tables highlight the availability and expected performance of different SSE* intrinsics. This can be useful for understanding the performance limitations that the Wasm SIMD specification has when running on x86 hardware.
 
@@ -1231,6 +1232,57 @@ All the 128-bit wide instructions from AVX2 instruction set are listed.
 Only a small part of the 256-bit AVX2 instruction set are listed, most of the
 256-bit wide AVX2 instructions are emulated by two 128-bit wide instructions.
 
+The following table highlights the availability and expected performance of different FMA intrinsics. Refer to `Intel Intrinsics Guide on FMA <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=FMA>`_.
+
+.. list-table:: x86 FMA intrinsics available via #include <immintrin.h> and -mfma
+   :widths: 20 30
+   :header-rows: 1
+
+   * - Intrinsic name
+     - WebAssembly SIMD support
+   * - _mm_fmadd_ps
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+add
+   * - _mm_fmadd_pd
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+add
+   * - _mm_fmadd_ss
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+add
+   * - _mm_fmadd_sd
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+add
+   * - _mm_fmsub_ps
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+sub
+   * - _mm_fmsub_pd
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+sub
+   * - _mm_fmsub_ss
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+sub
+   * - _mm_fmsub_sd
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+sub
+   * - _mm_fnmadd_ps
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+sub
+   * - _mm_fnmadd_pd
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+sub
+   * - _mm_fnmadd_ss
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+sub
+   * - _mm_fnmadd_sd
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+sub
+   * - _mm_fnmsub_ps
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+sub
+   * - _mm_fnmsub_pd
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+sub
+   * - _mm_fnmsub_ss
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+sub
+   * - _mm_fnmsub_sd
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+sub
+   * - _mm_fmaddsub_ps
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+add+blend
+   * - _mm_fmaddsub_pd
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+add+blend
+   * - _mm_fmsubadd_ps
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+add+blend
+   * - _mm_fmsubadd_pd
+     - ✅ wasm relaxed SIMD fma / 💡 emulated with mul+add+blend
+
+All 128-bit FMA intrinsics are listed above. The 256-bit FMA variants (``_mm256_fmadd_ps``, ``_mm256_fmadd_pd``, etc.) are emulated by applying the 128-bit operation to each half of the 256-bit vector. With ``-mrelaxed-simd``, the 128-bit operations use Wasm relaxed SIMD FMA; with ``-msimd128`` only, they use separate multiply and add/subtract.
+
 
 Compiling SIMD code targeting ARM NEON instruction set
 ======================================================
diff --git a/system/include/compat/fmaintrin.h b/system/include/compat/fmaintrin.h
new file mode 100644
index 0000000000000..24fbe49062d0d
--- /dev/null
+++ b/system/include/compat/fmaintrin.h
@@ -0,0 +1,445 @@
+/*
+ * Copyright 2026 The Emscripten Authors.  All rights reserved.
+ * Emscripten is available under two separate licenses, the MIT license and the
+ * University of Illinois/NCSA Open Source License.  Both these licenses can be
+ * found in the LICENSE file.
+ *
+ * FMA intrinsics implementation for Emscripten.
+ * Emulates x86 FMA (Fused Multiply-Add) operations using Wasm SIMD.
+ *
+ * With -mrelaxed-simd: uses Wasm relaxed SIMD FMA, which the host engine may
+ * lower to a hardware fused multiply-add (single rounding step) where
+ * available, e.g. on x86/ARM with FMA support. The relaxed SIMD spec leaves
+ * fusion implementation-defined, so on hosts without hardware FMA the result
+ * may instead be a separate multiply and add (two rounding steps).
+ * With -msimd128 only: emulates FMA with separate multiply and add/sub
+ * (two rounding steps).
+ */
+
+#ifndef __emscripten_immintrin_h__
+#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __emscripten_fmaintrin_h__
+#define __emscripten_fmaintrin_h__
+
+#ifndef __FMA__
+#error "FMA instruction set not enabled"
+#endif
+
+#ifdef __wasm_relaxed_simd__
+#include <wasm_simd128.h>
+#endif
+
+/* ============================================================
+ * 128-bit packed float (ps) — 4x float
+ * ============================================================ */
+
+/* a * b + c */
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) {
+#ifdef __wasm_relaxed_simd__
+  return (__m128)wasm_f32x4_relaxed_madd(
+    (__f32x4)__A, (__f32x4)__B, (__f32x4)__C);
+#else
+  return _mm_add_ps(_mm_mul_ps(__A, __B), __C);
+#endif
+}
+
+/* a * b - c */
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) {
+#ifdef __wasm_relaxed_simd__
+  return (__m128)wasm_f32x4_relaxed_madd(
+    (__f32x4)__A, (__f32x4)__B, (__f32x4)_mm_xor_ps(__C, _mm_set1_ps(-0.0f)));
+#else
+  return _mm_sub_ps(_mm_mul_ps(__A, __B), __C);
+#endif
+}
+
+/* -(a * b) + c */
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) {
+#ifdef __wasm_relaxed_simd__
+  return (__m128)wasm_f32x4_relaxed_nmadd(
+    (__f32x4)__A, (__f32x4)__B, (__f32x4)__C);
+#else
+  return _mm_sub_ps(__C, _mm_mul_ps(__A, __B));
+#endif
+}
+
+/* -(a * b) - c */
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) {
+#ifdef __wasm_relaxed_simd__
+  return (__m128)wasm_f32x4_relaxed_nmadd(
+    (__f32x4)__A, (__f32x4)__B, (__f32x4)_mm_xor_ps(__C, _mm_set1_ps(-0.0f)));
+#else
+  __m128 neg_ab = _mm_sub_ps(_mm_setzero_ps(), _mm_mul_ps(__A, __B));
+  return _mm_sub_ps(neg_ab, __C);
+#endif
+}
+
+/* even elements: a*b - c, odd elements: a*b + c */
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C) {
+#ifdef __wasm_relaxed_simd__
+  __m128 neg_c =
+    _mm_xor_ps(__C, (__m128)_mm_set_epi32(0, 0x80000000, 0, 0x80000000));
+  return (__m128)wasm_f32x4_relaxed_madd(
+    (__f32x4)__A, (__f32x4)__B, (__f32x4)neg_c);
+#else
+  __m128 add = _mm_add_ps(_mm_mul_ps(__A, __B), __C);
+  __m128 sub = _mm_sub_ps(_mm_mul_ps(__A, __B), __C);
+  return _mm_blend_ps(sub, add, 0xA); /* 0xA = 1010b: elements 1,3 from add */
+#endif
+}
+
+/* even elements: a*b + c, odd elements: a*b - c */
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C) {
+#ifdef __wasm_relaxed_simd__
+  __m128 neg_c =
+    _mm_xor_ps(__C, (__m128)_mm_set_epi32(0x80000000, 0, 0x80000000, 0));
+  return (__m128)wasm_f32x4_relaxed_madd(
+    (__f32x4)__A, (__f32x4)__B, (__f32x4)neg_c);
+#else
+  __m128 add = _mm_add_ps(_mm_mul_ps(__A, __B), __C);
+  __m128 sub = _mm_sub_ps(_mm_mul_ps(__A, __B), __C);
+  return _mm_blend_ps(add, sub, 0xA); /* 0xA = 1010b: elements 1,3 from sub */
+#endif
+}
+
+/* ============================================================
+ * 128-bit packed double (pd) — 2x double
+ * ============================================================ */
+
+/* a * b + c */
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) {
+#ifdef __wasm_relaxed_simd__
+  return (__m128d)wasm_f64x2_relaxed_madd(
+    (__f64x2)__A, (__f64x2)__B, (__f64x2)__C);
+#else
+  return _mm_add_pd(_mm_mul_pd(__A, __B), __C);
+#endif
+}
+
+/* a * b - c */
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
+#ifdef __wasm_relaxed_simd__
+  return (__m128d)wasm_f64x2_relaxed_madd(
+    (__f64x2)__A, (__f64x2)__B, (__f64x2)_mm_xor_pd(__C, _mm_set1_pd(-0.0)));
+#else
+  return _mm_sub_pd(_mm_mul_pd(__A, __B), __C);
+#endif
+}
+
+/* -(a * b) + c */
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) {
+#ifdef __wasm_relaxed_simd__
+  return (__m128d)wasm_f64x2_relaxed_nmadd(
+    (__f64x2)__A, (__f64x2)__B, (__f64x2)__C);
+#else
+  return _mm_sub_pd(__C, _mm_mul_pd(__A, __B));
+#endif
+}
+
+/* -(a * b) - c */
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
+#ifdef __wasm_relaxed_simd__
+  return (__m128d)wasm_f64x2_relaxed_nmadd(
+    (__f64x2)__A, (__f64x2)__B, (__f64x2)_mm_xor_pd(__C, _mm_set1_pd(-0.0)));
+#else
+  __m128d neg_ab = _mm_sub_pd(_mm_setzero_pd(), _mm_mul_pd(__A, __B));
+  return _mm_sub_pd(neg_ab, __C);
+#endif
+}
+
+/* even elements: a*b - c, odd elements: a*b + c */
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C) {
+#ifdef __wasm_relaxed_simd__
+  __m128d neg_c =
+    _mm_xor_pd(__C, (__m128d)_mm_set_epi64x(0, 0x8000000000000000LL));
+  return (__m128d)wasm_f64x2_relaxed_madd(
+    (__f64x2)__A, (__f64x2)__B, (__f64x2)neg_c);
+#else
+  __m128d add = _mm_add_pd(_mm_mul_pd(__A, __B), __C);
+  __m128d sub = _mm_sub_pd(_mm_mul_pd(__A, __B), __C);
+  return _mm_blend_pd(sub, add, 0x2); /* 0x2 = 10b: element 1 from add */
+#endif
+}
+
+/* even elements: a*b + c, odd elements: a*b - c */
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C) {
+#ifdef __wasm_relaxed_simd__
+  __m128d neg_c =
+    _mm_xor_pd(__C, (__m128d)_mm_set_epi64x(0x8000000000000000LL, 0));
+  return (__m128d)wasm_f64x2_relaxed_madd(
+    (__f64x2)__A, (__f64x2)__B, (__f64x2)neg_c);
+#else
+  __m128d add = _mm_add_pd(_mm_mul_pd(__A, __B), __C);
+  __m128d sub = _mm_sub_pd(_mm_mul_pd(__A, __B), __C);
+  return _mm_blend_pd(add, sub, 0x2); /* 0x2 = 10b: element 1 from sub */
+#endif
+}
+
+/* ============================================================
+ * Scalar float (ss) — lowest element only, upper from first operand
+ * ============================================================ */
+
+/* a[0] * b[0] + c[0], a[1..3] pass through */
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) {
+#ifdef __wasm_relaxed_simd__
+  return _mm_move_ss(
+    __A,
+    (__m128)wasm_f32x4_relaxed_madd((__f32x4)__A, (__f32x4)__B, (__f32x4)__C));
+#else
+  return _mm_move_ss(__A, _mm_add_ss(_mm_mul_ss(__A, __B), __C));
+#endif
+}
+
+/* a[0] * b[0] - c[0], a[1..3] pass through */
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) {
+#ifdef __wasm_relaxed_simd__
+  __m128 neg_c = _mm_xor_ps(__C, _mm_set1_ps(-0.0f));
+  return _mm_move_ss(__A,
+                     (__m128)wasm_f32x4_relaxed_madd(
+                       (__f32x4)__A, (__f32x4)__B, (__f32x4)neg_c));
+#else
+  return _mm_move_ss(__A, _mm_sub_ss(_mm_mul_ss(__A, __B), __C));
+#endif
+}
+
+/* -(a[0] * b[0]) + c[0], a[1..3] pass through */
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) {
+#ifdef __wasm_relaxed_simd__
+  return _mm_move_ss(
+    __A,
+    (__m128)wasm_f32x4_relaxed_nmadd((__f32x4)__A, (__f32x4)__B, (__f32x4)__C));
+#else
+  return _mm_move_ss(__A, _mm_sub_ss(__C, _mm_mul_ss(__A, __B)));
+#endif
+}
+
+/* -(a[0] * b[0]) - c[0], a[1..3] pass through */
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) {
+#ifdef __wasm_relaxed_simd__
+  __m128 neg_c = _mm_xor_ps(__C, _mm_set1_ps(-0.0f));
+  return _mm_move_ss(__A,
+                     (__m128)wasm_f32x4_relaxed_nmadd(
+                       (__f32x4)__A, (__f32x4)__B, (__f32x4)neg_c));
+#else
+  __m128 neg_ab = _mm_sub_ss(_mm_setzero_ps(), _mm_mul_ss(__A, __B));
+  return _mm_move_ss(__A, _mm_sub_ss(neg_ab, __C));
+#endif
+}
+
+/* ============================================================
+ * Scalar double (sd) — lowest element only, upper from first operand
+ * ============================================================ */
+
+/* a[0] * b[0] + c[0], a[1] pass through */
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) {
+#ifdef __wasm_relaxed_simd__
+  return _mm_move_sd(
+    __A,
+    (__m128d)wasm_f64x2_relaxed_madd((__f64x2)__A, (__f64x2)__B, (__f64x2)__C));
+#else
+  return _mm_move_sd(__A, _mm_add_sd(_mm_mul_sd(__A, __B), __C));
+#endif
+}
+
+/* a[0] * b[0] - c[0], a[1] pass through */
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) {
+#ifdef __wasm_relaxed_simd__
+  __m128d neg_c = _mm_xor_pd(__C, _mm_set1_pd(-0.0));
+  return _mm_move_sd(__A,
+                     (__m128d)wasm_f64x2_relaxed_madd(
+                       (__f64x2)__A, (__f64x2)__B, (__f64x2)neg_c));
+#else
+  return _mm_move_sd(__A, _mm_sub_sd(_mm_mul_sd(__A, __B), __C));
+#endif
+}
+
+/* -(a[0] * b[0]) + c[0], a[1] pass through */
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) {
+#ifdef __wasm_relaxed_simd__
+  return _mm_move_sd(__A,
+                     (__m128d)wasm_f64x2_relaxed_nmadd(
+                       (__f64x2)__A, (__f64x2)__B, (__f64x2)__C));
+#else
+  return _mm_move_sd(__A, _mm_sub_sd(__C, _mm_mul_sd(__A, __B)));
+#endif
+}
+
+/* -(a[0] * b[0]) - c[0], a[1] pass through */
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) {
+#ifdef __wasm_relaxed_simd__
+  __m128d neg_c = _mm_xor_pd(__C, _mm_set1_pd(-0.0));
+  return _mm_move_sd(__A,
+                     (__m128d)wasm_f64x2_relaxed_nmadd(
+                       (__f64x2)__A, (__f64x2)__B, (__f64x2)neg_c));
+#else
+  __m128d neg_ab = _mm_sub_sd(_mm_setzero_pd(), _mm_mul_sd(__A, __B));
+  return _mm_move_sd(__A, _mm_sub_sd(neg_ab, __C));
+#endif
+}
+
+#ifdef __AVX__
+/* ============================================================
+ * 256-bit packed float (ps) — 8x float
+ * ============================================================ */
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) {
+  __m256_internal a = __m256_to_internal(__A);
+  __m256_internal b = __m256_to_internal(__B);
+  __m256_internal c = __m256_to_internal(__C);
+  __m256_internal ret;
+  ret.v0 = _mm_fmadd_ps(a.v0, b.v0, c.v0);
+  ret.v1 = _mm_fmadd_ps(a.v1, b.v1, c.v1);
+  return __m256_from_internal(ret);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) {
+  __m256_internal a = __m256_to_internal(__A);
+  __m256_internal b = __m256_to_internal(__B);
+  __m256_internal c = __m256_to_internal(__C);
+  __m256_internal ret;
+  ret.v0 = _mm_fmsub_ps(a.v0, b.v0, c.v0);
+  ret.v1 = _mm_fmsub_ps(a.v1, b.v1, c.v1);
+  return __m256_from_internal(ret);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) {
+  __m256_internal a = __m256_to_internal(__A);
+  __m256_internal b = __m256_to_internal(__B);
+  __m256_internal c = __m256_to_internal(__C);
+  __m256_internal ret;
+  ret.v0 = _mm_fnmadd_ps(a.v0, b.v0, c.v0);
+  ret.v1 = _mm_fnmadd_ps(a.v1, b.v1, c.v1);
+  return __m256_from_internal(ret);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) {
+  __m256_internal a = __m256_to_internal(__A);
+  __m256_internal b = __m256_to_internal(__B);
+  __m256_internal c = __m256_to_internal(__C);
+  __m256_internal ret;
+  ret.v0 = _mm_fnmsub_ps(a.v0, b.v0, c.v0);
+  ret.v1 = _mm_fnmsub_ps(a.v1, b.v1, c.v1);
+  return __m256_from_internal(ret);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C) {
+  __m256_internal a = __m256_to_internal(__A);
+  __m256_internal b = __m256_to_internal(__B);
+  __m256_internal c = __m256_to_internal(__C);
+  __m256_internal ret;
+  ret.v0 = _mm_fmaddsub_ps(a.v0, b.v0, c.v0);
+  ret.v1 = _mm_fmaddsub_ps(a.v1, b.v1, c.v1);
+  return __m256_from_internal(ret);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C) {
+  __m256_internal a = __m256_to_internal(__A);
+  __m256_internal b = __m256_to_internal(__B);
+  __m256_internal c = __m256_to_internal(__C);
+  __m256_internal ret;
+  ret.v0 = _mm_fmsubadd_ps(a.v0, b.v0, c.v0);
+  ret.v1 = _mm_fmsubadd_ps(a.v1, b.v1, c.v1);
+  return __m256_from_internal(ret);
+}
+
+/* ============================================================
+ * 256-bit packed double (pd) — 4x double
+ * ============================================================ */
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) {
+  __m256d_internal a = __m256d_to_internal(__A);
+  __m256d_internal b = __m256d_to_internal(__B);
+  __m256d_internal c = __m256d_to_internal(__C);
+  __m256d_internal ret;
+  ret.v0 = _mm_fmadd_pd(a.v0, b.v0, c.v0);
+  ret.v1 = _mm_fmadd_pd(a.v1, b.v1, c.v1);
+  return __m256d_from_internal(ret);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) {
+  __m256d_internal a = __m256d_to_internal(__A);
+  __m256d_internal b = __m256d_to_internal(__B);
+  __m256d_internal c = __m256d_to_internal(__C);
+  __m256d_internal ret;
+  ret.v0 = _mm_fmsub_pd(a.v0, b.v0, c.v0);
+  ret.v1 = _mm_fmsub_pd(a.v1, b.v1, c.v1);
+  return __m256d_from_internal(ret);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) {
+  __m256d_internal a = __m256d_to_internal(__A);
+  __m256d_internal b = __m256d_to_internal(__B);
+  __m256d_internal c = __m256d_to_internal(__C);
+  __m256d_internal ret;
+  ret.v0 = _mm_fnmadd_pd(a.v0, b.v0, c.v0);
+  ret.v1 = _mm_fnmadd_pd(a.v1, b.v1, c.v1);
+  return __m256d_from_internal(ret);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) {
+  __m256d_internal a = __m256d_to_internal(__A);
+  __m256d_internal b = __m256d_to_internal(__B);
+  __m256d_internal c = __m256d_to_internal(__C);
+  __m256d_internal ret;
+  ret.v0 = _mm_fnmsub_pd(a.v0, b.v0, c.v0);
+  ret.v1 = _mm_fnmsub_pd(a.v1, b.v1, c.v1);
+  return __m256d_from_internal(ret);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C) {
+  __m256d_internal a = __m256d_to_internal(__A);
+  __m256d_internal b = __m256d_to_internal(__B);
+  __m256d_internal c = __m256d_to_internal(__C);
+  __m256d_internal ret;
+  ret.v0 = _mm_fmaddsub_pd(a.v0, b.v0, c.v0);
+  ret.v1 = _mm_fmaddsub_pd(a.v1, b.v1, c.v1);
+  return __m256d_from_internal(ret);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C) {
+  __m256d_internal a = __m256d_to_internal(__A);
+  __m256d_internal b = __m256d_to_internal(__B);
+  __m256d_internal c = __m256d_to_internal(__C);
+  __m256d_internal ret;
+  ret.v0 = _mm_fmsubadd_pd(a.v0, b.v0, c.v0);
+  ret.v1 = _mm_fmsubadd_pd(a.v1, b.v1, c.v1);
+  return __m256d_from_internal(ret);
+}
+
+#endif /* __AVX__ */
+
+#endif /* __emscripten_fmaintrin_h__ */
diff --git a/system/include/compat/immintrin.h b/system/include/compat/immintrin.h
index c0ef3e73e528a..ac200ddc83a52 100644
--- a/system/include/compat/immintrin.h
+++ b/system/include/compat/immintrin.h
@@ -39,4 +39,8 @@
 #include <avx2intrin.h>
 #endif
 
+#ifdef __FMA__
+#include <fmaintrin.h>
+#endif
+
 #endif /* __emscripten_immintrin_h__ */
diff --git a/test/sse/test_fma.cpp b/test/sse/test_fma.cpp
new file mode 100644
index 0000000000000..cfd4c95ad1706
--- /dev/null
+++ b/test/sse/test_fma.cpp
@@ -0,0 +1,498 @@
+/*
+ * Copyright 2026 The Emscripten Authors.  All rights reserved.
+ * Emscripten is available under two separate licenses, the MIT license and the
+ * University of Illinois/NCSA Open Source License.  Both these licenses can be
+ * found in the LICENSE file.
+ *
+ * Self-contained FMA intrinsics test. Unlike other SSE tests, this does NOT
+ * compare against native x86 output because FMA emulation (separate mul+add)
+ * produces different results from true FMA (single rounding) for edge cases
+ * near overflow/underflow boundaries. Instead, we test with well-behaved values
+ * where emulated and true FMA agree exactly.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+static int tests_passed = 0;
+static int tests_failed = 0;
+
+static void check_f(const char* name, float got, float expected) {
+  if (got == expected || (isnan(got) && isnan(expected))) {
+    tests_passed++;
+  } else {
+    printf("FAIL %s: got %a, expected %a\n", name, got, expected);
+    tests_failed++;
+  }
+}
+
+static void check_d(const char* name, double got, double expected) {
+  if (got == expected || (isnan(got) && isnan(expected))) {
+    tests_passed++;
+  } else {
+    printf("FAIL %s: got %la, expected %la\n", name, got, expected);
+    tests_failed++;
+  }
+}
+
+/* Bit-exact comparison. Unlike check_f/check_d (which use ==), this
+   distinguishes +0.0 from -0.0, so it can verify sign-of-zero results. */
+static void check_f_bits(const char* name, float got, float expected) {
+  if (memcmp(&got, &expected, sizeof(float)) == 0) {
+    tests_passed++;
+  } else {
+    printf("FAIL %s: got %a, expected %a\n", name, got, expected);
+    tests_failed++;
+  }
+}
+
+static void check_d_bits(const char* name, double got, double expected) {
+  if (memcmp(&got, &expected, sizeof(double)) == 0) {
+    tests_passed++;
+  } else {
+    printf("FAIL %s: got %la, expected %la\n", name, got, expected);
+    tests_failed++;
+  }
+}
+
+/* Helper to extract float lanes */
+static void storeu_ps(float* out, __m128 v) { _mm_storeu_ps(out, v); }
+static void storeu_pd(double* out, __m128d v) { _mm_storeu_pd(out, v); }
+
+/* ============================================================
+ * 128-bit packed float tests
+ * ============================================================ */
+void test_128_ps(void) {
+  __m128 a = _mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f);
+  __m128 b = _mm_set_ps(8.0f, 7.0f, 6.0f, 5.0f);
+  __m128 c = _mm_set_ps(12.0f, 11.0f, 10.0f, 9.0f);
+  float out[4];
+
+  /* fmadd: a*b + c = {1*5+9, 2*6+10, 3*7+11, 4*8+12} = {14, 22, 32, 44} */
+  storeu_ps(out, _mm_fmadd_ps(a, b, c));
+  check_f("fmadd_ps[0]", out[0], 14.0f);
+  check_f("fmadd_ps[1]", out[1], 22.0f);
+  check_f("fmadd_ps[2]", out[2], 32.0f);
+  check_f("fmadd_ps[3]", out[3], 44.0f);
+
+  /* fmsub: a*b - c = {-4, 2, 10, 20} */
+  storeu_ps(out, _mm_fmsub_ps(a, b, c));
+  check_f("fmsub_ps[0]", out[0], -4.0f);
+  check_f("fmsub_ps[1]", out[1], 2.0f);
+  check_f("fmsub_ps[2]", out[2], 10.0f);
+  check_f("fmsub_ps[3]", out[3], 20.0f);
+
+  /* fnmadd: -(a*b) + c = {4, -2, -10, -20} */
+  storeu_ps(out, _mm_fnmadd_ps(a, b, c));
+  check_f("fnmadd_ps[0]", out[0], 4.0f);
+  check_f("fnmadd_ps[1]", out[1], -2.0f);
+  check_f("fnmadd_ps[2]", out[2], -10.0f);
+  check_f("fnmadd_ps[3]", out[3], -20.0f);
+
+  /* fnmsub: -(a*b) - c = {-14, -22, -32, -44} */
+  storeu_ps(out, _mm_fnmsub_ps(a, b, c));
+  check_f("fnmsub_ps[0]", out[0], -14.0f);
+  check_f("fnmsub_ps[1]", out[1], -22.0f);
+  check_f("fnmsub_ps[2]", out[2], -32.0f);
+  check_f("fnmsub_ps[3]", out[3], -44.0f);
+
+  /* fmaddsub: even=a*b-c, odd=a*b+c = {-4, 22, 10, 44} */
+  storeu_ps(out, _mm_fmaddsub_ps(a, b, c));
+  check_f("fmaddsub_ps[0]", out[0], -4.0f);
+  check_f("fmaddsub_ps[1]", out[1], 22.0f);
+  check_f("fmaddsub_ps[2]", out[2], 10.0f);
+  check_f("fmaddsub_ps[3]", out[3], 44.0f);
+
+  /* fmsubadd: even=a*b+c, odd=a*b-c = {14, 2, 32, 20} */
+  storeu_ps(out, _mm_fmsubadd_ps(a, b, c));
+  check_f("fmsubadd_ps[0]", out[0], 14.0f);
+  check_f("fmsubadd_ps[1]", out[1], 2.0f);
+  check_f("fmsubadd_ps[2]", out[2], 32.0f);
+  check_f("fmsubadd_ps[3]", out[3], 20.0f);
+}
+
+/* ============================================================
+ * 128-bit packed double tests
+ * ============================================================ */
+void test_128_pd(void) {
+  __m128d a = _mm_set_pd(2.0, 1.0);
+  __m128d b = _mm_set_pd(4.0, 3.0);
+  __m128d c = _mm_set_pd(6.0, 5.0);
+  double out[2];
+
+  /* fmadd: {1*3+5, 2*4+6} = {8, 14} */
+  storeu_pd(out, _mm_fmadd_pd(a, b, c));
+  check_d("fmadd_pd[0]", out[0], 8.0);
+  check_d("fmadd_pd[1]", out[1], 14.0);
+
+  /* fmsub: {1*3-5, 2*4-6} = {-2, 2} */
+  storeu_pd(out, _mm_fmsub_pd(a, b, c));
+  check_d("fmsub_pd[0]", out[0], -2.0);
+  check_d("fmsub_pd[1]", out[1], 2.0);
+
+  /* fnmadd: {-(1*3)+5, -(2*4)+6} = {2, -2} */
+  storeu_pd(out, _mm_fnmadd_pd(a, b, c));
+  check_d("fnmadd_pd[0]", out[0], 2.0);
+  check_d("fnmadd_pd[1]", out[1], -2.0);
+
+  /* fnmsub: {-(1*3)-5, -(2*4)-6} = {-8, -14} */
+  storeu_pd(out, _mm_fnmsub_pd(a, b, c));
+  check_d("fnmsub_pd[0]", out[0], -8.0);
+  check_d("fnmsub_pd[1]", out[1], -14.0);
+
+  /* fmaddsub: {1*3-5, 2*4+6} = {-2, 14} */
+  storeu_pd(out, _mm_fmaddsub_pd(a, b, c));
+  check_d("fmaddsub_pd[0]", out[0], -2.0);
+  check_d("fmaddsub_pd[1]", out[1], 14.0);
+
+  /* fmsubadd: {1*3+5, 2*4-6} = {8, 2} */
+  storeu_pd(out, _mm_fmsubadd_pd(a, b, c));
+  check_d("fmsubadd_pd[0]", out[0], 8.0);
+  check_d("fmsubadd_pd[1]", out[1], 2.0);
+}
+
+/* ============================================================
+ * Scalar float tests — only element 0, upper from a
+ * ============================================================ */
+void test_scalar_ss(void) {
+  __m128 a = _mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f);
+  __m128 b = _mm_set_ps(8.0f, 7.0f, 6.0f, 5.0f);
+  __m128 c = _mm_set_ps(12.0f, 11.0f, 10.0f, 9.0f);
+  float out[4];
+
+  /* fmadd_ss: elem0 = 1*5+9 = 14, upper = {2, 3, 4} from a */
+  storeu_ps(out, _mm_fmadd_ss(a, b, c));
+  check_f("fmadd_ss[0]", out[0], 14.0f);
+  check_f("fmadd_ss[1]", out[1], 2.0f);
+  check_f("fmadd_ss[2]", out[2], 3.0f);
+  check_f("fmadd_ss[3]", out[3], 4.0f);
+
+  /* fmsub_ss: elem0 = 1*5-9 = -4 */
+  storeu_ps(out, _mm_fmsub_ss(a, b, c));
+  check_f("fmsub_ss[0]", out[0], -4.0f);
+  check_f("fmsub_ss[1]", out[1], 2.0f);
+
+  /* fnmadd_ss: elem0 = -(1*5)+9 = 4 */
+  storeu_ps(out, _mm_fnmadd_ss(a, b, c));
+  check_f("fnmadd_ss[0]", out[0], 4.0f);
+  check_f("fnmadd_ss[1]", out[1], 2.0f);
+
+  /* fnmsub_ss: elem0 = -(1*5)-9 = -14 */
+  storeu_ps(out, _mm_fnmsub_ss(a, b, c));
+  check_f("fnmsub_ss[0]", out[0], -14.0f);
+  check_f("fnmsub_ss[1]", out[1], 2.0f);
+}
+
+/* ============================================================
+ * Scalar double tests — only element 0, upper from a
+ * ============================================================ */
+void test_scalar_sd(void) {
+  __m128d a = _mm_set_pd(2.0, 1.0);
+  __m128d b = _mm_set_pd(4.0, 3.0);
+  __m128d c = _mm_set_pd(6.0, 5.0);
+  double out[2];
+
+  /* fmadd_sd: elem0 = 1*3+5 = 8, elem1 = 2 from a */
+  storeu_pd(out, _mm_fmadd_sd(a, b, c));
+  check_d("fmadd_sd[0]", out[0], 8.0);
+  check_d("fmadd_sd[1]", out[1], 2.0);
+
+  /* fmsub_sd: elem0 = 1*3-5 = -2 */
+  storeu_pd(out, _mm_fmsub_sd(a, b, c));
+  check_d("fmsub_sd[0]", out[0], -2.0);
+  check_d("fmsub_sd[1]", out[1], 2.0);
+
+  /* fnmadd_sd: elem0 = -(1*3)+5 = 2 */
+  storeu_pd(out, _mm_fnmadd_sd(a, b, c));
+  check_d("fnmadd_sd[0]", out[0], 2.0);
+  check_d("fnmadd_sd[1]", out[1], 2.0);
+
+  /* fnmsub_sd: elem0 = -(1*3)-5 = -8 */
+  storeu_pd(out, _mm_fnmsub_sd(a, b, c));
+  check_d("fnmsub_sd[0]", out[0], -8.0);
+  check_d("fnmsub_sd[1]", out[1], 2.0);
+}
+
+/* ============================================================
+ * Special values: infinities, NaN, signed zero.
+ *
+ * Every expected value here is identical whether computed by true fused
+ * FMA (single rounding) or by the emulated mul+add path (two roundings),
+ * so this test is valid in both -mrelaxed-simd and -msimd128-only builds.
+ *
+ * To keep that property:
+ *  - Infinities come from actual inf inputs (not from overflow), so the
+ *    product is genuinely infinite in both paths.
+ *  - inf - inf = NaN cases use an exact infinite product.
+ *  - Signed-zero results are produced via multiply sign propagation
+ *    (e.g. -0.0 * 1.0), never via product negation, which the emulated
+ *    path implements as 0 - x and would round differently.
+ * ============================================================ */
+void test_special_ps(void) {
+  const float inf = INFINITY;
+  float out[4];
+
+  /* fmadd: a*b + c
+     lane0: inf*2 + 1        = +inf
+     lane1: 2*3 + (-inf)     = -inf
+     lane2: nan*1 + 5        = nan
+     lane3: inf*1 + (-inf)   = nan  (inf - inf) */
+  __m128 a = _mm_set_ps(inf, NAN, 2.0f, inf);
+  __m128 b = _mm_set_ps(1.0f, 1.0f, 3.0f, 2.0f);
+  __m128 c = _mm_set_ps(-inf, 5.0f, -inf, 1.0f);
+  storeu_ps(out, _mm_fmadd_ps(a, b, c));
+  check_f("special_fmadd_ps[0]", out[0], inf);
+  check_f("special_fmadd_ps[1]", out[1], -inf);
+  check_f("special_fmadd_ps[2]", out[2], NAN);
+  check_f("special_fmadd_ps[3]", out[3], NAN);
+
+  /* fmsub: a*b - c
+     lane0: inf*1 - inf      = nan
+     lane1: 2*3 - 1          = 5
+     lane2: -0.0*1.0 - 0.0   = -0.0  (signed zero)
+     lane3: inf*2 - 5        = +inf */
+  a = _mm_set_ps(inf, -0.0f, 2.0f, inf);
+  b = _mm_set_ps(2.0f, 1.0f, 3.0f, 1.0f);
+  c = _mm_set_ps(5.0f, 0.0f, 1.0f, inf);
+  storeu_ps(out, _mm_fmsub_ps(a, b, c));
+  check_f("special_fmsub_ps[0]", out[0], NAN);
+  check_f("special_fmsub_ps[1]", out[1], 5.0f);
+  check_f_bits("special_fmsub_ps[2]", out[2], -0.0f);
+  check_f("special_fmsub_ps[3]", out[3], inf);
+
+  /* fnmadd: -(a*b) + c
+     lane0: -(inf*1) + inf   = nan
+     lane1: -(2*3) + 1       = -5
+     lane2: -(inf*1) + 0     = -inf
+     lane3: -(2*3) + inf     = +inf */
+  a = _mm_set_ps(2.0f, inf, 2.0f, inf);
+  b = _mm_set_ps(3.0f, 1.0f, 3.0f, 1.0f);
+  c = _mm_set_ps(inf, 0.0f, 1.0f, inf);
+  storeu_ps(out, _mm_fnmadd_ps(a, b, c));
+  check_f("special_fnmadd_ps[0]", out[0], NAN);
+  check_f("special_fnmadd_ps[1]", out[1], -5.0f);
+  check_f("special_fnmadd_ps[2]", out[2], -inf);
+  check_f("special_fnmadd_ps[3]", out[3], inf);
+
+  /* fnmsub: -(a*b) - c
+     lane0: -(2*3) - 1       = -7
+     lane1: -(inf*1) - 1     = -inf
+     lane2: -(inf*1) - inf   = -inf
+     lane3: -(nan*1) - 1     = nan */
+  a = _mm_set_ps(NAN, inf, inf, 2.0f);
+  b = _mm_set_ps(1.0f, 1.0f, 1.0f, 3.0f);
+  c = _mm_set_ps(1.0f, inf, 1.0f, 1.0f);
+  storeu_ps(out, _mm_fnmsub_ps(a, b, c));
+  check_f("special_fnmsub_ps[0]", out[0], -7.0f);
+  check_f("special_fnmsub_ps[1]", out[1], -inf);
+  check_f("special_fnmsub_ps[2]", out[2], -inf);
+  check_f("special_fnmsub_ps[3]", out[3], NAN);
+}
+
+void test_special_pd(void) {
+  const double inf = INFINITY;
+  double out[2];
+
+  /* fmadd: lane0: -0.0*1.0 + (-0.0) = -0.0 (signed zero)
+            lane1: inf*1 + (-inf)    = nan */
+  __m128d a = _mm_set_pd(inf, -0.0);
+  __m128d b = _mm_set_pd(1.0, 1.0);
+  __m128d c = _mm_set_pd(-inf, -0.0);
+  storeu_pd(out, _mm_fmadd_pd(a, b, c));
+  check_d_bits("special_fmadd_pd[0]", out[0], -0.0);
+  check_d("special_fmadd_pd[1]", out[1], NAN);
+
+  /* fnmadd: lane0: -(inf*1) + 0 = -inf
+             lane1: -(2*3) + 1   = -5 */
+  a = _mm_set_pd(2.0, inf);
+  b = _mm_set_pd(3.0, 1.0);
+  c = _mm_set_pd(1.0, 0.0);
+  storeu_pd(out, _mm_fnmadd_pd(a, b, c));
+  check_d("special_fnmadd_pd[0]", out[0], -inf);
+  check_d("special_fnmadd_pd[1]", out[1], -5.0);
+}
+
+void test_special_scalar(void) {
+  const float inf = INFINITY;
+  float out[4];
+
+  /* fmadd_ss: lane0 inf*1 + (-inf) = nan; upper lanes pass through from a */
+  __m128 a = _mm_set_ps(4.0f, 3.0f, 2.0f, inf);
+  __m128 b = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
+  __m128 c = _mm_set_ps(0.0f, 0.0f, 0.0f, -inf);
+  storeu_ps(out, _mm_fmadd_ss(a, b, c));
+  check_f("special_fmadd_ss[0]", out[0], NAN);
+  check_f("special_fmadd_ss[1]", out[1], 2.0f);
+  check_f("special_fmadd_ss[2]", out[2], 3.0f);
+  check_f("special_fmadd_ss[3]", out[3], 4.0f);
+
+  const double dinf = INFINITY;
+  double outd[2];
+
+  /* fnmadd_sd: lane0 -(inf*1) + 0 = -inf; lane1 passes through from a */
+  __m128d ad = _mm_set_pd(7.0, dinf);
+  __m128d bd = _mm_set_pd(0.0, 1.0);
+  __m128d cd = _mm_set_pd(0.0, 0.0);
+  storeu_pd(outd, _mm_fnmadd_sd(ad, bd, cd));
+  check_d("special_fnmadd_sd[0]", outd[0], -dinf);
+  check_d("special_fnmadd_sd[1]", outd[1], 7.0);
+}
+
+/* ============================================================
+ * 256-bit tests
+ * ============================================================ */
+#ifdef __AVX__
+void test_256_ps(void) {
+  __m256 a = _mm256_set_ps(8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f);
+  __m256 b =
+    _mm256_set_ps(16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f);
+  __m256 c =
+    _mm256_set_ps(24.0f, 23.0f, 22.0f, 21.0f, 20.0f, 19.0f, 18.0f, 17.0f);
+  float out[8];
+
+  /* fmadd: a*b+c = {1*9+17, 2*10+18, 3*11+19, 4*12+20, 5*13+21, 6*14+22,
+     7*15+23, 8*16+24} = {26, 38, 52, 68, 86, 106, 128, 152} */
+  _mm256_storeu_ps(out, _mm256_fmadd_ps(a, b, c));
+  check_f("256_fmadd_ps[0]", out[0], 26.0f);
+  check_f("256_fmadd_ps[1]", out[1], 38.0f);
+  check_f("256_fmadd_ps[2]", out[2], 52.0f);
+  check_f("256_fmadd_ps[3]", out[3], 68.0f);
+  check_f("256_fmadd_ps[4]", out[4], 86.0f);
+  check_f("256_fmadd_ps[5]", out[5], 106.0f);
+  check_f("256_fmadd_ps[6]", out[6], 128.0f);
+  check_f("256_fmadd_ps[7]", out[7], 152.0f);
+
+  /* fmsub: a*b-c */
+  _mm256_storeu_ps(out, _mm256_fmsub_ps(a, b, c));
+  check_f("256_fmsub_ps[0]", out[0], -8.0f);  /* 1*9-17 */
+  check_f("256_fmsub_ps[7]", out[7], 104.0f); /* 8*16-24 */
+
+  /* fnmadd: -(a*b)+c */
+  _mm256_storeu_ps(out, _mm256_fnmadd_ps(a, b, c));
+  check_f("256_fnmadd_ps[0]", out[0], 8.0f);    /* -(1*9)+17 */
+  check_f("256_fnmadd_ps[7]", out[7], -104.0f); /* -(8*16)+24 */
+
+  /* fnmsub: -(a*b)-c */
+  _mm256_storeu_ps(out, _mm256_fnmsub_ps(a, b, c));
+  check_f("256_fnmsub_ps[0]", out[0], -26.0f);  /* -(1*9)-17 */
+  check_f("256_fnmsub_ps[7]", out[7], -152.0f); /* -(8*16)-24 */
+
+  /* fmaddsub: even=a*b-c, odd=a*b+c */
+  _mm256_storeu_ps(out, _mm256_fmaddsub_ps(a, b, c));
+  check_f("256_fmaddsub_ps[0]", out[0], -8.0f); /* even: sub */
+  check_f("256_fmaddsub_ps[1]", out[1], 38.0f); /* odd: add */
+  check_f("256_fmaddsub_ps[2]", out[2], 14.0f); /* even: sub */
+  check_f("256_fmaddsub_ps[3]", out[3], 68.0f); /* odd: add */
+
+  /* fmsubadd: even=a*b+c, odd=a*b-c */
+  _mm256_storeu_ps(out, _mm256_fmsubadd_ps(a, b, c));
+  check_f("256_fmsubadd_ps[0]", out[0], 26.0f); /* even: add */
+  check_f("256_fmsubadd_ps[1]", out[1], 2.0f);  /* odd: sub */
+}
+
+void test_256_pd(void) {
+  __m256d a = _mm256_set_pd(4.0, 3.0, 2.0, 1.0);
+  __m256d b = _mm256_set_pd(8.0, 7.0, 6.0, 5.0);
+  __m256d c = _mm256_set_pd(12.0, 11.0, 10.0, 9.0);
+  double out[4];
+
+  /* fmadd: {1*5+9, 2*6+10, 3*7+11, 4*8+12} = {14, 22, 32, 44} */
+  _mm256_storeu_pd(out, _mm256_fmadd_pd(a, b, c));
+  check_d("256_fmadd_pd[0]", out[0], 14.0);
+  check_d("256_fmadd_pd[1]", out[1], 22.0);
+  check_d("256_fmadd_pd[2]", out[2], 32.0);
+  check_d("256_fmadd_pd[3]", out[3], 44.0);
+
+  /* fmsub */
+  _mm256_storeu_pd(out, _mm256_fmsub_pd(a, b, c));
+  check_d("256_fmsub_pd[0]", out[0], -4.0);
+  check_d("256_fmsub_pd[3]", out[3], 20.0);
+
+  /* fnmadd */
+  _mm256_storeu_pd(out, _mm256_fnmadd_pd(a, b, c));
+  check_d("256_fnmadd_pd[0]", out[0], 4.0);
+
+  /* fnmsub */
+  _mm256_storeu_pd(out, _mm256_fnmsub_pd(a, b, c));
+  check_d("256_fnmsub_pd[0]", out[0], -14.0);
+
+  /* fmaddsub: even=a*b-c, odd=a*b+c */
+  _mm256_storeu_pd(out, _mm256_fmaddsub_pd(a, b, c));
+  check_d("256_fmaddsub_pd[0]", out[0], -4.0); /* even: sub */
+  check_d("256_fmaddsub_pd[1]", out[1], 22.0); /* odd: add */
+
+  /* fmsubadd: even=a*b+c, odd=a*b-c */
+  _mm256_storeu_pd(out, _mm256_fmsubadd_pd(a, b, c));
+  check_d("256_fmsubadd_pd[0]", out[0], 14.0); /* even: add */
+  check_d("256_fmsubadd_pd[1]", out[1], 2.0);  /* odd: sub */
+}
+#endif
+
+/* ============================================================
+ * Additional edge case tests with fractional values
+ * ============================================================ */
+void test_fractional(void) {
+  /* Use values that exercise non-trivial math but stay in exact float range */
+  __m128 a = _mm_set_ps(0.5f, -1.5f, 2.25f, -0.75f);
+  __m128 b = _mm_set_ps(4.0f, -2.0f, 0.5f, 8.0f);
+  __m128 c = _mm_set_ps(1.0f, 3.0f, -1.0f, 0.5f);
+  float out[4];
+
+  /* fmadd: {-0.75*8+0.5, 2.25*0.5-1, -1.5*-2+3, 0.5*4+1}
+           = {-5.5, 0.125, 6, 3} */
+  storeu_ps(out, _mm_fmadd_ps(a, b, c));
+  check_f("frac_fmadd[0]", out[0], -5.5f);
+  check_f("frac_fmadd[1]", out[1], 0.125f);
+  check_f("frac_fmadd[2]", out[2], 6.0f);
+  check_f("frac_fmadd[3]", out[3], 3.0f);
+
+  __m128d ad = _mm_set_pd(-0.125, 3.5);
+  __m128d bd = _mm_set_pd(16.0, 2.0);
+  __m128d cd = _mm_set_pd(1.0, -3.0);
+  double outd[2];
+
+  /* fmadd: {3.5*2-3, -0.125*16+1} = {4, -1} */
+  storeu_pd(outd, _mm_fmadd_pd(ad, bd, cd));
+  check_d("frac_fmadd_pd[0]", outd[0], 4.0);
+  check_d("frac_fmadd_pd[1]", outd[1], -1.0);
+}
+
+int main() {
+  printf("Testing 128-bit packed float (ps)...\n");
+  test_128_ps();
+
+  printf("Testing 128-bit packed double (pd)...\n");
+  test_128_pd();
+
+  printf("Testing scalar float (ss)...\n");
+  test_scalar_ss();
+
+  printf("Testing scalar double (sd)...\n");
+  test_scalar_sd();
+
+#ifdef __AVX__
+  printf("Testing 256-bit packed float (ps)...\n");
+  test_256_ps();
+
+  printf("Testing 256-bit packed double (pd)...\n");
+  test_256_pd();
+#endif
+
+  printf("Testing fractional values...\n");
+  test_fractional();
+
+  printf("Testing special values (inf, nan, signed zero)...\n");
+  test_special_ps();
+  test_special_pd();
+  test_special_scalar();
+
+  printf("\nResults: %d passed, %d failed\n", tests_passed, tests_failed);
+  if (tests_failed == 0) {
+    printf("All FMA tests PASSED\n");
+  }
+  assert(tests_failed == 0);
+  return 0;
+}
diff --git a/test/sse/test_fma_relaxed.cpp b/test/sse/test_fma_relaxed.cpp
new file mode 100644
index 0000000000000..2717ed76d6865
--- /dev/null
+++ b/test/sse/test_fma_relaxed.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright 2026 The Emscripten Authors.  All rights reserved.
+ * Emscripten is available under two separate licenses, the MIT license and the
+ * University of Illinois/NCSA Open Source License.  Both these licenses can be
+ * found in the LICENSE file.
+ */
+// Exhaustive FMA test — compare against native x86 output.
+// This test should be run with -mrelaxed-simd, and on an x86 host, so that
+// Wasm relaxed SIMD FMA lowers to a hardware fused multiply-add matching
+// native x86 FMA behavior bit-for-bit.
+
+// immintrin.h must be included before test_sse.h
+// clang-format off
+#include <immintrin.h>
+#include "test_sse.h"
+// clang-format on
+
+bool testNaNBits = false;
+
+float* interesting_floats = get_interesting_floats();
+int numInterestingFloats =
+  sizeof(interesting_floats_) / sizeof(interesting_floats_[0]);
+uint32_t* interesting_ints = get_interesting_ints();
+int numInterestingInts =
+  sizeof(interesting_ints_) / sizeof(interesting_ints_[0]);
+double* interesting_doubles = get_interesting_doubles();
+int numInterestingDoubles =
+  sizeof(interesting_doubles_) / sizeof(interesting_doubles_[0]);
+
+void test_fmadd(void) {
+  Ret_M128_M128_M128(__m128, _mm_fmadd_ps);
+  Ret_M128d_M128d_M128d(__m128d, _mm_fmadd_pd);
+  Ret_M128_M128_M128(__m128, _mm_fmadd_ss);
+  Ret_M128d_M128d_M128d(__m128d, _mm_fmadd_sd);
+#ifdef __AVX__
+  Ret_M256_M256_M256(__m256, _mm256_fmadd_ps);
+  Ret_M256d_M256d_M256d(__m256d, _mm256_fmadd_pd);
+#endif
+}
+
+void test_fmsub(void) {
+  Ret_M128_M128_M128(__m128, _mm_fmsub_ps);
+  Ret_M128d_M128d_M128d(__m128d, _mm_fmsub_pd);
+  Ret_M128_M128_M128(__m128, _mm_fmsub_ss);
+  Ret_M128d_M128d_M128d(__m128d, _mm_fmsub_sd);
+#ifdef __AVX__
+  Ret_M256_M256_M256(__m256, _mm256_fmsub_ps);
+  Ret_M256d_M256d_M256d(__m256d, _mm256_fmsub_pd);
+#endif
+}
+
+void test_fnmadd(void) {
+  Ret_M128_M128_M128(__m128, _mm_fnmadd_ps);
+  Ret_M128d_M128d_M128d(__m128d, _mm_fnmadd_pd);
+  Ret_M128_M128_M128(__m128, _mm_fnmadd_ss);
+  Ret_M128d_M128d_M128d(__m128d, _mm_fnmadd_sd);
+#ifdef __AVX__
+  Ret_M256_M256_M256(__m256, _mm256_fnmadd_ps);
+  Ret_M256d_M256d_M256d(__m256d, _mm256_fnmadd_pd);
+#endif
+}
+
+void test_fnmsub(void) {
+  Ret_M128_M128_M128(__m128, _mm_fnmsub_ps);
+  Ret_M128d_M128d_M128d(__m128d, _mm_fnmsub_pd);
+  Ret_M128_M128_M128(__m128, _mm_fnmsub_ss);
+  Ret_M128d_M128d_M128d(__m128d, _mm_fnmsub_sd);
+#ifdef __AVX__
+  Ret_M256_M256_M256(__m256, _mm256_fnmsub_ps);
+  Ret_M256d_M256d_M256d(__m256d, _mm256_fnmsub_pd);
+#endif
+}
+
+void test_fmaddsub(void) {
+  Ret_M128_M128_M128(__m128, _mm_fmaddsub_ps);
+  Ret_M128d_M128d_M128d(__m128d, _mm_fmaddsub_pd);
+#ifdef __AVX__
+  Ret_M256_M256_M256(__m256, _mm256_fmaddsub_ps);
+  Ret_M256d_M256d_M256d(__m256d, _mm256_fmaddsub_pd);
+#endif
+}
+
+void test_fmsubadd(void) {
+  Ret_M128_M128_M128(__m128, _mm_fmsubadd_ps);
+  Ret_M128d_M128d_M128d(__m128d, _mm_fmsubadd_pd);
+#ifdef __AVX__
+  Ret_M256_M256_M256(__m256, _mm256_fmsubadd_ps);
+  Ret_M256d_M256d_M256d(__m256d, _mm256_fmsubadd_pd);
+#endif
+}
+
+int main() {
+  assert(numInterestingFloats % 8 == 0);
+  assert(numInterestingInts % 8 == 0);
+  assert(numInterestingDoubles % 4 == 0);
+
+  test_fmadd();
+  test_fmsub();
+  test_fnmadd();
+  test_fnmsub();
+  test_fmaddsub();
+  test_fmsubadd();
+}
diff --git a/test/test_core.py b/test/test_core.py
index 27f3423460df7..0986590836852 100644
--- a/test/test_core.py
+++ b/test/test_core.py
@@ -6763,6 +6763,30 @@ def test_avx2(self, args):
     self.maybe_closure()
     self.do_runf(src, native_result)
 
+  @wasm_simd
+  @no_big_endian('SIMD support is currently not compatible with big endian')
+  def test_fma(self):
+    src = test_file('sse/test_fma.cpp')
+    self.cflags += ['-mavx', '-mfma', '-sSTACK_SIZE=1MB']
+    self.do_runf(src, 'All FMA tests PASSED')
+
+  # Exhaustive FMA test with relaxed SIMD — compared against native x86.
+  # With -mrelaxed-simd on an x86 host, Wasm relaxed SIMD FMA lowers to a
+  # hardware fused operation that matches native x86 FMA bit-for-bit. This
+  # test is gated to x64 CPUs (@requires_x64_cpu) to guarantee that fusion.
+  @wasm_relaxed_simd
+  @requires_native_clang
+  @requires_x64_cpu
+  @is_slow_test
+  @no_big_endian('SIMD support is currently not compatible with big endian')
+  def test_fma_relaxed(self):
+    src = test_file('sse/test_fma_relaxed.cpp')
+    self.run_process([shared.CLANG_CXX, src, '-mfma', '-mavx', '-o', 'test_fma_relaxed', '-D_CRT_SECURE_NO_WARNINGS=1'] + clang_native.get_clang_native_args(), stdout=PIPE)
+    native_result = self.run_process('./test_fma_relaxed', stdout=PIPE).stdout
+
+    self.cflags += ['-I' + test_file('sse'), '-mavx', '-mfma', '-sSTACK_SIZE=1MB']
+    self.do_runf(src, native_result)
+
   @wasm_simd
   def test_sse_diagnostics(self):
     self.cflags.remove('-Werror')
diff --git a/test/test_other.py b/test/test_other.py
index cdaa5ef5b5aee..92ec66048ced2 100644
--- a/test/test_other.py
+++ b/test/test_other.py
@@ -9050,8 +9050,8 @@ def test_standalone_system_headers(self, prefix):
         print('header: ' + header)
         # These headers cannot be included in isolation.
         # e.g: error: unknown type name 'EGLDisplay'
-        # Don't include avxintrin.h and avx2inrin.h directly, include immintrin.h instead
-        if header in {'eglext.h', 'SDL_config_macosx.h', 'glext.h', 'gl2ext.h', 'avxintrin.h', 'avx2intrin.h'}:
+        # Don't include avxintrin.h, avx2intrin.h, fmaintrin.h directly, include immintrin.h instead
+        if header in {'eglext.h', 'SDL_config_macosx.h', 'glext.h', 'gl2ext.h', 'avxintrin.h', 'avx2intrin.h', 'fmaintrin.h'}:
           continue
         # These headers are C++ only and cannot be included from C code.
         # But we still want to check they can be included on there own without
@@ -9066,7 +9066,7 @@ def test_standalone_system_headers(self, prefix):
         inc = f'#include <{header}>\n__attribute__((weak)) int foo;\n'
         cflags = ['-Werror', '-Wall', '-pedantic', '-msimd128', '-msse4']
         if header == 'immintrin.h':
-          cflags.append('-mavx2')
+          cflags += ['-mavx2', '-mfma']
         if cxx_only:
           create_file('a.cxx', inc)
           create_file('b.cxx', inc)
diff --git a/tools/cmdline.py b/tools/cmdline.py
index 4b38e18ea0081..4b59683136c31 100644
--- a/tools/cmdline.py
+++ b/tools/cmdline.py
@@ -26,7 +26,7 @@
 from tools.toolchain_profiler import ToolchainProfiler
 from tools.utils import exit_with_error, read_file
 
-SIMD_INTEL_FEATURE_TOWER = ['-msse', '-msse2', '-msse3', '-mssse3', '-msse4.1', '-msse4.2', '-msse4', '-mavx', '-mavx2']
+SIMD_INTEL_FEATURE_TOWER = ['-msse', '-msse2', '-msse3', '-mssse3', '-msse4.1', '-msse4.2', '-msse4', '-mavx', '-mavx2', '-mfma']
 SIMD_NEON_FLAGS = ['-mfpu=neon']
 CLANG_FLAGS_WITH_ARGS = {
     '-MT', '-MF', '-MJ', '-MQ', '-D', '-U', '-o', '-x',
diff --git a/tools/compile.py b/tools/compile.py
index 75329d22de15a..257ef3aabf03e 100644
--- a/tools/compile.py
+++ b/tools/compile.py
@@ -130,6 +130,9 @@ def array_contains_any_of(hay, needles):
   if array_contains_any_of(user_args, SIMD_INTEL_FEATURE_TOWER[8:]):
     cflags += ['-D__AVX2__=1']
 
+  if array_contains_any_of(user_args, SIMD_INTEL_FEATURE_TOWER[9:]):
+    cflags += ['-D__FMA__=1']
+
   if array_contains_any_of(user_args, SIMD_NEON_FLAGS):
     cflags += ['-D__ARM_NEON__=1']