diff --git a/ChangeLog.md b/ChangeLog.md
index b8616eece19ef..979f7bfc33e3a 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -20,6 +20,9 @@ See docs/process.md for more on how version tagging works.
 
 3.1.68 (in development)
 -----------------------
+- Added support for compiling 256-bit wide AVX intrinsics, emulated on top
+  of 128-bit Wasm SIMD instruction set. (#22430). Pass `-msimd128 -mavx` to
+  enable targeting AVX.
 - Pthread-based programs no longer generates `.worker.js` file.  This file was
   made redundant back in 3.1.58 and now is completely removed. (#22598)
 - The freetype port was updated from v2.6 to v2.13.3. (#22585)
diff --git a/site/source/docs/porting/simd.rst b/site/source/docs/porting/simd.rst
index 7c67059f8ff8e..f5c12ff597509 100644
--- a/site/source/docs/porting/simd.rst
+++ b/site/source/docs/porting/simd.rst
@@ -12,7 +12,7 @@ Emscripten supports the `WebAssembly SIMD <https://github.com/webassembly/simd/>
 1. Enable LLVM/Clang SIMD autovectorizer to automatically target WebAssembly SIMD, without requiring changes to C/C++ source code.
 2. Write SIMD code using the GCC/Clang SIMD Vector Extensions (``__attribute__((vector_size(16)))``)
 3. Write SIMD code using the WebAssembly SIMD intrinsics (``#include <wasm_simd128.h>``)
-4. Compile existing SIMD code that uses the x86 SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 or 128-bit subset of the AVX intrinsics (``#include <*mmintrin.h>``)
+4. Compile existing SIMD code that uses the x86 SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 or AVX intrinsics (``#include <*mmintrin.h>``)
 5. Compile existing SIMD code that uses the ARM NEON intrinsics (``#include <arm_neon.h>``)
 
 These techniques can be freely combined in a single program.
@@ -153,7 +153,7 @@ Emscripten supports compiling existing codebases that use x86 SSE instructions b
 * **SSE4.2**: pass ``-msse4.2`` and ``#include <nmmintrin.h>``. Use ``#ifdef __SSE4_2__`` to gate code.
 * **AVX**: pass ``-mavx`` and ``#include <immintrin.h>``. Use ``#ifdef __AVX__`` to gate code.
 
-Currently only the SSE1, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, and 128-bit AVX instruction sets are supported. Each of these instruction sets add on top of the previous ones, so e.g. when targeting SSE3, the instruction sets SSE1 and SSE2 are also available.
+Currently only the SSE1, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, and AVX instruction sets are supported. Each of these instruction sets add on top of the previous ones, so e.g. when targeting SSE3, the instruction sets SSE1 and SSE2 are also available.
 
 The following tables highlight the availability and expected performance of different SSE* intrinsics. This can be useful for understanding the performance limitations that the Wasm SIMD specification has when running on x86 hardware.
 
@@ -1136,7 +1136,7 @@ The following table highlights the availability and expected performance of diff
    * - _mm_testz_ps
      - 💣 emulated with complex SIMD+scalar sequence
 
-Only the 128-bit wide instructions from AVX instruction set are available. 256-bit wide AVX instructions are not provided.
+Only the 128-bit wide instructions from AVX instruction set are listed. The 256-bit wide AVX instructions are emulated by two 128-bit wide instructions.
 
 
 ====================================================== 
diff --git a/system/include/compat/avxintrin.h b/system/include/compat/avxintrin.h
index 50e2e7d130abb..09c9a3e8bf6a0 100644
--- a/system/include/compat/avxintrin.h
+++ b/system/include/compat/avxintrin.h
@@ -11,14 +11,500 @@
 #error "AVX instruction set not enabled"
 #endif
 
+#include <emmintrin.h>
 #include <nmmintrin.h>
+#include <pmmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
+
+typedef struct {
+  __m128d v0;
+  __m128d v1;
+} __m256d;
+
+typedef struct {
+  __m128 v0;
+  __m128 v1;
+} __m256;
+
+typedef struct {
+  __m128i v0;
+  __m128i v1;
+} __m256i;
+
+typedef int64_t __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
+
+typedef struct {
+  __m128i_u v0;
+  __m128i_u v1;
+} __m256i_u;
+
+union m256_data {
+  __m256i int_view;
+  __m256d double_view;
+  __m256 float_view;
+  __m128i_u int_u_view;
+};
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_add_pd(__m256d __a, __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_add_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_add_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_add_ps(__m256 __a, __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_add_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_add_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_pd(__m256d __a, __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_sub_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_sub_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_ps(__m256 __a, __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_sub_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_sub_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_addsub_pd(__m256d __a, __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_addsub_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_addsub_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_addsub_ps(__m256 __a, __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_addsub_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_addsub_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_div_pd(__m256d __a, __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_div_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_div_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_div_ps(__m256 __a, __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_div_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_div_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_max_pd(__m256d __a, __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_max_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_max_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_max_ps(__m256 __a, __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_max_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_max_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_min_pd(__m256d __a, __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_min_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_min_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_min_ps(__m256 __a, __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_min_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_min_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_mul_pd(__m256d __a, __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_mul_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_mul_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_mul_ps(__m256 __a, __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_mul_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_mul_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_sqrt_pd(__m256d __a) {
+  __m256d ret;
+  ret.v0 = _mm_sqrt_pd(__a.v0);
+  ret.v1 = _mm_sqrt_pd(__a.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_sqrt_ps(__m256 __a) {
+  __m256 ret;
+  ret.v0 = _mm_sqrt_ps(__a.v0);
+  ret.v1 = _mm_sqrt_ps(__a.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_rsqrt_ps(__m256 __a) {
+  __m256 ret;
+  ret.v0 = _mm_rsqrt_ps(__a.v0);
+  ret.v1 = _mm_rsqrt_ps(__a.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_rcp_ps(__m256 __a) {
+  __m256 ret;
+  ret.v0 = _mm_rcp_ps(__a.v0);
+  ret.v1 = _mm_rcp_ps(__a.v1);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_round_pd(__m256d __a, int __rounding) {
+  __m256d ret;
+  ret.v0 = _mm_round_pd(__a.v0, __rounding);
+  ret.v1 = _mm_round_pd(__a.v1, __rounding);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_round_ps(__m256 __a, int __rounding) {
+  __m256 ret;
+  ret.v0 = _mm_round_ps(__a.v0, __rounding);
+  ret.v1 = _mm_round_ps(__a.v1, __rounding);
+  return ret;
+}
+
+#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
+#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
+#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_and_pd(__m256d __a, __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_and_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_and_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_and_ps(__m256 __a, __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_and_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_and_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_andnot_pd(__m256d __a, __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_andnot_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_andnot_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_andnot_ps(__m256 __a, __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_andnot_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_andnot_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_or_pd(__m256d __a, __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_or_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_or_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_or_ps(__m256 __a, __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_or_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_or_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_xor_pd(__m256d __a, __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_xor_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_xor_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_xor_ps(__m256 __a, __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_xor_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_xor_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_hadd_pd(__m256d __a, __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_hadd_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_hadd_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_hadd_ps(__m256 __a, __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_hadd_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_hadd_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_hsub_pd(__m256d __a, __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_hsub_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_hsub_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_hsub_ps(__m256 __a, __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_hsub_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_hsub_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_permutevar_pd(__m128d __a, __m128i __c) {
+  return (__m128d)wasm_f64x2_make(
+    ((__f64x2)__a)[(wasm_i64x2_extract_lane(__c, 0) >> 1) & 1],
+    ((__f64x2)__a)[(wasm_i64x2_extract_lane(__c, 1) >> 1) & 1]);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_permutevar_pd(__m256d __a, __m256i __c) {
+  __m256d ret;
+  ret.v0 = _mm_permutevar_pd(__a.v0, __c.v0);
+  ret.v1 = _mm_permutevar_pd(__a.v1, __c.v1);
+  return ret;
+}
 
 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
-_mm_broadcast_ss(const float *__mem_addr)
-{
-  return (__m128)wasm_v32x4_load_splat(__mem_addr);
+_mm_permutevar_ps(__m128 __a, __m128i __c) {
+  return (__m128)wasm_f32x4_make(
+    ((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 0) & 3],
+    ((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 1) & 3],
+    ((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 2) & 3],
+    ((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 3) & 3]);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_permutevar_ps(__m256 __a, __m256i __c) {
+  __m256 ret;
+  ret.v0 = _mm_permutevar_ps(__a.v0, __c.v0);
+  ret.v1 = _mm_permutevar_ps(__a.v1, __c.v1);
+  return ret;
+}
+
+#define _mm_permute_pd(__a, __imm)                                             \
+  ((__m128d)wasm_i64x2_shuffle(                                                \
+    (__m128d)(__a), (__m128d)(__a), ((__imm) & 1), (((__imm) >> 1) & 1)))
+
+#define _mm256_permute_pd(__A, __imm)                                          \
+  __extension__({                                                              \
+    __m256d __a = (__A);                                                       \
+    _mm256_set_m128d(_mm_permute_pd(__a.v1, (__imm) >> 2),                     \
+                     _mm_permute_pd(__a.v0, (__imm)));                         \
+  })
+
+#define _mm_permute_ps(__a, __imm)                                             \
+  ((__m128)wasm_i32x4_shuffle((__m128)(__a),                                   \
+                              (__m128)(__a),                                   \
+                              ((__imm) & 3),                                   \
+                              (((__imm) >> 2) & 3),                            \
+                              (((__imm) >> 4) & 3),                            \
+                              (((__imm) >> 6) & 3)))
+
+#define _mm256_permute_ps(__A, __imm)                                          \
+  __extension__({                                                              \
+    __m256 __a = (__A);                                                        \
+    _mm256_set_m128(_mm_permute_ps(__a.v1, (__imm)),                           \
+                    _mm_permute_ps(__a.v0, (__imm)));                          \
+  })
+
+static __inline__ __m128d
+__avx_select4d(__m256d __a, __m256d __b, const int imm8) {
+  switch (imm8 & 0xF) {
+    case 0:
+    case 4:
+      return __a.v0;
+    case 1:
+    case 5:
+      return __a.v1;
+    case 2:
+    case 6:
+      return __b.v0;
+    case 3:
+    case 7:
+      return __b.v1;
+    default:
+      return (__m128d)wasm_i64x2_const_splat(0);
+  }
+}
+
+static __inline__ __m128 __avx_select4(__m256 __a, __m256 __b, const int imm8) {
+  switch (imm8 & 0xF) {
+    case 0:
+    case 4:
+      return __a.v0;
+    case 1:
+    case 5:
+      return __a.v1;
+    case 2:
+    case 6:
+      return __b.v0;
+    case 3:
+    case 7:
+      return __b.v1;
+    default:
+      return (__m128)wasm_i64x2_const_splat(0);
+  }
+}
+
+static __inline__ __m128i
+__avx_select4i(__m256i __a, __m256i __b, const int imm8) {
+  switch (imm8 & 0xF) {
+    case 0:
+    case 4:
+      return __a.v0;
+    case 1:
+    case 5:
+      return __a.v1;
+    case 2:
+    case 6:
+      return __b.v0;
+    case 3:
+    case 7:
+      return __b.v1;
+    default:
+      return wasm_i64x2_const_splat(0);
+  }
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_permute2f128_pd(__m256d __a, __m256d __b, const int imm8) {
+  __m256d ret;
+  ret.v0 = __avx_select4d(__a, __b, imm8);
+  ret.v1 = __avx_select4d(__a, __b, imm8 >> 4);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_permute2f128_ps(__m256 __a, __m256 __b, const int imm8) {
+  __m256 ret;
+  ret.v0 = __avx_select4(__a, __b, imm8);
+  ret.v1 = __avx_select4(__a, __b, imm8 >> 4);
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_permute2f128_si256(__m256i __a, __m256i __b, const int imm8) {
+  __m256i ret;
+  ret.v0 = __avx_select4i(__a, __b, imm8);
+  ret.v1 = __avx_select4i(__a, __b, imm8 >> 4);
+  return ret;
 }
 
+#define _mm256_blend_pd(__A, __B, imm8)                                        \
+  __extension__({                                                              \
+    __m256d __a = (__A);                                                       \
+    __m256d __b = (__B);                                                       \
+    _mm256_set_m128d(_mm_blend_pd(__a.v1, __b.v1, (imm8) >> 2),                \
+                     _mm_blend_pd(__a.v0, __b.v0, (imm8)));                    \
+  })
+
+#define _mm256_blend_ps(__A, __B, imm)                                         \
+  __extension__({                                                              \
+    __m256 __a = (__A);                                                        \
+    __m256 __b = (__B);                                                        \
+    _mm256_set_m128(_mm_blend_ps(__a.v1, __b.v1, (imm) >> 4),                  \
+                    _mm_blend_ps(__a.v0, __b.v0, (imm)));                      \
+  })
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) {
+  __m256d ret;
+  ret.v0 = _mm_blendv_pd(__a.v0, __b.v0, __c.v0);
+  ret.v1 = _mm_blendv_pd(__a.v1, __b.v1, __c.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) {
+  __m256 ret;
+  ret.v0 = _mm_blendv_ps(__a.v0, __b.v0, __c.v0);
+  ret.v1 = _mm_blendv_ps(__a.v1, __b.v1, __c.v1);
+  return ret;
+}
+
+#define _mm256_dp_ps(__A, __B, imm)                                            \
+  __extension__({                                                              \
+    __m256 __a = (__A);                                                        \
+    __m256 __b = (__B);                                                        \
+    _mm256_set_m128(_mm_dp_ps(__a.v1, __b.v1, (imm)),                          \
+                    _mm_dp_ps(__a.v0, __b.v0, (imm)));                         \
+  })
+
+#define _mm256_shuffle_ps(__A, __B, mask)                                      \
+  __extension__({                                                              \
+    __m256 __a = (__A);                                                        \
+    __m256 __b = (__B);                                                        \
+    _mm256_set_m128(_mm_shuffle_ps(__a.v1, __b.v1, (mask)),                    \
+                    _mm_shuffle_ps(__a.v0, __b.v0, (mask)));                   \
+  })
+
+#define _mm256_shuffle_pd(__A, __B, mask)                                      \
+  __extension__({                                                              \
+    __m256d __a = (__A);                                                       \
+    __m256d __b = (__B);                                                       \
+    _mm256_set_m128d(_mm_shuffle_pd(__a.v1, __b.v1, (mask) >> 2),              \
+                     _mm_shuffle_pd(__a.v0, __b.v0, (mask)));                  \
+  })
+
 #define _CMP_EQ_OQ 0
 #define _CMP_LT_OS 1
 #define _CMP_LE_OS 2
@@ -44,215 +530,1481 @@ _mm_broadcast_ss(const float *__mem_addr)
 #define _CMP_NLE_UQ 22
 #define _CMP_ORD_S 23
 #define _CMP_EQ_US 24
-#define _CMP_NGE_UQ  25
-#define _CMP_NGT_UQ  26
-#define _CMP_FALSE_OS  27
-#define _CMP_NEQ_OS  28
+#define _CMP_NGE_UQ 25
+#define _CMP_NGT_UQ 26
+#define _CMP_FALSE_OS 27
+#define _CMP_NEQ_OS 28
 #define _CMP_GE_OQ 29
 #define _CMP_GT_OQ 30
 #define _CMP_TRUE_US 31
 
-#define _mm_cmp_pd(__a, __b, __imm) __extension__ ({ \
-     __m128d __ret; \
-     if ((__imm) == _CMP_EQ_OQ || (__imm) == _CMP_EQ_OS) __ret = _mm_cmpeq_pd((__a), (__b)); \
-     if ((__imm) == _CMP_EQ_UQ || (__imm) == _CMP_EQ_US) __ret = _mm_or_pd(_mm_cmpeq_pd((__a), (__b)), _mm_cmpunord_pd((__a), (__b))); \
-     if ((__imm) == _CMP_LT_OS || (__imm) == _CMP_LT_OQ) __ret = _mm_cmplt_pd((__a), (__b)); \
-     if ((__imm) == _CMP_LE_OS || (__imm) == _CMP_LE_OQ) __ret = _mm_cmple_pd((__a), (__b)); \
-     if ((__imm) == _CMP_UNORD_Q || (__imm) == _CMP_UNORD_S) __ret = _mm_cmpunord_pd((__a), (__b)); \
-     if ((__imm) == _CMP_NEQ_UQ || (__imm) == _CMP_NEQ_US) __ret = _mm_cmpneq_pd((__a), (__b)); \
-     if ((__imm) == _CMP_NEQ_OQ || (__imm) == _CMP_NEQ_OS) __ret = _mm_andnot_pd(_mm_cmpunord_pd((__a), (__b)), _mm_cmpneq_pd((__a), (__b))); \
-     if ((__imm) == _CMP_NLT_US || (__imm) == _CMP_NLT_UQ) __ret = _mm_cmpnlt_pd((__a), (__b)); \
-     if ((__imm) == _CMP_ORD_Q || (__imm) == _CMP_ORD_S) __ret = _mm_cmpord_pd((__a), (__b)); \
-     if ((__imm) == _CMP_NGE_US || (__imm) == _CMP_NGE_UQ) __ret = _mm_cmpnge_pd((__a), (__b)); \
-     if ((__imm) == _CMP_NGT_US || (__imm) == _CMP_NGT_UQ) __ret = _mm_cmpngt_pd((__a), (__b)); \
-     if ((__imm) == _CMP_FALSE_OQ || (__imm) == _CMP_FALSE_OS) __ret = _mm_setzero_pd(); \
-     if ((__imm) == _CMP_GE_OS || (__imm) == _CMP_GE_OQ) __ret = _mm_cmpge_pd((__a), (__b)); \
-     if ((__imm) == _CMP_GT_OS || (__imm) == _CMP_GT_OQ) __ret = _mm_cmpgt_pd((__a), (__b)); \
-     if ((__imm) == _CMP_TRUE_UQ || (__imm) == _CMP_TRUE_US) __ret = (__m128d)wasm_i8x16_splat(0xFF); \
-     if ((__imm) == _CMP_NLE_US || (__imm) == _CMP_NLE_UQ) __ret = _mm_cmpnle_pd((__a), (__b)); \
-     __ret; })
-
-#define _mm_cmp_ps(__a, __b, __imm) __extension__ ({ \
-     __m128 __ret; \
-     if ((__imm) == _CMP_EQ_OQ || (__imm) == _CMP_EQ_OS) __ret = _mm_cmpeq_ps((__a), (__b)); \
-     if ((__imm) == _CMP_EQ_UQ || (__imm) == _CMP_EQ_US) __ret = _mm_or_ps(_mm_cmpeq_ps((__a), (__b)), _mm_cmpunord_ps((__a), (__b))); \
-     if ((__imm) == _CMP_LT_OS || (__imm) == _CMP_LT_OQ) __ret = _mm_cmplt_ps((__a), (__b)); \
-     if ((__imm) == _CMP_LE_OS || (__imm) == _CMP_LE_OQ) __ret = _mm_cmple_ps((__a), (__b)); \
-     if ((__imm) == _CMP_UNORD_Q || (__imm) == _CMP_UNORD_S) __ret = _mm_cmpunord_ps((__a), (__b)); \
-     if ((__imm) == _CMP_NEQ_UQ || (__imm) == _CMP_NEQ_US) __ret = _mm_cmpneq_ps((__a), (__b)); \
-     if ((__imm) == _CMP_NEQ_OQ || (__imm) == _CMP_NEQ_OS) __ret = _mm_andnot_ps(_mm_cmpunord_ps((__a), (__b)), _mm_cmpneq_ps((__a), (__b))); \
-     if ((__imm) == _CMP_NLT_US || (__imm) == _CMP_NLT_UQ) __ret = _mm_cmpnlt_ps((__a), (__b)); \
-     if ((__imm) == _CMP_ORD_Q || (__imm) == _CMP_ORD_S) __ret = _mm_cmpord_ps((__a), (__b)); \
-     if ((__imm) == _CMP_NGE_US || (__imm) == _CMP_NGE_UQ) __ret = _mm_cmpnge_ps((__a), (__b)); \
-     if ((__imm) == _CMP_NGT_US || (__imm) == _CMP_NGT_UQ) __ret = _mm_cmpngt_ps((__a), (__b)); \
-     if ((__imm) == _CMP_FALSE_OQ || (__imm) == _CMP_FALSE_OS) __ret = _mm_setzero_ps(); \
-     if ((__imm) == _CMP_GE_OS || (__imm) == _CMP_GE_OQ) __ret = _mm_cmpge_ps((__a), (__b)); \
-     if ((__imm) == _CMP_GT_OS || (__imm) == _CMP_GT_OQ) __ret = _mm_cmpgt_ps((__a), (__b)); \
-     if ((__imm) == _CMP_TRUE_UQ || (__imm) == _CMP_TRUE_US) __ret = (__m128)wasm_i8x16_splat(0xFF); \
-     if ((__imm) == _CMP_NLE_US || (__imm) == _CMP_NLE_UQ) __ret = _mm_cmpnle_ps((__a), (__b)); \
-     __ret; })
-
-#define _mm_cmp_sd(__a, __b, __imm) __extension__ ({ \
-     __m128d __ret; \
-     if ((__imm) == _CMP_EQ_OQ || (__imm) == _CMP_EQ_OS) __ret = _mm_cmpeq_sd((__a), (__b)); \
-     if ((__imm) == _CMP_EQ_UQ || (__imm) == _CMP_EQ_US) __ret = _mm_move_sd((__a), _mm_or_pd(_mm_cmpeq_sd((__a), (__b)), _mm_cmpunord_sd((__a), (__b)))); \
-     if ((__imm) == _CMP_LT_OS || (__imm) == _CMP_LT_OQ) __ret = _mm_cmplt_sd((__a), (__b)); \
-     if ((__imm) == _CMP_LE_OS || (__imm) == _CMP_LE_OQ) __ret = _mm_cmple_sd((__a), (__b)); \
-     if ((__imm) == _CMP_UNORD_Q || (__imm) == _CMP_UNORD_S) __ret = _mm_cmpunord_sd((__a), (__b)); \
-     if ((__imm) == _CMP_NEQ_UQ || (__imm) == _CMP_NEQ_US) __ret = _mm_cmpneq_sd((__a), (__b)); \
-     if ((__imm) == _CMP_NEQ_OQ || (__imm) == _CMP_NEQ_OS) __ret = _mm_move_sd((__a), _mm_andnot_pd(_mm_cmpunord_sd((__a), (__b)), _mm_cmpneq_sd((__a), (__b)))); \
-     if ((__imm) == _CMP_NLT_US || (__imm) == _CMP_NLT_UQ) __ret = _mm_cmpnlt_sd((__a), (__b)); \
-     if ((__imm) == _CMP_ORD_Q || (__imm) == _CMP_ORD_S) __ret = _mm_cmpord_sd((__a), (__b)); \
-     if ((__imm) == _CMP_NGE_US || (__imm) == _CMP_NGE_UQ) __ret = _mm_cmpnge_sd((__a), (__b)); \
-     if ((__imm) == _CMP_NGT_US || (__imm) == _CMP_NGT_UQ) __ret = _mm_cmpngt_sd((__a), (__b)); \
-     if ((__imm) == _CMP_FALSE_OQ || (__imm) == _CMP_FALSE_OS) __ret = _mm_move_sd((__a), _mm_setzero_pd()); \
-     if ((__imm) == _CMP_GE_OS || (__imm) == _CMP_GE_OQ) __ret = _mm_cmpge_sd((__a), (__b)); \
-     if ((__imm) == _CMP_GT_OS || (__imm) == _CMP_GT_OQ) __ret = _mm_cmpgt_sd((__a), (__b)); \
-     if ((__imm) == _CMP_TRUE_UQ || (__imm) == _CMP_TRUE_US) __ret = _mm_move_sd((__a), (__m128d)wasm_i8x16_splat(0xFF)); \
-     if ((__imm) == _CMP_NLE_US || (__imm) == _CMP_NLE_UQ) __ret = _mm_cmpnle_sd((__a), (__b)); \
-     __ret; })
-
-#define _mm_cmp_ss(__a, __b, __imm) __extension__ ({ \
-     __m128 __ret; \
-     if ((__imm) == _CMP_EQ_OQ || (__imm) == _CMP_EQ_OS) __ret = _mm_cmpeq_ss((__a), (__b)); \
-     if ((__imm) == _CMP_EQ_UQ || (__imm) == _CMP_EQ_US) __ret = _mm_move_ss((__a), _mm_or_ps(_mm_cmpeq_ss((__a), (__b)), _mm_cmpunord_ss((__a), (__b)))); \
-     if ((__imm) == _CMP_LT_OS || (__imm) == _CMP_LT_OQ) __ret = _mm_cmplt_ss((__a), (__b)); \
-     if ((__imm) == _CMP_LE_OS || (__imm) == _CMP_LE_OQ) __ret = _mm_cmple_ss((__a), (__b)); \
-     if ((__imm) == _CMP_UNORD_Q || (__imm) == _CMP_UNORD_S) __ret = _mm_cmpunord_ss((__a), (__b)); \
-     if ((__imm) == _CMP_NEQ_UQ || (__imm) == _CMP_NEQ_US) __ret = _mm_cmpneq_ss((__a), (__b)); \
-     if ((__imm) == _CMP_NEQ_OQ || (__imm) == _CMP_NEQ_OS) __ret = _mm_move_ss((__a), _mm_andnot_ps(_mm_cmpunord_ss((__a), (__b)), _mm_cmpneq_ss((__a), (__b)))); \
-     if ((__imm) == _CMP_NLT_US || (__imm) == _CMP_NLT_UQ) __ret = _mm_cmpnlt_ss((__a), (__b)); \
-     if ((__imm) == _CMP_ORD_Q || (__imm) == _CMP_ORD_S) __ret = _mm_cmpord_ss((__a), (__b)); \
-     if ((__imm) == _CMP_NGE_US || (__imm) == _CMP_NGE_UQ) __ret = _mm_cmpnge_ss((__a), (__b)); \
-     if ((__imm) == _CMP_NGT_US || (__imm) == _CMP_NGT_UQ) __ret = _mm_cmpngt_ss((__a), (__b)); \
-     if ((__imm) == _CMP_FALSE_OQ || (__imm) == _CMP_FALSE_OS) __ret = _mm_move_ss((__a), _mm_setzero_ps()); \
-     if ((__imm) == _CMP_GE_OS || (__imm) == _CMP_GE_OQ) __ret = _mm_cmpge_ss((__a), (__b)); \
-     if ((__imm) == _CMP_GT_OS || (__imm) == _CMP_GT_OQ) __ret = _mm_cmpgt_ss((__a), (__b)); \
-     if ((__imm) == _CMP_TRUE_UQ || (__imm) == _CMP_TRUE_US) __ret = _mm_move_ss((__a), (__m128)wasm_i8x16_splat(0xFF)); \
-     if ((__imm) == _CMP_NLE_US || (__imm) == _CMP_NLE_UQ) __ret = _mm_cmpnle_ss((__a), (__b)); \
-     __ret; })
+#define _mm_cmp_pd(__a, __b, __imm)                                            \
+  __extension__({                                                              \
+    __m128d __ret;                                                             \
+    switch ((__imm)) {                                                         \
+      case _CMP_EQ_OQ:                                                         \
+      case _CMP_EQ_OS:                                                         \
+        __ret = _mm_cmpeq_pd((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_EQ_UQ:                                                         \
+      case _CMP_EQ_US:                                                         \
+        __ret = _mm_or_pd(_mm_cmpeq_pd((__a), (__b)),                          \
+                          _mm_cmpunord_pd((__a), (__b)));                      \
+        break;                                                                 \
+      case _CMP_LT_OS:                                                         \
+      case _CMP_LT_OQ:                                                         \
+        __ret = _mm_cmplt_pd((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_LE_OS:                                                         \
+      case _CMP_LE_OQ:                                                         \
+        __ret = _mm_cmple_pd((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_UNORD_Q:                                                       \
+      case _CMP_UNORD_S:                                                       \
+        __ret = _mm_cmpunord_pd((__a), (__b));                                 \
+        break;                                                                 \
+      case _CMP_NEQ_UQ:                                                        \
+      case _CMP_NEQ_US:                                                        \
+        __ret = _mm_cmpneq_pd((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_NEQ_OQ:                                                        \
+      case _CMP_NEQ_OS:                                                        \
+        __ret = _mm_andnot_pd(_mm_cmpunord_pd((__a), (__b)),                   \
+                              _mm_cmpneq_pd((__a), (__b)));                    \
+        break;                                                                 \
+      case _CMP_NLT_US:                                                        \
+      case _CMP_NLT_UQ:                                                        \
+        __ret = _mm_cmpnlt_pd((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_ORD_Q:                                                         \
+      case _CMP_ORD_S:                                                         \
+        __ret = _mm_cmpord_pd((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_NGE_US:                                                        \
+      case _CMP_NGE_UQ:                                                        \
+        __ret = _mm_cmpnge_pd((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_NGT_US:                                                        \
+      case _CMP_NGT_UQ:                                                        \
+        __ret = _mm_cmpngt_pd((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_FALSE_OQ:                                                      \
+      case _CMP_FALSE_OS:                                                      \
+        __ret = _mm_setzero_pd();                                              \
+        break;                                                                 \
+      case _CMP_GE_OS:                                                         \
+      case _CMP_GE_OQ:                                                         \
+        __ret = _mm_cmpge_pd((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_GT_OS:                                                         \
+      case _CMP_GT_OQ:                                                         \
+        __ret = _mm_cmpgt_pd((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_TRUE_UQ:                                                       \
+      case _CMP_TRUE_US:                                                       \
+        __ret = (__m128d)wasm_i8x16_splat(0xFF);                               \
+        break;                                                                 \
+      case _CMP_NLE_US:                                                        \
+      case _CMP_NLE_UQ:                                                        \
+        __ret = _mm_cmpnle_pd((__a), (__b));                                   \
+        break;                                                                 \
+    }                                                                          \
+    __ret;                                                                     \
+  })
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
-_mm_maskload_pd(const double *__mem_addr, __m128i __mask)
-{
-  // This may cause an out-of-bounds memory load since we first load and
-  // then mask, but since there are no segmentation faults in Wasm memory
-  // accesses, that is ok (as long as we are within the heap bounds -
-  // a negligible limitation in practice)
-  return _mm_and_pd(_mm_load_pd(__mem_addr), (__m128d)wasm_i64x2_shr(__mask, 63));
+#define _mm_cmp_ps(__a, __b, __imm)                                            \
+  __extension__({                                                              \
+    __m128 __ret;                                                              \
+    switch ((__imm)) {                                                         \
+      case _CMP_EQ_OQ:                                                         \
+      case _CMP_EQ_OS:                                                         \
+        __ret = _mm_cmpeq_ps((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_EQ_UQ:                                                         \
+      case _CMP_EQ_US:                                                         \
+        __ret = _mm_or_ps(_mm_cmpeq_ps((__a), (__b)),                          \
+                          _mm_cmpunord_ps((__a), (__b)));                      \
+        break;                                                                 \
+      case _CMP_LT_OS:                                                         \
+      case _CMP_LT_OQ:                                                         \
+        __ret = _mm_cmplt_ps((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_LE_OS:                                                         \
+      case _CMP_LE_OQ:                                                         \
+        __ret = _mm_cmple_ps((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_UNORD_Q:                                                       \
+      case _CMP_UNORD_S:                                                       \
+        __ret = _mm_cmpunord_ps((__a), (__b));                                 \
+        break;                                                                 \
+      case _CMP_NEQ_UQ:                                                        \
+      case _CMP_NEQ_US:                                                        \
+        __ret = _mm_cmpneq_ps((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_NEQ_OQ:                                                        \
+      case _CMP_NEQ_OS:                                                        \
+        __ret = _mm_andnot_ps(_mm_cmpunord_ps((__a), (__b)),                   \
+                              _mm_cmpneq_ps((__a), (__b)));                    \
+        break;                                                                 \
+      case _CMP_NLT_US:                                                        \
+      case _CMP_NLT_UQ:                                                        \
+        __ret = _mm_cmpnlt_ps((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_ORD_Q:                                                         \
+      case _CMP_ORD_S:                                                         \
+        __ret = _mm_cmpord_ps((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_NGE_US:                                                        \
+      case _CMP_NGE_UQ:                                                        \
+        __ret = _mm_cmpnge_ps((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_NGT_US:                                                        \
+      case _CMP_NGT_UQ:                                                        \
+        __ret = _mm_cmpngt_ps((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_FALSE_OQ:                                                      \
+      case _CMP_FALSE_OS:                                                      \
+        __ret = _mm_setzero_ps();                                              \
+        break;                                                                 \
+      case _CMP_GE_OS:                                                         \
+      case _CMP_GE_OQ:                                                         \
+        __ret = _mm_cmpge_ps((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_GT_OS:                                                         \
+      case _CMP_GT_OQ:                                                         \
+        __ret = _mm_cmpgt_ps((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_TRUE_UQ:                                                       \
+      case _CMP_TRUE_US:                                                       \
+        __ret = (__m128)wasm_i8x16_splat(0xFF);                                \
+        break;                                                                 \
+      case _CMP_NLE_US:                                                        \
+      case _CMP_NLE_UQ:                                                        \
+        __ret = _mm_cmpnle_ps((__a), (__b));                                   \
+        break;                                                                 \
+    }                                                                          \
+    __ret;                                                                     \
+  })
+
+#define _mm_cmp_sd(__a, __b, __imm)                                            \
+  __extension__({                                                              \
+    __m128d __ret;                                                             \
+    switch ((__imm)) {                                                         \
+      case _CMP_EQ_OQ:                                                         \
+      case _CMP_EQ_OS:                                                         \
+        __ret = _mm_cmpeq_sd((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_EQ_UQ:                                                         \
+      case _CMP_EQ_US:                                                         \
+        __ret = _mm_move_sd((__a),                                             \
+                            _mm_or_pd(_mm_cmpeq_sd((__a), (__b)),              \
+                                      _mm_cmpunord_sd((__a), (__b))));         \
+        break;                                                                 \
+      case _CMP_LT_OS:                                                         \
+      case _CMP_LT_OQ:                                                         \
+        __ret = _mm_cmplt_sd((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_LE_OS:                                                         \
+      case _CMP_LE_OQ:                                                         \
+        __ret = _mm_cmple_sd((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_UNORD_Q:                                                       \
+      case _CMP_UNORD_S:                                                       \
+        __ret = _mm_cmpunord_sd((__a), (__b));                                 \
+        break;                                                                 \
+      case _CMP_NEQ_UQ:                                                        \
+      case _CMP_NEQ_US:                                                        \
+        __ret = _mm_cmpneq_sd((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_NEQ_OQ:                                                        \
+      case _CMP_NEQ_OS:                                                        \
+        __ret = _mm_move_sd((__a),                                             \
+                            _mm_andnot_pd(_mm_cmpunord_sd((__a), (__b)),       \
+                                          _mm_cmpneq_sd((__a), (__b))));       \
+        break;                                                                 \
+      case _CMP_NLT_US:                                                        \
+      case _CMP_NLT_UQ:                                                        \
+        __ret = _mm_cmpnlt_sd((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_ORD_Q:                                                         \
+      case _CMP_ORD_S:                                                         \
+        __ret = _mm_cmpord_sd((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_NGE_US:                                                        \
+      case _CMP_NGE_UQ:                                                        \
+        __ret = _mm_cmpnge_sd((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_NGT_US:                                                        \
+      case _CMP_NGT_UQ:                                                        \
+        __ret = _mm_cmpngt_sd((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_FALSE_OQ:                                                      \
+      case _CMP_FALSE_OS:                                                      \
+        __ret = _mm_move_sd((__a), _mm_setzero_pd());                          \
+        break;                                                                 \
+      case _CMP_GE_OS:                                                         \
+      case _CMP_GE_OQ:                                                         \
+        __ret = _mm_cmpge_sd((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_GT_OS:                                                         \
+      case _CMP_GT_OQ:                                                         \
+        __ret = _mm_cmpgt_sd((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_TRUE_UQ:                                                       \
+      case _CMP_TRUE_US:                                                       \
+        __ret = _mm_move_sd((__a), (__m128d)wasm_i8x16_splat(0xFF));           \
+        break;                                                                 \
+      case _CMP_NLE_US:                                                        \
+      case _CMP_NLE_UQ:                                                        \
+        __ret = _mm_cmpnle_sd((__a), (__b));                                   \
+        break;                                                                 \
+    }                                                                          \
+    __ret;                                                                     \
+  })
+
+#define _mm_cmp_ss(__a, __b, __imm)                                            \
+  __extension__({                                                              \
+    __m128 __ret;                                                              \
+    switch ((__imm)) {                                                         \
+      case _CMP_EQ_OQ:                                                         \
+      case _CMP_EQ_OS:                                                         \
+        __ret = _mm_cmpeq_ss((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_EQ_UQ:                                                         \
+      case _CMP_EQ_US:                                                         \
+        __ret = _mm_move_ss((__a),                                             \
+                            _mm_or_ps(_mm_cmpeq_ss((__a), (__b)),              \
+                                      _mm_cmpunord_ss((__a), (__b))));         \
+        break;                                                                 \
+      case _CMP_LT_OS:                                                         \
+      case _CMP_LT_OQ:                                                         \
+        __ret = _mm_cmplt_ss((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_LE_OS:                                                         \
+      case _CMP_LE_OQ:                                                         \
+        __ret = _mm_cmple_ss((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_UNORD_Q:                                                       \
+      case _CMP_UNORD_S:                                                       \
+        __ret = _mm_cmpunord_ss((__a), (__b));                                 \
+        break;                                                                 \
+      case _CMP_NEQ_UQ:                                                        \
+      case _CMP_NEQ_US:                                                        \
+        __ret = _mm_cmpneq_ss((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_NEQ_OQ:                                                        \
+      case _CMP_NEQ_OS:                                                        \
+        __ret = _mm_move_ss((__a),                                             \
+                            _mm_andnot_ps(_mm_cmpunord_ss((__a), (__b)),       \
+                                          _mm_cmpneq_ss((__a), (__b))));       \
+        break;                                                                 \
+      case _CMP_NLT_US:                                                        \
+      case _CMP_NLT_UQ:                                                        \
+        __ret = _mm_cmpnlt_ss((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_ORD_Q:                                                         \
+      case _CMP_ORD_S:                                                         \
+        __ret = _mm_cmpord_ss((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_NGE_US:                                                        \
+      case _CMP_NGE_UQ:                                                        \
+        __ret = _mm_cmpnge_ss((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_NGT_US:                                                        \
+      case _CMP_NGT_UQ:                                                        \
+        __ret = _mm_cmpngt_ss((__a), (__b));                                   \
+        break;                                                                 \
+      case _CMP_FALSE_OQ:                                                      \
+      case _CMP_FALSE_OS:                                                      \
+        __ret = _mm_move_ss((__a), _mm_setzero_ps());                          \
+        break;                                                                 \
+      case _CMP_GE_OS:                                                         \
+      case _CMP_GE_OQ:                                                         \
+        __ret = _mm_cmpge_ss((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_GT_OS:                                                         \
+      case _CMP_GT_OQ:                                                         \
+        __ret = _mm_cmpgt_ss((__a), (__b));                                    \
+        break;                                                                 \
+      case _CMP_TRUE_UQ:                                                       \
+      case _CMP_TRUE_US:                                                       \
+        __ret = _mm_move_ss((__a), (__m128)wasm_i8x16_splat(0xFF));            \
+        break;                                                                 \
+      case _CMP_NLE_US:                                                        \
+      case _CMP_NLE_UQ:                                                        \
+        __ret = _mm_cmpnle_ss((__a), (__b));                                   \
+        break;                                                                 \
+    }                                                                          \
+    __ret;                                                                     \
+  })
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_cmp_pd(__m256d a, __m256d b, const int imm8) {
+  __m256d ret;
+  ret.v0 = _mm_cmp_pd(a.v0, b.v0, imm8);
+  ret.v1 = _mm_cmp_pd(a.v1, b.v1, imm8);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmp_ps(__m256 __a, __m256 __b, const int imm8) {
+  __m256 ret;
+  ret.v0 = _mm_cmp_ps(__a.v0, __b.v0, imm8);
+  ret.v1 = _mm_cmp_ps(__a.v1, __b.v1, imm8);
+  return ret;
+}
+
+#define _mm256_extract_epi32(__A, N)                                           \
+  __extension__({                                                              \
+    __m256i __a = (__A);                                                       \
+    ((N) & 0x7) < 4 ? _mm_extract_epi32(__a.v0, (N) & 0x3)                     \
+                    : _mm_extract_epi32(__a.v1, (N) & 0x3);                    \
+  })
+
+#define _mm256_extract_epi16(__A, N)                                           \
+  __extension__({                                                              \
+    __m256i __a = (__A);                                                       \
+    ((N) & 0xF) < 8 ? _mm_extract_epi16(__a.v0, (N) & 0x7)                     \
+                    : _mm_extract_epi16(__a.v1, (N) & 0x7);                    \
+  })
+
+#define _mm256_extract_epi8(__A, N)                                            \
+  __extension__({                                                              \
+    __m256i __a = (__A);                                                       \
+    ((N) & 0x1F) < 16 ? _mm_extract_epi8(__a.v0, (N) & 0xF)                    \
+                      : _mm_extract_epi8(__a.v1, (N) & 0xF);                   \
+  })
+
+#define _mm256_extract_epi64(__A, N)                                           \
+  __extension__({                                                              \
+    __m256i __a = (__A);                                                       \
+    ((N) & 0x3) < 2 ? _mm_extract_epi64(__a.v0, (N) & 0x1)                     \
+                    : _mm_extract_epi64(__a.v1, (N) & 0x1);                    \
+  })
+
+#define _mm256_insert_epi32(__A, __I, N)                                       \
+  __extension__({                                                              \
+    __m256i __a = (__A);                                                       \
+    int32_t __i = (__I);                                                       \
+    ((N) & 0x7) < 4                                                            \
+      ? _mm256_set_m128i(__a.v1, _mm_insert_epi32(__a.v0, __i, (N) & 0x3))     \
+      : _mm256_set_m128i(_mm_insert_epi32(__a.v1, __i, (N) & 0x3), __a.v0);    \
+  })
+
+#define _mm256_insert_epi16(__A, __I, N)                                       \
+  __extension__({                                                              \
+    __m256i __a = (__A);                                                       \
+    int16_t __i = (__I);                                                       \
+    ((N) & 0xF) < 8                                                            \
+      ? _mm256_set_m128i(__a.v1, _mm_insert_epi16(__a.v0, __i, (N) & 0x7))     \
+      : _mm256_set_m128i(_mm_insert_epi16(__a.v1, __i, (N) & 0x7), __a.v0);    \
+  })
+
+#define _mm256_insert_epi8(__A, __I, N)                                        \
+  __extension__({                                                              \
+    __m256i __a = (__A);                                                       \
+    int8_t __i = (__I);                                                        \
+    ((N) & 0x1F) < 16                                                          \
+      ? _mm256_set_m128i(__a.v1, _mm_insert_epi8(__a.v0, __i, (N) & 0xF))      \
+      : _mm256_set_m128i(_mm_insert_epi8(__a.v1, __i, (N) & 0xF), __a.v0);     \
+  })
+
+#define _mm256_insert_epi64(__A, __I, N)                                       \
+  __extension__({                                                              \
+    __m256i __a = (__A);                                                       \
+    int64_t __i = (__I);                                                       \
+    ((N) & 0x3) < 2                                                            \
+      ? _mm256_set_m128i(__a.v1, _mm_insert_epi64(__a.v0, __i, (N) & 0x1))     \
+      : _mm256_set_m128i(_mm_insert_epi64(__a.v1, __i, (N) & 0x1), __a.v0);    \
+  })
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi32_pd(__m128i __a) {
+  __m256d ret;
+  ret.v0 = _mm_cvtepi32_pd(__a);
+  __m128i __a1 = wasm_i32x4_shuffle(__a, __a, 2, 3, 0, 0);
+  ret.v1 = _mm_cvtepi32_pd(__a1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi32_ps(__m256i __a) {
+  __m256 ret;
+  ret.v0 = _mm_cvtepi32_ps(__a.v0);
+  ret.v1 = _mm_cvtepi32_ps(__a.v1);
+  return ret;
 }
 
 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
-_mm_maskload_ps(const float *__mem_addr, __m128i __mask)
-{
-  // This may cause an out-of-bounds memory load since we first load and
-  // then mask, but since there are no segmentation faults in Wasm memory
-  // accesses, that is ok (as long as we are within the heap bounds -
-  // a negligible limitation in practice)
-  return _mm_and_ps(_mm_load_ps(__mem_addr), (__m128)_mm_srai_epi32(__mask, 31));
+_mm256_cvtpd_ps(__m256d __a) {
+  __m128 low = _mm_cvtpd_ps(__a.v0);
+  __m128 high = _mm_cvtpd_ps(__a.v1);
+  __m128 ret = (__m128)wasm_i32x4_shuffle(low, high, 0, 1, 4, 5);
+  return ret;
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
-_mm_maskstore_pd(double *__mem_addr, __m128i __mask, __m128d __a)
-{
-  if ((wasm_i64x2_extract_lane(__mask, 0) & 0x8000000000000000ull) != 0)
-    __mem_addr[0] = wasm_f64x2_extract_lane((v128_t)__a, 0);
-  if ((wasm_i64x2_extract_lane(__mask, 1) & 0x8000000000000000ull) != 0)
-    __mem_addr[1] = wasm_f64x2_extract_lane((v128_t)__a, 1);
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtps_epi32(__m256 __a) {
+  __m256i ret;
+  ret.v0 = _mm_cvtps_epi32(__a.v0);
+  ret.v1 = _mm_cvtps_epi32(__a.v1);
+  return ret;
 }
 
-static __inline__ void __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
-_mm_maskstore_ps(float *__mem_addr, __m128i __mask, __m128 __a)
-{
-  if ((wasm_i32x4_extract_lane(__mask, 0) & 0x80000000ull) != 0)
-    __mem_addr[0] = wasm_f32x4_extract_lane((v128_t)__a, 0);
-  if ((wasm_i32x4_extract_lane(__mask, 1) & 0x80000000ull) != 0)
-    __mem_addr[1] = wasm_f32x4_extract_lane((v128_t)__a, 1);
-  if ((wasm_i32x4_extract_lane(__mask, 2) & 0x80000000ull) != 0)
-    __mem_addr[2] = wasm_f32x4_extract_lane((v128_t)__a, 2);
-  if ((wasm_i32x4_extract_lane(__mask, 3) & 0x80000000ull) != 0)
-    __mem_addr[3] = wasm_f32x4_extract_lane((v128_t)__a, 3);
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtps_pd(__m128 __a) {
+  __m256d ret;
+  ret.v0 = _mm_cvtps_pd(__a);
+  __m128 __a1 = (__m128)wasm_i32x4_shuffle(__a, __a, 2, 3, 0, 0);
+  ret.v1 = _mm_cvtps_pd(__a1);
+  return ret;
 }
 
-#define _mm_permute_pd(__a, __imm) __extension__ ({ \
-  (__m128d)wasm_i64x2_shuffle((__m128d)(__a), (__m128d)(__a), \
-                              ((__imm) & 1), (((__imm) >> 1) & 1)); })
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvttpd_epi32(__m256d __a) {
+  __m128i low = _mm_cvttpd_epi32(__a.v0);
+  __m128i high = _mm_cvttpd_epi32(__a.v1);
+  __m128i ret = wasm_i32x4_shuffle(low, high, 0, 1, 4, 5);
+  return ret;
+}
 
-#define _mm_permute_ps(__a, __imm) __extension__ ({ \
-  (__m128)wasm_i32x4_shuffle((__m128)(__a), (__m128)(__a), \
-                             ((__imm) & 3), (((__imm) >> 2) & 3), \
-                             (((__imm) >> 4) & 3), (((__imm) >> 6) & 3)); })
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtpd_epi32(__m256d __a) {
+  __m128i low = _mm_cvtpd_epi32(__a.v0);
+  __m128i high = _mm_cvtpd_epi32(__a.v1);
+  __m128i ret = wasm_i32x4_shuffle(low, high, 0, 1, 4, 5);
+  return ret;
+}
 
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
-_mm_permutevar_pd(__m128d __a, __m128d __b)
-{
-  return (__m128d)wasm_f64x2_make(
-    ((__f64x2)__a)[(wasm_i64x2_extract_lane((v128_t)__b, 0) >> 1) & 1],
-    ((__f64x2)__a)[(wasm_i64x2_extract_lane((v128_t)__b, 1) >> 1) & 1]);
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvttps_epi32(__m256 __a) {
+  __m256i ret;
+  ret.v0 = _mm_cvttps_epi32(__a.v0);
+  ret.v1 = _mm_cvttps_epi32(__a.v1);
+  return ret;
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
-_mm_permutevar_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)wasm_f32x4_make(((__f32x4)__a)[wasm_i32x4_extract_lane((v128_t)__b, 0) & 3],
-    ((__f32x4)__a)[wasm_i32x4_extract_lane((v128_t)__b, 1) & 3],
-    ((__f32x4)__a)[wasm_i32x4_extract_lane((v128_t)__b, 2) & 3],
-    ((__f32x4)__a)[wasm_i32x4_extract_lane((v128_t)__b, 3) & 3]);
+static __inline__ double __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtsd_f64(__m256d __a) {
+  return _mm_cvtsd_f64(__a.v0);
 }
 
 static __inline__ int __attribute__((__always_inline__, __nodebug__))
-_mm_testc_pd(__m128d __a, __m128d __b)
-{
-  v128_t __m = wasm_u64x2_shr(wasm_v128_or(wasm_v128_not((v128_t)__b), (v128_t)__a), 63);
+_mm256_cvtsi256_si32(__m256i __a) {
+  return _mm_cvtsi128_si32(__a.v0);
+}
+
+static __inline__ float __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtss_f32(__m256 __a) {
+  return _mm_cvtss_f32(__a.v0);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_movehdup_ps(__m256 __a) {
+  __m256 ret;
+  ret.v0 = _mm_movehdup_ps(__a.v0);
+  ret.v1 = _mm_movehdup_ps(__a.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_moveldup_ps(__m256 __a) {
+  __m256 ret;
+  ret.v0 = _mm_moveldup_ps(__a.v0);
+  ret.v1 = _mm_moveldup_ps(__a.v1);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_movedup_pd(__m256d __a) {
+  __m256d ret;
+  ret.v0 = _mm_movedup_pd(__a.v0);
+  ret.v1 = _mm_movedup_pd(__a.v1);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_pd(__m256d __a, __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_unpackhi_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_unpackhi_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_pd(__m256d __a, __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_unpacklo_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_unpacklo_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_ps(__m256 __a, __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_unpackhi_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_unpackhi_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_ps(__m256 __a, __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_unpacklo_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_unpacklo_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_testz_pd(__m128d __a, __m128d __b) {
+  v128_t __m =
+    wasm_u64x2_shr(wasm_v128_not(wasm_v128_and((v128_t)__a, (v128_t)__b)), 63);
+  return wasm_i64x2_extract_lane(__m, 0) & wasm_i64x2_extract_lane(__m, 1);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_testc_pd(__m128d __a, __m128d __b) {
+  v128_t __m =
+    wasm_u64x2_shr(wasm_v128_or(wasm_v128_not((v128_t)__b), (v128_t)__a), 63);
   return wasm_i64x2_extract_lane(__m, 0) & wasm_i64x2_extract_lane(__m, 1);
 }
 
 static __inline__ int __attribute__((__always_inline__, __nodebug__))
-_mm_testc_ps(__m128 __a, __m128 __b)
-{
-  v128_t __m = wasm_u32x4_shr(wasm_v128_or(wasm_v128_not((v128_t)__b), (v128_t)__a), 31);
+_mm_testnzc_pd(__m128d __a, __m128d __b) {
+  v128_t __m = wasm_u64x2_shr(wasm_v128_and((v128_t)__a, (v128_t)__b), 63);
+  v128_t __m2 = wasm_u64x2_shr(wasm_v128_andnot((v128_t)__b, (v128_t)__a), 63);
+  return (wasm_i64x2_extract_lane(__m, 0) | wasm_i64x2_extract_lane(__m, 1)) &
+         (wasm_i64x2_extract_lane(__m2, 0) | wasm_i64x2_extract_lane(__m2, 1));
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_testz_ps(__m128 __a, __m128 __b) {
+  v128_t __m =
+    wasm_u32x4_shr(wasm_v128_not(wasm_v128_and((v128_t)__a, (v128_t)__b)), 31);
   __m = wasm_v128_and(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m));
   __m = wasm_v128_and(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1)));
   return wasm_i32x4_extract_lane(__m, 0);
 }
 
 static __inline__ int __attribute__((__always_inline__, __nodebug__))
-_mm_testnzc_pd(__m128d __a, __m128d __b)
-{
-  v128_t  __m = wasm_u64x2_shr(wasm_v128_and((v128_t)__a, (v128_t)__b), 63);
-  v128_t __m2 = wasm_u64x2_shr(wasm_v128_andnot((v128_t)__b, (v128_t)__a), 63);
-  return (wasm_i64x2_extract_lane(__m, 0)  | wasm_i64x2_extract_lane(__m, 1))
-       & (wasm_i64x2_extract_lane(__m2, 0) | wasm_i64x2_extract_lane(__m2, 1));
+_mm_testc_ps(__m128 __a, __m128 __b) {
+  v128_t __m =
+    wasm_u32x4_shr(wasm_v128_or(wasm_v128_not((v128_t)__b), (v128_t)__a), 31);
+  __m = wasm_v128_and(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m));
+  __m = wasm_v128_and(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1)));
+  return wasm_i32x4_extract_lane(__m, 0);
 }
 
 static __inline__ int __attribute__((__always_inline__, __nodebug__))
-_mm_testnzc_ps(__m128 __a, __m128 __b)
-{
-  v128_t  __m = wasm_u32x4_shr(wasm_v128_and((v128_t)__a, (v128_t)__b), 31);
+_mm_testnzc_ps(__m128 __a, __m128 __b) {
+  v128_t __m = wasm_u32x4_shr(wasm_v128_and((v128_t)__a, (v128_t)__b), 31);
   v128_t __m2 = wasm_u32x4_shr(wasm_v128_andnot((v128_t)__b, (v128_t)__a), 31);
 
-  __m  = wasm_v128_or(__m,  (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m));
+  __m = wasm_v128_or(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m));
   __m2 = wasm_v128_or(__m2, (v128_t)_mm_movehl_ps((__m128)__m2, (__m128)__m2));
-  __m  = wasm_v128_or(__m,  _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1)));
+  __m = wasm_v128_or(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1)));
   __m2 = wasm_v128_or(__m2, _mm_shuffle_epi32(__m2, _MM_SHUFFLE(3, 2, 0, 1)));
 
   return wasm_i32x4_extract_lane(__m, 0) & wasm_i32x4_extract_lane(__m2, 0);
 }
 
 static __inline__ int __attribute__((__always_inline__, __nodebug__))
-_mm_testz_pd(__m128d __a, __m128d __b)
-{
-  v128_t __m = wasm_u64x2_shr(wasm_v128_not(wasm_v128_and((v128_t)__a, (v128_t)__b)), 63);
-  return wasm_i64x2_extract_lane(__m, 0) & wasm_i64x2_extract_lane(__m, 1);
+_mm256_testz_pd(__m256d __a, __m256d __b) {
+  return _mm_testz_pd(__a.v0, __b.v0) & _mm_testz_pd(__a.v1, __b.v1);
 }
 
 static __inline__ int __attribute__((__always_inline__, __nodebug__))
-_mm_testz_ps(__m128 __a, __m128 __b)
-{
-  v128_t __m = wasm_u32x4_shr(wasm_v128_not(wasm_v128_and((v128_t)__a, (v128_t)__b)), 31);
-  __m = wasm_v128_and(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m));
-  __m = wasm_v128_and(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1)));
-  return wasm_i32x4_extract_lane(__m, 0);
+_mm256_testc_pd(__m256d __a, __m256d __b) {
+  return _mm_testc_pd(__a.v0, __b.v0) & _mm_testc_pd(__a.v1, __b.v1);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm256_testnzc_pd(__m256d __a, __m256d __b) {
+  v128_t __m =
+    wasm_u64x2_shr(wasm_v128_and((v128_t)__a.v0, (v128_t)__b.v0), 63);
+  v128_t __m1 =
+    wasm_u64x2_shr(wasm_v128_and((v128_t)__a.v1, (v128_t)__b.v1), 63);
+  v128_t __m2 =
+    wasm_u64x2_shr(wasm_v128_andnot((v128_t)__b.v0, (v128_t)__a.v0), 63);
+  v128_t __m3 =
+    wasm_u64x2_shr(wasm_v128_andnot((v128_t)__b.v1, (v128_t)__a.v1), 63);
+  return wasm_v128_any_true(wasm_v128_or(__m, __m1)) &
+         wasm_v128_any_true(wasm_v128_or(__m2, __m3));
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm256_testz_ps(__m256 __a, __m256 __b) {
+  return _mm_testz_ps(__a.v0, __b.v0) & _mm_testz_ps(__a.v1, __b.v1);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm256_testc_ps(__m256 __a, __m256 __b) {
+  return _mm_testc_ps(__a.v0, __b.v0) & _mm_testc_ps(__a.v1, __b.v1);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm256_testnzc_ps(__m256 __a, __m256 __b) {
+  v128_t __m =
+    wasm_u32x4_shr(wasm_v128_and((v128_t)__a.v0, (v128_t)__b.v0), 31);
+  v128_t __m1 =
+    wasm_u32x4_shr(wasm_v128_and((v128_t)__a.v1, (v128_t)__b.v1), 31);
+  v128_t __m2 =
+    wasm_u32x4_shr(wasm_v128_andnot((v128_t)__b.v0, (v128_t)__a.v0), 31);
+  v128_t __m3 =
+    wasm_u32x4_shr(wasm_v128_andnot((v128_t)__b.v1, (v128_t)__a.v1), 31);
+
+  return wasm_v128_any_true(wasm_v128_or(__m, __m1)) &
+         wasm_v128_any_true(wasm_v128_or(__m2, __m3));
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm256_testz_si256(__m256i __a, __m256i __b) {
+  return _mm_testz_si128(__a.v0, __b.v0) & _mm_testz_si128(__a.v1, __b.v1);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm256_testc_si256(__m256i __a, __m256i __b) {
+  return _mm_testc_si128(__a.v0, __b.v0) & _mm_testc_si128(__a.v1, __b.v1);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm256_testnzc_si256(__m256i __a, __m256i __b) {
+  v128_t __m = wasm_v128_and(__a.v0, __b.v0);
+  v128_t __m1 = wasm_v128_and(__a.v1, __b.v1);
+  v128_t __m2 = wasm_v128_andnot(__b.v0, __a.v0);
+  v128_t __m3 = wasm_v128_andnot(__b.v1, __a.v1);
+  return wasm_v128_any_true(wasm_v128_or(__m, __m1)) &
+         wasm_v128_any_true(wasm_v128_or(__m2, __m3));
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm256_movemask_pd(__m256d __a) {
+  return _mm_movemask_pd(__a.v0) | (_mm_movemask_pd(__a.v1) << 2);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm256_movemask_ps(__m256 __a) {
+  return _mm_movemask_ps(__a.v0) | (_mm_movemask_ps(__a.v1) << 4);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_zeroall(void) {
+  // Do nothing
+  // when porting any assembly code that would have calls to these functions
+  // around, that assembly code in the first place will not compile.
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_zeroupper(void) {
+  // Do nothing
+  // when porting any assembly code that would have calls to these functions
+  // around, that assembly code in the first place will not compile.
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_broadcast_ss(float const* __a) {
+  return (__m128)wasm_v128_load32_splat(__a);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcast_sd(double const* __a) {
+  __m256d ret;
+  ret.v1 = ret.v0 = (__m128d)wasm_v128_load64_splat(__a);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcast_ss(float const* __a) {
+  __m256 ret;
+  ret.v1 = ret.v0 = _mm_broadcast_ss(__a);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcast_pd(__m128d const* __a) {
+  __m256d ret;
+  ret.v1 = ret.v0 = (__m128d)wasm_v128_load(__a);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcast_ps(__m128 const* __a) {
+  __m256 ret;
+  ret.v1 = ret.v0 = (__m128)wasm_v128_load(__a);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_load_pd(double const* __p) {
+  __m256d ret;
+  ret.v0 = _mm_load_pd(__p);
+  ret.v1 = _mm_load_pd(__p + 2);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_load_ps(float const* __p) {
+  __m256 ret;
+  ret.v0 = _mm_load_ps(__p);
+  ret.v1 = _mm_load_ps(__p + 4);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu_pd(double const* __p) {
+  __m256d ret;
+  ret.v0 = _mm_loadu_pd(__p);
+  ret.v1 = _mm_loadu_pd(__p + 2);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu_ps(float const* __p) {
+  __m256 ret;
+  ret.v0 = _mm_loadu_ps(__p);
+  ret.v1 = _mm_loadu_ps(__p + 4);
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_load_si256(__m256i const* __p) {
+  __m256i ret;
+  ret.v0 = _mm_load_si128((__m128i const*)__p);
+  ret.v1 = _mm_load_si128(((__m128i const*)__p) + 1);
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu_si256(__m256i_u const* __p) {
+  __m256i ret;
+  ret.v0 = _mm_loadu_si128((__m128i const*)__p);
+  ret.v1 = _mm_loadu_si128(((__m128i const*)__p) + 1);
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_lddqu_si256(__m256i_u const* __p) {
+  __m256i ret;
+  ret.v0 = _mm_lddqu_si128((__m128i const*)__p);
+  ret.v1 = _mm_lddqu_si128(((__m128i const*)__p) + 1);
+  return ret;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_store_pd(double* __p, __m256d __a) {
+  _mm_store_pd(__p, __a.v0);
+  _mm_store_pd(__p + 2, __a.v1);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_store_ps(float* __p, __m256 __a) {
+  _mm_store_ps(__p, __a.v0);
+  _mm_store_ps(__p + 4, __a.v1);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu_pd(double* __p, __m256d __a) {
+  _mm_storeu_pd(__p, __a.v0);
+  _mm_storeu_pd(__p + 2, __a.v1);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu_ps(float* __p, __m256 __a) {
+  _mm_storeu_ps(__p, __a.v0);
+  _mm_storeu_ps(__p + 4, __a.v1);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_store_si256(__m256i* __p, __m256i __a) {
+  _mm_store_si128((__m128i*)__p, __a.v0);
+  _mm_store_si128(((__m128i*)__p) + 1, __a.v1);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu_si256(__m256i_u* __p, __m256i __a) {
+  _mm_storeu_si128((__m128i*)__p, __a.v0);
+  _mm_storeu_si128(((__m128i*)__p) + 1, __a.v1);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_maskload_pd(double const* __p, __m128i __m) {
+  // This may cause an out-of-bounds memory load since we first load and
+  // then mask, but since there are no segmentation faults in Wasm memory
+  // accesses, that is ok (as long as we are within the heap bounds -
+  // a negligible limitation in practice)
+  return _mm_and_pd(_mm_load_pd(__p), (__m128d)wasm_i64x2_shr(__m, 63));
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_maskload_pd(double const* __p, __m256i __m) {
+  __m256d ret;
+  ret.v0 = _mm_maskload_pd(__p, __m.v0);
+  ret.v1 = _mm_maskload_pd(__p + 2, __m.v1);
+  return ret;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_maskload_ps(float const* __p, __m128i __m) {
+  // This may cause an out-of-bounds memory load since we first load and
+  // then mask, but since there are no segmentation faults in Wasm memory
+  // accesses, that is ok (as long as we are within the heap bounds -
+  // a negligible limitation in practice)
+  return _mm_and_ps(_mm_load_ps(__p), (__m128)_mm_srai_epi32(__m, 31));
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_maskload_ps(float const* __p, __m256i __m) {
+  __m256 ret;
+  ret.v0 = _mm_maskload_ps(__p, __m.v0);
+  ret.v1 = _mm_maskload_ps(__p + 4, __m.v1);
+  return ret;
+}
+
+static __inline__ void
+  __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
+  _mm_maskstore_ps(float* __p, __m128i __m, __m128 __a) {
+  if ((wasm_i32x4_extract_lane(__m, 0) & 0x80000000ull) != 0)
+    __p[0] = wasm_f32x4_extract_lane((v128_t)__a, 0);
+  if ((wasm_i32x4_extract_lane(__m, 1) & 0x80000000ull) != 0)
+    __p[1] = wasm_f32x4_extract_lane((v128_t)__a, 1);
+  if ((wasm_i32x4_extract_lane(__m, 2) & 0x80000000ull) != 0)
+    __p[2] = wasm_f32x4_extract_lane((v128_t)__a, 2);
+  if ((wasm_i32x4_extract_lane(__m, 3) & 0x80000000ull) != 0)
+    __p[3] = wasm_f32x4_extract_lane((v128_t)__a, 3);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_maskstore_ps(float* __p, __m256i __m, __m256 __a) {
+  _mm_maskstore_ps(__p, __m.v0, __a.v0);
+  _mm_maskstore_ps(__p + 4, __m.v1, __a.v1);
+}
+
+static __inline__ void
+  __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
+  _mm_maskstore_pd(double* __p, __m128i __m, __m128d __a) {
+  if ((wasm_i64x2_extract_lane(__m, 0) & 0x8000000000000000ull) != 0)
+    __p[0] = wasm_f64x2_extract_lane((v128_t)__a, 0);
+  if ((wasm_i64x2_extract_lane(__m, 1) & 0x8000000000000000ull) != 0)
+    __p[1] = wasm_f64x2_extract_lane((v128_t)__a, 1);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_maskstore_pd(double* __p, __m256i __m, __m256d __a) {
+  _mm_maskstore_pd(__p, __m.v0, __a.v0);
+  _mm_maskstore_pd(__p + 2, __m.v1, __a.v1);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_stream_si256(void* __a, __m256i __b) {
+  _mm_stream_si128((__m128i*)__a, __b.v0);
+  _mm_stream_si128(((__m128i*)__a) + 1, __b.v1);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_stream_pd(void* __a, __m256d __b) {
+  _mm_stream_pd((double*)__a, __b.v0);
+  _mm_stream_pd(((double*)__a) + 2, __b.v1);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_stream_ps(void* __p, __m256 __a) {
+  _mm_stream_ps((float*)__p, __a.v0);
+  _mm_stream_ps(((float*)__p) + 4, __a.v1);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_undefined_pd(void) {
+  __m256d val;
+  return val;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_undefined_ps(void) {
+  __m256 val;
+  return val;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_undefined_si256(void) {
+  __m256i val;
+  return val;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_set_pd(double __a, double __b, double __c, double __d) {
+  __m256d ret;
+  ret.v0 = _mm_set_pd(__c, __d);
+  ret.v1 = _mm_set_pd(__a, __b);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_set_ps(float __a,
+              float __b,
+              float __c,
+              float __d,
+              float __e,
+              float __f,
+              float __g,
+              float __h) {
+  __m256 ret;
+  ret.v0 = _mm_set_ps(__e, __f, __g, __h);
+  ret.v1 = _mm_set_ps(__a, __b, __c, __d);
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_epi32(int __i0,
+                 int __i1,
+                 int __i2,
+                 int __i3,
+                 int __i4,
+                 int __i5,
+                 int __i6,
+                 int __i7) {
+  __m256i ret;
+  ret.v0 = _mm_set_epi32(__i4, __i5, __i6, __i7);
+  ret.v1 = _mm_set_epi32(__i0, __i1, __i2, __i3);
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_epi16(short __w15,
+                 short __w14,
+                 short __w13,
+                 short __w12,
+                 short __w11,
+                 short __w10,
+                 short __w09,
+                 short __w08,
+                 short __w07,
+                 short __w06,
+                 short __w05,
+                 short __w04,
+                 short __w03,
+                 short __w02,
+                 short __w01,
+                 short __w00) {
+  __m256i ret;
+  ret.v0 =
+    _mm_set_epi16(__w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00);
+  ret.v1 =
+    _mm_set_epi16(__w15, __w14, __w13, __w12, __w11, __w10, __w09, __w08);
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_epi8(char __b31,
+                char __b30,
+                char __b29,
+                char __b28,
+                char __b27,
+                char __b26,
+                char __b25,
+                char __b24,
+                char __b23,
+                char __b22,
+                char __b21,
+                char __b20,
+                char __b19,
+                char __b18,
+                char __b17,
+                char __b16,
+                char __b15,
+                char __b14,
+                char __b13,
+                char __b12,
+                char __b11,
+                char __b10,
+                char __b09,
+                char __b08,
+                char __b07,
+                char __b06,
+                char __b05,
+                char __b04,
+                char __b03,
+                char __b02,
+                char __b01,
+                char __b00) {
+  __m256i ret;
+  ret.v0 = _mm_set_epi8(__b15,
+                        __b14,
+                        __b13,
+                        __b12,
+                        __b11,
+                        __b10,
+                        __b09,
+                        __b08,
+                        __b07,
+                        __b06,
+                        __b05,
+                        __b04,
+                        __b03,
+                        __b02,
+                        __b01,
+                        __b00);
+  ret.v1 = _mm_set_epi8(__b31,
+                        __b30,
+                        __b29,
+                        __b28,
+                        __b27,
+                        __b26,
+                        __b25,
+                        __b24,
+                        __b23,
+                        __b22,
+                        __b21,
+                        __b20,
+                        __b19,
+                        __b18,
+                        __b17,
+                        __b16);
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) {
+  __m256i ret;
+  ret.v0 = _mm_set_epi64x(__c, __d);
+  ret.v1 = _mm_set_epi64x(__a, __b);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_pd(double __a, double __b, double __c, double __d) {
+  return _mm256_set_pd(__d, __c, __b, __a);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_ps(float __a,
+               float __b,
+               float __c,
+               float __d,
+               float __e,
+               float __f,
+               float __g,
+               float __h) {
+  return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_epi32(int __i0,
+                  int __i1,
+                  int __i2,
+                  int __i3,
+                  int __i4,
+                  int __i5,
+                  int __i6,
+                  int __i7) {
+  return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_epi16(short __w15,
+                  short __w14,
+                  short __w13,
+                  short __w12,
+                  short __w11,
+                  short __w10,
+                  short __w09,
+                  short __w08,
+                  short __w07,
+                  short __w06,
+                  short __w05,
+                  short __w04,
+                  short __w03,
+                  short __w02,
+                  short __w01,
+                  short __w00) {
+  return _mm256_set_epi16(__w00,
+                          __w01,
+                          __w02,
+                          __w03,
+                          __w04,
+                          __w05,
+                          __w06,
+                          __w07,
+                          __w08,
+                          __w09,
+                          __w10,
+                          __w11,
+                          __w12,
+                          __w13,
+                          __w14,
+                          __w15);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_epi8(char __b31,
+                 char __b30,
+                 char __b29,
+                 char __b28,
+                 char __b27,
+                 char __b26,
+                 char __b25,
+                 char __b24,
+                 char __b23,
+                 char __b22,
+                 char __b21,
+                 char __b20,
+                 char __b19,
+                 char __b18,
+                 char __b17,
+                 char __b16,
+                 char __b15,
+                 char __b14,
+                 char __b13,
+                 char __b12,
+                 char __b11,
+                 char __b10,
+                 char __b09,
+                 char __b08,
+                 char __b07,
+                 char __b06,
+                 char __b05,
+                 char __b04,
+                 char __b03,
+                 char __b02,
+                 char __b01,
+                 char __b00) {
+  return _mm256_set_epi8(__b00,
+                         __b01,
+                         __b02,
+                         __b03,
+                         __b04,
+                         __b05,
+                         __b06,
+                         __b07,
+                         __b08,
+                         __b09,
+                         __b10,
+                         __b11,
+                         __b12,
+                         __b13,
+                         __b14,
+                         __b15,
+                         __b16,
+                         __b17,
+                         __b18,
+                         __b19,
+                         __b20,
+                         __b21,
+                         __b22,
+                         __b23,
+                         __b24,
+                         __b25,
+                         __b26,
+                         __b27,
+                         __b28,
+                         __b29,
+                         __b30,
+                         __b31);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) {
+  return _mm256_set_epi64x(__d, __c, __b, __a);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_pd(double __w) {
+  __m256d ret;
+  ret.v1 = ret.v0 = (__m128d)wasm_f64x2_splat(__w);
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_ps(float __w) {
+  __m256 ret;
+  ret.v1 = ret.v0 = (__m128)wasm_f32x4_splat(__w);
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_epi32(int __i) {
+  __m256i ret;
+  ret.v1 = ret.v0 = wasm_i32x4_splat(__i);
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_epi16(short __w) {
+  __m256i ret;
+  ret.v1 = ret.v0 = wasm_i16x8_splat(__w);
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_epi8(char __b) {
+  __m256i ret;
+  ret.v1 = ret.v0 = wasm_i8x16_splat(__b);
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_epi64x(long long __q) {
+  __m256i ret;
+  ret.v1 = ret.v0 = wasm_i64x2_splat(__q);
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_setzero_pd(void) {
+  __m256d ret;
+  ret.v1 = ret.v0 = _mm_setzero_pd();
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_setzero_ps(void) {
+  __m256 ret;
+  ret.v1 = ret.v0 = _mm_setzero_ps();
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setzero_si256(void) {
+  __m256i ret;
+  ret.v1 = ret.v0 = _mm_setzero_si128();
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_castpd_ps(__m256d __a) {
+  m256_data ret;
+  ret.double_view = __a;
+  return ret.float_view;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_castpd_si256(__m256d __a) {
+  m256_data ret;
+  ret.double_view = __a;
+  return ret.int_view;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_castps_pd(__m256 __a) {
+  m256_data ret;
+  ret.float_view = __a;
+  return ret.double_view;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_castps_si256(__m256 __a) {
+  m256_data ret;
+  ret.float_view = __a;
+  return ret.int_view;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_castsi256_ps(__m256i __a) {
+  m256_data ret;
+  ret.int_view = __a;
+  return ret.float_view;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_castsi256_pd(__m256i __a) {
+  m256_data ret;
+  ret.int_view = __a;
+  return ret.double_view;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm256_castpd256_pd128(__m256d __a) {
+  return __a.v0;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm256_castps256_ps128(__m256 __a) {
+  return __a.v0;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm256_castsi256_si128(__m256i __a) {
+  return __a.v0;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_castpd128_pd256(__m128d __a) {
+  __m256d ret;
+  ret.v0 = __a;
+  ret.v1 = _mm_setzero_pd();
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_castps128_ps256(__m128 __a) {
+  __m256 ret;
+  ret.v0 = __a;
+  ret.v1 = _mm_setzero_ps();
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_castsi128_si256(__m128i __a) {
+  __m256i ret;
+  ret.v0 = __a;
+  ret.v1 = _mm_setzero_si128();
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_zextpd128_pd256(__m128d __a) {
+  __m256d ret;
+  ret.v0 = __a;
+  ret.v1 = _mm_setzero_pd();
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_zextps128_ps256(__m128 __a) {
+  __m256 ret;
+  ret.v0 = __a;
+  ret.v1 = _mm_setzero_ps();
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_zextsi128_si256(__m128i __a) {
+  __m256i ret;
+  ret.v0 = __a;
+  ret.v1 = _mm_setzero_si128();
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_insertf128_ps(__m256 __a, __m128 __b, const int imm8) {
+  __m256 ret = __a;
+  if (imm8 & 0x1) {
+    ret.v1 = __b;
+  } else {
+    ret.v0 = __b;
+  }
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_insertf128_pd(__m256d __a, __m128d __b, const int imm8) {
+  __m256d ret = __a;
+  if (imm8 & 0x1) {
+    ret.v1 = __b;
+  } else {
+    ret.v0 = __b;
+  }
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_insertf128_si256(__m256i __a, __m128i __b, const int imm8) {
+  __m256i ret = __a;
+  if (imm8 & 0x1) {
+    ret.v1 = __b;
+  } else {
+    ret.v0 = __b;
+  }
+  return ret;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm256_extractf128_ps(__m256 __a, const int imm8) {
+  if (imm8 & 0x1) {
+    return __a.v1;
+  } else {
+    return __a.v0;
+  }
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm256_extractf128_pd(__m256d __a, const int imm8) {
+  if (imm8 & 0x1) {
+    return __a.v1;
+  } else {
+    return __a.v0;
+  }
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm256_extractf128_si256(__m256i __a, const int imm8) {
+  if (imm8 & 0x1) {
+    return __a.v1;
+  } else {
+    return __a.v0;
+  }
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_set_m128(__m128 __hi, __m128 __lo) {
+  __m256 ret;
+  ret.v0 = __lo;
+  ret.v1 = __hi;
+  return ret;
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_set_m128d(__m128d __hi, __m128d __lo) {
+  __m256d ret;
+  ret.v0 = __lo;
+  ret.v1 = __hi;
+  return ret;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_m128i(__m128i __hi, __m128i __lo) {
+  __m256i ret;
+  ret.v0 = __lo;
+  ret.v1 = __hi;
+  return ret;
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_m128(__m128 __lo, __m128 __hi) {
+  return _mm256_set_m128(__hi, __lo);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_m128d(__m128d __lo, __m128d __hi) {
+  return (__m256d)_mm256_set_m128d(__hi, __lo);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_m128i(__m128i __lo, __m128i __hi) {
+  return (__m256i)_mm256_set_m128i(__hi, __lo);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu2_m128(float const* __addr_hi, float const* __addr_lo) {
+  return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu2_m128d(double const* __addr_hi, double const* __addr_lo) {
+  return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu2_m128i(__m128i_u const* __addr_hi, __m128i_u const* __addr_lo) {
+  return _mm256_set_m128i(_mm_loadu_si128((__m128i const*)__addr_hi),
+                          _mm_loadu_si128((__m128i const*)__addr_lo));
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu2_m128(float* __addr_hi, float* __addr_lo, __m256 __a) {
+  _mm_storeu_ps(__addr_lo, __a.v0);
+  _mm_storeu_ps(__addr_hi, __a.v1);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu2_m128d(double* __addr_hi, double* __addr_lo, __m256d __a) {
+  _mm_storeu_pd(__addr_lo, __a.v0);
+  _mm_storeu_pd(__addr_hi, __a.v1);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu2_m128i(__m128i_u* __addr_hi, __m128i_u* __addr_lo, __m256i __a) {
+  _mm_storeu_si128((__m128i*)__addr_lo, __a.v0);
+  _mm_storeu_si128((__m128i*)__addr_hi, __a.v1);
 }
 
 #endif /* __emscripten_avxintrin_h__ */
diff --git a/test/sse/test_avx.cpp b/test/sse/test_avx.cpp
index 44410d75e856b..561c1bb2a40ea 100644
--- a/test/sse/test_avx.cpp
+++ b/test/sse/test_avx.cpp
@@ -4,43 +4,407 @@
  * University of Illinois/NCSA Open Source License.  Both these licenses can be
  * found in the LICENSE file.
  */
-// This file uses AVX by calling different functions with different interesting inputs and prints the results.
-// Use a diff tool to compare the results between platforms.
+// This file uses AVX by calling different functions with different interesting
+// inputs and prints the results. Use a diff tool to compare the results between
+// platforms.
 
+// immintrin.h must be included before test_sse.h
+// clang-format off
 #include <immintrin.h>
 #include "test_sse.h"
+// clang-format on
 
 bool testNaNBits = true;
 
-float *interesting_floats = get_interesting_floats();
-int numInterestingFloats = sizeof(interesting_floats_)/sizeof(interesting_floats_[0]);
-uint32_t *interesting_ints = get_interesting_ints();
-int numInterestingInts = sizeof(interesting_ints_)/sizeof(interesting_ints_[0]);
-double *interesting_doubles = get_interesting_doubles();
-int numInterestingDoubles = sizeof(interesting_doubles_)/sizeof(interesting_doubles_[0]);
+float* interesting_floats = get_interesting_floats();
+int numInterestingFloats =
+  sizeof(interesting_floats_) / sizeof(interesting_floats_[0]);
+uint32_t* interesting_ints = get_interesting_ints();
+int numInterestingInts =
+  sizeof(interesting_ints_) / sizeof(interesting_ints_[0]);
+double* interesting_doubles = get_interesting_doubles();
+int numInterestingDoubles =
+  sizeof(interesting_doubles_) / sizeof(interesting_doubles_[0]);
 
-int main() {
-  assert(numInterestingFloats % 4 == 0);
-  assert(numInterestingInts % 4 == 0);
-  assert(numInterestingDoubles % 4 == 0); 
+void test_arithmetic(void) {
+  testNaNBits = false;
+  Ret_M256d_M256d(__m256d, _mm256_add_pd);
+  Ret_M256_M256(__m256, _mm256_add_ps);
 
-  Ret_FloatPtr(__m128, _mm_broadcast_ss, 1, 1);
+  testNaNBits = true;
+  Ret_M256d_M256d(__m256d, _mm256_sub_pd);
+  Ret_M256_M256(__m256, _mm256_sub_ps);
+
+  testNaNBits = false;
+  Ret_M256d_M256d(__m256d, _mm256_addsub_pd);
+  Ret_M256_M256(__m256, _mm256_addsub_ps);
+
+  testNaNBits = true;
+  Ret_M256d_M256d(__m256d, _mm256_div_pd);
+  Ret_M256_M256(__m256, _mm256_div_ps);
+
+  testNaNBits = false;
+  Ret_M256d_M256d(__m256d, _mm256_mul_pd);
+  Ret_M256_M256(__m256, _mm256_mul_ps);
+
+  Ret_M256d_M256d(__m256d, _mm256_hadd_pd);
+  Ret_M256_M256(__m256, _mm256_hadd_ps);
+
+  testNaNBits = true;
+  Ret_M256d_M256d(__m256d, _mm256_hsub_pd);
+  Ret_M256_M256(__m256, _mm256_hsub_ps);
+
+  testNaNBits = false;
+  Ret_M256_M256_Tint(__m256, _mm256_dp_ps);
+}
+
+void test_special_math(void) {
+  Ret_M256d_M256d(__m256d, _mm256_max_pd);
+  Ret_M256_M256(__m256, _mm256_max_ps);
+
+  Ret_M256d_M256d(__m256d, _mm256_min_pd);
+  Ret_M256_M256(__m256, _mm256_min_ps);
+
+  Ret_M256d_Tint(__m256d, _mm256_round_pd);
+  Ret_M256_Tint(__m256, _mm256_round_ps);
+
+  Ret_M256d(__m256d, _mm256_ceil_pd);
+  Ret_M256(__m256, _mm256_ceil_ps);
+
+  Ret_M256d(__m256d, _mm256_floor_pd);
+  Ret_M256(__m256, _mm256_floor_ps);
+}
+
+void test_elementary_math(void) {
+  Ret_M256d(__m256d, _mm256_sqrt_pd);
+  Ret_M256approx(__m256, _mm256_sqrt_ps);
+  Ret_M256approx(__m256, _mm256_rsqrt_ps);
+  Ret_M256approx(__m256, _mm256_rcp_ps);
+}
+
+void test_logical(void) {
+  Ret_M128d_M128d(__m128d, _mm_and_pd);
+  Ret_M128_M128(__m128, _mm_and_ps);
+
+  Ret_M128d_M128d(__m128d, _mm_andnot_pd);
+  Ret_M128_M128(__m128, _mm_andnot_ps);
+
+  Ret_M128d_M128d(__m128d, _mm_or_pd);
+  Ret_M128_M128(__m128, _mm_or_ps);
+
+  Ret_M128d_M128d(__m128d, _mm_xor_pd);
+  Ret_M128_M128(__m128, _mm_xor_ps);
+
+  Ret_M128d_M128d(int, _mm_testz_pd);
+  Ret_M128d_M128d(int, _mm_testc_pd);
+  Ret_M128d_M128d(int, _mm_testnzc_pd);
+
+  Ret_M128_M128(int, _mm_testz_ps);
+  Ret_M128_M128(int, _mm_testc_ps);
+  Ret_M128_M128(int, _mm_testnzc_ps);
+
+  Ret_M256d_M256d(int, _mm256_testz_pd);
+  Ret_M256d_M256d(int, _mm256_testc_pd);
+  Ret_M256d_M256d(int, _mm256_testnzc_pd);
+
+  Ret_M256_M256(int, _mm256_testz_ps);
+  Ret_M256_M256(int, _mm256_testc_ps);
+  Ret_M256_M256(int, _mm256_testnzc_ps);
+
+  Ret_M256i_M256i(int, _mm256_testz_si256);
+  Ret_M256i_M256i(int, _mm256_testc_si256);
+  Ret_M256i_M256i(int, _mm256_testnzc_si256);
+}
+
+// split test_swizzle into multiple functions to avoid too many locals error
+void test_swizzle_128bit() {
+  Ret_M128d_M128i(__m128d, _mm_permutevar_pd);
+  Ret_M128_M128i(__m128, _mm_permutevar_ps);
+  Ret_M128d_Tint(__m128d, _mm_permute_pd);
+  Ret_M128_Tint(__m128, _mm_permute_ps);
+}
+
+void test_swizzle_permute2f128() {
+  Ret_M256d_M256i(__m256d, _mm256_permutevar_pd);
+  Ret_M256_M256i(__m256, _mm256_permutevar_ps);
+  Ret_M256d_Tint(__m256d, _mm256_permute_pd);
+  Ret_M256_Tint(__m256, _mm256_permute_ps);
+
+  Ret_M256d_M256d_Tint(__m256d, _mm256_permute2f128_pd);
+  Ret_M256_M256_Tint(__m256, _mm256_permute2f128_ps);
+  Ret_M256i_M256i_Tint(__m256i, _mm256_permute2f128_si256);
+}
+
+void test_swizzle_blend() {
+  Ret_M256d_M256d_Tint(__m256d, _mm256_blend_pd);
+  Ret_M256_M256_Tint(__m256, _mm256_blend_ps);
+  Ret_M256d_M256d_M256d(__m256d, _mm256_blendv_pd);
+  Ret_M256_M256_M256(__m256, _mm256_blendv_ps);
+}
+
+void test_swizzle_shuffle() {
+  Ret_M256d_M256d_Tint(__m256d, _mm256_shuffle_pd);
+  Ret_M256_M256_Tint(__m256, _mm256_shuffle_ps);
+}
+
+void test_swizzle_extract_int() {
+  Ret_M256i_Tint(int, _mm256_extract_epi32);
+  Ret_M256i_Tint(int, _mm256_extract_epi16);
+  Ret_M256i_Tint(int, _mm256_extract_epi8);
+  Ret_M256i_Tint(int64_t, _mm256_extract_epi64);
+}
+
+void test_swizzle_insert_int() {
+  Ret_M256i_int_Tint(__m256i, _mm256_insert_epi32);
+  Ret_M256i_int_Tint(__m256i, _mm256_insert_epi16);
+  Ret_M256i_int_Tint(__m256i, _mm256_insert_epi8);
+  Ret_M256i_int_Tint(__m256i, _mm256_insert_epi64);
+}
+
+void test_swizzle_unpack() {
+  Ret_M256d_M256d(__m256d, _mm256_unpackhi_pd);
+  Ret_M256d_M256d(__m256d, _mm256_unpacklo_pd);
+  Ret_M256_M256(__m256, _mm256_unpackhi_ps);
+  Ret_M256_M256(__m256, _mm256_unpacklo_ps);
+}
+
+void test_swizzle_insertf128() {
+  Ret_M256d_M128d_Tint(__m256d, _mm256_insertf128_pd);
+  Ret_M256_M128_Tint(__m256, _mm256_insertf128_ps);
+  Ret_M256i_M128i_Tint(__m256i, _mm256_insertf128_si256);
+}
+
+void test_swizzle_extractf128() {
+  Ret_M256d_Tint(__m128d, _mm256_extractf128_pd);
+  Ret_M256_Tint(__m128, _mm256_extractf128_ps);
+  Ret_M256i_Tint(__m128i, _mm256_extractf128_si256);
+}
+
+void test_swizzle(void) {
+  test_swizzle_128bit();
+  test_swizzle_permute2f128();
+  test_swizzle_blend();
+  test_swizzle_shuffle();
+  test_swizzle_extract_int();
+  test_swizzle_insert_int();
+  test_swizzle_unpack();
+  test_swizzle_insertf128();
+  test_swizzle_extractf128();
+}
+
+void test_convert(void) {
+  Ret_M128i(__m256d, _mm256_cvtepi32_pd);
+  Ret_M256i(__m256, _mm256_cvtepi32_ps);
+
+  Ret_M256d(__m128, _mm256_cvtpd_ps);
+
+  Ret_M256(__m256i, _mm256_cvtps_epi32);
+  Ret_M128(__m256d, _mm256_cvtps_pd);
+
+  Ret_M256d(__m128i, _mm256_cvttpd_epi32);
+
+  Ret_M256d(__m128i, _mm256_cvtpd_epi32);
+
+  Ret_M256(__m256i, _mm256_cvttps_epi32);
+
+  Ret_M256d(double, _mm256_cvtsd_f64);
+
+  Ret_M256i(int, _mm256_cvtsi256_si32);
+
+  Ret_M256(float, _mm256_cvtss_f32);
+}
+
+void test_move(void) {
+  Ret_M256(__m256, _mm256_movehdup_ps);
+  Ret_M256(__m256, _mm256_moveldup_ps);
+  Ret_M256d(__m256d, _mm256_movedup_pd);
+}
+
+// split test_compare into multiple functions to avoid too many locals error
+void test_compare_128bit() {
   Ret_M128d_M128d_Tint_5bits(__m128d, _mm_cmp_pd);
   Ret_M128_M128_Tint_5bits(__m128, _mm_cmp_ps);
   Ret_M128d_M128d_Tint_5bits(__m128d, _mm_cmp_sd);
   Ret_M128_M128_Tint_5bits(__m128, _mm_cmp_ss);
+}
+
+void test_mm256_cmp_pd_tint_0_to_15() {
+  Ret_M256d_M256d_Tint_5bits_0_to_15(__m256d, _mm256_cmp_pd);
+}
+
+void test_mm256_cmp_pd_tint_16_to_31() {
+  Ret_M256d_M256d_Tint_5bits_16_to_31(__m256d, _mm256_cmp_pd);
+}
+
+void test_mm256_cmp_pd() {
+  test_mm256_cmp_pd_tint_0_to_15();
+  test_mm256_cmp_pd_tint_16_to_31();
+}
+
+void test_mm256_cmp_ps_tint_0_to_15() {
+  Ret_M256_M256_Tint_5bits_0_to_15(__m256, _mm256_cmp_ps);
+}
+
+void test_mm256_cmp_ps_tint_16_to_31() {
+  Ret_M256_M256_Tint_5bits_16_to_31(__m256, _mm256_cmp_ps);
+}
+
+void test_mm256_cmp_ps() {
+  test_mm256_cmp_ps_tint_0_to_15();
+  test_mm256_cmp_ps_tint_16_to_31();
+}
+
+void test_compare(void) {
+  test_compare_128bit();
+  test_mm256_cmp_pd();
+  test_mm256_cmp_ps();
+}
+
+void test_misc(void) {
+  Ret_M256d(int, _mm256_movemask_pd);
+  Ret_M256(int, _mm256_movemask_ps);
+}
+
+void test_load(void) {
+  Ret_FloatPtr(__m128, _mm_broadcast_ss, 1, 1);
+  Ret_DoublePtr(__m256d, _mm256_broadcast_sd, 1, 1);
+  Ret_FloatPtr(__m256, _mm256_broadcast_ss, 1, 1);
+
+  // TODO reuse Ret_DoublePtr?
+  Ret_M128dPtr(__m256d, _mm256_broadcast_pd);
+  Ret_M128Ptr(__m256, _mm256_broadcast_ps); // must aligned? not sure
+
+  Ret_DoublePtr(__m256d, _mm256_load_pd, 4, 4); // error, input not aligned
+  Ret_FloatPtr(__m256, _mm256_load_ps, 8, 8);   // error, align
+  Ret_DoublePtr(__m256d, _mm256_loadu_pd, 4, 1);
+  Ret_FloatPtr(__m256, _mm256_loadu_ps, 8, 1);
+
+  Ret_IntPtr(__m256i, _mm256_load_si256, __m256i*, 8, 8); // error, align
+  Ret_IntPtr(__m256i, _mm256_loadu_si256, __m256i_u*, 8, 1);
+  Ret_IntPtr(__m256i, _mm256_lddqu_si256, __m256i_u*, 8, 1);
+
   Ret_DoublePtr_M128i(__m128d, _mm_maskload_pd, 2, 2);
+  Ret_DoublePtr_M256i(__m256d, _mm256_maskload_pd, 4, 4);
   Ret_FloatPtr_M128i(__m128, _mm_maskload_ps, 4, 4);
+  Ret_FloatPtr_M256i(__m256, _mm256_maskload_ps, 8, 8);
+
+  Ret_DoublePtr_DoublePtr(__m256d, _mm256_loadu2_m128d, 2, 2);
+  Ret_FloatPtr_FloatPtr(__m256, _mm256_loadu2_m128, 4, 4);
+  Ret_IntPtr_IntPtr(__m256i, _mm256_loadu2_m128i, __m128i_u*, 4, 4);
+}
+
+void test_store(void) {
+
+  void_OutDoublePtr_M256d(_mm256_store_pd, double*, 32, 32);
+  void_OutFloatPtr_M256(_mm256_store_ps, float*, 32, 32);
+  void_OutDoublePtr_M256d(_mm256_storeu_pd, double*, 32, 1);
+  void_OutFloatPtr_M256(_mm256_storeu_ps, float*, 32, 1);
+  void_OutIntPtr_M256i(_mm256_store_si256, __m256i*, 32, 32);
+  void_OutIntPtr_M256i(_mm256_storeu_si256, __m256i_u*, 32, 1);
+
   void_OutDoublePtr_M128i_M128d(_mm_maskstore_pd, double*, 16, 8);
+  void_OutDoublePtr_M256i_M256d(_mm256_maskstore_pd, double*, 32, 8);
   void_OutFloatPtr_M128i_M128(_mm_maskstore_ps, float*, 16, 4);
-  Ret_M128d_Tint(__m128d, _mm_permute_pd);
-  Ret_M128_Tint(__m128, _mm_permute_ps);
-  Ret_M128d_M128d(__m128d, _mm_permutevar_pd);
-  Ret_M128_M128(__m128, _mm_permutevar_ps);
-  Ret_M128d_M128d(int, _mm_testc_pd);
-  Ret_M128_M128(int, _mm_testc_ps);
-  Ret_M128d_M128d(int, _mm_testnzc_pd);
-  Ret_M128_M128(int, _mm_testnzc_ps);
-  Ret_M128d_M128d(int, _mm_testz_pd);
-  Ret_M128_M128(int, _mm_testz_ps);
+  void_OutFloatPtr_M256i_M256(_mm256_maskstore_ps, float*, 32, 4);
+
+  void_OutIntPtr_M256i(_mm256_stream_si256, __m256i*, 32, 32);
+  void_OutDoublePtr_M256d(_mm256_stream_pd, double*, 32, 32);
+  void_OutFloatPtr_M256(_mm256_stream_ps, float*, 32, 32);
+
+  void_OutFloatPtr_OutFloatPtr_M256(_mm256_storeu2_m128, float*, 32, 1);
+  void_OutDoublePtr_OutDoublePtr_M256d(_mm256_storeu2_m128d, double*, 32, 1);
+  void_OutIntPtr_OutIntPtr_M256i(_mm256_storeu2_m128i, __m128i_u*, 32, 1)
+}
+
+void test_undef(void) {
+#ifdef __EMSCRIPTEN__
+  _mm256_undefined_pd();
+  _mm256_undefined_ps();
+  _mm256_undefined_si256();
+#endif
+}
+
+void test_set(void) {
+  Ret_Double4(__m256d, _mm256_set_pd, 1);
+  Ret_Float8(__m256, _mm256_set_ps, 1);
+  Ret_Int8(__m256i, _mm256_set_epi32, 1);
+  Ret_Short16(__m256i, _mm256_set_epi16, 2);
+  Ret_Char32(__m256i, _mm256_set_epi8, 4);
+  Ret_Longlong4(__m256i, _mm256_set_epi64x, 1);
+
+  Ret_Double4(__m256d, _mm256_setr_pd, 1);
+  Ret_Float8(__m256, _mm256_setr_ps, 1);
+  Ret_Int8(__m256i, _mm256_setr_epi32, 1);
+  Ret_Short16(__m256i, _mm256_setr_epi16, 2);
+  Ret_Char32(__m256i, _mm256_setr_epi8, 4);
+  Ret_Longlong4(__m256i, _mm256_setr_epi64x, 1);
+
+  Ret_Double(__m256d, _mm256_set1_pd, 1);
+  Ret_Float(__m256, _mm256_set1_ps, 1);
+  Ret_Int(__m256i, _mm256_set1_epi32, 1);
+  Ret_Int(__m256i, _mm256_set1_epi16, 1);
+  Ret_Int(__m256i, _mm256_set1_epi8, 1);
+  Ret_Int(__m256i, _mm256_set1_epi64x, 1);
+
+  char str[256] = {};
+  __m256d zerod = _mm256_setzero_pd();
+  tostr(&zerod, str);
+  printf("_mm256_setzero_pd() = %s\n", str);
+
+  __m256 zero = _mm256_setzero_ps();
+  tostr(&zero, str);
+  printf("_mm256_setzero_ps() = %s\n", str);
+
+  __m256i zeroi = _mm256_setzero_si256();
+  tostr(&zeroi, str);
+  printf("_mm256_setzero_si256() = %s\n", str);
+
+  Ret_M128_M128(__m256, _mm256_set_m128);
+  Ret_M128d_M128d(__m256d, _mm256_set_m128d);
+  Ret_M128i_M128i(__m256i, _mm256_set_m128i);
+
+  Ret_M128_M128(__m256, _mm256_setr_m128);
+  Ret_M128d_M128d(__m256d, _mm256_setr_m128d);
+  Ret_M128i_M128i(__m256i, _mm256_setr_m128i);
+}
+
+void test_cast(void) {
+  Ret_M256d(__m256, _mm256_castpd_ps);
+  Ret_M256d(__m256i, _mm256_castpd_si256);
+  Ret_M256(__m256d, _mm256_castps_pd);
+  Ret_M256(__m256i, _mm256_castps_si256);
+  Ret_M256i(__m256d, _mm256_castsi256_pd);
+  Ret_M256i(__m256, _mm256_castsi256_ps);
+
+  Ret_M256d(__m128d, _mm256_castpd256_pd128);
+  Ret_M256(__m128, _mm256_castps256_ps128);
+  Ret_M256i(__m128i, _mm256_castsi256_si128);
+  Ret_M128d(__m256d, _mm256_castpd128_pd256);
+  Ret_M128(__m256, _mm256_castps128_ps256);
+  Ret_M128i(__m256i, _mm256_castsi128_si256);
+
+  Ret_M128d(__m256d, _mm256_zextpd128_pd256);
+  Ret_M128(__m256, _mm256_zextps128_ps256);
+  Ret_M128i(__m256i, _mm256_zextsi128_si256);
+}
+
+int main() {
+  assert(numInterestingFloats % 8 == 0);
+  assert(numInterestingInts % 8 == 0);
+  assert(numInterestingDoubles % 4 == 0);
+
+  test_arithmetic();
+  test_special_math();
+  test_elementary_math();
+  test_logical();
+  test_swizzle();
+  test_convert();
+  test_move();
+  test_compare();
+  test_misc();
+  test_load();
+  test_store();
+  test_undef();
+  test_set();
+  test_cast();
 }
diff --git a/test/sse/test_sse.h b/test/sse/test_sse.h
index ca162c703fdde..f1660862a4fd1 100644
--- a/test/sse/test_sse.h
+++ b/test/sse/test_sse.h
@@ -36,23 +36,122 @@ double ucastd(uint64_t t) { return *(double*)&t; }
 // Data used in test. Store them global and access via a getter to confuse optimizer to not "solve" the whole test suite at compile-time,
 // so that the operation will actually be performed at runtime, and not at compile-time. (Testing the capacity of the compiler to perform
 // SIMD ops at compile-time would be interesting as well, but that's for another test)
-float interesting_floats_[] = { -INFINITY, -FLT_MAX, -2.5f, -1.5f, -1.4f, -1.0f, -0.5f, -0.2f, -FLT_MIN, -0.f, 0.f, 
-                                1.401298464e-45f, FLT_MIN, 0.3f, 0.5f, 0.8f, 1.0f, 1.5f, 2.5f, 3.5f, 3.6f, FLT_MAX, INFINITY, NAN,
-                                ucastf(0x01020304), ucastf(0x80000000), ucastf(0x7FFFFFFF), ucastf(0xFFFFFFFF)
-                            };
-
-double interesting_doubles_[] = { -INFINITY, -FLT_MAX, -2.5, -1.5, -1.4, -1.0, -0.5, -0.2, -FLT_MIN, -0.0, 0.0, 
-                                1.401298464e-45, FLT_MIN, 0.3, 0.5, 0.8, 1.0, 1.5, 2.5, 3.5, 3.6, FLT_MAX, INFINITY, NAN,
-                                ucastd(0x0102030405060708ULL), ucastd(0x8000000000000000ULL),
-                                ucastd(0x7FFFFFFFFFFFFFFFULL), ucastd(0xFFFFFFFFFFFFFFFFULL)
-                                };
-
-uint32_t interesting_ints_[] = { 0, 1, 2, 3, 0x01020304, 0x10203040, 0x7FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x12345678, 0x9ABCDEF1, 0x80000000,
-                                 0x80808080, 0x7F7F7F7F, 0x01010101, 0x11111111, 0x20202020, 0x0F0F0F0F, 0xF0F0F0F0,
-                                 fcastu(-INFINITY), fcastu(-FLT_MAX), fcastu(-2.5f), fcastu(-1.5f), fcastu(-1.4f), fcastu(-1.0f), fcastu(-0.5f),
-                                 fcastu(-0.2f), fcastu(-FLT_MIN), 0xF9301AB9, 0x0039AB12, 0x19302BCD,
-                                 fcastu(1.401298464e-45f), fcastu(FLT_MIN), fcastu(0.3f), fcastu(0.5f), fcastu(0.8f), fcastu(1.0f), fcastu(1.5f),
-                                 fcastu(2.5f), fcastu(3.5f), fcastu(3.6f), fcastu(FLT_MAX), fcastu(INFINITY), fcastu(NAN) };
+__attribute__((aligned(32)))
+float interesting_floats_[] = {
+  -INFINITY,
+  -FLT_MAX,
+  -2.5f,
+  -1.5f,
+  -1.4f,
+  -1.0f,
+  -0.5f,
+  -0.2f,
+  -FLT_MIN,
+  -0.f,
+  0.f,
+  1.401298464e-45f,
+  FLT_MIN,
+  0.3f,
+  0.5f,
+  0.8f,
+  1.0f,
+  1.5f,
+  2.5f,
+  3.5f,
+  3.6f,
+  FLT_MAX,
+  INFINITY,
+  NAN,
+  ucastf(0x01020304),
+  ucastf(0x80000000),
+  ucastf(0x7FFFFFFF),
+  ucastf(0xFFFFFFFF),
+  -2.70497e+38f,
+  -3.2995e-21f,
+  3.40282e+38f,
+  3.38211e+19f};
+
+__attribute__((aligned(32)))
+double interesting_doubles_[] = {
+  -INFINITY,
+  -FLT_MAX,
+  -2.5,
+  -1.5,
+  -1.4,
+  -1.0,
+  -0.5,
+  -0.2,
+  -FLT_MIN,
+  -0.0,
+  0.0,
+  1.401298464e-45,
+  FLT_MIN,
+  0.3,
+  0.5,
+  0.8,
+  1.0,
+  1.5,
+  2.5,
+  3.5,
+  3.6,
+  FLT_MAX,
+  INFINITY,
+  NAN,
+  ucastd(0x0102030405060708ULL),
+  ucastd(0x8000000000000000ULL),
+  ucastd(0x7FFFFFFFFFFFFFFFULL),
+  ucastd(0xFFFFFFFFFFFFFFFFULL)};
+
+__attribute__((aligned(32)))
+uint32_t interesting_ints_[] = {
+  0,
+  1,
+  2,
+  3,
+  0x01020304,
+  0x10203040,
+  0x7FFFFFFF,
+  0xFFFFFFFF,
+  0xFFFFFFFE,
+  0x12345678,
+  0x9ABCDEF1,
+  0x80000000,
+  0x80808080,
+  0x7F7F7F7F,
+  0x01010101,
+  0x11111111,
+  0x20202020,
+  0x0F0F0F0F,
+  0xF0F0F0F0,
+  fcastu(-INFINITY),
+  fcastu(-FLT_MAX),
+  fcastu(-2.5f),
+  fcastu(-1.5f),
+  fcastu(-1.4f),
+  fcastu(-1.0f),
+  fcastu(-0.5f),
+  fcastu(-0.2f),
+  fcastu(-FLT_MIN),
+  0xF9301AB9,
+  0x0039AB12,
+  0x19302BCD,
+  fcastu(1.401298464e-45f),
+  fcastu(FLT_MIN),
+  fcastu(0.3f),
+  fcastu(0.5f),
+  fcastu(0.8f),
+  fcastu(1.0f),
+  fcastu(1.5f),
+  fcastu(2.5f),
+  fcastu(3.5f),
+  fcastu(3.6f),
+  fcastu(FLT_MAX),
+  fcastu(INFINITY),
+  fcastu(NAN),
+  0x000003FF,
+  0xDDDDDDDD,
+  0x88888888,
+  0xEEEEEEEE};
 
 bool always_true() { return time(NULL) != 0; } // This function always returns true, but the compiler should not know this.
 
@@ -192,17 +291,25 @@ void tostr(align1_double *m, char *outstr) {
 }
 
 void tostr(align1_double *m, int numElems, char *outstr) {
-  char s[2][64];
+  assert(numElems <= 4);
+  char s[4][64];
   for(int i = 0; i < numElems; ++i)
     SerializeDouble(m[i], s[i]);
   switch(numElems) {
     case 1: sprintf(outstr, "{%s}", s[0]); break;
     case 2: sprintf(outstr, "{%s,%s}", s[0], s[1]); break;
+    case 3:
+      sprintf(outstr, "{%s,%s,%s}", s[0], s[1], s[2]);
+      break;
+    case 4:
+      sprintf(outstr, "{%s,%s,%s,%s}", s[0], s[1], s[2], s[3]);
+      break;
   }
 }
 
 void tostr(align1_float *m, int numElems, char *outstr) {
-  char s[4][64];
+  assert(numElems <= 8);
+  char s[8][64];
   for(int i = 0; i < numElems; ++i)
     SerializeFloat(m[i], s[i]);
   switch(numElems) {
@@ -210,22 +317,125 @@ void tostr(align1_float *m, int numElems, char *outstr) {
     case 2: sprintf(outstr, "{%s,%s}", s[0], s[1]); break;
     case 3: sprintf(outstr, "{%s,%s,%s}", s[0], s[1], s[2]); break;
     case 4: sprintf(outstr, "{%s,%s,%s,%s}", s[0], s[1], s[2], s[3]); break;
+    case 5:
+      sprintf(outstr, "{%s,%s,%s,%s,%s}", s[0], s[1], s[2], s[3], s[4]);
+      break;
+    case 6:
+      sprintf(
+        outstr, "{%s,%s,%s,%s,%s,%s}", s[0], s[1], s[2], s[3], s[4], s[5]);
+      break;
+    case 7:
+      sprintf(outstr,
+              "{%s,%s,%s,%s,%s,%s,%s}",
+              s[0],
+              s[1],
+              s[2],
+              s[3],
+              s[4],
+              s[5],
+              s[6]);
+      break;
+    case 8:
+      sprintf(outstr,
+              "{%s,%s,%s,%s,%s,%s,%s,%s}",
+              s[0],
+              s[1],
+              s[2],
+              s[3],
+              s[4],
+              s[5],
+              s[6],
+              s[7]);
+      break;
   }
 }
 
 void tostr(align1_int *s, int numElems, char *outstr) {
+  assert(numElems <= 8);
   switch(numElems) {
     case 1: sprintf(outstr, "{0x%08X}", s[0]); break;
     case 2: sprintf(outstr, "{0x%08X,0x%08X}", s[0], s[1]); break;
     case 3: sprintf(outstr, "{0x%08X,0x%08X,0x%08X}", s[0], s[1], s[2]); break;
     case 4: sprintf(outstr, "{0x%08X,0x%08X,0x%08X,0x%08X}", s[0], s[1], s[2], s[3]); break;
+    case 5:
+      sprintf(outstr,
+              "{0x%08X,0x%08X,0x%08X,0x%08X,0x%08X}",
+              s[0],
+              s[1],
+              s[2],
+              s[3],
+              s[4]);
+      break;
+    case 6:
+      sprintf(outstr,
+              "{0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X}",
+              s[0],
+              s[1],
+              s[2],
+              s[3],
+              s[4],
+              s[5]);
+      break;
+    case 7:
+      sprintf(outstr,
+              "{0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X}",
+              s[0],
+              s[1],
+              s[2],
+              s[3],
+              s[4],
+              s[5],
+              s[6]);
+      break;
+    case 8:
+      sprintf(outstr,
+              "{0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X}",
+              s[0],
+              s[1],
+              s[2],
+              s[3],
+              s[4],
+              s[5],
+              s[6],
+              s[7]);
+      break;
   }
 }
 
 void tostr(align1_int64 *m, int numElems, char *outstr) {
+  assert(numElems <= 4);
   switch(numElems) {
     case 1: sprintf(outstr, "{0x%08X%08X}", (int)(*m >> 32), (int)*m); break;
-    case 2: sprintf(outstr, "{0x%08X%08X,0x%08X%08X}", (int)(*m >> 32), (int)*m, (int)(m[1] >> 32), (int)m[1]);
+    case 2:
+      sprintf(outstr,
+              "{0x%08X%08X,0x%08X%08X}",
+              (int)(*m >> 32),
+              (int)*m,
+              (int)(m[1] >> 32),
+              (int)m[1]);
+      break;
+    case 3:
+      sprintf(outstr,
+              "{0x%08X%08X,0x%08X%08X,0x%08X%08X}",
+              (int)(*m >> 32),
+              (int)*m,
+              (int)(m[1] >> 32),
+              (int)m[1],
+              (int)(m[2] >> 32),
+              (int)m[2]);
+      break;
+    case 4:
+      sprintf(outstr,
+              "{0x%08X%08X,0x%08X%08X,0x%08X%08X,0x%08X%08X}",
+              (int)(*m >> 32),
+              (int)*m,
+              (int)(m[1] >> 32),
+              (int)m[1],
+              (int)(m[2] >> 32),
+              (int)m[2],
+              (int)(m[3] >> 32),
+              (int)m[3]);
+      break;
   }
 }
 
@@ -593,7 +803,7 @@ __m128 ExtractIntInRandomOrder(unsigned int *arr, int i, int n, int prime) {
       printf("%s(%s) = %s\n", #func, str, str2); \
     }
 
-float tempOutFloatStore[16];
+float tempOutFloatStore[32];
 float *getTempOutFloatStore(int alignmentBytes) {
   memset(tempOutFloatStore, 0, sizeof(tempOutFloatStore));
   uintptr_t addr = (uintptr_t)tempOutFloatStore;
@@ -933,3 +1143,1017 @@ double *getTempOutDoubleStore(int alignmentBytes) { return (double*)getTempOutFl
         char str3[256]; tostr(&ret, str3); \
         printf("%s(%s, %s) = %s\n", #func, str, str2, str3); \
       }
+
+#ifdef __AVX__
+
+void tostr(__m256* m, char* outstr) {
+  union {
+    __m256 m;
+    float val[8];
+  } u;
+  u.m = *m;
+  char s[8][32];
+  for (int i = 0; i < 8; i++) {
+    SerializeFloat(u.val[i], s[i]);
+  }
+  sprintf(outstr,
+          "[%s,%s,%s,%s,%s,%s,%s,%s]",
+          s[7],
+          s[6],
+          s[5],
+          s[4],
+          s[3],
+          s[2],
+          s[1],
+          s[0]);
+}
+
+void tostr(__m256i* m, char* outstr) {
+  union {
+    __m256i m;
+    uint32_t val[8];
+  } u;
+  u.m = *m;
+  sprintf(outstr,
+          "[0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X]",
+          u.val[7],
+          u.val[6],
+          u.val[5],
+          u.val[4],
+          u.val[3],
+          u.val[2],
+          u.val[1],
+          u.val[0]);
+}
+
+void tostr(__m256d* m, char* outstr) {
+  union {
+    __m256d m;
+    double val[4];
+  } u;
+  u.m = *m;
+  char s[4][64];
+  SerializeDouble(u.val[0], s[0]);
+  SerializeDouble(u.val[1], s[1]);
+  SerializeDouble(u.val[2], s[2]);
+  SerializeDouble(u.val[3], s[3]);
+  sprintf(outstr, "[%s,%s,%s,%s]", s[3], s[2], s[1], s[0]);
+}
+
+void tostr_approx(__m256* m, char* outstr, bool approximate) {
+  union {
+    __m256 m;
+    float val[8];
+  } u;
+  u.m = *m;
+  char s[8][32];
+
+  for (int i = 0; i < 8; i++) {
+    SerializeFloat(u.val[i], s[i], approximate);
+  }
+  sprintf(outstr,
+          "[%s,%s,%s,%s,%s,%s,%s,%s]",
+          s[7],
+          s[6],
+          s[5],
+          s[4],
+          s[3],
+          s[2],
+          s[1],
+          s[0]);
+}
+
+#define Ret_M128_M128i(Ret_type, func)                                         \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingInts / 4; ++j) {                       \
+        __m128 m1 = E1(interesting_floats, i * 4 + k, numInterestingFloats);   \
+        __m128i m2 =                                                           \
+          (__m128i)E1_Int(interesting_ints, j * 4, numInterestingInts);        \
+        Ret_type ret = func(m1, m2);                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s) = %s\n", #func, str, str2, str3);                   \
+      }
+
+#define Ret_M128d_M128i(Ret_type, func)                                        \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int k = 0; k < 2; ++k)                                                \
+      for (int j = 0; j < numInterestingInts / 4; ++j) {                       \
+        __m128d m1 =                                                           \
+          E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);    \
+        __m128i m2 =                                                           \
+          (__m128i)E1_Int(interesting_ints, j * 4, numInterestingInts);        \
+        Ret_type ret = func(m1, m2);                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s) = %s\n", #func, str, str2, str3);                   \
+      }
+
+#define Ret_M256d(Ret_type, func)                                              \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int k = 0; k < 2; ++k) {                                              \
+      __m128d tmp =                                                            \
+        E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);      \
+      __m256d m1 = _mm256_set_m128d(tmp, tmp);                                 \
+      Ret_type ret = func(m1);                                                 \
+      char str[256];                                                           \
+      tostr(&m1, str);                                                         \
+      char str2[256];                                                          \
+      tostr(&ret, str2);                                                       \
+      printf("%s(%s) = %s\n", #func, str, str2);                               \
+    }
+
+#define Ret_M256(Ret_type, func)                                               \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k) {                                              \
+      __m128 tmp = E1(interesting_floats, i * 4 + k, numInterestingFloats);    \
+      __m256 m1 = _mm256_set_m128(tmp, tmp);                                   \
+      Ret_type ret = func(m1);                                                 \
+      char str[256];                                                           \
+      tostr(&m1, str);                                                         \
+      char str2[256];                                                          \
+      tostr(&ret, str2);                                                       \
+      printf("%s(%s) = %s\n", #func, str, str2);                               \
+    }
+
+#define Ret_M256approx(Ret_type, func)                                         \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k) {                                              \
+      __m128 tmp = E1(interesting_floats, i * 4 + k, numInterestingFloats);    \
+      __m256 m1 = _mm256_set_m128(tmp, tmp);                                   \
+      Ret_type ret = func(m1);                                                 \
+      char str[256];                                                           \
+      tostr(&m1, str);                                                         \
+      char str2[256];                                                          \
+      tostr_approx(&ret, str2, true /*approximate*/);                          \
+      printf("%s(%s) = %s\n", #func, str, str2);                               \
+    }
+
+#define Ret_M256d_M256d(Ret_type, func)                                        \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int k = 0; k < 2; ++k)                                                \
+      for (int j = 0; j < numInterestingDoubles / 2; ++j) {                    \
+        __m128d tmp =                                                          \
+          E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);    \
+        __m256d m1 = _mm256_set_m128d(tmp, tmp);                               \
+        tmp = E2_Double(interesting_doubles, j * 2, numInterestingDoubles);    \
+        __m256d m2 = _mm256_set_m128d(tmp, tmp);                               \
+        Ret_type ret = func(m1, m2);                                           \
+        /* a op b */                                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s) = %s\n", #func, str, str2, str3);                   \
+        /* b op a */                                                           \
+        ret = func(m2, m1);                                                    \
+        tostr(&m1, str);                                                       \
+        tostr(&m2, str2);                                                      \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s) = %s\n", #func, str, str2, str3);                   \
+      }
+
+#define Ret_M256_M256(Ret_type, func)                                          \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingFloats / 4; ++j) {                     \
+        __m128 tmp = E1(interesting_floats, i * 4 + k, numInterestingFloats);  \
+        __m256 m1 = _mm256_set_m128(tmp, tmp);                                 \
+        tmp = E2(interesting_floats, j * 4, numInterestingFloats);             \
+        __m256 m2 = _mm256_set_m128(tmp, tmp);                                 \
+        Ret_type ret = func(m1, m2);                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s) = %s\n", #func, str, str2, str3);                   \
+      }
+
+#define Ret_M256i_M256i(Ret_type, func)                                        \
+  for (int i = 0; i < numInterestingInts / 4; ++i)                             \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingInts / 4; ++j) {                       \
+        __m128i tmp =                                                          \
+          (__m128i)E1_Int(interesting_ints, i * 4 + k, numInterestingInts);    \
+        __m256i m1 = _mm256_set_m128i(tmp, tmp);                               \
+        tmp = (__m128i)E2_Int(interesting_ints, j * 4, numInterestingInts);    \
+        __m256i m2 = _mm256_set_m128i(tmp, tmp);                               \
+        Ret_type ret = func(m1, m2);                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s) = %s\n", #func, str, str2, str3);                   \
+      }
+
+#define Ret_M256d_M256i(Ret_type, func)                                        \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int k = 0; k < 2; ++k)                                                \
+      for (int j = 0; j < numInterestingInts / 4; ++j) {                       \
+        __m128d tmp1 =                                                         \
+          E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);    \
+        __m256d m1 = _mm256_set_m128d(tmp1, tmp1);                             \
+        __m128i tmp2 =                                                         \
+          (__m128i)E1_Int(interesting_ints, j * 4, numInterestingInts);        \
+        __m256i m2 = _mm256_set_m128i(tmp2, tmp2);                             \
+        Ret_type ret = func(m1, m2);                                           \
+        /* a op b */                                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s) = %s\n", #func, str, str2, str3);                   \
+      }
+
+#define Ret_M256_M256i(Ret_type, func)                                         \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingInts / 4; ++j) {                       \
+        __m128 tmp1 = E1(interesting_floats, i * 4 + k, numInterestingFloats); \
+        __m256 m1 = _mm256_set_m128(tmp1, tmp1);                               \
+        __m128i tmp2 =                                                         \
+          (__m128i)E1_Int(interesting_ints, j * 4, numInterestingInts);        \
+        __m256i m2 = _mm256_set_m128i(tmp2, tmp2);                             \
+        Ret_type ret = func(m1, m2);                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s) = %s\n", #func, str, str2, str3);                   \
+      }
+
+#define Ret_M256_M256_M256(Ret_type, func)                                     \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingFloats / 4; ++j)                       \
+        for (int l = 0; l < numInterestingFloats / 4; ++l) {                   \
+          __m128 tmp =                                                         \
+            E1(interesting_floats, i * 4 + k, numInterestingFloats);           \
+          __m256 m1 = _mm256_set_m128(tmp, tmp);                               \
+          tmp = E2(interesting_floats, j * 4, numInterestingFloats);           \
+          __m256 m2 = _mm256_set_m128(tmp, tmp);                               \
+          tmp = E1(interesting_floats, l * 4, numInterestingFloats);           \
+          __m256 m3 = _mm256_set_m128(tmp, tmp);                               \
+          Ret_type ret = func(m1, m2, m3);                                     \
+          char str[256];                                                       \
+          tostr(&m1, str);                                                     \
+          char str2[256];                                                      \
+          tostr(&m2, str2);                                                    \
+          char str3[256];                                                      \
+          tostr(&m3, str3);                                                    \
+          char str4[256];                                                      \
+          tostr(&ret, str4);                                                   \
+          printf("%s(%s, %s, %s) = %s\n", #func, str, str2, str3, str4);       \
+        }
+
+#define Ret_M256d_M256d_M256d(Ret_type, func)                                  \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int k = 0; k < 2; ++k)                                                \
+      for (int j = 0; j < numInterestingDoubles / 2; ++j)                      \
+        for (int l = 0; l < numInterestingDoubles / 2; ++l) {                  \
+          __m128d tmp =                                                        \
+            E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);  \
+          __m256d m1 = _mm256_set_m128d(tmp, tmp);                             \
+          tmp = E2_Double(interesting_doubles, j * 2, numInterestingDoubles);  \
+          __m256d m2 = _mm256_set_m128d(tmp, tmp);                             \
+          tmp = E1_Double(interesting_doubles, l * 2, numInterestingDoubles);  \
+          __m256d m3 = _mm256_set_m128d(tmp, tmp);                             \
+          Ret_type ret = func(m1, m2, m3);                                     \
+          /* a, b, c */                                                        \
+          char str[256];                                                       \
+          tostr(&m1, str);                                                     \
+          char str2[256];                                                      \
+          tostr(&m2, str2);                                                    \
+          char str3[256];                                                      \
+          tostr(&m3, str3);                                                    \
+          char str4[256];                                                      \
+          tostr(&ret, str4);                                                   \
+          printf("%s(%s, %s, %s) = %s\n", #func, str, str2, str3, str4);       \
+          /* b, c, a */                                                        \
+          ret = func(m2, m3, m1);                                              \
+          tostr(&m1, str);                                                     \
+          tostr(&m2, str2);                                                    \
+          tostr(&m3, str3);                                                    \
+          tostr(&ret, str4);                                                   \
+          printf("%s(%s, %s, %s) = %s\n", #func, str, str2, str3, str4);       \
+          /* c, a, b */                                                        \
+          ret = func(m3, m1, m2);                                              \
+          tostr(&m1, str);                                                     \
+          tostr(&m2, str2);                                                    \
+          tostr(&m3, str3);                                                    \
+          tostr(&ret, str4);                                                   \
+          printf("%s(%s, %s, %s) = %s\n", #func, str, str2, str3, str4);       \
+        }
+
+#define Ret_M256i(Ret_type, func)                                              \
+  for (int i = 0; i < numInterestingInts / 4; ++i)                             \
+    for (int k = 0; k < 4; ++k) {                                              \
+      __m128i tmp =                                                            \
+        (__m128i)E1_Int(interesting_ints, i * 4 + k, numInterestingInts);      \
+      __m256i m1 = _mm256_set_m128i(tmp, tmp);                                 \
+      Ret_type ret = func(m1);                                                 \
+      char str[256];                                                           \
+      tostr(&m1, str);                                                         \
+      char str2[256];                                                          \
+      tostr(&ret, str2);                                                       \
+      printf("%s(%s) = %s\n", #func, str, str2);                               \
+    }
+
+#define Ret_M128dPtr(Ret_type, func)                                           \
+  for (int i = 0; i + 2 <= numInterestingDoubles; i += 2) {                    \
+    double* ptr = interesting_doubles + i;                                     \
+    Ret_type ret = func((__m128d*)ptr);                                        \
+    char str[256];                                                             \
+    tostr(ptr, 2, str);                                                        \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_M128Ptr(Ret_type, func)                                            \
+  for (int i = 0; i + 4 <= numInterestingFloats; i += 4) {                     \
+    float* ptr = interesting_floats + i;                                       \
+    Ret_type ret = func((__m128*)ptr);                                         \
+    char str[256];                                                             \
+    tostr(ptr, 4, str);                                                        \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_DoublePtr_DoublePtr(Ret_type, func, numElemsAccessed, inc)         \
+  for (int i = 0; i + numElemsAccessed <= numInterestingDoubles; i += inc) {   \
+    double* ptr1 = interesting_doubles + i;                                    \
+    for (int j = 0; j + numElemsAccessed <= numInterestingDoubles; j += inc) { \
+      double* ptr2 = interesting_doubles + j;                                  \
+      Ret_type ret = func(ptr1, ptr2);                                         \
+      char str1[256];                                                          \
+      tostr(ptr1, numElemsAccessed, str1);                                     \
+      char str2[256];                                                          \
+      tostr(ptr2, numElemsAccessed, str2);                                     \
+      char str3[256];                                                          \
+      tostr(&ret, str3);                                                       \
+      printf("%s(%s, %s) = %s\n", #func, str1, str2, str3);                    \
+    }                                                                          \
+  }
+
+#define Ret_FloatPtr_FloatPtr(Ret_type, func, numElemsAccessed, inc)           \
+  for (int i = 0; i + numElemsAccessed <= numInterestingFloats; i += inc) {    \
+    float* ptr1 = interesting_floats + i;                                      \
+    for (int j = 0; j + numElemsAccessed <= numInterestingFloats; j += inc) {  \
+      float* ptr2 = interesting_floats + j;                                    \
+      Ret_type ret = func(ptr1, ptr2);                                         \
+      char str1[256];                                                          \
+      tostr(ptr1, numElemsAccessed, str1);                                     \
+      char str2[256];                                                          \
+      tostr(ptr2, numElemsAccessed, str2);                                     \
+      char str3[256];                                                          \
+      tostr(&ret, str3);                                                       \
+      printf("%s(%s,%s) = %s\n", #func, str1, str2, str3);                     \
+    }                                                                          \
+  }
+
+#define Ret_IntPtr_IntPtr(Ret_type, func, Ptr_type, numElemsAccessed, inc)     \
+  for (int i = 0; i + numElemsAccessed <= numInterestingInts; i += inc) {      \
+    uint32_t* ptr1 = interesting_ints + i;                                     \
+    for (int j = 0; j + numElemsAccessed <= numInterestingInts; j += inc) {    \
+      uint32_t* ptr2 = interesting_ints + j;                                   \
+      Ret_type ret = func((Ptr_type)ptr1, (Ptr_type)ptr2);                     \
+      char str1[256];                                                          \
+      tostr((int*)ptr1, numElemsAccessed, str1);                               \
+      char str2[256];                                                          \
+      tostr((int*)ptr2, numElemsAccessed, str2);                               \
+      char str3[256];                                                          \
+      tostr(&ret, str3);                                                       \
+      printf("%s(%s, %s) = %s\n", #func, str1, str2, str3);                    \
+    }                                                                          \
+  }
+
+#define Ret_DoublePtr_M256i(Ret_type, func, numElemsAccessed, inc)             \
+  for (int i = 0; i + numElemsAccessed <= numInterestingDoubles; i += inc)     \
+    for (int j = 0; j < numInterestingInts / 4; ++j) {                         \
+      double* ptr = interesting_doubles + i;                                   \
+      __m128i tmp =                                                            \
+        (__m128i)E2_Int(interesting_ints, j * 4, numInterestingInts);          \
+      __m256i m1 = _mm256_set_m128i(tmp, tmp);                                 \
+      Ret_type ret = func(ptr, m1);                                            \
+      char str[256];                                                           \
+      tostr(ptr, numElemsAccessed, str);                                       \
+      char str2[256];                                                          \
+      tostr(&ret, str2);                                                       \
+      printf("%s(%s) = %s\n", #func, str, str2);                               \
+    }
+
+#define Ret_FloatPtr_M256i(Ret_type, func, numElemsAccessed, inc)              \
+  for (int i = 0; i + numElemsAccessed <= numInterestingFloats; i += inc)      \
+    for (int j = 0; j < numInterestingInts / 4; ++j) {                         \
+      float* ptr = interesting_floats + i;                                     \
+      __m128i tmp =                                                            \
+        (__m128i)E1_Int(interesting_ints, j * 4, numInterestingInts);          \
+      __m256i m1 = _mm256_set_m128i(tmp, tmp);                                 \
+      Ret_type ret = func(ptr, m1);                                            \
+      char str[256];                                                           \
+      tostr(ptr, numElemsAccessed, str);                                       \
+      char str2[256];                                                          \
+      tostr(&ret, str2);                                                       \
+      printf("%s(%s) = %s\n", #func, str, str2);                               \
+    }
+
+#define Ret_M256d_M256d_Tint_body(Ret_type, func, Tint)                        \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int k = 0; k < 2; ++k)                                                \
+      for (int j = 0; j < numInterestingDoubles / 2; ++j) {                    \
+        __m128d tmp =                                                          \
+          E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);    \
+        __m256d m1 = _mm256_set_m128d(tmp, tmp);                               \
+        tmp = E2_Double(interesting_doubles, j * 2, numInterestingDoubles);    \
+        __m256d m2 = _mm256_set_m128d(tmp, tmp);                               \
+        Ret_type ret = func(m1, m2, Tint);                                     \
+        /* a op b */                                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+        /* b op a */                                                           \
+        ret = func(m2, m1, Tint);                                              \
+        tostr(&m1, str);                                                       \
+        tostr(&m2, str2);                                                      \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+      }
+
+#define Ret_M256_M256_Tint_body(Ret_type, func, Tint)                          \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingFloats / 4; ++j) {                     \
+        __m128 tmp = E1(interesting_floats, i * 4 + k, numInterestingFloats);  \
+        __m256 m1 = _mm256_set_m128(tmp, tmp);                                 \
+        tmp = E2(interesting_floats, j * 4, numInterestingFloats);             \
+        __m256 m2 = _mm256_set_m128(tmp, tmp);                                 \
+        Ret_type ret = func(m1, m2, Tint);                                     \
+        /* a op b */                                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+        /* b op a */                                                           \
+        ret = func(m2, m1, Tint);                                              \
+        tostr(&m1, str);                                                       \
+        tostr(&m2, str2);                                                      \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+      }
+
+#define Ret_M256i_M256i_Tint_body(Ret_type, func, Tint)                        \
+  for (int i = 0; i < numInterestingInts / 4; ++i)                             \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingInts / 4; ++j) {                       \
+        __m128i tmp =                                                          \
+          (__m128i)E1_Int(interesting_ints, i * 4 + k, numInterestingInts);    \
+        __m256i m1 = _mm256_set_m128i(tmp, tmp);                               \
+        tmp = (__m128i)E2_Int(interesting_ints, j * 4, numInterestingInts);    \
+        __m256i m2 = _mm256_set_m128i(tmp, tmp);                               \
+        Ret_type ret = func(m1, m2, Tint);                                     \
+        /* a op b */                                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+        /* b op a */                                                           \
+        ret = func(m2, m1, Tint);                                              \
+        tostr(&m1, str);                                                       \
+        tostr(&m2, str2);                                                      \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+      }
+
+#define Ret_M256_Tint_body(Ret_type, func, Tint)                               \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k) {                                              \
+      __m128 tmp = E1(interesting_floats, i * 4 + k, numInterestingFloats);    \
+      __m256 m1 = _mm256_set_m128(tmp, tmp);                                   \
+      Ret_type ret = func(m1, Tint);                                           \
+      char str[256];                                                           \
+      tostr(&m1, str);                                                         \
+      char str2[256];                                                          \
+      tostr(&ret, str2);                                                       \
+      printf("%s(%s, %d) = %s\n", #func, str, Tint, str2);                     \
+    }
+
+#define Ret_M256d_Tint_body(Ret_type, func, Tint)                              \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int k = 0; k < 2; ++k) {                                              \
+      __m128d tmp =                                                            \
+        E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);      \
+      __m256d m1 = _mm256_set_m128d(tmp, tmp);                                 \
+      Ret_type ret = func(m1, Tint);                                           \
+      char str[256];                                                           \
+      tostr(&m1, str);                                                         \
+      char str2[256];                                                          \
+      tostr(&ret, str2);                                                       \
+      printf("%s(%s, %d) = %s\n", #func, str, Tint, str2);                     \
+    }
+
+#define Ret_M256i_Tint_body(Ret_type, func, Tint)                              \
+  for (int i = 0; i < numInterestingInts / 4; ++i)                             \
+    for (int k = 0; k < 4; ++k) {                                              \
+      __m128i tmp =                                                            \
+        (__m128i)E1_Int(interesting_ints, i * 4 + k, numInterestingInts);      \
+      __m256i m1 = _mm256_set_m128i(tmp, tmp);                                 \
+      Ret_type ret = func(m1, Tint);                                           \
+      char str[256];                                                           \
+      tostr(&m1, str);                                                         \
+      char str2[256];                                                          \
+      tostr(&ret, str2);                                                       \
+      printf("%s(%s, %d) = %s\n", #func, str, Tint, str2);                     \
+    }
+
+#define Ret_M256i_int_Tint_body(Ret_type, func, Tint)                          \
+  for (int i = 0; i < numInterestingInts / 4; ++i)                             \
+    for (int j = 0; j < numInterestingInts; ++j)                               \
+      for (int k = 0; k < 4; ++k) {                                            \
+        __m128i tmp =                                                          \
+          (__m128i)E1_Int(interesting_ints, i * 4 + k, numInterestingInts);    \
+        __m256i m1 = _mm256_set_m128i(tmp, tmp);                               \
+        Ret_type ret = func(m1, interesting_ints[j], Tint);                    \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&ret, str2);                                                     \
+        printf("%s(%s, 0x%08X, %d) = %s\n",                                    \
+               #func,                                                          \
+               str,                                                            \
+               interesting_ints[j],                                            \
+               Tint,                                                           \
+               str2);                                                          \
+      }
+
+#define Ret_M256i_M128i_Tint_body(Ret_type, func, Tint)                        \
+  for (int i = 0; i < numInterestingInts / 4; ++i)                             \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingInts / 4; ++j) {                       \
+        __m128i tmp =                                                          \
+          (__m128i)E1_Int(interesting_ints, i * 4 + k, numInterestingInts);    \
+        __m256i m1 = _mm256_set_m128i(tmp, tmp);                               \
+        __m128i m2 =                                                           \
+          (__m128i)E2_Int(interesting_ints, j * 4, numInterestingInts);        \
+        Ret_type ret = func(m1, m2, Tint);                                     \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+      }
+
+#define Ret_M256d_M128d_Tint_body(Ret_type, func, Tint)                        \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int k = 0; k < 2; ++k)                                                \
+      for (int j = 0; j < numInterestingDoubles / 2; ++j) {                    \
+        __m128d tmp =                                                          \
+          E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);    \
+        __m256d m1 = _mm256_set_m128d(tmp, tmp);                               \
+        __m128d m2 =                                                           \
+          E2_Double(interesting_doubles, j * 2, numInterestingDoubles);        \
+        Ret_type ret = func(m1, m2, Tint);                                     \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+      }
+
+#define Ret_M256_M128_Tint_body(Ret_type, func, Tint)                          \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingFloats / 4; ++j) {                     \
+        __m128 tmp = E1(interesting_floats, i * 4 + k, numInterestingFloats);  \
+        __m256 m1 = _mm256_set_m128(tmp, tmp);                                 \
+        __m128 m2 = E2(interesting_floats, j * 4, numInterestingFloats);       \
+        Ret_type ret = func(m1, m2, Tint);                                     \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+      }
+
+#define Ret_M256_Tint(Ret_type, func)                                          \
+  const_int8_unroll(Ret_type, Ret_M256_Tint_body, func)
+#define Ret_M256d_Tint(Ret_type, func)                                         \
+  const_int8_unroll(Ret_type, Ret_M256d_Tint_body, func)
+
+#define Ret_M256i_M256i_Tint(Ret_type, func)                                   \
+  const_int8_unroll(Ret_type, Ret_M256i_M256i_Tint_body, func)
+#define Ret_M256d_M256d_Tint(Ret_type, func)                                   \
+  const_int8_unroll(Ret_type, Ret_M256d_M256d_Tint_body, func)
+#define Ret_M256_M256_Tint(Ret_type, func)                                     \
+  const_int8_unroll(Ret_type, Ret_M256_M256_Tint_body, func)
+#define Ret_M256i_Tint(Ret_type, func)                                         \
+  const_int8_unroll(Ret_type, Ret_M256i_Tint_body, func)
+
+#define Ret_M256i_int_Tint(Ret_type, func)                                     \
+  const_int8_unroll(Ret_type, Ret_M256i_int_Tint_body, func)
+
+#define Ret_M256i_M128i_Tint(Ret_type, func)                                   \
+  const_int8_unroll(Ret_type, Ret_M256i_M128i_Tint_body, func)
+#define Ret_M256d_M128d_Tint(Ret_type, func)                                   \
+  const_int8_unroll(Ret_type, Ret_M256d_M128d_Tint_body, func)
+#define Ret_M256_M128_Tint(Ret_type, func)                                     \
+  const_int8_unroll(Ret_type, Ret_M256_M128_Tint_body, func)
+
+#define const_int5_unroll_0_to_15(Ret_type, F, func)                           \
+  F(Ret_type, func, 0);                                                        \
+  F(Ret_type, func, 1);                                                        \
+  F(Ret_type, func, 2);                                                        \
+  F(Ret_type, func, 3);                                                        \
+  F(Ret_type, func, 4);                                                        \
+  F(Ret_type, func, 5);                                                        \
+  F(Ret_type, func, 6);                                                        \
+  F(Ret_type, func, 7);                                                        \
+  F(Ret_type, func, 8);                                                        \
+  F(Ret_type, func, 9);                                                        \
+  F(Ret_type, func, 10);                                                       \
+  F(Ret_type, func, 11);                                                       \
+  F(Ret_type, func, 12);                                                       \
+  F(Ret_type, func, 13);                                                       \
+  F(Ret_type, func, 14);                                                       \
+  F(Ret_type, func, 15);
+
+#define const_int5_unroll_16_to_31(Ret_type, F, func)                          \
+  F(Ret_type, func, 16);                                                       \
+  F(Ret_type, func, 17);                                                       \
+  F(Ret_type, func, 18);                                                       \
+  F(Ret_type, func, 19);                                                       \
+  F(Ret_type, func, 20);                                                       \
+  F(Ret_type, func, 21);                                                       \
+  F(Ret_type, func, 22);                                                       \
+  F(Ret_type, func, 23);                                                       \
+  F(Ret_type, func, 24);                                                       \
+  F(Ret_type, func, 25);                                                       \
+  F(Ret_type, func, 26);                                                       \
+  F(Ret_type, func, 27);                                                       \
+  F(Ret_type, func, 28);                                                       \
+  F(Ret_type, func, 29);                                                       \
+  F(Ret_type, func, 30);                                                       \
+  F(Ret_type, func, 31);
+
+#define Ret_M256d_M256d_Tint_5bits_0_to_15(Ret_type, func)                     \
+  const_int5_unroll_0_to_15(Ret_type, Ret_M256d_M256d_Tint_body, func)
+#define Ret_M256d_M256d_Tint_5bits_16_to_31(Ret_type, func)                    \
+  const_int5_unroll_16_to_31(Ret_type, Ret_M256d_M256d_Tint_body, func)
+
+#define Ret_M256_M256_Tint_5bits_0_to_15(Ret_type, func)                       \
+  const_int5_unroll_0_to_15(Ret_type, Ret_M256_M256_Tint_body, func)
+#define Ret_M256_M256_Tint_5bits_16_to_31(Ret_type, func)                      \
+  const_int5_unroll_16_to_31(Ret_type, Ret_M256_M256_Tint_body, func)
+
+#define void_OutDoublePtr_M256d(                                               \
+  func, Ptr_type, numBytesWritten, alignmentBytes)                             \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int offset = 0; offset < numBytesWritten; offset += alignmentBytes)   \
+      for (int k = 0; k < 2; ++k) {                                            \
+        uintptr_t base = (uintptr_t)getTempOutDoubleStore(32);                 \
+        __m128d tmp =                                                          \
+          E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);    \
+        __m256d m1 = _mm256_set_m128d(tmp, tmp);                               \
+        align1_double* out = (align1_double*)(base + offset);                  \
+        func((Ptr_type)out, m1);                                               \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(out, numBytesWritten / sizeof(double), str2);                    \
+        printf("%s(p:align=%d, %s) = %s\n", #func, offset, str, str2);         \
+      }
+
+#define void_OutFloatPtr_M256(func, Ptr_type, numBytesWritten, alignmentBytes) \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int offset = 0; offset < numBytesWritten; offset += alignmentBytes)   \
+      for (int k = 0; k < 4; ++k) {                                            \
+        uintptr_t base = (uintptr_t)getTempOutFloatStore(32);                  \
+        __m128 tmp = E1(interesting_floats, i * 4 + k, numInterestingFloats);  \
+        __m256 m1 = _mm256_set_m128(tmp, tmp);                                 \
+        align1_float* out = (align1_float*)(base + offset);                    \
+        func((Ptr_type)out, m1);                                               \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(out, numBytesWritten / sizeof(float), str2);                     \
+        printf("%s(p:align=%d, %s) = %s\n", #func, offset, str, str2);         \
+      }
+
+#define void_OutIntPtr_M256i(func, Ptr_type, numBytesWritten, alignmentBytes)  \
+  for (int i = 0; i < numInterestingInts / 4; ++i)                             \
+    for (int offset = 0; offset < numBytesWritten; offset += alignmentBytes)   \
+      for (int k = 0; k < 4; ++k) {                                            \
+        uintptr_t base = (uintptr_t)getTempOutIntStore(32);                    \
+        __m128i tmp =                                                          \
+          (__m128i)E1_Int(interesting_ints, i * 4 + k, numInterestingInts);    \
+        __m256i m1 = _mm256_set_m128i(tmp, tmp);                               \
+        align1_int* out = (align1_int*)(base + offset);                        \
+        func((Ptr_type)out, m1);                                               \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(out, (numBytesWritten + sizeof(int) - 1) / sizeof(int), str2);   \
+        printf("%s(p:align=%d, %s) = %s\n", #func, offset, str, str2);         \
+      }
+
+#define void_OutDoublePtr_M256i_M256d(                                         \
+  func, Ptr_type, numBytesWritten, alignmentBytes)                             \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int j = 0; j < numInterestingInts / 4; ++j)                           \
+      for (int offset = 0; offset < numBytesWritten; offset += alignmentBytes) \
+        for (int k = 0; k < 2; ++k) {                                          \
+          uintptr_t base = (uintptr_t)getTempOutDoubleStore(32);               \
+          __m128i tmp1 =                                                       \
+            (__m128i)E1_Int(interesting_ints, j * 4, numInterestingInts);      \
+          __m256i m1 = _mm256_set_m128i(tmp1, tmp1);                           \
+          __m128d tmp2 =                                                       \
+            E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);  \
+          __m256d m2 = _mm256_set_m128d(tmp2, tmp2);                           \
+          align1_double* out = (align1_double*)(base + offset);                \
+          func((Ptr_type)out, m1, m2);                                         \
+          char str[256];                                                       \
+          tostr(&m1, str);                                                     \
+          char str2[256];                                                      \
+          tostr(&m2, str2);                                                    \
+          char str3[256];                                                      \
+          tostr(out, numBytesWritten / sizeof(double), str3);                  \
+          printf(                                                              \
+            "%s(p:align=%d, %s, %s) = %s\n", #func, offset, str, str2, str3);  \
+        }
+
+#define void_OutFloatPtr_M256i_M256(                                           \
+  func, Ptr_type, numBytesWritten, alignmentBytes)                             \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int j = 0; j < numInterestingInts / 4; ++j)                           \
+      for (int offset = 0; offset < numBytesWritten; offset += alignmentBytes) \
+        for (int k = 0; k < 4; ++k) {                                          \
+          uintptr_t base = (uintptr_t)getTempOutFloatStore(16);                \
+          __m128i tmp1 =                                                       \
+            (__m128i)E1_Int(interesting_ints, j * 4, numInterestingInts);      \
+          __m256i m1 = _mm256_set_m128i(tmp1, tmp1);                           \
+          __m128 tmp2 =                                                        \
+            E1(interesting_floats, i * 4 + k, numInterestingFloats);           \
+          __m256 m2 = _mm256_set_m128(tmp2, tmp2);                             \
+          align1_float* out = (align1_float*)(base + offset);                  \
+          func((Ptr_type)out, m1, m2);                                         \
+          char str[256];                                                       \
+          tostr(&m1, str);                                                     \
+          char str2[256];                                                      \
+          tostr(&m2, str2);                                                    \
+          char str3[256];                                                      \
+          tostr(out, numBytesWritten / sizeof(float), str3);                   \
+          printf(                                                              \
+            "%s(p:align=%d, %s, %s) = %s\n", #func, offset, str, str2, str3);  \
+        }
+
+#define void_OutFloatPtr_OutFloatPtr_M256(                                     \
+  func, Ptr_type, numBytesWritten, alignmentBytes)                             \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int offset = 0; offset < numBytesWritten; offset += alignmentBytes)   \
+      for (int k = 0; k < 4; ++k) {                                            \
+        uintptr_t base = (uintptr_t)getTempOutFloatStore(32);                  \
+        __m128 tmp = E1(interesting_floats, i * 4 + k, numInterestingFloats);  \
+        __m256 m1 = _mm256_set_m128(tmp, tmp);                                 \
+        align1_float* out1 = (align1_float*)(base + offset);                   \
+        align1_float* out2 = out1 + 4;                                         \
+        func((Ptr_type)out1, (Ptr_type)out2, m1);                              \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(out1, numBytesWritten / 2 / sizeof(float), str2);                \
+        char str3[256];                                                        \
+        tostr(out2, numBytesWritten / 2 / sizeof(float), str3);                \
+        printf(                                                                \
+          "%s(p:align=%d, %s) = %s,%s\n", #func, offset, str, str2, str3);     \
+      }
+
+#define void_OutDoublePtr_OutDoublePtr_M256d(                                  \
+  func, Ptr_type, numBytesWritten, alignmentBytes)                             \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int offset = 0; offset < numBytesWritten; offset += alignmentBytes)   \
+      for (int k = 0; k < 2; ++k) {                                            \
+        uintptr_t base = (uintptr_t)getTempOutDoubleStore(32);                 \
+        __m128d tmp =                                                          \
+          E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);    \
+        __m256d m1 = _mm256_set_m128d(tmp, tmp);                               \
+        align1_double* out1 = (align1_double*)(base + offset);                 \
+        align1_double* out2 = out1 + 2;                                        \
+        func((Ptr_type)out1, (Ptr_type)out2, m1);                              \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(out1, numBytesWritten / 2 / sizeof(double), str2);               \
+        char str3[256];                                                        \
+        tostr(out2, numBytesWritten / 2 / sizeof(double), str3);               \
+        printf(                                                                \
+          "%s(p:align=%d, %s) = %s,%s\n", #func, offset, str, str2, str3);     \
+      }
+
+#define void_OutIntPtr_OutIntPtr_M256i(                                        \
+  func, Ptr_type, numBytesWritten, alignmentBytes)                             \
+  for (int i = 0; i < numInterestingInts / 4; ++i)                             \
+    for (int offset = 0; offset < numBytesWritten; offset += alignmentBytes)   \
+      for (int k = 0; k < 4; ++k) {                                            \
+        uintptr_t base = (uintptr_t)getTempOutIntStore(32);                    \
+        __m128i tmp =                                                          \
+          (__m128i)E1_Int(interesting_ints, i * 4 + k, numInterestingInts);    \
+        __m256i m1 = _mm256_set_m128i(tmp, tmp);                               \
+        align1_int* out1 = (align1_int*)(base + offset);                       \
+        align1_int* out2 = out1 + 4;                                           \
+        func((Ptr_type)out1, (Ptr_type)out2, m1);                              \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(                                                                 \
+          out1, (numBytesWritten + sizeof(int) - 1) / 2 / sizeof(int), str2);  \
+        char str3[256];                                                        \
+        tostr(                                                                 \
+          out2, (numBytesWritten + sizeof(int) - 1) / 2 / sizeof(int), str3);  \
+        printf(                                                                \
+          "%s(p:align=%d, %s) = %s,%s\n", #func, offset, str, str2, str3);     \
+      }
+
+#define Ret_Double2(Ret_type, func, inc)                                       \
+  for (int i = 0; i + 2 <= numInterestingDoubles; i += inc) {                  \
+    double* ptr = interesting_doubles + i;                                     \
+    Ret_type ret = func(ptr[0], ptr[1]);                                       \
+    char str[256];                                                             \
+    tostr(ptr, 2, str);                                                        \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_Double4(Ret_type, func, inc)                                       \
+  for (int i = 0; i + 4 <= numInterestingDoubles; i += inc) {                  \
+    double* ptr = interesting_doubles + i;                                     \
+    Ret_type ret = func(ptr[0], ptr[1], ptr[2], ptr[3]);                       \
+    char str[256];                                                             \
+    tostr(ptr, 4, str);                                                        \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_Float8(Ret_type, func, inc)                                        \
+  for (int i = 0; i + 8 <= numInterestingFloats; i += inc) {                   \
+    float* ptr = interesting_floats + i;                                       \
+    Ret_type ret =                                                             \
+      func(ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]);    \
+    char str[256];                                                             \
+    tostr(ptr, 8, str);                                                        \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_Int8(Ret_type, func, inc)                                          \
+  for (int i = 0; i + 8 <= numInterestingInts; i += inc) {                     \
+    int* ptr = (int*)interesting_ints + i;                                     \
+    Ret_type ret =                                                             \
+      func(ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]);    \
+    char str[256];                                                             \
+    tostr(ptr, 8, str);                                                        \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_Short16(Ret_type, func, inc)                                       \
+  for (int i = 0; i + 16 <= numInterestingInts * 2; i += inc) {                \
+    short* ptr = ((short*)interesting_ints) + i;                               \
+    Ret_type ret = func(ptr[0],                                                \
+                        ptr[1],                                                \
+                        ptr[2],                                                \
+                        ptr[3],                                                \
+                        ptr[4],                                                \
+                        ptr[5],                                                \
+                        ptr[6],                                                \
+                        ptr[7],                                                \
+                        ptr[8],                                                \
+                        ptr[9],                                                \
+                        ptr[10],                                               \
+                        ptr[11],                                               \
+                        ptr[12],                                               \
+                        ptr[13],                                               \
+                        ptr[14],                                               \
+                        ptr[15]);                                              \
+    char str[256];                                                             \
+    tostr((int*)ptr, 8, str);                                                  \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_Char32(Ret_type, func, inc)                                        \
+  for (int i = 0; i + 32 <= numInterestingInts * 4; i += inc) {                \
+    char* ptr = ((char*)interesting_ints) + i;                                 \
+    Ret_type ret = func(ptr[0],                                                \
+                        ptr[1],                                                \
+                        ptr[2],                                                \
+                        ptr[3],                                                \
+                        ptr[4],                                                \
+                        ptr[5],                                                \
+                        ptr[6],                                                \
+                        ptr[7],                                                \
+                        ptr[8],                                                \
+                        ptr[9],                                                \
+                        ptr[10],                                               \
+                        ptr[11],                                               \
+                        ptr[12],                                               \
+                        ptr[13],                                               \
+                        ptr[14],                                               \
+                        ptr[15],                                               \
+                        ptr[16],                                               \
+                        ptr[17],                                               \
+                        ptr[18],                                               \
+                        ptr[19],                                               \
+                        ptr[20],                                               \
+                        ptr[21],                                               \
+                        ptr[22],                                               \
+                        ptr[23],                                               \
+                        ptr[24],                                               \
+                        ptr[25],                                               \
+                        ptr[26],                                               \
+                        ptr[27],                                               \
+                        ptr[28],                                               \
+                        ptr[29],                                               \
+                        ptr[30],                                               \
+                        ptr[31]);                                              \
+    char str[256];                                                             \
+    tostr((int*)ptr, 8, str);                                                  \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_Longlong4(Ret_type, func, inc)                                     \
+  for (int i = 0; i + 4 <= numInterestingInts / 2; i += inc) {                 \
+    long long* ptr = ((long long*)interesting_ints) + i;                       \
+    Ret_type ret = func(ptr[0], ptr[1], ptr[2], ptr[3]);                       \
+    char str[256];                                                             \
+    tostr((int*)ptr, 8, str);                                                  \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_Double(Ret_type, func, inc)                                        \
+  for (int i = 0; i + 1 <= numInterestingDoubles; i += inc) {                  \
+    double* ptr = interesting_doubles + i;                                     \
+    Ret_type ret = func(*ptr);                                                 \
+    char str[256];                                                             \
+    tostr(ptr, 1, str);                                                        \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_Int(Ret_type, func, inc)                                           \
+  for (int i = 0; i + 1 <= numInterestingInts; i += inc) {                     \
+    int* ptr = ((int*)interesting_ints) + i;                                   \
+    Ret_type ret = func(*ptr);                                                 \
+    char str[256];                                                             \
+    tostr(ptr, 1, str);                                                        \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#endif
diff --git a/test/test_core.py b/test/test_core.py
index 6c38e9fede6df..b06fd63341bf3 100644
--- a/test/test_core.py
+++ b/test/test_core.py
@@ -6620,7 +6620,7 @@ def test_sse4(self, use_4_2):
   @no_asan('local count too large')
   def test_avx(self):
     src = test_file('sse/test_avx.cpp')
-    self.run_process([shared.CLANG_CXX, src, '-mavx', '-Wno-argument-outside-range', '-o', 'test_avx', '-D_CRT_SECURE_NO_WARNINGS=1'] + clang_native.get_clang_native_args(), stdout=PIPE)
+    self.run_process([shared.CLANG_CXX, src, '-mavx', '-Wno-argument-outside-range', '-Wpedantic', '-o', 'test_avx', '-D_CRT_SECURE_NO_WARNINGS=1'] + clang_native.get_clang_native_args(), stdout=PIPE)
     native_result = self.run_process('./test_avx', stdout=PIPE).stdout
 
     self.emcc_args += ['-I' + test_file('sse'), '-mavx', '-Wno-argument-outside-range', '-sSTACK_SIZE=1MB']