Speed up simd_op_check_wasm (#8780)

abadams · web-flow · commit 793a03241ccb · 2025-08-27T15:58:52.000-07:00
* Speed up simd_op_check_wasm

It is currently timing out on some bots because running the generated
wasm is slow. This speeds it up by only checking one row and by cutting
down on the number of vector sizes tested - just native and double
native.

* Fix args when running the callable

* Run internally-threaded tests serially with other tests

They're internally threaded because they're slow. If we make them
compete with other tests, that reduces them back to one thread and they
risk timing out entirely.

* Add timing debugging prints

* tests_properties, not target_properties

* Revert debug printing of time
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
@@ -455,3 +455,19 @@ set_target_properties(correctness_async
                       correctness_sliding_window
                       correctness_storage_folding
                       PROPERTIES ENABLE_EXPORTS TRUE)
+
+# Tests which are internally parallelized should not be run at the
+# same time as other tests, or they may time out
+
+set_tests_properties(correctness_mul_div_mod
+                     correctness_simd_op_check_arm
+                     correctness_simd_op_check_hvx
+                     correctness_simd_op_check_powerpc
+                     correctness_simd_op_check_riscv
+                     correctness_simd_op_check_sve2
+                     correctness_simd_op_check_wasm
+                     correctness_simd_op_check_x86
+                     correctness_vector_cast
+                     correctness_vector_math
+                     correctness_vector_reductions
+                     PROPERTIES RUN_SERIAL TRUE)
diff --git a/test/correctness/mul_div_mod.cpp b/test/correctness/mul_div_mod.cpp
@@ -1,4 +1,5 @@
 #include "Halide.h"
+#include "halide_thread_pool.h"
 #include "test_sharding.h"
 
 #include <algorithm>
@@ -574,11 +575,19 @@ int main(int argc, char **argv) {
 
     using Sharder = Halide::Internal::Test::Sharder;
     Sharder sharder;
+
+    std::vector<std::future<bool>> futures;
+
+    Halide::Tools::ThreadPool<bool> pool;
     for (size_t t = 0; t < tasks.size(); t++) {
         if (!sharder.should_run(t)) continue;
         const auto &task = tasks.at(t);
-        if (!task.fn()) {
-            exit(-1);
+        futures.push_back(pool.async(task.fn));
+    }
+
+    for (auto &f : futures) {
+        if (!f.get()) {
+            return 1;
         }
     }
 
diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h
@@ -268,7 +268,7 @@ class SimdOpCheckTest {
             p.dim(0).set_min((p.dim(0).min() / alignment) * alignment);
         }
 
-        const std::vector<Argument> arg_types(image_params.begin(), image_params.end());
+        std::vector<Argument> arg_types(image_params.begin(), image_params.end());
 
         class HookUpImageParams : public Internal::IRMutator {
             using Internal::IRMutator::visit;
@@ -343,8 +343,15 @@ class SimdOpCheckTest {
                 .vectorize(xi);
         }
 
+        // We'll check over H rows, but we won't let the pipeline know H
+        // statically, as that can trigger some simplifications that change
+        // instruction selection.
+        Param<int> rows;
+        rows.set(H);
+        arg_types.push_back(rows);
+
         // The output to the pipeline is the maximum absolute difference as a double.
-        RDom r_check(0, W, 0, H);
+        RDom r_check(0, W, 0, rows);
         Halide::Func error("error_" + name);
         error() = Halide::cast<double>(maximum(absd(f(r_check.x, r_check.y), f_scalar(r_check.x, r_check.y))));
 
@@ -357,11 +364,13 @@ class SimdOpCheckTest {
             // Make some unallocated input buffers
             std::vector<Runtime::Buffer<>> inputs(image_params.size());
 
-            std::vector<Argument> args(image_params.size());
-            for (size_t i = 0; i < args.size(); i++) {
+            std::vector<Argument> args(image_params.size() + 1);
+            for (size_t i = 0; i < image_params.size(); i++) {
                 args[i] = image_params[i];
                 inputs[i] = Runtime::Buffer<>(args[i].type, nullptr, 0);
             }
+            args.back() = rows;
+
             auto callable = error.compile_to_callable(args, run_target);
 
             Runtime::Buffer<double> output = Runtime::Buffer<double>::make_scalar();
@@ -372,7 +381,7 @@ class SimdOpCheckTest {
             (void)callable(inputs[0], inputs[1], inputs[2], inputs[3],
                            inputs[4], inputs[5], inputs[6], inputs[7],
                            inputs[8], inputs[9], inputs[10], inputs[11],
-                           output);
+                           H, output);
 
             std::mt19937 rng;
             rng.seed(rng_seed);
@@ -409,7 +418,7 @@ class SimdOpCheckTest {
             (void)callable(inputs[0], inputs[1], inputs[2], inputs[3],
                            inputs[4], inputs[5], inputs[6], inputs[7],
                            inputs[8], inputs[9], inputs[10], inputs[11],
-                           output);
+                           H, output);
 
             double e = output(0);
             // Use a very loose tolerance for floating point tests. The
diff --git a/test/correctness/simd_op_check_wasm.cpp b/test/correctness/simd_op_check_wasm.cpp
@@ -13,7 +13,7 @@ namespace {
 
 class SimdOpCheckWASM : public SimdOpCheckTest {
 public:
-    SimdOpCheckWASM(Target t, int w = 768, int h = 128)
+    SimdOpCheckWASM(Target t, int w = 768, int h = 1)
         : SimdOpCheckTest(t, w, h) {
         use_wasm_simd128 = target.has_feature(Target::WasmSimd128);
         use_wasm_sign_ext = !target.has_feature(Target::WasmMvpOnly);
@@ -74,7 +74,7 @@ class SimdOpCheckWASM : public SimdOpCheckTest {
         }
 
         if (use_wasm_simd128) {
-            for (int w = 1; w <= 4; w <<= 1) {
+            for (int w = 1; w <= 2; w <<= 1) {
                 // create arbitrary 16-byte constant
                 check("v128.const", 16 * w, u8_1 * u8(42 + x));
 
@@ -141,21 +141,19 @@ class SimdOpCheckWASM : public SimdOpCheckTest {
                 check("i64x2.neg", 2 * w, -i64_1);
 
                 // Extended (widening) integer multiplication
-                if (w > 1) {
-                    // Need a register wider than 128 bits for us to generate these
-                    check("i16x8.extmul_low_i8x16_s", 8 * w, i16(i8_1) * i8_2);
-                    check("i32x4.extmul_low_i16x8_s", 4 * w, i32(i16_1) * i16_2);
-                    check("i64x2.extmul_low_i32x4_s", 2 * w, i64(i32_1) * i32_2);
-                    check("i16x8.extmul_low_i8x16_u", 8 * w, u16(u8_1) * u8_2);
-                    check("i32x4.extmul_low_i16x8_u", 4 * w, u32(u16_1) * u16_2);
-                    check("i64x2.extmul_low_i32x4_u", 2 * w, u64(u32_1) * u32_2);
-                    check("i16x8.extmul_high_i8x16_s", 8 * w, i16(i8_1) * i8_2);
-                    check("i32x4.extmul_high_i16x8_s", 4 * w, i32(i16_1) * i16_2);
-                    check("i64x2.extmul_high_i32x4_s", 2 * w, i64(i32_1) * i32_2);
-                    check("i16x8.extmul_high_i8x16_u", 8 * w, u16(u8_1) * u8_2);
-                    check("i32x4.extmul_high_i16x8_u", 4 * w, u32(u16_1) * u16_2);
-                    check("i64x2.extmul_high_i32x4_u", 2 * w, u64(u32_1) * u32_2);
-                }
+                // Need a register wider than 128 bits for us to generate these
+                check("i16x8.extmul_low_i8x16_s", 16 * w, i16(i8_1) * i8_2);
+                check("i32x4.extmul_low_i16x8_s", 8 * w, i32(i16_1) * i16_2);
+                check("i64x2.extmul_low_i32x4_s", 4 * w, i64(i32_1) * i32_2);
+                check("i16x8.extmul_low_i8x16_u", 16 * w, u16(u8_1) * u8_2);
+                check("i32x4.extmul_low_i16x8_u", 8 * w, u32(u16_1) * u16_2);
+                check("i64x2.extmul_low_i32x4_u", 4 * w, u64(u32_1) * u32_2);
+                check("i16x8.extmul_high_i8x16_s", 16 * w, i16(i8_1) * i8_2);
+                check("i32x4.extmul_high_i16x8_s", 8 * w, i32(i16_1) * i16_2);
+                check("i64x2.extmul_high_i32x4_s", 4 * w, i64(i32_1) * i32_2);
+                check("i16x8.extmul_high_i8x16_u", 16 * w, u16(u8_1) * u8_2);
+                check("i32x4.extmul_high_i16x8_u", 8 * w, u32(u16_1) * u16_2);
+                check("i64x2.extmul_high_i32x4_u", 4 * w, u64(u32_1) * u32_2);
 
                 // Extended pairwise integer addition
                 for (int f : {2, 4}) {