Skip to content

Commit 793a032

Browse files
authored
Speed up simd_op_check_wasm (#8780)
* Speed up simd_op_check_wasm It is currently timing out on some bots because running the generated wasm is slow. This speeds it up by only checking one row and by cutting down on the number of vector sizes tested - just native and double native. * Fix args when running the callable * Run internally-threaded tests serially with other tests They're internally threaded because they're slow. If we make them compete with other tests, that reduces them back to one thread and they risk timing out entirely. * Add timing debugging prints * tests_properties, not target_properties * Revert debug printing of time
1 parent 13d1944 commit 793a032

File tree

4 files changed

+57
-25
lines changed

4 files changed

+57
-25
lines changed

test/correctness/CMakeLists.txt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,3 +455,19 @@ set_target_properties(correctness_async
455455
correctness_sliding_window
456456
correctness_storage_folding
457457
PROPERTIES ENABLE_EXPORTS TRUE)
458+
459+
# Tests which are internally parallelized should not be run at the
460+
# same time as other tests, or they may time out
461+
462+
set_tests_properties(correctness_mul_div_mod
463+
correctness_simd_op_check_arm
464+
correctness_simd_op_check_hvx
465+
correctness_simd_op_check_powerpc
466+
correctness_simd_op_check_riscv
467+
correctness_simd_op_check_sve2
468+
correctness_simd_op_check_wasm
469+
correctness_simd_op_check_x86
470+
correctness_vector_cast
471+
correctness_vector_math
472+
correctness_vector_reductions
473+
PROPERTIES RUN_SERIAL TRUE)

test/correctness/mul_div_mod.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "Halide.h"
2+
#include "halide_thread_pool.h"
23
#include "test_sharding.h"
34

45
#include <algorithm>
@@ -574,11 +575,19 @@ int main(int argc, char **argv) {
574575

575576
using Sharder = Halide::Internal::Test::Sharder;
576577
Sharder sharder;
578+
579+
std::vector<std::future<bool>> futures;
580+
581+
Halide::Tools::ThreadPool<bool> pool;
577582
for (size_t t = 0; t < tasks.size(); t++) {
578583
if (!sharder.should_run(t)) continue;
579584
const auto &task = tasks.at(t);
580-
if (!task.fn()) {
581-
exit(-1);
585+
futures.push_back(pool.async(task.fn));
586+
}
587+
588+
for (auto &f : futures) {
589+
if (!f.get()) {
590+
return 1;
582591
}
583592
}
584593

test/correctness/simd_op_check.h

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ class SimdOpCheckTest {
268268
p.dim(0).set_min((p.dim(0).min() / alignment) * alignment);
269269
}
270270

271-
const std::vector<Argument> arg_types(image_params.begin(), image_params.end());
271+
std::vector<Argument> arg_types(image_params.begin(), image_params.end());
272272

273273
class HookUpImageParams : public Internal::IRMutator {
274274
using Internal::IRMutator::visit;
@@ -343,8 +343,15 @@ class SimdOpCheckTest {
343343
.vectorize(xi);
344344
}
345345

346+
// We'll check over H rows, but we won't let the pipeline know H
347+
// statically, as that can trigger some simplifications that change
348+
// instruction selection.
349+
Param<int> rows;
350+
rows.set(H);
351+
arg_types.push_back(rows);
352+
346353
// The output to the pipeline is the maximum absolute difference as a double.
347-
RDom r_check(0, W, 0, H);
354+
RDom r_check(0, W, 0, rows);
348355
Halide::Func error("error_" + name);
349356
error() = Halide::cast<double>(maximum(absd(f(r_check.x, r_check.y), f_scalar(r_check.x, r_check.y))));
350357

@@ -357,11 +364,13 @@ class SimdOpCheckTest {
357364
// Make some unallocated input buffers
358365
std::vector<Runtime::Buffer<>> inputs(image_params.size());
359366

360-
std::vector<Argument> args(image_params.size());
361-
for (size_t i = 0; i < args.size(); i++) {
367+
std::vector<Argument> args(image_params.size() + 1);
368+
for (size_t i = 0; i < image_params.size(); i++) {
362369
args[i] = image_params[i];
363370
inputs[i] = Runtime::Buffer<>(args[i].type, nullptr, 0);
364371
}
372+
args.back() = rows;
373+
365374
auto callable = error.compile_to_callable(args, run_target);
366375

367376
Runtime::Buffer<double> output = Runtime::Buffer<double>::make_scalar();
@@ -372,7 +381,7 @@ class SimdOpCheckTest {
372381
(void)callable(inputs[0], inputs[1], inputs[2], inputs[3],
373382
inputs[4], inputs[5], inputs[6], inputs[7],
374383
inputs[8], inputs[9], inputs[10], inputs[11],
375-
output);
384+
H, output);
376385

377386
std::mt19937 rng;
378387
rng.seed(rng_seed);
@@ -409,7 +418,7 @@ class SimdOpCheckTest {
409418
(void)callable(inputs[0], inputs[1], inputs[2], inputs[3],
410419
inputs[4], inputs[5], inputs[6], inputs[7],
411420
inputs[8], inputs[9], inputs[10], inputs[11],
412-
output);
421+
H, output);
413422

414423
double e = output(0);
415424
// Use a very loose tolerance for floating point tests. The

test/correctness/simd_op_check_wasm.cpp

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ namespace {
1313

1414
class SimdOpCheckWASM : public SimdOpCheckTest {
1515
public:
16-
SimdOpCheckWASM(Target t, int w = 768, int h = 128)
16+
SimdOpCheckWASM(Target t, int w = 768, int h = 1)
1717
: SimdOpCheckTest(t, w, h) {
1818
use_wasm_simd128 = target.has_feature(Target::WasmSimd128);
1919
use_wasm_sign_ext = !target.has_feature(Target::WasmMvpOnly);
@@ -74,7 +74,7 @@ class SimdOpCheckWASM : public SimdOpCheckTest {
7474
}
7575

7676
if (use_wasm_simd128) {
77-
for (int w = 1; w <= 4; w <<= 1) {
77+
for (int w = 1; w <= 2; w <<= 1) {
7878
// create arbitrary 16-byte constant
7979
check("v128.const", 16 * w, u8_1 * u8(42 + x));
8080

@@ -141,21 +141,19 @@ class SimdOpCheckWASM : public SimdOpCheckTest {
141141
check("i64x2.neg", 2 * w, -i64_1);
142142

143143
// Extended (widening) integer multiplication
144-
if (w > 1) {
145-
// Need a register wider than 128 bits for us to generate these
146-
check("i16x8.extmul_low_i8x16_s", 8 * w, i16(i8_1) * i8_2);
147-
check("i32x4.extmul_low_i16x8_s", 4 * w, i32(i16_1) * i16_2);
148-
check("i64x2.extmul_low_i32x4_s", 2 * w, i64(i32_1) * i32_2);
149-
check("i16x8.extmul_low_i8x16_u", 8 * w, u16(u8_1) * u8_2);
150-
check("i32x4.extmul_low_i16x8_u", 4 * w, u32(u16_1) * u16_2);
151-
check("i64x2.extmul_low_i32x4_u", 2 * w, u64(u32_1) * u32_2);
152-
check("i16x8.extmul_high_i8x16_s", 8 * w, i16(i8_1) * i8_2);
153-
check("i32x4.extmul_high_i16x8_s", 4 * w, i32(i16_1) * i16_2);
154-
check("i64x2.extmul_high_i32x4_s", 2 * w, i64(i32_1) * i32_2);
155-
check("i16x8.extmul_high_i8x16_u", 8 * w, u16(u8_1) * u8_2);
156-
check("i32x4.extmul_high_i16x8_u", 4 * w, u32(u16_1) * u16_2);
157-
check("i64x2.extmul_high_i32x4_u", 2 * w, u64(u32_1) * u32_2);
158-
}
144+
// Need a register wider than 128 bits for us to generate these
145+
check("i16x8.extmul_low_i8x16_s", 16 * w, i16(i8_1) * i8_2);
146+
check("i32x4.extmul_low_i16x8_s", 8 * w, i32(i16_1) * i16_2);
147+
check("i64x2.extmul_low_i32x4_s", 4 * w, i64(i32_1) * i32_2);
148+
check("i16x8.extmul_low_i8x16_u", 16 * w, u16(u8_1) * u8_2);
149+
check("i32x4.extmul_low_i16x8_u", 8 * w, u32(u16_1) * u16_2);
150+
check("i64x2.extmul_low_i32x4_u", 4 * w, u64(u32_1) * u32_2);
151+
check("i16x8.extmul_high_i8x16_s", 16 * w, i16(i8_1) * i8_2);
152+
check("i32x4.extmul_high_i16x8_s", 8 * w, i32(i16_1) * i16_2);
153+
check("i64x2.extmul_high_i32x4_s", 4 * w, i64(i32_1) * i32_2);
154+
check("i16x8.extmul_high_i8x16_u", 16 * w, u16(u8_1) * u8_2);
155+
check("i32x4.extmul_high_i16x8_u", 8 * w, u32(u16_1) * u16_2);
156+
check("i64x2.extmul_high_i32x4_u", 4 * w, u64(u32_1) * u32_2);
159157

160158
// Extended pairwise integer addition
161159
for (int f : {2, 4}) {

0 commit comments

Comments
 (0)