From 20756a1e0e54f453e446616cc41fe0191a67a810 Mon Sep 17 00:00:00 2001 From: Daljit Singh Date: Thu, 19 Mar 2026 00:52:32 +0000 Subject: [PATCH] Reduce threading scheduler contention for smoothing filter Previously, we were dispatching the filter smoothing one image line at a time via `ThreadedLoop(..., axes, 1)`. Profiling confirmed that this was causing millions of scheduler lock acquisitions for large images. To substantially improve the situation, we use two inner axes when possible to increase chunk size from single lines to small slices of the image. The result is less scheduler contention and lower OS overhead. --- cpp/core/filter/smooth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/core/filter/smooth.h b/cpp/core/filter/smooth.h index 32c0a81348..8d733484e5 100644 --- a/cpp/core/filter/smooth.h +++ b/cpp/core/filter/smooth.h @@ -151,7 +151,7 @@ class Smooth : public Base { } DEBUG("smoothing dimension " + str(dim) + " in place with stride order: " + str(axes)); SmoothFunctor1D smooth(in_and_output, stdev[dim], dim, extent[dim], zero_boundary); - ThreadedLoop(in_and_output, axes, 1).run(smooth, in_and_output); + ThreadedLoop(in_and_output, axes, std::min(2, axes.size())).run(smooth, in_and_output); if (progress) ++(*progress); }