From 20756a1e0e54f453e446616cc41fe0191a67a810 Mon Sep 17 00:00:00 2001
From: Daljit Singh <daljit7991@gmail.com>
Date: Thu, 19 Mar 2026 00:52:32 +0000
Subject: [PATCH] Reduce threading scheduler contention for smoothing filter

Previously, we were dispatching the filter smoothing one image line at
a time via `ThreadedLoop(..., axes, 1)`. Profiling confirmed that this
was causing millions of scheduler lock acquisitions for large images.
To substantially improve the situation, we use two inner axes when
possible to increase chunk size from single lines to small slices of the
image. The result is less scheduler contention and lower OS overhead.
---
 cpp/core/filter/smooth.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cpp/core/filter/smooth.h b/cpp/core/filter/smooth.h
index 32c0a81348..8d733484e5 100644
--- a/cpp/core/filter/smooth.h
+++ b/cpp/core/filter/smooth.h
@@ -151,7 +151,7 @@ class Smooth : public Base {
         }
         DEBUG("smoothing dimension " + str(dim) + " in place with stride order: " + str(axes));
         SmoothFunctor1D<ImageType> smooth(in_and_output, stdev[dim], dim, extent[dim], zero_boundary);
-        ThreadedLoop(in_and_output, axes, 1).run(smooth, in_and_output);
+        ThreadedLoop(in_and_output, axes, std::min<size_t>(2, axes.size())).run(smooth, in_and_output);
         if (progress)
           ++(*progress);
       }