Improve HPX backend with adaptive chunking and thread control

lums658 · claude · lums658 · commit 55753946b03f · 2026-01-20T18:45:47.000-08:00
- Update HPX includes for HPX 2.0 API compatibility - Add adaptive_static_chunk_size for better work distribution - Add set_num_threads() for controlling HPX thread count - Add ThreadLimiter utility for benchmarking with varying thread counts - Update scalability benchmarks to use ThreadLimiter The adaptive chunking matches TBB's blocked_range behavior more closely, improving performance for local parallelism workloads. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/bench/scalability/parallel_for_scaling.cpp b/bench/scalability/parallel_for_scaling.cpp
@@ -42,7 +42,7 @@ void bench_memory_bound(const Args& args) {
   const size_t N = args.problem_size;
   std::vector<double> data(N, 1.0);
 
-  auto counts = thread_counts(args.max_threads);
+  auto counts = args.get_thread_counts();
 
   scaling_study("parallel_for (memory bound)", [&](size_t nthreads) {
     ThreadLimiter limiter(nthreads);
@@ -74,7 +74,7 @@ void bench_compute_bound(const Args& args) {
     x = x / N * 3.14159;
   }
 
-  auto counts = thread_counts(args.max_threads);
+  auto counts = args.get_thread_counts();
 
   scaling_study("parallel_for (compute bound)", [&](size_t nthreads) {
     ThreadLimiter limiter(nthreads);
@@ -103,7 +103,7 @@ void bench_splittable_range(const Args& args) {
   std::vector<size_t> indices(N);
   std::iota(indices.begin(), indices.end(), 0);
 
-  auto counts = thread_counts(args.max_threads);
+  auto counts = args.get_thread_counts();
 
   scaling_study("parallel_for (splittable_range)", [&](size_t nthreads) {
     ThreadLimiter limiter(nthreads);
@@ -136,7 +136,7 @@ void bench_irregular_work(const Args& args) {
     work_counts[i] = (i % 100) + 1;  // 1 to 100 iterations per element
   }
 
-  auto counts = thread_counts(args.max_threads);
+  auto counts = args.get_thread_counts();
 
   scaling_study("parallel_for (irregular work)", [&](size_t nthreads) {
     ThreadLimiter limiter(nthreads);
diff --git a/bench/scalability/parallel_reduce_scaling.cpp b/bench/scalability/parallel_reduce_scaling.cpp
@@ -43,7 +43,7 @@ void bench_sum_reduction(const Args& args) {
   std::vector<double> data(N);
   std::iota(data.begin(), data.end(), 1.0);
 
-  auto counts = thread_counts(args.max_threads);
+  auto counts = args.get_thread_counts();
 
   scaling_study("parallel_reduce (sum)", [&](size_t nthreads) {
     ThreadLimiter limiter(nthreads);
@@ -71,7 +71,7 @@ void bench_minmax_reduction(const Args& args) {
     data[i] = std::sin(static_cast<double>(i) * 0.1) * 1000.0;
   }
 
-  auto counts = thread_counts(args.max_threads);
+  auto counts = args.get_thread_counts();
 
   scaling_study("parallel_reduce (min)", [&](size_t nthreads) {
     ThreadLimiter limiter(nthreads);
@@ -112,7 +112,7 @@ void bench_dot_product(const Args& args) {
     b[i] = std::cos(static_cast<double>(i) * 0.01);
   }
 
-  auto counts = thread_counts(args.max_threads);
+  auto counts = args.get_thread_counts();
 
   scaling_study("parallel_reduce (dot product)", [&](size_t nthreads) {
     ThreadLimiter limiter(nthreads);
@@ -139,7 +139,7 @@ void bench_l2_norm(const Args& args) {
     data[i] = std::sin(static_cast<double>(i) * 0.001);
   }
 
-  auto counts = thread_counts(args.max_threads);
+  auto counts = args.get_thread_counts();
 
   scaling_study("parallel_reduce (L2 norm)", [&](size_t nthreads) {
     ThreadLimiter limiter(nthreads);
@@ -171,7 +171,7 @@ void bench_range_reduce(const Args& args) {
     data[i] = static_cast<double>(i % 1000);
   }
 
-  auto counts = thread_counts(args.max_threads);
+  auto counts = args.get_thread_counts();
 
   scaling_study("parallel_reduce (splittable_range)", [&](size_t nthreads) {
     ThreadLimiter limiter(nthreads);
@@ -205,7 +205,7 @@ void bench_convergence_check(const Args& args) {
     new_rank[i] = (1.0 / N) + (static_cast<double>(i % 100) - 50) * 1e-6;
   }
 
-  auto counts = thread_counts(args.max_threads);
+  auto counts = args.get_thread_counts();
 
   scaling_study("parallel_reduce (convergence)", [&](size_t nthreads) {
     ThreadLimiter limiter(nthreads);
diff --git a/bench/scalability/scalability_common.hpp b/bench/scalability/scalability_common.hpp
@@ -240,6 +240,7 @@ struct Args {
   std::string file;
   size_t      ntrials     = 5;
   size_t      max_threads = 0;  // 0 = auto-detect
+  size_t      hpx_threads = 0;  // HPX: fixed thread count (0 = scaling study)
   size_t      problem_size = 10000000;  // For synthetic benchmarks
   bool        verbose     = false;
   bool        csv_output  = false;
@@ -258,6 +259,10 @@ struct Args {
       } else if (arg == "-t" || arg == "--threads") {
         if (++i >= argc) usage(argv[0], "Missing argument for " + arg);
         max_threads = std::stoul(argv[i]);
+      } else if (arg == "--hpx-threads") {
+        // HPX-specific: set fixed thread count for this run
+        if (++i >= argc) usage(argv[0], "Missing argument for " + arg);
+        hpx_threads = std::stoul(argv[i]);
       } else if (arg == "-s" || arg == "--size") {
         if (++i >= argc) usage(argv[0], "Missing argument for " + arg);
         problem_size = std::stoul(argv[i]);
@@ -278,6 +283,26 @@ struct Args {
     if (max_threads == 0) {
       max_threads = hardware_threads();
     }
+
+    // For HPX: set thread count BEFORE any parallel operations
+    // This must happen before backend::ensure_initialized() is called
+    if (hpx_threads > 0) {
+      backend::set_num_threads(hpx_threads);
+    }
+  }
+
+  /**
+   * @brief Get thread counts for scaling study.
+   *
+   * For HPX with --hpx-threads, returns just that single thread count.
+   * Otherwise returns the standard scaling thread counts.
+   */
+  std::vector<size_t> get_thread_counts() const {
+    if (hpx_threads > 0) {
+      // HPX: single fixed thread count
+      return {hpx_threads};
+    }
+    return thread_counts(max_threads);
   }
 
   static void usage(const std::string& prog, const std::string& msg = "") {
@@ -287,11 +312,15 @@ struct Args {
     std::cerr << "Usage: " << prog << " [OPTIONS]\n";
     std::cerr << "  -f, --file FILE      Input graph file (optional)\n";
     std::cerr << "  -n, --ntrials N      Number of trials [default: 5]\n";
-    std::cerr << "  -t, --threads N      Max threads [default: auto-detect]\n";
+    std::cerr << "  -t, --threads N      Max threads for scaling [default: auto]\n";
+    std::cerr << "  --hpx-threads N      HPX: fixed thread count for this run\n";
     std::cerr << "  -s, --size N         Problem size [default: 10000000]\n";
     std::cerr << "  -v, --verbose        Verbose output\n";
     std::cerr << "  --csv                CSV output format\n";
     std::cerr << "  -o, --output FILE    Output file for results\n";
+    std::cerr << "\n";
+    std::cerr << "HPX Note: Use --hpx-threads N to set thread count at startup.\n";
+    std::cerr << "          Run multiple times with different values for scaling.\n";
     exit(1);
   }
 };
diff --git a/include/nwgraph/util/backend.hpp b/include/nwgraph/util/backend.hpp
@@ -24,14 +24,15 @@
 // Define NWGRAPH_BACKEND_HPX to use HPX, otherwise TBB is used
 #if defined(NWGRAPH_BACKEND_HPX)
 
-  #include <hpx/local/init.hpp>
-  #include <hpx/modules/algorithms.hpp>
-  #include <hpx/modules/execution.hpp>
-  #include <hpx/modules/runtime_local.hpp>
-  #include <hpx/parallel/algorithms/for_each.hpp>
-  #include <hpx/parallel/algorithms/for_loop.hpp>
-  #include <hpx/parallel/algorithms/reduce.hpp>
-  #include <hpx/parallel/algorithms/transform_reduce.hpp>
+  // HPX 2.0 headers
+  #include <hpx/algorithm.hpp>
+  #include <hpx/execution.hpp>
+  #include <hpx/init.hpp>
+  #include <hpx/runtime.hpp>
+  #include <hpx/include/parallel_for_each.hpp>
+  #include <hpx/include/parallel_for_loop.hpp>
+  #include <hpx/include/parallel_reduce.hpp>
+  #include <hpx/include/parallel_transform_reduce.hpp>
 
   #define NWGRAPH_PARALLEL_BACKEND "HPX"
   #define NWGRAPH_BACKEND_HPX_ENABLED 1
@@ -106,6 +107,24 @@ class hpx_runtime_manager {
     return mgr;
   }
 
+  /**
+   * @brief Set desired thread count before initialization.
+   * Must be called before first parallel operation.
+   * @param n Number of threads (0 = use all available)
+   */
+  void set_num_threads(std::size_t n) {
+    if (!initialized_.load(std::memory_order_acquire)) {
+      num_threads_ = n;
+    }
+  }
+
+  /**
+   * @brief Get configured thread count.
+   */
+  std::size_t get_num_threads() const noexcept {
+    return num_threads_;
+  }
+
   /**
    * @brief Ensure HPX runtime is initialized.
    *
@@ -117,10 +136,25 @@ class hpx_runtime_manager {
       std::lock_guard<std::mutex> lock(mutex_);
       if (!initialized_.load(std::memory_order_relaxed)) {
         if (!hpx::is_running()) {
-          // Start HPX local runtime (no networking)
-          // Using nullptr for argc/argv starts with default settings
           started_by_us_ = true;
-          hpx::local::start(nullptr, 0, nullptr);
+
+          // Build command-line arguments for HPX
+          std::vector<std::string> args_storage;
+          args_storage.push_back("nwgraph");  // Program name
+
+          if (num_threads_ > 0) {
+            args_storage.push_back("--hpx:threads=" + std::to_string(num_threads_));
+          }
+
+          // Convert to char* array
+          std::vector<char*> argv;
+          for (auto& s : args_storage) {
+            argv.push_back(const_cast<char*>(s.c_str()));
+          }
+          argv.push_back(nullptr);
+
+          int argc = static_cast<int>(argv.size() - 1);
+          hpx::local::start(nullptr, argc, argv.data());
         }
         initialized_.store(true, std::memory_order_release);
       }
@@ -151,10 +185,23 @@ class hpx_runtime_manager {
   std::atomic<bool> initialized_{false};
   std::mutex mutex_;
   bool started_by_us_{false};
+  std::size_t num_threads_{0};  // 0 = use all available
 };
 
 } // namespace detail
 
+/**
+ * @brief Set the number of HPX worker threads.
+ *
+ * Must be called BEFORE the first parallel operation. Once HPX is
+ * initialized, thread count cannot be changed.
+ *
+ * @param n Number of threads (0 = use all available hardware threads)
+ */
+inline void set_num_threads(std::size_t n) {
+  detail::hpx_runtime_manager::instance().set_num_threads(n);
+}
+
 /**
  * @brief Ensure the HPX runtime is initialized.
  *
@@ -177,6 +224,19 @@ inline bool is_initialized() noexcept {
 
 #else // TBB backend
 
+/**
+ * @brief Set the number of TBB worker threads.
+ *
+ * For TBB, this is a no-op since thread limiting is done via
+ * tbb::global_control in the calling code.
+ *
+ * @param n Number of threads (ignored for TBB)
+ */
+inline void set_num_threads(std::size_t /*n*/) noexcept {
+  // TBB thread count is controlled via tbb::global_control
+  // in the benchmark code itself
+}
+
 /**
  * @brief Ensure the TBB runtime is initialized.
  *
diff --git a/include/nwgraph/util/parallel_for.hpp b/include/nwgraph/util/parallel_for.hpp
@@ -25,10 +25,15 @@
 #include "nwgraph/util/traits.hpp"
 
 #if defined(NWGRAPH_BACKEND_HPX)
-  #include <hpx/parallel/algorithms/for_each.hpp>
-  #include <hpx/parallel/algorithms/for_loop.hpp>
-  #include <hpx/parallel/algorithms/transform_reduce.hpp>
-  #include <hpx/include/util.hpp>
+  // HPX 2.0 headers
+  #include <hpx/algorithm.hpp>
+  #include <hpx/execution.hpp>
+  #include <hpx/include/parallel_for_each.hpp>
+  #include <hpx/include/parallel_for_loop.hpp>
+  #include <hpx/include/parallel_transform_reduce.hpp>
+  #include <hpx/iterator_support/counting_iterator.hpp>
+  #include <hpx/execution/executors/adaptive_static_chunk_size.hpp>
+  #include <thread>
 #else
   #include <tbb/parallel_for.h>
   #include <tbb/parallel_reduce.h>
@@ -37,6 +42,31 @@
 namespace nw {
 namespace graph {
 
+#if defined(NWGRAPH_BACKEND_HPX)
+namespace detail {
+
+/**
+ * @brief Get HPX execution policy with adaptive chunking.
+ *
+ * Uses HPX's adaptive_static_chunk_size which automatically determines
+ * optimal chunk sizes based on the problem size and core count.
+ * This is equivalent to OpenMP's STATIC scheduling directive.
+ *
+ * @return Parallel execution policy with adaptive static chunk size
+ */
+inline auto chunked_policy() {
+  // adaptive_static_chunk_size automatically computes:
+  // - For large inputs (>32M): 8 chunks per core
+  // - For medium inputs (>512K): 4 chunks per core
+  // - Otherwise: 2-4 chunks per core
+  // This matches TBB's blocked_range behavior more closely
+  return hpx::execution::par.with(
+      hpx::execution::experimental::adaptive_static_chunk_size());
+}
+
+}  // namespace detail
+#endif
+
 /**
  * Inner evaluation function for parallel_for.
  *
@@ -118,8 +148,8 @@ void parallel_for(Range&& range, Op&& op) {
 
   if (range.is_divisible()) {
 #if defined(NWGRAPH_BACKEND_HPX)
-    // HPX uses for_each on the range
-    hpx::for_each(hpx::execution::par, range.begin(), range.end(),
+    // HPX uses for_each on the range with adaptive chunking for better performance
+    hpx::for_each(detail::chunked_policy(), range.begin(), range.end(),
                   [&](auto&& elem) { parallel_for_inner(op, elem); });
 #else
     // TBB uses parallel_for with splittable ranges
@@ -154,8 +184,8 @@ auto parallel_reduce(Range&& range, Op&& op, Reduce&& reduce, T init) {
 
   if (range.is_divisible()) {
 #if defined(NWGRAPH_BACKEND_HPX)
-    // HPX uses transform_reduce
-    return hpx::transform_reduce(hpx::execution::par,
+    // HPX uses transform_reduce with adaptive chunking for better performance
+    return hpx::transform_reduce(detail::chunked_policy(),
                                   range.begin(), range.end(),
                                   init,
                                   std::forward<Reduce>(reduce),
@@ -186,7 +216,8 @@ void parallel_for_each(std::size_t begin, std::size_t end, Op&& op) {
   backend::init_guard guard;  // Ensure runtime is initialized
 
 #if defined(NWGRAPH_BACKEND_HPX)
-  hpx::for_loop(hpx::execution::par, begin, end, std::forward<Op>(op));
+  // HPX 2.0: for_loop with adaptive chunking for better performance
+  hpx::experimental::for_loop(detail::chunked_policy(), begin, end, std::forward<Op>(op));
 #else
   tbb::parallel_for(tbb::blocked_range<std::size_t>(begin, end),
                     [&](const auto& r) {
@@ -217,7 +248,8 @@ T parallel_reduce_each(std::size_t begin, std::size_t end, T init, Op&& op, Redu
   backend::init_guard guard;  // Ensure runtime is initialized
 
 #if defined(NWGRAPH_BACKEND_HPX)
-  return hpx::transform_reduce(hpx::execution::par,
+  // HPX transform_reduce with adaptive chunking for better performance
+  return hpx::transform_reduce(detail::chunked_policy(),
                                 hpx::util::counting_iterator<std::size_t>(begin),
                                 hpx::util::counting_iterator<std::size_t>(end),
                                 init,