Merge pull request #28795 from Lazin/ct/read-pipeline-parallel-fetch

dotnwat · web-flow · commit 223a8e6a1236 · 2025-12-18T12:42:33.000-08:00
ct: Read scheduler component
diff --git a/src/v/cloud_topics/BUILD b/src/v/cloud_topics/BUILD
@@ -133,8 +133,10 @@ redpanda_cc_library(
         "//src/v/cloud_topics/level_zero/pipeline:read_pipeline",
         "//src/v/cloud_topics/level_zero/pipeline:write_pipeline",
         "//src/v/cloud_topics/level_zero/read_fanout",
+        "//src/v/cloud_topics/level_zero/read_request_scheduler",
         "//src/v/cloud_topics/level_zero/reader:fetch_handler",
         "//src/v/cloud_topics/level_zero/write_request_scheduler",
+        "//src/v/config",
         "//src/v/model",
         "//src/v/ssx:sharded_service_container",
         "//src/v/storage",
diff --git a/src/v/cloud_topics/data_plane_impl.cc b/src/v/cloud_topics/data_plane_impl.cc
@@ -20,8 +20,10 @@
 #include "cloud_topics/level_zero/pipeline/read_pipeline.h"
 #include "cloud_topics/level_zero/pipeline/write_pipeline.h"
 #include "cloud_topics/level_zero/read_fanout/read_fanout.h"
+#include "cloud_topics/level_zero/read_request_scheduler/read_request_scheduler.h"
 #include "cloud_topics/level_zero/reader/fetch_request_handler.h"
 #include "cloud_topics/level_zero/write_request_scheduler/write_request_scheduler.h"
+#include "config/configuration.h"
 #include "model/fundamental.h"
 #include "ssx/sharded_service_container.h"
 #include "storage/api.h"
@@ -73,6 +75,13 @@ class impl
               return _read_pipeline.local().register_read_pipeline_stage();
           }));
 
+        if (config::shard_local_cfg().cloud_topics_parallel_fetch_enabled()) {
+            co_await construct_service(
+              _read_request_scheduler, ss::sharded_parameter([this] {
+                  return _read_pipeline.local().register_read_pipeline_stage();
+              }));
+        }
+
         co_await construct_service(
           _fetch_handler,
           ss::sharded_parameter([this] {
@@ -93,6 +102,10 @@ class impl
           [](auto& s) { return s.start(); });
         co_await _batcher.invoke_on_all([](auto& s) { return s.start(); });
         co_await _read_fanout.invoke_on_all([](auto& s) { return s.start(); });
+        if (_read_request_scheduler.local_is_initialized()) {
+            co_await _read_request_scheduler.invoke_on_all(
+              [](auto& s) { return s.start(); });
+        }
         co_await _fetch_handler.invoke_on_all(
           [](auto& s) { return s.start(); });
         co_await _batch_cache.invoke_on_all([](auto& s) { return s.start(); });
@@ -160,6 +173,8 @@ class impl
     // Read path
     ss::sharded<l0::read_pipeline<>> _read_pipeline;
     ss::sharded<l0::read_fanout> _read_fanout;
+    ss::sharded<l0::read_request_scheduler> _read_request_scheduler;
+
     ss::sharded<l0::fetch_handler> _fetch_handler;
     // Batch cache
     ss::sharded<batch_cache> _batch_cache;
diff --git a/src/v/cloud_topics/level_zero/common/BUILD b/src/v/cloud_topics/level_zero/common/BUILD
@@ -11,6 +11,8 @@ package(
         "//src/v/cloud_topics/level_zero/frontend_reader:__pkg__",
         "//src/v/cloud_topics/level_zero/pipeline:__pkg__",
         "//src/v/cloud_topics/level_zero/pipeline/tests:__pkg__",
+        "//src/v/cloud_topics/level_zero/read_request_scheduler:__pkg__",
+        "//src/v/cloud_topics/level_zero/read_request_scheduler/tests:__pkg__",
         "//src/v/cloud_topics/level_zero/reader:__pkg__",
         "//src/v/cloud_topics/level_zero/reader/tests:__pkg__",
         "//src/v/cloud_topics/level_zero/stm:__pkg__",
diff --git a/src/v/cloud_topics/level_zero/pipeline/BUILD b/src/v/cloud_topics/level_zero/pipeline/BUILD
@@ -7,6 +7,8 @@ package(default_visibility = [
     "//src/v/cloud_topics/level_zero/batcher/tests:__pkg__",
     "//src/v/cloud_topics/level_zero/read_fanout:__pkg__",
     "//src/v/cloud_topics/level_zero/read_fanout/tests:__pkg__",
+    "//src/v/cloud_topics/level_zero/read_request_scheduler:__pkg__",
+    "//src/v/cloud_topics/level_zero/read_request_scheduler/tests:__pkg__",
     "//src/v/cloud_topics/level_zero/reader:__pkg__",
     "//src/v/cloud_topics/level_zero/reader/tests:__pkg__",
     "//src/v/cloud_topics/level_zero/tests:__pkg__",
diff --git a/src/v/cloud_topics/level_zero/pipeline/read_pipeline.h b/src/v/cloud_topics/level_zero/pipeline/read_pipeline.h
@@ -116,6 +116,8 @@ class read_pipeline
             _parent->_probe.register_micro_probe(p);
         }
 
+        pipeline_stage id() const noexcept { return _ps; }
+
     private:
         pipeline_stage _ps;
         read_pipeline<Clock>* _parent;
diff --git a/src/v/cloud_topics/level_zero/read_request_scheduler/BUILD b/src/v/cloud_topics/level_zero/read_request_scheduler/BUILD
@@ -0,0 +1,31 @@
+load("//bazel:build.bzl", "redpanda_cc_library")
+
+package(default_visibility = [
+    ":__subpackages__",
+    "//src/v/cloud_topics:__pkg__",
+])
+
+redpanda_cc_library(
+    name = "read_request_scheduler",
+    srcs = [
+        "read_request_scheduler.cc",
+    ],
+    hdrs = [
+        "read_request_scheduler.h",
+    ],
+    deps = [
+        "//src/v/base",
+        "//src/v/bytes",
+        "//src/v/bytes:iobuf",
+        "//src/v/cloud_topics:logger",
+        "//src/v/cloud_topics:types",
+        "//src/v/cloud_topics/level_zero/common:extent_meta",
+        "//src/v/cloud_topics/level_zero/pipeline:base_pipeline",
+        "//src/v/cloud_topics/level_zero/pipeline:event_filter",
+        "//src/v/cloud_topics/level_zero/pipeline:read_pipeline",
+        "//src/v/cloud_topics/level_zero/pipeline:read_request",
+        "//src/v/config",
+        "//src/v/ssx:future_util",
+        "@seastar",
+    ],
+)
diff --git a/src/v/cloud_topics/level_zero/read_request_scheduler/read_request_scheduler.cc b/src/v/cloud_topics/level_zero/read_request_scheduler/read_request_scheduler.cc
@@ -0,0 +1,172 @@
+/*
+ * Copyright 2025 Redpanda Data, Inc.
+ *
+ * Licensed as a Redpanda Enterprise file under the Redpanda Community
+ * License (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://github.com/redpanda-data/redpanda/blob/master/licenses/rcl.md
+ */
+#include "cloud_topics/level_zero/read_request_scheduler/read_request_scheduler.h"
+
+#include "cloud_topics/logger.h"
+#include "ssx/future-util.h"
+
+#include <chrono>
+
+using namespace std::chrono_literals;
+
+namespace cloud_topics::l0 {
+
+read_request_scheduler::read_request_scheduler(
+  read_pipeline<ss::lowres_clock>::stage stage)
+  : _stage(std::move(stage)) {}
+
+ss::future<> read_request_scheduler::start() {
+    vlog(cd_log.debug, "Read Request Scheduler start");
+    ssx::spawn_with_gate(_gate, [this] { return bg_loop(); });
+    co_return;
+}
+
+ss::future<> read_request_scheduler::stop() { co_await _gate.close(); }
+
+namespace {
+ss::shard_id shard_for(const read_request<ss::lowres_clock>& req) {
+    std::hash<ss::sstring> hasher;
+    // The request is generated from the placeholder batch.
+    // The placeholder batch can't span multiple objects so it's safe
+    // to check only the first extent.
+    auto h = hasher(req.query.meta.front().id.name);
+    auto shard = h % ss::smp::count;
+    return static_cast<ss::shard_id>(shard);
+}
+
+std::unique_ptr<read_request<ss::lowres_clock>> make_proxy(
+  ss::shard_id target_shard,
+  const read_request<ss::lowres_clock>& req,
+  ss::lowres_clock::time_point timeout,
+  retry_chain_node* target_rtc,
+  pipeline_stage id) {
+    vassert(
+      ss::this_shard_id() == target_shard,
+      "make_proxy called on the wrong shard");
+    dataplane_query query;
+    query.output_size_estimate = req.query.output_size_estimate;
+    query.meta = req.query.meta.copy();
+    auto proxy = std::make_unique<read_request<ss::lowres_clock>>(
+      req.ntp, std::move(query), timeout, target_rtc, id);
+    return proxy;
+}
+
+} // namespace
+
+void read_request_scheduler::schedule_on(
+  read_request<ss::lowres_clock>& source_req, ss::shard_id target) {
+    if (target == ss::this_shard_id()) {
+        // Fast path, just push source_req down the pipeline
+        _stage.push_next_stage(source_req);
+        return;
+    }
+
+    // Check shutdown before launching cross-shard RPC
+    if (_stage.stopped()) {
+        source_req.set_value(errc::shutting_down);
+        return;
+    }
+
+    auto proxy = container().invoke_on(
+      target, [target, &source_req](read_request_scheduler& s) {
+          return s.proxy_read_request(source_req, target);
+      });
+
+    auto ack = proxy.then(
+      [&source_req](read_request<ss::lowres_clock>::response_t resp) {
+          if (resp.has_value()) {
+              source_req.set_value(std::move(resp.value()));
+          } else {
+              source_req.set_value(resp.error());
+          }
+      });
+
+    // Note: We intentionally do NOT hold the gate here. The gate is only used
+    // for the bg_loop fiber. These fire-and-forget continuations must be able
+    // to complete even during shutdown to avoid deadlock.
+    ssx::background = std::move(ack);
+}
+
+ss::future<read_request<ss::lowres_clock>::response_t>
+read_request_scheduler::proxy_read_request(
+  const read_request<ss::lowres_clock>& source_req, ss::shard_id target) {
+    auto now = ss::lowres_clock::now();
+    auto timeout = source_req.expiration_time;
+    if (timeout < now) {
+        co_return std::unexpected(errc::timeout);
+    }
+    // Use pipeline stage id from the _stage object and not from the request/
+    // The request belongs to another pipeline and its stage id doesn't make
+    // sense on the current shard.
+    auto proxy = make_proxy(
+      target, source_req, timeout, &_stage.get_root_rtc(), _stage.id());
+
+    // Check if pipeline is shutting down before awaiting response
+    if (_stage.stopped()) {
+        co_return std::unexpected(errc::shutting_down);
+    }
+
+    auto f = proxy->response.get_future();
+    _stage.push_next_stage(*proxy);
+    auto res = co_await ss::coroutine::as_future(std::move(f));
+    if (res.failed()) {
+        auto ex = res.get_exception();
+        // Check for shutdown exceptions explicitly
+        if (ssx::is_shutdown_exception(ex)) {
+            co_return std::unexpected(errc::shutting_down);
+        }
+        co_return std::unexpected(errc::unexpected_failure);
+    }
+    co_return std::move(res.get());
+}
+
+ss::future<> read_request_scheduler::bg_loop() {
+    while (!_stage.stopped()) {
+        // NOTE(1): requests are vectorized but it's not guaranteed
+        // that all extents in the request target the same object.
+        // If this is the case the scheduler will use first extent
+        // to decide the target shard. This could lead to suboptimal
+        // distribution of requests across shards and some edge cases.
+        // To avoid this the caller of the 'materialize' must ensure
+        // that the requests are split properly so that all extents
+        // in the request target the same object. This is not a
+        // correctness problem. The only side effect is that we may
+        // download same objects on multiple shards in parallel in
+        // cases.
+        //
+        // NOTE(2): cache locality is not a concern here because
+        // unlike in cases of write path the read path is only used
+        // when there is a cache miss. Normally, we will not hit this
+        // code path if the cache is working well and there is no
+        // leadership transfers. The goal here is to brute-force the
+        // reconciliation of cache misses as fast as possible.
+        auto res = co_await _stage.pull_fetch_requests(10_MiB);
+        if (!res.has_value()) {
+            if (res.error() == errc::shutting_down) {
+                break;
+            }
+            vlog(
+              _stage.logger().error,
+              "Failed to pull fetch requests: {}",
+              res.error());
+            _stage.register_pipeline_error(res.error());
+            continue;
+        }
+        auto list = std::move(res.value());
+        while (!list.requests.empty()) {
+            auto front = &list.requests.front();
+            list.requests.pop_front();
+            auto target_shard = shard_for(*front);
+            schedule_on(*front, target_shard);
+        }
+    }
+}
+
+} // namespace cloud_topics::l0
diff --git a/src/v/cloud_topics/level_zero/read_request_scheduler/read_request_scheduler.h b/src/v/cloud_topics/level_zero/read_request_scheduler/read_request_scheduler.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2025 Redpanda Data, Inc.
+ *
+ * Licensed as a Redpanda Enterprise file under the Redpanda Community
+ * License (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://github.com/redpanda-data/redpanda/blob/master/licenses/rcl.md
+ */
+
+#pragma once
+
+#include "base/seastarx.h"
+#include "cloud_topics/level_zero/pipeline/read_pipeline.h"
+#include "cloud_topics/level_zero/pipeline/read_request.h"
+
+#include <seastar/core/abort_source.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/sharded.hh>
+
+namespace cloud_topics::l0 {
+/// Read Request Scheduler
+///
+/// This is a simple fan-out scheduler for read requests.
+/// It directs read requests to different shards based on the
+/// object id. The requests that target the same object id will
+/// always go to the same shard.
+class read_request_scheduler
+  : public ss::peering_sharded_service<read_request_scheduler> {
+public:
+    explicit read_request_scheduler(
+      read_pipeline<ss::lowres_clock>::stage stage);
+
+    ss::future<> start();
+
+    ss::future<> stop();
+
+private:
+    ss::future<> bg_loop();
+
+    /// Schedules request processing on the target shard.
+    ///
+    /// The method sends the request to the target shard.
+    /// The 'target shard' is a shard that performs the processing of the
+    /// request. The 'source shard' is a shard that owns the request. In some
+    /// cases 'source shard' and 'target shard' can be the same. The source_req
+    /// request is just propagated down the pipeline in this case.
+    ///
+    /// \param target Target shard (the shard that should process the request)
+    /// \param source_req Request to process. The request is owned by the source
+    /// shard.
+    void schedule_on(
+      read_request<ss::lowres_clock>& source_req, ss::shard_id target);
+
+    ss::future<read_request<ss::lowres_clock>::response_t> proxy_read_request(
+      const read_request<ss::lowres_clock>& source_req, ss::shard_id target);
+
+    read_pipeline<ss::lowres_clock>::stage _stage;
+    ss::gate _gate;
+};
+} // namespace cloud_topics::l0
diff --git a/src/v/cloud_topics/level_zero/read_request_scheduler/tests/BUILD b/src/v/cloud_topics/level_zero/read_request_scheduler/tests/BUILD
@@ -0,0 +1,26 @@
+load("//bazel:test.bzl", "redpanda_cc_gtest")
+
+redpanda_cc_gtest(
+    name = "read_request_scheduler_test",
+    timeout = "short",
+    srcs = [
+        "read_request_scheduler_test.cc",
+    ],
+    cpu = 4,
+    deps = [
+        "//src/v/base",
+        "//src/v/cloud_topics:types",
+        "//src/v/cloud_topics/level_zero/pipeline:event_filter",
+        "//src/v/cloud_topics/level_zero/pipeline:pipeline_stage",
+        "//src/v/cloud_topics/level_zero/pipeline:read_pipeline",
+        "//src/v/cloud_topics/level_zero/pipeline:read_request",
+        "//src/v/cloud_topics/level_zero/read_request_scheduler",
+        "//src/v/config",
+        "//src/v/model",
+        "//src/v/model/tests:random",
+        "//src/v/test_utils:gtest",
+        "//src/v/utils:uuid",
+        "@googletest//:gtest",
+        "@seastar",
+    ],
+)
diff --git a/src/v/cloud_topics/level_zero/read_request_scheduler/tests/read_request_scheduler_test.cc b/src/v/cloud_topics/level_zero/read_request_scheduler/tests/read_request_scheduler_test.cc
diff --git a/src/v/config/configuration.cc b/src/v/config/configuration.cc
diff --git a/src/v/config/configuration.h b/src/v/config/configuration.h

Original file line number	Diff line number	Diff line change
`@@ -116,6 +116,8 @@ class read_pipeline`
`116`	`116`	`_parent->_probe.register_micro_probe(p);`
`117`	`117`	`}`
`118`	`118`
	`119`	`+ pipeline_stage id() const noexcept { return _ps; }`
	`120`	`+`
`119`	`121`	`private:`
`120`	`122`	`pipeline_stage _ps;`
`121`	`123`	`read_pipeline<Clock>* _parent;`