Scrape health report for cloud_topic epochs

oleiman · oleiman · commit 964417f8ba73 · 2025-12-05T18:55:06.000-08:00
In real version probably use Noah's cluster_support or whatever it was called.
diff --git a/src/v/redpanda/admin/services/internal/BUILD b/src/v/redpanda/admin/services/internal/BUILD
@@ -11,6 +11,7 @@ redpanda_cc_library(
     deps = [
         "//proto/redpanda/core/admin/internal/v1:debug_redpanda_proto",
         "//src/v/base",
+        "//src/v/container:chunked_hash_map",
         "//src/v/finjector",
         "//src/v/redpanda/admin/proxy:client",
         "//src/v/serde/protobuf:rpc",
diff --git a/src/v/redpanda/admin/services/internal/gc.cc b/src/v/redpanda/admin/services/internal/gc.cc
@@ -12,10 +12,18 @@
 
 #include "model/fundamental.h"
 #include "model/namespace.h"
+#include "model/timeout_clock.h"
 #include "serde/protobuf/rpc.h"
 
 #include <seastar/core/coroutine.hh>
 
+#include <fmt/format.h>
+
+namespace {
+using namespace std::chrono_literals;
+constexpr auto health_report_query_timeout = 10s;
+} // namespace
+
 namespace admin {
 
 seastar::future<proto::admin::gc::advance_epoch_response>
@@ -64,6 +72,52 @@ gc_service_impl::advance_epoch(
     co_return response;
 }
 
+auto gc_service_impl::populate_epochs(topic_partition_epoch_map tp_epochs)
+  -> ss::future<topic_partition_epoch_map> {
+    auto health_report = co_await _health_monitor->local().get_cluster_health(
+      cluster::cluster_report_filter{},
+      cluster::force_refresh::yes,
+      model::timeout_clock::now() + health_report_query_timeout);
+
+    if (!health_report.has_value()) {
+        throw serde::pb::rpc::unavailable_exception(
+          fmt::format(
+            "Error retrieving cluster health report: {}",
+            health_report.error()));
+    }
+    fmt::print(std::cerr, "POPULATE EPOCHS: {}\n", tp_epochs.size());
+
+    for (const auto& node_health : health_report.value().node_reports) {
+        for (const auto& [tp_ns, partition_statuses] : node_health->topics) {
+            auto tp_it = tp_epochs.find(tp_ns.tp);
+            if (tp_it == tp_epochs.end()) {
+                fmt::print(std::cerr, "{}: NOT FOUND\n", tp_ns);
+                continue;
+            }
+            for (const auto& [pid, p_status] : partition_statuses) {
+                const auto maybe_max_gc_epoch
+                  = p_status.cloud_topic_max_gc_eligible_epoch;
+                auto p_it = tp_it->second.find(pid);
+                if (p_it == tp_it->second.end()) {
+                    fmt::print(
+                      std::cerr, "{}/{}: PARTITION NOT FOUND\n", tp_ns, pid);
+                    continue;
+                }
+                if (!maybe_max_gc_epoch.has_value()) {
+                    fmt::print(std::cerr, "{}/{}: NO EPOCH\n", tp_ns, pid);
+                    continue;
+                }
+                p_it->second = std::max(p_it->second, maybe_max_gc_epoch);
+                fmt::print(
+                  std::cerr, "{}/{}: EPOCH: {}\n", tp_ns, pid, p_it->second);
+            }
+        }
+        co_await ss::maybe_yield();
+    }
+
+    co_return std::move(tp_epochs);
+}
+
 seastar::future<proto::admin::gc::get_epoch_response>
 gc_service_impl::get_epoch(
   serde::pb::rpc::context, proto::admin::gc::get_epoch_request req) {
@@ -77,20 +131,19 @@ gc_service_impl::get_epoch(
     //   3. Get the current GC epoch for the partition
     //   4. Create result entry with success/failure status
 
-    // Stub: Return empty response for now
     chunked_vector<topic_partition_get_epoch_result> results;
+    topic_partition_epoch_map epochs;
     for (const auto& tp : req.get_partitions()) {
-        topic_partition_get_epoch_result result;
+        auto cfg = _topic_table->local().get_topic_cfg(
+          model::topic_namespace_view{
+            model::kafka_namespace, model::topic_view{tp.get_topic()}});
 
+        topic_partition_get_epoch_result result;
         proto::common::topic_partition result_tp;
         result_tp.set_topic(ss::sstring{tp.get_topic()});
         result_tp.set_partition(tp.get_partition());
         result.set_partition(std::move(result_tp));
 
-        auto cfg = _topic_table->local().get_topic_cfg(
-          model::topic_namespace_view{
-            model::kafka_namespace, model::topic_view{tp.get_topic()}});
-
         if (!cfg.has_value()) {
             result.set_error(error::gc_error_topic_not_found);
         } else if (auto p = tp.get_partition();
@@ -99,12 +152,47 @@ gc_service_impl::get_epoch(
         } else if (!cfg.value().is_cloud_topic()) {
             result.set_error(error::gc_error_not_cloud_topic);
         } else {
-            result.set_error(error::gc_error_failed);
+            // if the requested tp is good, stage it for a health report scan
+            epochs[model::topic_view{result.get_partition().get_topic()}]
+              .try_emplace(
+                model::partition_id{result.get_partition().get_partition()},
+                std::nullopt);
         }
 
         results.push_back(std::move(result));
     }
 
+    if (!epochs.empty()) {
+        epochs = co_await populate_epochs(std::move(epochs));
+        for (auto& r : results) {
+            auto tp_it = epochs.find(
+              model::topic_view{r.get_partition().get_topic()});
+            if (tp_it == epochs.end()) {
+                vassert(
+                  r.has_error(),
+                  "Epoch not populated for {}, expected error!",
+                  r.get_partition().get_topic());
+                continue;
+            }
+            auto p_it = tp_it->second.find(
+              model::partition_id{r.get_partition().get_partition()});
+            if (p_it == tp_it->second.end()) {
+                vassert(
+                  r.has_error(),
+                  "Epoch not populated for {}/{}, expected error!",
+                  r.get_partition().get_topic(),
+                  r.get_partition().get_partition());
+                continue;
+            }
+            if (!p_it->second.has_value()) {
+                // TODO(oren): better error code i guess
+                r.set_error(error::gc_error_failed);
+            } else {
+                r.set_epoch(p_it->second.value());
+            }
+        }
+    }
+
     response.set_partitions(std::move(results));
 
     co_return response;
diff --git a/src/v/redpanda/admin/services/internal/gc.h b/src/v/redpanda/admin/services/internal/gc.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include "cluster/health_monitor_frontend.h"
 #include "cluster/topic_table.h"
 #include "proto/redpanda/core/admin/internal/cloud_topics/v1/gc.proto.h"
 
@@ -19,19 +20,28 @@ namespace admin {
 
 class gc_service_impl : public proto::admin::gc::gc_service {
 public:
-    explicit gc_service_impl(ss::sharded<cluster::topic_table>* tt)
-      : _topic_table(tt) {}
+    explicit gc_service_impl(
+      ss::sharded<cluster::topic_table>* tt,
+      ss::sharded<cluster::health_monitor_frontend>* hm)
+      : _topic_table(tt)
+      , _health_monitor(hm) {}
 
     seastar::future<proto::admin::gc::advance_epoch_response> advance_epoch(
       serde::pb::rpc::context,
       proto::admin::gc::advance_epoch_request) override;
 
     seastar::future<proto::admin::gc::get_epoch_response> get_epoch(
-      serde::pb::rpc::context,
-      proto::admin::gc::get_epoch_request) override;
+      serde::pb::rpc::context, proto::admin::gc::get_epoch_request) override;
 
 private:
-    [[maybe_unused]] ss::sharded<cluster::topic_table>* _topic_table;
+    using partition_epoch_map
+      = chunked_hash_map<model::partition_id, std::optional<int64_t>>;
+    using topic_partition_epoch_map
+      = chunked_hash_map<model::topic, partition_epoch_map>;
+    ss::future<topic_partition_epoch_map>
+      populate_epochs(topic_partition_epoch_map);
+    ss::sharded<cluster::topic_table>* _topic_table;
+    ss::sharded<cluster::health_monitor_frontend>* _health_monitor;
 };
 
 } // namespace admin
diff --git a/src/v/redpanda/application.cc b/src/v/redpanda/application.cc
@@ -1189,7 +1189,8 @@ void application::configure_admin_server(model::node_id node_id) {
                   &controller->get_topics_state()));
               s.add_service(
                 std::make_unique<admin::gc_service_impl>(
-                  &controller->get_topics_state()));
+                  &controller->get_topics_state(),
+                  &controller->get_health_monitor()));
           }
           s.add_service(
             std::make_unique<
diff --git a/tests/rptest/tests/cloud_topics/gc_test.py b/tests/rptest/tests/cloud_topics/gc_test.py
@@ -11,6 +11,7 @@
 from rptest.clients.rpk import RpkTool
 from rptest.clients.types import TopicSpec
 from rptest.services.cluster import cluster
+from rptest.services.kgo_verifier_services import KgoVerifierProducer
 from rptest.services.redpanda import (
     SISettings,
     CLOUD_TOPICS_CONFIG_STR,
@@ -45,6 +46,7 @@ def __init__(self, test_context):
             ),
         )
         self.rpk = RpkTool(self.redpanda)
+        self.test_context = test_context
 
     @cluster(num_nodes=3)
     def test_advance_epoch_endpoint_availability(self):
@@ -507,15 +509,15 @@ def test_advance_epoch_mixed_errors(self):
             "Successfully verified mixed error conditions are handled correctly"
         )
 
-    @cluster(num_nodes=3)
+    @cluster(num_nodes=4)
     def test_get_epoch_endpoint_availability(self):
         """
         Test that the get_epoch endpoint is available and returns expected responses.
 
         This test verifies:
         1. The endpoint is accessible via the admin API
         2. The endpoint accepts requests with topic partitions
-        3. The endpoint returns failed status for each partition (stub implementation)
+        3. The endpoint returns a nonzero epoch for each partition
         """
         admin = Admin(self.redpanda)
 
@@ -531,6 +533,15 @@ def test_get_epoch_endpoint_availability(self):
             },
         )
 
+        KgoVerifierProducer.oneshot(
+            self.test_context,
+            self.redpanda,
+            topic_name,
+            msg_size=1024,
+            msg_count=1024,
+            timeout_sec=30,
+        )
+
         # Prepare the get epoch request
         gc_client = admin.gc()
 
@@ -562,15 +573,15 @@ def test_get_epoch_endpoint_availability(self):
                 f"Partition {i} result: "
                 f"topic={result.partition.topic if result.HasField('partition') else 'N/A'}, "
                 f"partition={result.partition.partition if result.HasField('partition') else 'N/A'}, "
-                f"result={result.error}"
+                f"result={result.epoch}"
             )
 
-            assert result.error == gc_pb.GC_ERROR_FAILED, (
-                f"Expected FAILED status for partition {i}, got {result.error}"
+            assert result.epoch > 0, (
+                f"Expected nonzero epoch for partition {i}, got {result.epoch}"
             )
 
         self.logger.info(
-            "Successfully verified get_epoch endpoint returns failed status as expected"
+            "Successfully verified get_epoch endpoint returns valid epochs as expected"
         )
 
     @cluster(num_nodes=3)
@@ -849,7 +860,7 @@ def test_get_epoch_mixed_errors(self):
         # Build request with multiple partitions having different error conditions
         request = gc_pb.GetEpochRequest()
 
-        # 1. Valid cloud topic, valid partition (should return FAILED since stub)
+        # 1. Valid cloud topic, valid partition (should return epoch)
         tp1 = ntp_pb.TopicPartition()
         tp1.topic = topic_name
         tp1.partition = 0
@@ -891,9 +902,9 @@ def test_get_epoch_mixed_errors(self):
         )
 
         # Verify each result
-        # Result 1: Valid cloud topic and partition (should return FAILED in stub)
-        assert response.partitions[0].error == gc_pb.GC_ERROR_FAILED, (
-            f"Expected FAILED for valid topic/partition, got {response.partitions[0].error}"
+        # Result 1: Valid cloud topic and partition (should return epoch)
+        assert response.partitions[0].epoch == 0, (
+            f"Expected epoch for valid topic/partition, got {response.partitions[0].epoch}"
         )
 
         # Result 2: Non-existent topic