Skip to content

Commit 964417f

Browse files
committed
Scrape health report for cloud_topic epochs
In real version probably use Noah's cluster_support or whatever it was called.
1 parent 2b6d5b3 commit 964417f

File tree

5 files changed

+134
-23
lines changed

5 files changed

+134
-23
lines changed

src/v/redpanda/admin/services/internal/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ redpanda_cc_library(
1111
deps = [
1212
"//proto/redpanda/core/admin/internal/v1:debug_redpanda_proto",
1313
"//src/v/base",
14+
"//src/v/container:chunked_hash_map",
1415
"//src/v/finjector",
1516
"//src/v/redpanda/admin/proxy:client",
1617
"//src/v/serde/protobuf:rpc",

src/v/redpanda/admin/services/internal/gc.cc

Lines changed: 95 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,18 @@
1212

1313
#include "model/fundamental.h"
1414
#include "model/namespace.h"
15+
#include "model/timeout_clock.h"
1516
#include "serde/protobuf/rpc.h"
1617

1718
#include <seastar/core/coroutine.hh>
1819

20+
#include <fmt/format.h>
21+
22+
namespace {
23+
using namespace std::chrono_literals;
24+
constexpr auto health_report_query_timeout = 10s;
25+
} // namespace
26+
1927
namespace admin {
2028

2129
seastar::future<proto::admin::gc::advance_epoch_response>
@@ -64,6 +72,52 @@ gc_service_impl::advance_epoch(
6472
co_return response;
6573
}
6674

75+
auto gc_service_impl::populate_epochs(topic_partition_epoch_map tp_epochs)
76+
-> ss::future<topic_partition_epoch_map> {
77+
auto health_report = co_await _health_monitor->local().get_cluster_health(
78+
cluster::cluster_report_filter{},
79+
cluster::force_refresh::yes,
80+
model::timeout_clock::now() + health_report_query_timeout);
81+
82+
if (!health_report.has_value()) {
83+
throw serde::pb::rpc::unavailable_exception(
84+
fmt::format(
85+
"Error retrieving cluster health report: {}",
86+
health_report.error()));
87+
}
88+
fmt::print(std::cerr, "POPULATE EPOCHS: {}\n", tp_epochs.size());
89+
90+
for (const auto& node_health : health_report.value().node_reports) {
91+
for (const auto& [tp_ns, partition_statuses] : node_health->topics) {
92+
auto tp_it = tp_epochs.find(tp_ns.tp);
93+
if (tp_it == tp_epochs.end()) {
94+
fmt::print(std::cerr, "{}: NOT FOUND\n", tp_ns);
95+
continue;
96+
}
97+
for (const auto& [pid, p_status] : partition_statuses) {
98+
const auto maybe_max_gc_epoch
99+
= p_status.cloud_topic_max_gc_eligible_epoch;
100+
auto p_it = tp_it->second.find(pid);
101+
if (p_it == tp_it->second.end()) {
102+
fmt::print(
103+
std::cerr, "{}/{}: PARTITION NOT FOUND\n", tp_ns, pid);
104+
continue;
105+
}
106+
if (!maybe_max_gc_epoch.has_value()) {
107+
fmt::print(std::cerr, "{}/{}: NO EPOCH\n", tp_ns, pid);
108+
continue;
109+
}
110+
p_it->second = std::max(p_it->second, maybe_max_gc_epoch);
111+
fmt::print(
112+
std::cerr, "{}/{}: EPOCH: {}\n", tp_ns, pid, p_it->second);
113+
}
114+
}
115+
co_await ss::maybe_yield();
116+
}
117+
118+
co_return std::move(tp_epochs);
119+
}
120+
67121
seastar::future<proto::admin::gc::get_epoch_response>
68122
gc_service_impl::get_epoch(
69123
serde::pb::rpc::context, proto::admin::gc::get_epoch_request req) {
@@ -77,20 +131,19 @@ gc_service_impl::get_epoch(
77131
// 3. Get the current GC epoch for the partition
78132
// 4. Create result entry with success/failure status
79133

80-
// Stub: Return empty response for now
81134
chunked_vector<topic_partition_get_epoch_result> results;
135+
topic_partition_epoch_map epochs;
82136
for (const auto& tp : req.get_partitions()) {
83-
topic_partition_get_epoch_result result;
137+
auto cfg = _topic_table->local().get_topic_cfg(
138+
model::topic_namespace_view{
139+
model::kafka_namespace, model::topic_view{tp.get_topic()}});
84140

141+
topic_partition_get_epoch_result result;
85142
proto::common::topic_partition result_tp;
86143
result_tp.set_topic(ss::sstring{tp.get_topic()});
87144
result_tp.set_partition(tp.get_partition());
88145
result.set_partition(std::move(result_tp));
89146

90-
auto cfg = _topic_table->local().get_topic_cfg(
91-
model::topic_namespace_view{
92-
model::kafka_namespace, model::topic_view{tp.get_topic()}});
93-
94147
if (!cfg.has_value()) {
95148
result.set_error(error::gc_error_topic_not_found);
96149
} else if (auto p = tp.get_partition();
@@ -99,12 +152,47 @@ gc_service_impl::get_epoch(
99152
} else if (!cfg.value().is_cloud_topic()) {
100153
result.set_error(error::gc_error_not_cloud_topic);
101154
} else {
102-
result.set_error(error::gc_error_failed);
155+
// if the requested tp is good, stage it for a health report scan
156+
epochs[model::topic_view{result.get_partition().get_topic()}]
157+
.try_emplace(
158+
model::partition_id{result.get_partition().get_partition()},
159+
std::nullopt);
103160
}
104161

105162
results.push_back(std::move(result));
106163
}
107164

165+
if (!epochs.empty()) {
166+
epochs = co_await populate_epochs(std::move(epochs));
167+
for (auto& r : results) {
168+
auto tp_it = epochs.find(
169+
model::topic_view{r.get_partition().get_topic()});
170+
if (tp_it == epochs.end()) {
171+
vassert(
172+
r.has_error(),
173+
"Epoch not populated for {}, expected error!",
174+
r.get_partition().get_topic());
175+
continue;
176+
}
177+
auto p_it = tp_it->second.find(
178+
model::partition_id{r.get_partition().get_partition()});
179+
if (p_it == tp_it->second.end()) {
180+
vassert(
181+
r.has_error(),
182+
"Epoch not populated for {}/{}, expected error!",
183+
r.get_partition().get_topic(),
184+
r.get_partition().get_partition());
185+
continue;
186+
}
187+
if (!p_it->second.has_value()) {
188+
// TODO(oren): better error code i guess
189+
r.set_error(error::gc_error_failed);
190+
} else {
191+
r.set_epoch(p_it->second.value());
192+
}
193+
}
194+
}
195+
108196
response.set_partitions(std::move(results));
109197

110198
co_return response;

src/v/redpanda/admin/services/internal/gc.h

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#pragma once
1212

13+
#include "cluster/health_monitor_frontend.h"
1314
#include "cluster/topic_table.h"
1415
#include "proto/redpanda/core/admin/internal/cloud_topics/v1/gc.proto.h"
1516

@@ -19,19 +20,28 @@ namespace admin {
1920

2021
class gc_service_impl : public proto::admin::gc::gc_service {
2122
public:
22-
explicit gc_service_impl(ss::sharded<cluster::topic_table>* tt)
23-
: _topic_table(tt) {}
23+
explicit gc_service_impl(
24+
ss::sharded<cluster::topic_table>* tt,
25+
ss::sharded<cluster::health_monitor_frontend>* hm)
26+
: _topic_table(tt)
27+
, _health_monitor(hm) {}
2428

2529
seastar::future<proto::admin::gc::advance_epoch_response> advance_epoch(
2630
serde::pb::rpc::context,
2731
proto::admin::gc::advance_epoch_request) override;
2832

2933
seastar::future<proto::admin::gc::get_epoch_response> get_epoch(
30-
serde::pb::rpc::context,
31-
proto::admin::gc::get_epoch_request) override;
34+
serde::pb::rpc::context, proto::admin::gc::get_epoch_request) override;
3235

3336
private:
34-
[[maybe_unused]] ss::sharded<cluster::topic_table>* _topic_table;
37+
using partition_epoch_map
38+
= chunked_hash_map<model::partition_id, std::optional<int64_t>>;
39+
using topic_partition_epoch_map
40+
= chunked_hash_map<model::topic, partition_epoch_map>;
41+
ss::future<topic_partition_epoch_map>
42+
populate_epochs(topic_partition_epoch_map);
43+
ss::sharded<cluster::topic_table>* _topic_table;
44+
ss::sharded<cluster::health_monitor_frontend>* _health_monitor;
3545
};
3646

3747
} // namespace admin

src/v/redpanda/application.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1189,7 +1189,8 @@ void application::configure_admin_server(model::node_id node_id) {
11891189
&controller->get_topics_state()));
11901190
s.add_service(
11911191
std::make_unique<admin::gc_service_impl>(
1192-
&controller->get_topics_state()));
1192+
&controller->get_topics_state(),
1193+
&controller->get_health_monitor()));
11931194
}
11941195
s.add_service(
11951196
std::make_unique<

tests/rptest/tests/cloud_topics/gc_test.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from rptest.clients.rpk import RpkTool
1212
from rptest.clients.types import TopicSpec
1313
from rptest.services.cluster import cluster
14+
from rptest.services.kgo_verifier_services import KgoVerifierProducer
1415
from rptest.services.redpanda import (
1516
SISettings,
1617
CLOUD_TOPICS_CONFIG_STR,
@@ -45,6 +46,7 @@ def __init__(self, test_context):
4546
),
4647
)
4748
self.rpk = RpkTool(self.redpanda)
49+
self.test_context = test_context
4850

4951
@cluster(num_nodes=3)
5052
def test_advance_epoch_endpoint_availability(self):
@@ -507,15 +509,15 @@ def test_advance_epoch_mixed_errors(self):
507509
"Successfully verified mixed error conditions are handled correctly"
508510
)
509511

510-
@cluster(num_nodes=3)
512+
@cluster(num_nodes=4)
511513
def test_get_epoch_endpoint_availability(self):
512514
"""
513515
Test that the get_epoch endpoint is available and returns expected responses.
514516
515517
This test verifies:
516518
1. The endpoint is accessible via the admin API
517519
2. The endpoint accepts requests with topic partitions
518-
3. The endpoint returns failed status for each partition (stub implementation)
520+
3. The endpoint returns a nonzero epoch for each partition
519521
"""
520522
admin = Admin(self.redpanda)
521523

@@ -531,6 +533,15 @@ def test_get_epoch_endpoint_availability(self):
531533
},
532534
)
533535

536+
KgoVerifierProducer.oneshot(
537+
self.test_context,
538+
self.redpanda,
539+
topic_name,
540+
msg_size=1024,
541+
msg_count=1024,
542+
timeout_sec=30,
543+
)
544+
534545
# Prepare the get epoch request
535546
gc_client = admin.gc()
536547

@@ -562,15 +573,15 @@ def test_get_epoch_endpoint_availability(self):
562573
f"Partition {i} result: "
563574
f"topic={result.partition.topic if result.HasField('partition') else 'N/A'}, "
564575
f"partition={result.partition.partition if result.HasField('partition') else 'N/A'}, "
565-
f"result={result.error}"
576+
f"result={result.epoch}"
566577
)
567578

568-
assert result.error == gc_pb.GC_ERROR_FAILED, (
569-
f"Expected FAILED status for partition {i}, got {result.error}"
579+
assert result.epoch > 0, (
580+
f"Expected nonzero epoch for partition {i}, got {result.epoch}"
570581
)
571582

572583
self.logger.info(
573-
"Successfully verified get_epoch endpoint returns failed status as expected"
584+
"Successfully verified get_epoch endpoint returns valid epochs as expected"
574585
)
575586

576587
@cluster(num_nodes=3)
@@ -849,7 +860,7 @@ def test_get_epoch_mixed_errors(self):
849860
# Build request with multiple partitions having different error conditions
850861
request = gc_pb.GetEpochRequest()
851862

852-
# 1. Valid cloud topic, valid partition (should return FAILED since stub)
863+
# 1. Valid cloud topic, valid partition (should return epoch)
853864
tp1 = ntp_pb.TopicPartition()
854865
tp1.topic = topic_name
855866
tp1.partition = 0
@@ -891,9 +902,9 @@ def test_get_epoch_mixed_errors(self):
891902
)
892903

893904
# Verify each result
894-
# Result 1: Valid cloud topic and partition (should return FAILED in stub)
895-
assert response.partitions[0].error == gc_pb.GC_ERROR_FAILED, (
896-
f"Expected FAILED for valid topic/partition, got {response.partitions[0].error}"
905+
# Result 1: Valid cloud topic and partition (should return epoch)
906+
assert response.partitions[0].epoch == 0, (
907+
f"Expected epoch for valid topic/partition, got {response.partitions[0].epoch}"
897908
)
898909

899910
# Result 2: Non-existent topic

0 commit comments

Comments
 (0)