Skip to content

Commit 7ee2fcd

Browse files
authored
Merge pull request #29047 from bharathv/fix_decom_status
cluster/observability: fix empty partition in decom / reconfiguration status ouputs
2 parents 2b05017 + aa11692 commit 7ee2fcd

File tree

7 files changed

+142
-48
lines changed

7 files changed

+142
-48
lines changed

src/v/cluster/controller.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,7 @@ ss::future<> controller::start(
683683
std::ref(_members_table),
684684
std::ref(_partition_balancer),
685685
std::ref(_partition_manager),
686+
std::ref(_partition_leaders),
686687
std::ref(_as));
687688

688689
co_await _members_backend.invoke_on(

src/v/cluster/controller_api.cc

Lines changed: 70 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "cluster/partition_balancer_backend.h"
2121
#include "cluster/partition_balancer_rpc_service.h"
2222
#include "cluster/partition_balancer_types.h"
23+
#include "cluster/partition_leaders_table.h"
2324
#include "cluster/partition_manager.h"
2425
#include "cluster/shard_table.h"
2526
#include "cluster/topic_table.h"
@@ -49,6 +50,7 @@ controller_api::controller_api(
4950
ss::sharded<members_table>& members,
5051
ss::sharded<partition_balancer_backend>& partition_balancer,
5152
ss::sharded<partition_manager>& partition_manager,
53+
ss::sharded<partition_leaders_table>& partition_leaders,
5254
ss::sharded<ss::abort_source>& as)
5355
: _self(self)
5456
, _backend(backend)
@@ -58,6 +60,7 @@ controller_api::controller_api(
5860
, _members(members)
5961
, _partition_balancer(partition_balancer)
6062
, _partition_manager(partition_manager)
63+
, _partition_leaders(partition_leaders)
6164
, _as(as) {}
6265

6366
ss::future<chunked_vector<ntp_reconciliation_state>>
@@ -283,6 +286,22 @@ controller_api::get_reconciliation_state(
283286
});
284287
}
285288

289+
ss::future<result<ntp_reconciliation_state>>
290+
controller_api::get_partition_leader_reconciliation_state(
291+
model::ntp ntp, model::timeout_clock::time_point timeout) {
292+
auto leader = _partition_leaders.local().get_leader(ntp);
293+
if (!leader) {
294+
vlog(
295+
clusterlog.debug,
296+
"can't get partition leader for ntp {} to get its reconciliation "
297+
"state",
298+
ntp);
299+
co_return result<ntp_reconciliation_state>(errc::no_leader_controller);
300+
}
301+
co_return co_await get_reconciliation_state(
302+
*leader, std::move(ntp), timeout);
303+
}
304+
286305
ss::future<result<ntp_reconciliation_state>>
287306
controller_api::get_reconciliation_state(
288307
model::node_id id, model::ntp ntp, model::timeout_clock::time_point timeout) {
@@ -333,51 +352,60 @@ ss::future<std::error_code> controller_api::wait_for_topic(
333352
}
334353

335354
ss::future<result<chunked_vector<partition_reconfiguration_state>>>
336-
controller_api::get_partitions_reconfiguration_state(
355+
controller_api::get_partitions_leader_reconfiguration_state(
337356
const chunked_vector<model::ntp>& partitions,
338-
model::timeout_clock::time_point) {
357+
model::timeout_clock::time_point timeout) {
339358
auto& updates_in_progress = _topics.local().updates_in_progress();
340-
341359
absl::node_hash_map<model::ntp, partition_reconfiguration_state> states;
342-
for (auto& ntp : partitions) {
343-
auto progress_it = updates_in_progress.find(ntp);
344-
if (progress_it == updates_in_progress.end()) {
345-
continue;
346-
}
347-
auto p_as = _topics.local().get_partition_assignment(ntp);
348-
if (!p_as) {
349-
continue;
350-
}
351-
partition_reconfiguration_state state;
352-
state.ntp = ntp;
353-
354-
state.current_assignment = std::move(p_as->replicas);
355-
state.previous_assignment = progress_it->second.get_previous_replicas();
356-
state.state = progress_it->second.get_state();
357-
state.policy = progress_it->second.get_reconfiguration_policy();
358-
359-
auto reconciliation_state = co_await get_reconciliation_state(ntp);
360-
for (auto& operation : reconciliation_state.pending_operations()) {
361-
if (operation.recovery_state) {
362-
state.current_partition_size
363-
= operation.recovery_state->local_size;
364-
for (auto& [id, recovery_state] :
365-
operation.recovery_state->replicas) {
366-
state.replicas.push_back(
367-
replica_bytes{
368-
.node = id,
369-
.bytes_left = recovery_state.bytes_left,
370-
.bytes_transferred = state.current_partition_size
371-
- recovery_state.bytes_left,
372-
.offset = recovery_state.last_offset,
373-
});
360+
co_await ss::max_concurrent_for_each(
361+
partitions,
362+
16,
363+
[this, &updates_in_progress, &states, timeout](
364+
const model::ntp& ntp) -> ss::future<> {
365+
auto progress_it = updates_in_progress.find(ntp);
366+
if (progress_it == updates_in_progress.end()) {
367+
return ss::now();
368+
}
369+
auto p_as = _topics.local().get_partition_assignment(ntp);
370+
if (!p_as) {
371+
return ss::now();
372+
}
373+
partition_reconfiguration_state state;
374+
state.ntp = ntp;
375+
376+
state.current_assignment = std::move(p_as->replicas);
377+
state.previous_assignment
378+
= progress_it->second.get_previous_replicas();
379+
state.state = progress_it->second.get_state();
380+
state.policy = progress_it->second.get_reconfiguration_policy();
381+
382+
return get_partition_leader_reconciliation_state(ntp, timeout)
383+
.then([&states, &ntp, state = std::move(state)](
384+
result<ntp_reconciliation_state> r) mutable {
385+
if (r.has_value()) {
386+
for (auto& operation : r.value().pending_operations()) {
387+
if (operation.recovery_state) {
388+
state.current_partition_size
389+
= operation.recovery_state->local_size;
390+
for (auto& [id, recovery_state] :
391+
operation.recovery_state->replicas) {
392+
state.replicas.push_back(
393+
replica_bytes{
394+
.node = id,
395+
.bytes_left = recovery_state.bytes_left,
396+
.bytes_transferred
397+
= state.current_partition_size
398+
- recovery_state.bytes_left,
399+
.offset = recovery_state.last_offset,
400+
});
401+
}
402+
}
403+
}
374404
}
375-
}
376-
}
377-
378-
states.emplace(ntp, std::move(state));
379-
}
380-
405+
states.emplace(ntp, std::move(state));
406+
return ss::now();
407+
});
408+
});
381409
chunked_vector<partition_reconfiguration_state> ret;
382410
ret.reserve(states.size());
383411
for (auto& [_, state] : states) {
@@ -499,7 +527,7 @@ controller_api::get_node_decommission_progress(
499527
// replicas that are moving from decommissioned node are still present on a
500528
// node but their metadata is update, add them explicitly
501529
ret.replicas_left += moving_from_node.size();
502-
auto states = co_await get_partitions_reconfiguration_state(
530+
auto states = co_await get_partitions_leader_reconfiguration_state(
503531
std::move(moving_from_node), timeout);
504532

505533
if (states) {

src/v/cluster/controller_api.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ class controller_api {
4141
ss::sharded<members_table>&,
4242
ss::sharded<partition_balancer_backend>&,
4343
ss::sharded<partition_manager>&,
44+
ss::sharded<partition_leaders_table>&,
4445
ss::sharded<ss::abort_source>&);
4546

4647
ss::future<result<chunked_vector<ntp_reconciliation_state>>>
@@ -65,12 +66,16 @@ class controller_api {
6566
ss::future<result<ntp_reconciliation_state>> get_reconciliation_state(
6667
model::node_id, model::ntp, model::timeout_clock::time_point);
6768

69+
ss::future<result<ntp_reconciliation_state>>
70+
get_partition_leader_reconciliation_state(
71+
model::ntp, model::timeout_clock::time_point);
72+
6873
// high level APIs
6974
ss::future<std::error_code> wait_for_topic(
7075
model::topic_namespace_view, model::timeout_clock::time_point);
7176

7277
ss::future<result<chunked_vector<partition_reconfiguration_state>>>
73-
get_partitions_reconfiguration_state(
78+
get_partitions_leader_reconfiguration_state(
7479
const chunked_vector<model::ntp>&, model::timeout_clock::time_point);
7580
/**
7681
* Returns state of controller backend from each node in the cluster for
@@ -106,6 +111,7 @@ class controller_api {
106111
ss::sharded<members_table>& _members;
107112
ss::sharded<partition_balancer_backend>& _partition_balancer;
108113
ss::sharded<partition_manager>& _partition_manager;
114+
ss::sharded<partition_leaders_table>& _partition_leaders;
109115
ss::sharded<ss::abort_source>& _as;
110116
};
111117
} // namespace cluster

src/v/cluster/types.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,8 +282,9 @@ std::ostream& operator<<(std::ostream& o, const backend_operation& op) {
282282
std::ostream& operator<<(std::ostream& o, const recovery_state& r) {
283283
fmt::print(
284284
o,
285-
"{{local_last_offset: {}, replicas: {}}}",
285+
"{{local_last_offset: {}, local_size: {}, replicas: {}}}",
286286
r.local_last_offset,
287+
r.local_size,
287288
r.replicas);
288289
return o;
289290
}

src/v/cluster/types.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1559,7 +1559,7 @@ struct replica_recovery_state
15591559
};
15601560
struct recovery_state
15611561
: serde::
1562-
envelope<recovery_state, serde::version<0>, serde::compat_version<0>> {
1562+
envelope<recovery_state, serde::version<1>, serde::compat_version<0>> {
15631563
model::offset local_last_offset;
15641564
size_t local_size;
15651565

@@ -1570,7 +1570,9 @@ struct recovery_state
15701570
friend bool operator==(const recovery_state&, const recovery_state&)
15711571
= default;
15721572

1573-
auto serde_fields() { return std::tie(local_last_offset, replicas); }
1573+
auto serde_fields() {
1574+
return std::tie(local_last_offset, replicas, local_size);
1575+
}
15741576
};
15751577

15761578
struct backend_operation

src/v/redpanda/admin/partition.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,8 +216,9 @@ admin_server::get_reconfigurations_handler(std::unique_ptr<ss::http::request>) {
216216

217217
auto [reconfiguration_states, reconciliations]
218218
= co_await ss::when_all_succeed(
219-
_controller->get_api().local().get_partitions_reconfiguration_state(
220-
ntps, deadline),
219+
_controller->get_api()
220+
.local()
221+
.get_partitions_leader_reconfiguration_state(ntps, deadline),
221222
_controller->get_api().local().get_global_reconciliation_state(
222223
ntps, deadline));
223224

tests/rptest/tests/nodes_decommissioning_test.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,61 @@ def test_decommissioning_working_node(
438438
if not delete_topic:
439439
self.verify()
440440

441+
@cluster(num_nodes=6)
442+
def test_decommission_status(self):
443+
self.start_redpanda()
444+
self._create_topics(replication_factors=[3])
445+
self.start_producer()
446+
to_decommission = random.choice(self.redpanda.nodes)
447+
to_decommission_id = self.redpanda.node_id(to_decommission)
448+
self.logger.info(
449+
f"decommissioning node: {to_decommission_id}",
450+
)
451+
# Block recovery to ensure decommissioning takes some time
452+
# and we see some status in the middle of decommissioning
453+
self._set_recovery_rate(0)
454+
self._decommission(to_decommission_id)
455+
456+
def validate_decommission_status():
457+
def check_decommission_status(rpc_node: ClusterNode):
458+
status = self.admin.get_decommission_status(
459+
id=to_decommission_id, node=rpc_node
460+
)
461+
finished = status["finished"]
462+
replicas_left = status["replicas_left"]
463+
partition_meta = []
464+
for p in status.get("partitions", []):
465+
if p["topic"] != self._topic:
466+
# We are only producing data to self._topic
467+
continue
468+
has_valid_meta = (
469+
p["partition_size"] > 0 and p["bytes_left_to_move"] > 0
470+
)
471+
if not has_valid_meta:
472+
self.logger.debug(
473+
f"partition {p} is not reporting valid metadata"
474+
)
475+
partition_meta.append(has_valid_meta)
476+
return (
477+
not finished
478+
and replicas_left > 0
479+
and partition_meta
480+
and all(partition_meta)
481+
)
482+
483+
return all(check_decommission_status(n) for n in self.redpanda.nodes)
484+
485+
self.redpanda.wait_until(
486+
validate_decommission_status,
487+
timeout_sec=30,
488+
backoff_sec=1,
489+
err_msg="Decommission status not reported as in_progress on all nodes",
490+
retry_on_exc=True,
491+
)
492+
self._set_recovery_rate(2 << 30)
493+
494+
self._wait_for_node_removed(to_decommission_id)
495+
441496
@cluster(num_nodes=6, log_allow_list=CHAOS_LOG_ALLOW_LIST)
442497
def test_decommissioning_node_rf_1_replica(self):
443498
self.start_redpanda()

0 commit comments

Comments
 (0)