Skip to content

Commit 206a4c5

Browse files
authored
Merge pull request ceph#59248 from kamoltat/wip-ksirivad-improve-netsplit-warning
HealthMonitor: Add topology-aware netsplit detection and warning Reviewed-by: Anthony D'Atri <[email protected]> Reviewed-by: Samuel Just <[email protected]>
2 parents e3b2df6 + a5248f5 commit 206a4c5

File tree

7 files changed

+465
-4
lines changed

7 files changed

+465
-4
lines changed

doc/rados/operations/health-checks.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,24 @@ To adjust the warning threshold, run the following command:
153153

154154
ceph config set global mon_data_size_warn <size>
155155

156+
MON_NETSPLIT
157+
____________
158+
159+
A network partition has occurred among Ceph Monitors. This health check is
160+
raised when one or more monitors detect that at least two Ceph Monitors have
161+
lost connectivity or reachability, based on their individual connection scores,
162+
which are frequently updated. This warning only appears when
163+
the cluster is provisioned with at least three Ceph Monitors and are using the
164+
``connectivity`` election strategy.
165+
166+
Network partitions are reported in two ways:
167+
- As location-level netsplits (e.g., "Netsplit detected between dc1 and dc2") when
168+
all monitors in one location cannot communicate with all monitors in another location
169+
- As individual monitor netsplits (e.g., "Netsplit detected between mon.a and mon.d")
170+
when only specific monitors are disconnected across locations
171+
172+
The system prioritizes reporting at the highest topology level (``datacenter``, ``rack``, etc.)
173+
when possible, to better help operators identify infrastructure-level network issues.
156174

157175
AUTH_INSECURE_GLOBAL_ID_RECLAIM
158176
_______________________________

src/mon/ConnectionTracker.cc

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121
#undef dout_prefix
2222
#define dout_prefix _prefix(_dout, rank, epoch, version)
2323

24+
static std::ostream& _dgraph_prefix(std::ostream *_dout, CephContext *cct) {
25+
return *_dout << "DirectedGraph ";
26+
}
27+
2428
static std::ostream& _prefix(std::ostream *_dout, int rank, epoch_t epoch, uint64_t version) {
2529
return *_dout << "rank: " << rank << " version: "<< version << " ConnectionTracker(" << epoch << ") ";
2630
}
@@ -270,6 +274,166 @@ void ConnectionTracker::notify_rank_removed(int rank_removed, int new_rank)
270274
increase_version();
271275
}
272276

277+
#undef dout_prefix
278+
#define dout_prefix _dgraph_prefix(_dout, cct)
279+
280+
void DirectedGraph::add_outgoing_edge(unsigned from, unsigned to)
281+
{
282+
if (outgoing_edges[from].find(to) == outgoing_edges[from].end()) {
283+
outgoing_edges[from].insert(to);
284+
} else {
285+
ldout(cct, 30) << "Outgoing edge from " << from << " to " << to
286+
<< " already exists in the graph" << dendl;
287+
}
288+
}
289+
290+
void DirectedGraph::add_incoming_edge(unsigned to, unsigned from)
291+
{
292+
if (incoming_edges[to].find(from) == incoming_edges[to].end()) {
293+
incoming_edges[to].insert(from);
294+
} else {
295+
ldout(cct, 30) << "Incoming edge to " << to << " from " << to
296+
<< " already exists in the graph" << dendl;
297+
}
298+
}
299+
300+
bool DirectedGraph::has_outgoing_edge(unsigned from, unsigned to) const
301+
{
302+
auto from_it = outgoing_edges.find(from);
303+
if (from_it == outgoing_edges.end()) {
304+
ldout(cct, 30) << "Node " << from
305+
<< " has no outgoing edges" << dendl;
306+
return false;
307+
}
308+
return from_it->second.find(to) != from_it->second.end();
309+
}
310+
311+
bool DirectedGraph::has_incoming_edge(unsigned to, unsigned from) const
312+
{
313+
auto to_it = incoming_edges.find(to);
314+
if (to_it == incoming_edges.end()) {
315+
ldout(cct, 30) << "Node " << to
316+
<< " has no incoming edges" << dendl;
317+
return false;
318+
}
319+
return to_it->second.find(from) != to_it->second.end();
320+
}
321+
322+
#undef dout_prefix
323+
#define dout_prefix _prefix(_dout, rank, epoch, version)
324+
325+
std::set<std::pair<unsigned, unsigned>> ConnectionTracker::get_netsplit(
326+
std::set<unsigned> &mons_down)
327+
{
328+
ldout(cct, 30) << __func__ << dendl;
329+
/*
330+
* The netsplit detection algorithm is as follows:
331+
* 1. Build a directed connectivity graph from peer reports and my reports,
332+
* excluding down monitors.
333+
* 2. Find missing connections (partitions).
334+
* 3. Return the set of pairs of monitors that are in a netsplit.
335+
* O(m^2) time complexity, where m is the number of monitors.
336+
* O(m^2) space complexity.
337+
*/
338+
// Step 1: Build a directed connectivity graph
339+
// from peer reports and my reports. Exclude down monitors.
340+
// peer_reports:
341+
// 1: {current={0:true,2:true},history={0:0.93,2:0.99},epoch=1,epoch_version=1},
342+
// 2: {current={0:true,1:true},history={0:0.93,1:0.85},epoch=1,epoch_version=1}
343+
// O(m^2) time complexity, where m is the number of monitors
344+
auto mons_down_end = mons_down.end();
345+
peer_reports[rank] = my_reports;
346+
DirectedGraph bdg(cct);
347+
for (const auto& [reporter_rank, report] : peer_reports) {
348+
if (reporter_rank < 0) continue;
349+
if (mons_down.find(reporter_rank) != mons_down_end) {
350+
ldout(cct, 30) << "Skipping down monitor: " << reporter_rank << dendl;
351+
continue;
352+
}
353+
for (const auto& [peer_rank, is_connected] : report.current) {
354+
if (peer_rank < 0) continue;
355+
if (mons_down.find(peer_rank) != mons_down_end) {
356+
ldout(cct, 30) << "Skipping down monitor: " << peer_rank << dendl;
357+
continue;
358+
}
359+
if (is_connected) {
360+
bdg.add_outgoing_edge(reporter_rank, peer_rank);
361+
bdg.add_incoming_edge(peer_rank, reporter_rank);
362+
}
363+
}
364+
}
365+
// For debugging purposes:
366+
if (cct->_conf->subsys.should_gather(ceph_subsys_mon, 30)) {
367+
ldout(cct, 30) << "Directed graph: " << dendl;
368+
369+
ldout(cct, 30) << "Outgoing edges: {";
370+
bool outer_first = true;
371+
for (const auto& [node, edges] : bdg.outgoing_edges) {
372+
if (!outer_first) *_dout << ", ";
373+
outer_first = false;
374+
*_dout << node << " -> {";
375+
bool inner_first = true;
376+
for (const auto& edge : edges) {
377+
if (!inner_first) *_dout << ", ";
378+
inner_first = false;
379+
*_dout << edge;
380+
}
381+
*_dout << "}";
382+
}
383+
*_dout << "}" << dendl;
384+
385+
ldout(cct, 30) << "Incoming edges: {";
386+
bool outer_first = true;
387+
for (const auto& [node, edges] : bdg.incoming_edges) {
388+
if (!outer_first) *_dout << ", ";
389+
outer_first = false;
390+
*_dout << node << " <- {";
391+
bool inner_first = true;
392+
for (const auto& edge : edges) {
393+
if (!inner_first) *_dout << ", ";
394+
inner_first = false;
395+
*_dout << edge;
396+
}
397+
*_dout << "}";
398+
}
399+
*_dout << "}" << dendl;
400+
}
401+
// Step 2: Find missing connections (partitions)
402+
// Only consider it a partition if both node and peer doesn't
403+
// have edges to each other AND have > 0 incoming edges.
404+
// looping through incoming edges garantees that we are not
405+
// considering a node without incoming edges as a partition.
406+
// As for nodes that are not in quourm, they are already exlcuded
407+
// in the previous step.
408+
// O(m^2) time complexity, where m is the number of monitors
409+
std::set<std::pair<unsigned, unsigned>> nsp_pairs;
410+
for (const auto& [node, _] : bdg.incoming_edges) {
411+
for (const auto& [peer, _] : bdg.incoming_edges) {
412+
// Skip self-connections
413+
if (node == peer) continue;
414+
// Check for bidirectional communication failure
415+
if (!bdg.has_outgoing_edge(node, peer) &&
416+
!bdg.has_outgoing_edge(peer, node) &&
417+
!bdg.has_incoming_edge(node, peer) &&
418+
!bdg.has_incoming_edge(peer, node)) {
419+
// Normalize order to avoid duplicates
420+
unsigned first = std::min(node, peer);
421+
unsigned second = std::max(node, peer);
422+
nsp_pairs.insert(std::make_pair(first, second));
423+
}
424+
}
425+
}
426+
// For debugging purposes:
427+
if (cct->_conf->subsys.should_gather(ceph_subsys_mon, 30)) {
428+
ldout(cct, 30) << "Netsplit pairs: " << dendl;
429+
for (const auto& nsp_pair : nsp_pairs) {
430+
ldout(cct, 30) << "(" << nsp_pair.first << ", "
431+
<< nsp_pair.second << ") " << dendl;
432+
}
433+
}
434+
return nsp_pairs;
435+
}
436+
273437
bool ConnectionTracker::is_clean(int mon_rank, int monmap_size)
274438
{
275439
ldout(cct, 30) << __func__ << dendl;

src/mon/ConnectionTracker.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,21 @@ struct ConnectionReport {
5151
};
5252
WRITE_CLASS_ENCODER(ConnectionReport);
5353

54+
struct DirectedGraph {
55+
// The set of nodes in the graph
56+
// works with only non-negative ranks
57+
// because we only run the algorithm when
58+
// all monitors have valid ranks.
59+
std::map<unsigned, std::set<unsigned>> outgoing_edges;
60+
std::map<unsigned, std::set<unsigned>> incoming_edges;
61+
CephContext *cct;
62+
DirectedGraph(CephContext *c) : cct(c) {}
63+
void add_outgoing_edge(unsigned from, unsigned to);
64+
void add_incoming_edge(unsigned to, unsigned from);
65+
bool has_outgoing_edge(unsigned from, unsigned to) const;
66+
bool has_incoming_edge(unsigned to, unsigned from) const;
67+
};
68+
5469
class RankProvider {
5570
public:
5671
/**
@@ -127,6 +142,12 @@ class ConnectionTracker {
127142
* current and history of each peer_report.
128143
*/
129144
bool is_clean(int mon_rank, int monmap_size);
145+
/**
146+
* Get the set of monitor pairs that are disconnected
147+
* due to network partitions.
148+
* This is a set of pairs (rank1, rank2) where rank1 < rank2.
149+
*/
150+
std::set<std::pair<unsigned, unsigned>> get_netsplit(std::set<unsigned> &mons_down);
130151
/**
131152
* Encode this ConnectionTracker. Useful both for storing on disk
132153
* and for sending off to peers for decoding and import

src/mon/Elector.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -771,6 +771,11 @@ bool Elector::peer_tracker_is_clean()
771771
return peer_tracker.is_clean(mon->rank, paxos_size());
772772
}
773773

774+
std::set<std::pair<unsigned, unsigned>> Elector::get_netsplit_peer_tracker(std::set<unsigned> &mons_down)
775+
{
776+
return peer_tracker.get_netsplit(mons_down);
777+
}
778+
774779
bool Elector::is_tiebreaker(int rank) const
775780
{
776781
return mon->monmap->tiebreaker_mon == mon->monmap->get_name(rank);

src/mon/Elector.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,9 @@ class Elector : public ElectionOwner, RankProvider {
379379
* https://tracker.ceph.com/issues/58049
380380
*/
381381
bool peer_tracker_is_clean();
382+
383+
std::set<std::pair<unsigned, unsigned>> get_netsplit_peer_tracker(std::set<unsigned> &mons_down);
384+
382385
/**
383386
* Forget everything about our peers. :(
384387
*/

0 commit comments

Comments
 (0)