|
21 | 21 | #undef dout_prefix |
22 | 22 | #define dout_prefix _prefix(_dout, rank, epoch, version) |
23 | 23 |
|
| 24 | +static std::ostream& _dgraph_prefix(std::ostream *_dout, CephContext *cct) { |
| 25 | + return *_dout << "DirectedGraph "; |
| 26 | +} |
| 27 | + |
24 | 28 | static std::ostream& _prefix(std::ostream *_dout, int rank, epoch_t epoch, uint64_t version) { |
25 | 29 | return *_dout << "rank: " << rank << " version: "<< version << " ConnectionTracker(" << epoch << ") "; |
26 | 30 | } |
@@ -270,6 +274,166 @@ void ConnectionTracker::notify_rank_removed(int rank_removed, int new_rank) |
270 | 274 | increase_version(); |
271 | 275 | } |
272 | 276 |
|
| 277 | +#undef dout_prefix |
| 278 | +#define dout_prefix _dgraph_prefix(_dout, cct) |
| 279 | + |
| 280 | +void DirectedGraph::add_outgoing_edge(unsigned from, unsigned to) |
| 281 | +{ |
| 282 | + if (outgoing_edges[from].find(to) == outgoing_edges[from].end()) { |
| 283 | + outgoing_edges[from].insert(to); |
| 284 | + } else { |
| 285 | + ldout(cct, 30) << "Outgoing edge from " << from << " to " << to |
| 286 | + << " already exists in the graph" << dendl; |
| 287 | + } |
| 288 | +} |
| 289 | + |
| 290 | +void DirectedGraph::add_incoming_edge(unsigned to, unsigned from) |
| 291 | +{ |
| 292 | + if (incoming_edges[to].find(from) == incoming_edges[to].end()) { |
| 293 | + incoming_edges[to].insert(from); |
| 294 | + } else { |
| 295 | + ldout(cct, 30) << "Incoming edge to " << to << " from " << to |
| 296 | + << " already exists in the graph" << dendl; |
| 297 | + } |
| 298 | +} |
| 299 | + |
| 300 | +bool DirectedGraph::has_outgoing_edge(unsigned from, unsigned to) const |
| 301 | +{ |
| 302 | + auto from_it = outgoing_edges.find(from); |
| 303 | + if (from_it == outgoing_edges.end()) { |
| 304 | + ldout(cct, 30) << "Node " << from |
| 305 | + << " has no outgoing edges" << dendl; |
| 306 | + return false; |
| 307 | + } |
| 308 | + return from_it->second.find(to) != from_it->second.end(); |
| 309 | +} |
| 310 | + |
| 311 | +bool DirectedGraph::has_incoming_edge(unsigned to, unsigned from) const |
| 312 | +{ |
| 313 | + auto to_it = incoming_edges.find(to); |
| 314 | + if (to_it == incoming_edges.end()) { |
| 315 | + ldout(cct, 30) << "Node " << to |
| 316 | + << " has no incoming edges" << dendl; |
| 317 | + return false; |
| 318 | + } |
| 319 | + return to_it->second.find(from) != to_it->second.end(); |
| 320 | +} |
| 321 | + |
| 322 | +#undef dout_prefix |
| 323 | +#define dout_prefix _prefix(_dout, rank, epoch, version) |
| 324 | + |
| 325 | +std::set<std::pair<unsigned, unsigned>> ConnectionTracker::get_netsplit( |
| 326 | + std::set<unsigned> &mons_down) |
| 327 | +{ |
| 328 | + ldout(cct, 30) << __func__ << dendl; |
| 329 | + /* |
| 330 | + * The netsplit detection algorithm is as follows: |
| 331 | + * 1. Build a directed connectivity graph from peer reports and my reports, |
| 332 | + * excluding down monitors. |
| 333 | + * 2. Find missing connections (partitions). |
| 334 | + * 3. Return the set of pairs of monitors that are in a netsplit. |
| 335 | + * O(m^2) time complexity, where m is the number of monitors. |
| 336 | + * O(m^2) space complexity. |
| 337 | + */ |
| 338 | + // Step 1: Build a directed connectivity graph |
| 339 | + // from peer reports and my reports. Exclude down monitors. |
| 340 | + // peer_reports: |
| 341 | + // 1: {current={0:true,2:true},history={0:0.93,2:0.99},epoch=1,epoch_version=1}, |
| 342 | + // 2: {current={0:true,1:true},history={0:0.93,1:0.85},epoch=1,epoch_version=1} |
| 343 | + // O(m^2) time complexity, where m is the number of monitors |
| 344 | + auto mons_down_end = mons_down.end(); |
| 345 | + peer_reports[rank] = my_reports; |
| 346 | + DirectedGraph bdg(cct); |
| 347 | + for (const auto& [reporter_rank, report] : peer_reports) { |
| 348 | + if (reporter_rank < 0) continue; |
| 349 | + if (mons_down.find(reporter_rank) != mons_down_end) { |
| 350 | + ldout(cct, 30) << "Skipping down monitor: " << reporter_rank << dendl; |
| 351 | + continue; |
| 352 | + } |
| 353 | + for (const auto& [peer_rank, is_connected] : report.current) { |
| 354 | + if (peer_rank < 0) continue; |
| 355 | + if (mons_down.find(peer_rank) != mons_down_end) { |
| 356 | + ldout(cct, 30) << "Skipping down monitor: " << peer_rank << dendl; |
| 357 | + continue; |
| 358 | + } |
| 359 | + if (is_connected) { |
| 360 | + bdg.add_outgoing_edge(reporter_rank, peer_rank); |
| 361 | + bdg.add_incoming_edge(peer_rank, reporter_rank); |
| 362 | + } |
| 363 | + } |
| 364 | + } |
| 365 | + // For debugging purposes: |
| 366 | + if (cct->_conf->subsys.should_gather(ceph_subsys_mon, 30)) { |
| 367 | + ldout(cct, 30) << "Directed graph: " << dendl; |
| 368 | + |
| 369 | + ldout(cct, 30) << "Outgoing edges: {"; |
| 370 | + bool outer_first = true; |
| 371 | + for (const auto& [node, edges] : bdg.outgoing_edges) { |
| 372 | + if (!outer_first) *_dout << ", "; |
| 373 | + outer_first = false; |
| 374 | + *_dout << node << " -> {"; |
| 375 | + bool inner_first = true; |
| 376 | + for (const auto& edge : edges) { |
| 377 | + if (!inner_first) *_dout << ", "; |
| 378 | + inner_first = false; |
| 379 | + *_dout << edge; |
| 380 | + } |
| 381 | + *_dout << "}"; |
| 382 | + } |
| 383 | + *_dout << "}" << dendl; |
| 384 | + |
| 385 | + ldout(cct, 30) << "Incoming edges: {"; |
| 386 | + bool outer_first = true; |
| 387 | + for (const auto& [node, edges] : bdg.incoming_edges) { |
| 388 | + if (!outer_first) *_dout << ", "; |
| 389 | + outer_first = false; |
| 390 | + *_dout << node << " <- {"; |
| 391 | + bool inner_first = true; |
| 392 | + for (const auto& edge : edges) { |
| 393 | + if (!inner_first) *_dout << ", "; |
| 394 | + inner_first = false; |
| 395 | + *_dout << edge; |
| 396 | + } |
| 397 | + *_dout << "}"; |
| 398 | + } |
| 399 | + *_dout << "}" << dendl; |
| 400 | + } |
| 401 | + // Step 2: Find missing connections (partitions) |
| 402 | + // Only consider it a partition if both node and peer doesn't |
| 403 | + // have edges to each other AND have > 0 incoming edges. |
| 404 | + // looping through incoming edges garantees that we are not |
| 405 | + // considering a node without incoming edges as a partition. |
| 406 | + // As for nodes that are not in quourm, they are already exlcuded |
| 407 | + // in the previous step. |
| 408 | + // O(m^2) time complexity, where m is the number of monitors |
| 409 | + std::set<std::pair<unsigned, unsigned>> nsp_pairs; |
| 410 | + for (const auto& [node, _] : bdg.incoming_edges) { |
| 411 | + for (const auto& [peer, _] : bdg.incoming_edges) { |
| 412 | + // Skip self-connections |
| 413 | + if (node == peer) continue; |
| 414 | + // Check for bidirectional communication failure |
| 415 | + if (!bdg.has_outgoing_edge(node, peer) && |
| 416 | + !bdg.has_outgoing_edge(peer, node) && |
| 417 | + !bdg.has_incoming_edge(node, peer) && |
| 418 | + !bdg.has_incoming_edge(peer, node)) { |
| 419 | + // Normalize order to avoid duplicates |
| 420 | + unsigned first = std::min(node, peer); |
| 421 | + unsigned second = std::max(node, peer); |
| 422 | + nsp_pairs.insert(std::make_pair(first, second)); |
| 423 | + } |
| 424 | + } |
| 425 | + } |
| 426 | + // For debugging purposes: |
| 427 | + if (cct->_conf->subsys.should_gather(ceph_subsys_mon, 30)) { |
| 428 | + ldout(cct, 30) << "Netsplit pairs: " << dendl; |
| 429 | + for (const auto& nsp_pair : nsp_pairs) { |
| 430 | + ldout(cct, 30) << "(" << nsp_pair.first << ", " |
| 431 | + << nsp_pair.second << ") " << dendl; |
| 432 | + } |
| 433 | + } |
| 434 | + return nsp_pairs; |
| 435 | +} |
| 436 | + |
273 | 437 | bool ConnectionTracker::is_clean(int mon_rank, int monmap_size) |
274 | 438 | { |
275 | 439 | ldout(cct, 30) << __func__ << dendl; |
|
0 commit comments