Skip to content

Commit 765c4a8

Browse files
Feature/route programming data (#21523)
This PR adds test cases and supporting utilities to measure route programming time under high‑scale BGP IPv6 scenarios, building on the refactoring in PR #21335. It focuses on quantifying how long it takes for routes to be fully programmed after BGP/connection events (e.g., convergence, admin flaps), and verifying that the measured RP times stay within expected limits and similar to the convergence time. Signed-off-by: Priyansh Tratiya <ptratiya@microsoft.com>
1 parent fd90377 commit 765c4a8

File tree

1 file changed

+106
-90
lines changed

1 file changed

+106
-90
lines changed

tests/bgp/test_ipv6_bgp_scale.py

Lines changed: 106 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,64 @@ def compress_expected_routes(expected_routes):
414414
return b64_str
415415

416416

417+
def get_route_programming_start_time_from_syslog(duthost, connection_type, action, LOG_STAMP, syslog='/var/log/syslog'):
418+
"""
419+
Parse syslog for the first route programming event time. Returns the timestamp of the first route change event.
420+
"""
421+
state = 'down' if action == 'shutdown' else 'up'
422+
if connection_type == 'ports':
423+
cmd = f'grep "swss#portmgrd: " | grep "admin status to {state}"'
424+
elif connection_type == 'bgp_sessions':
425+
cmd = f'grep "admin state is set to \'{state}\'"'
426+
else:
427+
logger.info("[FLAP TEST] No RP analysis for connection_type: %s", connection_type)
428+
return None
429+
log_pattern = f'/{LOG_STAMP}/ {{found=1}} found'
430+
pattern = f'sudo awk "{log_pattern}" {syslog} | {cmd} | head -n 1'
431+
syslog_stamp = duthost.shell(pattern)['stdout'].strip()
432+
shut_time_str = " ".join(syslog_stamp.split()[:4])
433+
rp_start_time = datetime.datetime.strptime(shut_time_str, "%Y %b %d %H:%M:%S.%f")
434+
return rp_start_time
435+
436+
437+
def get_route_programming_metrics_from_sairedis_replay(duthost, start_time, sairedislog='/var/log/swss/sairedis.rec'):
438+
nhg_pattern = "|r|SAI_OBJECT_TYPE_NEXT_HOP_GROUP:"
439+
route_pattern = "|R|SAI_OBJECT_TYPE_ROUTE_ENTRY"
440+
ts_regex = re.compile(r'\d{4}-\d{2}-\d{2}\.\d{2}:\d{2}:\d{2}\.\d+')
441+
442+
def read_lines(path):
443+
try:
444+
return duthost.shell(f"sudo grep -e '{nhg_pattern}' -e '{route_pattern}' {path}")['stdout'].splitlines()
445+
except Exception as e:
446+
logger.warning("Failed to read %s: %s", path, e)
447+
return []
448+
lines = read_lines(sairedislog)
449+
if not lines:
450+
logger.warning("No RP events in %s, trying fallback", sairedislog)
451+
lines = read_lines(sairedislog + ".1")
452+
if not lines:
453+
return {
454+
"RP Start Time": start_time,
455+
"Route Programming Duration": None,
456+
"RP Error": "No RP events found"
457+
}
458+
deltas = []
459+
route_events_count = 0
460+
for line in lines:
461+
m = ts_regex.search(line)
462+
if not m:
463+
continue
464+
ts = datetime.datetime.strptime(m.group(0), "%Y-%m-%d.%H:%M:%S.%f")
465+
if ts <= start_time:
466+
continue
467+
if nhg_pattern in line:
468+
deltas.append((ts - start_time).total_seconds())
469+
elif route_pattern in line:
470+
route_events_count += 1
471+
return {"RP Start Time": start_time, "Route Programming Duration": deltas[-1] if deltas else None,
472+
"Route Events Count": route_events_count, "NextHopGroup Events Count": len(deltas)}
473+
474+
417475
def _select_targets_to_flap(bgp_peers_info, all_flap, flapping_count):
418476
"""Selects flapping_neighbors, injection_neighbor, flapping_ports, injection_port"""
419477
bgp_neighbors = list(bgp_peers_info.keys())
@@ -449,6 +507,7 @@ def flapper(duthost, pdp, bgp_peers_info, transient_setup, flapping_count, conne
449507
- On startup action: Reuses the previously determined injection/flapping selections to restore connectivity and
450508
again validates route convergence and traffic recovery.
451509
- Measures and validates data plane downtime across the operations, helping to detect issues in convergence times.
510+
- Reports and validates route programming data from syslog/sairedis logs for control plane convergence.
452511
- Returns details about the selected connections and test traffic for subsequent phases.
453512
454513
Returns:
@@ -482,11 +541,14 @@ def flapper(duthost, pdp, bgp_peers_info, transient_setup, flapping_count, conne
482541
expected_routes = deepcopy(startup_routes)
483542
remove_routes_with_nexthops(startup_routes, nexthops_to_remove, expected_routes)
484543
compressed_routes = compress_expected_routes(expected_routes)
485-
else:
544+
elif action == 'startup':
486545
compressed_routes = transient_setup['compressed_startup_routes']
487546
injection_port = transient_setup['injection_port']
488547
flapping_connections = transient_setup['flapping_connections']
489548
prefixes = transient_setup['prefixes']
549+
else:
550+
logger.warning(f"Action {action} provided is not supported, skipping flapper function")
551+
return {}
490552

491553
pkts = generate_packets(
492554
prefixes,
@@ -503,6 +565,8 @@ def flapper(duthost, pdp, bgp_peers_info, transient_setup, flapping_count, conne
503565
flush_counters(pdp, exp_mask)
504566
traffic_thread.start()
505567
start_time = datetime.datetime.now()
568+
LOG_STAMP = "RP_ANALYSIS_STAMP_%s" % start_time.strftime("%Y%m%d_%H%M%S")
569+
duthost.shell('sudo logger "%s"' % LOG_STAMP)
506570
try:
507571
result = check_bgp_routes_converged(
508572
duthost=duthost,
@@ -528,6 +592,18 @@ def flapper(duthost, pdp, bgp_peers_info, transient_setup, flapping_count, conne
528592
# Ensure traffic is stopped
529593
terminated.set()
530594
traffic_thread.join()
595+
rp_start_time = get_route_programming_start_time_from_syslog(duthost, connection_type, action, LOG_STAMP)
596+
if rp_start_time:
597+
RP_metrics = get_route_programming_metrics_from_sairedis_replay(duthost, rp_start_time)
598+
logger.info(f"[FLAP TEST] Route programming metrics after {action}: {RP_metrics}")
599+
test_results[f"{current_test}_RP"] = RP_metrics
600+
RP_duration = RP_metrics.get('Route Programming Duration')
601+
if RP_duration is not None and RP_duration > _get_max_time('controlplane_convergence'):
602+
_restore(duthost, connection_type, flapping_connections, all_flap)
603+
pytest.fail(f"RP Time during {current_test} is too long: {RP_duration} seconds")
604+
else:
605+
logger.info(f"[FLAP TEST] No Route Programming metrics found after {action}")
606+
test_results[f"{current_test}_RP"] = "No RP metrics found"
531607

532608
return {
533609
"flapping_connections": flapping_connections,
@@ -537,95 +613,6 @@ def flapper(duthost, pdp, bgp_peers_info, transient_setup, flapping_count, conne
537613
} if action == 'shutdown' else {}
538614

539615

540-
def test_port_flap_with_syslog(
541-
request,
542-
duthost,
543-
bgp_peers_info,
544-
setup_routes_before_test
545-
):
546-
global current_test, test_results
547-
current_test = request.node.name
548-
TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
549-
LOG_STAMP = "ONLY_ANALYSIS_LOGS_AFTER_THIS_LINE_%s" % TIMESTAMP
550-
TMP_SYSLOG_FILEPATH = "/tmp/syslog_after_bgp_flapping_%s.log" % TIMESTAMP
551-
bgp_neighbors = [hostname for hostname in bgp_peers_info.keys()]
552-
flapping_neighbor = random.choice(bgp_neighbors)
553-
flapping_ports = [bgp_peers_info[flapping_neighbor][DUT_PORT]]
554-
logger.info("Flapping port: %s", flapping_ports)
555-
556-
startup_routes = get_all_bgp_ipv6_routes(duthost, True)
557-
nexthops_to_remove = [b[IPV6_KEY] for b in bgp_peers_info.values() if b[DUT_PORT] in flapping_ports]
558-
expected_routes = deepcopy(startup_routes)
559-
remove_routes_with_nexthops(startup_routes, nexthops_to_remove, expected_routes)
560-
compressed_expected_routes = compress_expected_routes(expected_routes)
561-
duthost.shell('sudo logger "%s"' % LOG_STAMP)
562-
try:
563-
result = check_bgp_routes_converged(
564-
duthost=duthost,
565-
expected_routes=compressed_expected_routes,
566-
shutdown_connections=flapping_ports,
567-
connection_type='ports',
568-
shutdown_all_connections=False,
569-
timeout=_get_max_time('controlplane_convergence'),
570-
compressed=True,
571-
action='shutdown'
572-
)
573-
574-
if not result.get("converged"):
575-
pytest.fail("BGP routes are not stable in long time")
576-
577-
duthost.shell('sudo logger -f /var/log/swss/sairedis.rec')
578-
duthost.shell('sudo awk "/%s/ {found=1; next} found" %s > %s'
579-
% (LOG_STAMP, '/var/log/syslog', TMP_SYSLOG_FILEPATH))
580-
581-
last_group_update = duthost.shell('sudo cat %s | grep "|C|SAI_OBJECT_TYPE_NEXT_HOP_GROUP_MEMBER||" | tail -n 1'
582-
% TMP_SYSLOG_FILEPATH)['stdout']
583-
last_group_update_time_str = re.search(r'\d{4}-\d{2}-\d{2}\.\d{2}:\d{2}:\d{2}\.\d+', last_group_update).group(0)
584-
last_group_update_time = datetime.datetime.strptime(last_group_update_time_str, "%Y-%m-%d.%H:%M:%S.%f")
585-
586-
port_shut_log = duthost.shell('sudo cat %s | grep "Configure %s admin status to down" | tail -n 1'
587-
% (TMP_SYSLOG_FILEPATH, flapping_ports[0]))['stdout']
588-
port_shut_time_str = " ".join(port_shut_log.split()[:4])
589-
port_shut_time = datetime.datetime.strptime(port_shut_time_str, "%Y %b %d %H:%M:%S.%f")
590-
591-
time_gap = (last_group_update_time - port_shut_time).total_seconds()
592-
if time_gap > _get_max_time('controlplane_convergence'):
593-
pytest.fail("Time is too long, from port shut to last group update is %s seconds" % time_gap)
594-
logger.info("Time difference between port shut and last nexthop group update is %s seconds", time_gap)
595-
test_results[current_test] = "Time between port shut and last nexthop group update is %s seconds" % time_gap
596-
finally:
597-
duthost.no_shutdown_multiple(flapping_ports)
598-
599-
600-
@pytest.mark.parametrize("flapping_port_count", [1, 10, 20, 'all'])
601-
def test_sessions_flapping(
602-
request,
603-
duthost,
604-
ptfadapter,
605-
bgp_peers_info,
606-
flapping_port_count,
607-
setup_routes_before_test
608-
):
609-
'''
610-
Validates that both control plane and data plane remain functional with acceptable downtime when BGP sessions are
611-
flapped (brought down and back up), simulating various failure or maintenance scenarios.
612-
613-
Uses the flapper function to orchestrate the flapping of BGP sessions and measure convergence times.
614-
615-
Parameters range from flapping a single session to all sessions.
616-
617-
Expected result:
618-
Dataplane downtime is less than MAX_DOWNTIME_PORT_FLAPPING or MAX_DOWNTIME_UNISOLATION for all ports.
619-
'''
620-
pdp = ptfadapter.dataplane
621-
pdp.set_qlen(PACKET_QUEUE_LENGTH)
622-
623-
# Measure shutdown convergence
624-
transient_setup = flapper(duthost, pdp, bgp_peers_info, None, flapping_port_count, 'ports', 'shutdown')
625-
# Measure startup convergence
626-
flapper(duthost, pdp, None, transient_setup, flapping_port_count, 'ports', 'startup')
627-
628-
629616
def test_nexthop_group_member_scale(
630617
duthost,
631618
ptfadapter,
@@ -785,3 +772,32 @@ def test_nexthop_group_member_scale(
785772
validate_rx_tx_counters(pdp, end_time, start_time, exp_mask, _get_max_time('dataplane_downtime', 1))
786773
if not result.get("converged"):
787774
pytest.fail("BGP routes are not stable in long time")
775+
776+
777+
@pytest.mark.parametrize("flapping_port_count", [1, 10, 20, 'all'])
778+
def test_sessions_flapping(
779+
request,
780+
duthost,
781+
ptfadapter,
782+
bgp_peers_info,
783+
flapping_port_count,
784+
setup_routes_before_test
785+
):
786+
'''
787+
Validates that both control plane and data plane remain functional with acceptable downtime when BGP sessions are
788+
flapped (brought down and back up), simulating various failure or maintenance scenarios.
789+
790+
Uses the flapper function to orchestrate the flapping of BGP sessions and measure convergence times.
791+
792+
Parameters range from flapping a single session to all sessions.
793+
794+
Expected result:
795+
Dataplane downtime is less than MAX_DOWNTIME_PORT_FLAPPING or MAX_DOWNTIME_UNISOLATION for all ports.
796+
'''
797+
pdp = ptfadapter.dataplane
798+
pdp.set_qlen(PACKET_QUEUE_LENGTH)
799+
800+
# Measure shutdown convergence
801+
transient_setup = flapper(duthost, pdp, bgp_peers_info, None, flapping_port_count, 'ports', 'shutdown')
802+
# Measure startup convergence
803+
flapper(duthost, pdp, None, transient_setup, flapping_port_count, 'ports', 'startup')

0 commit comments

Comments
 (0)