@@ -414,6 +414,64 @@ def compress_expected_routes(expected_routes):
414414 return b64_str
415415
416416
417+ def get_route_programming_start_time_from_syslog (duthost , connection_type , action , LOG_STAMP , syslog = '/var/log/syslog' ):
418+ """
419+ Parse syslog for the first route programming event time. Returns the timestamp of the first route change event.
420+ """
421+ state = 'down' if action == 'shutdown' else 'up'
422+ if connection_type == 'ports' :
423+ cmd = f'grep "swss#portmgrd: " | grep "admin status to { state } "'
424+ elif connection_type == 'bgp_sessions' :
425+ cmd = f'grep "admin state is set to \' { state } \' "'
426+ else :
427+ logger .info ("[FLAP TEST] No RP analysis for connection_type: %s" , connection_type )
428+ return None
429+ log_pattern = f'/{ LOG_STAMP } / {{found=1}} found'
430+ pattern = f'sudo awk "{ log_pattern } " { syslog } | { cmd } | head -n 1'
431+ syslog_stamp = duthost .shell (pattern )['stdout' ].strip ()
432+ shut_time_str = " " .join (syslog_stamp .split ()[:4 ])
433+ rp_start_time = datetime .datetime .strptime (shut_time_str , "%Y %b %d %H:%M:%S.%f" )
434+ return rp_start_time
435+
436+
437+ def get_route_programming_metrics_from_sairedis_replay (duthost , start_time , sairedislog = '/var/log/swss/sairedis.rec' ):
438+ nhg_pattern = "|r|SAI_OBJECT_TYPE_NEXT_HOP_GROUP:"
439+ route_pattern = "|R|SAI_OBJECT_TYPE_ROUTE_ENTRY"
440+ ts_regex = re .compile (r'\d{4}-\d{2}-\d{2}\.\d{2}:\d{2}:\d{2}\.\d+' )
441+
442+ def read_lines (path ):
443+ try :
444+ return duthost .shell (f"sudo grep -e '{ nhg_pattern } ' -e '{ route_pattern } ' { path } " )['stdout' ].splitlines ()
445+ except Exception as e :
446+ logger .warning ("Failed to read %s: %s" , path , e )
447+ return []
448+ lines = read_lines (sairedislog )
449+ if not lines :
450+ logger .warning ("No RP events in %s, trying fallback" , sairedislog )
451+ lines = read_lines (sairedislog + ".1" )
452+ if not lines :
453+ return {
454+ "RP Start Time" : start_time ,
455+ "Route Programming Duration" : None ,
456+ "RP Error" : "No RP events found"
457+ }
458+ deltas = []
459+ route_events_count = 0
460+ for line in lines :
461+ m = ts_regex .search (line )
462+ if not m :
463+ continue
464+ ts = datetime .datetime .strptime (m .group (0 ), "%Y-%m-%d.%H:%M:%S.%f" )
465+ if ts <= start_time :
466+ continue
467+ if nhg_pattern in line :
468+ deltas .append ((ts - start_time ).total_seconds ())
469+ elif route_pattern in line :
470+ route_events_count += 1
471+ return {"RP Start Time" : start_time , "Route Programming Duration" : deltas [- 1 ] if deltas else None ,
472+ "Route Events Count" : route_events_count , "NextHopGroup Events Count" : len (deltas )}
473+
474+
417475def _select_targets_to_flap (bgp_peers_info , all_flap , flapping_count ):
418476 """Selects flapping_neighbors, injection_neighbor, flapping_ports, injection_port"""
419477 bgp_neighbors = list (bgp_peers_info .keys ())
@@ -449,6 +507,7 @@ def flapper(duthost, pdp, bgp_peers_info, transient_setup, flapping_count, conne
449507 - On startup action: Reuses the previously determined injection/flapping selections to restore connectivity and
450508 again validates route convergence and traffic recovery.
451509 - Measures and validates data plane downtime across the operations, helping to detect issues in convergence times.
510+ - Reports and validates route programming data from syslog/sairedis logs for control plane convergence.
452511 - Returns details about the selected connections and test traffic for subsequent phases.
453512
454513 Returns:
@@ -482,11 +541,14 @@ def flapper(duthost, pdp, bgp_peers_info, transient_setup, flapping_count, conne
482541 expected_routes = deepcopy (startup_routes )
483542 remove_routes_with_nexthops (startup_routes , nexthops_to_remove , expected_routes )
484543 compressed_routes = compress_expected_routes (expected_routes )
485- else :
544+ elif action == 'startup' :
486545 compressed_routes = transient_setup ['compressed_startup_routes' ]
487546 injection_port = transient_setup ['injection_port' ]
488547 flapping_connections = transient_setup ['flapping_connections' ]
489548 prefixes = transient_setup ['prefixes' ]
549+ else :
550+ logger .warning (f"Action { action } provided is not supported, skipping flapper function" )
551+ return {}
490552
491553 pkts = generate_packets (
492554 prefixes ,
@@ -503,6 +565,8 @@ def flapper(duthost, pdp, bgp_peers_info, transient_setup, flapping_count, conne
503565 flush_counters (pdp , exp_mask )
504566 traffic_thread .start ()
505567 start_time = datetime .datetime .now ()
568+ LOG_STAMP = "RP_ANALYSIS_STAMP_%s" % start_time .strftime ("%Y%m%d_%H%M%S" )
569+ duthost .shell ('sudo logger "%s"' % LOG_STAMP )
506570 try :
507571 result = check_bgp_routes_converged (
508572 duthost = duthost ,
@@ -528,6 +592,18 @@ def flapper(duthost, pdp, bgp_peers_info, transient_setup, flapping_count, conne
528592 # Ensure traffic is stopped
529593 terminated .set ()
530594 traffic_thread .join ()
595+ rp_start_time = get_route_programming_start_time_from_syslog (duthost , connection_type , action , LOG_STAMP )
596+ if rp_start_time :
597+ RP_metrics = get_route_programming_metrics_from_sairedis_replay (duthost , rp_start_time )
598+ logger .info (f"[FLAP TEST] Route programming metrics after { action } : { RP_metrics } " )
599+ test_results [f"{ current_test } _RP" ] = RP_metrics
600+ RP_duration = RP_metrics .get ('Route Programming Duration' )
601+ if RP_duration is not None and RP_duration > _get_max_time ('controlplane_convergence' ):
602+ _restore (duthost , connection_type , flapping_connections , all_flap )
603+ pytest .fail (f"RP Time during { current_test } is too long: { RP_duration } seconds" )
604+ else :
605+ logger .info (f"[FLAP TEST] No Route Programming metrics found after { action } " )
606+ test_results [f"{ current_test } _RP" ] = "No RP metrics found"
531607
532608 return {
533609 "flapping_connections" : flapping_connections ,
@@ -537,95 +613,6 @@ def flapper(duthost, pdp, bgp_peers_info, transient_setup, flapping_count, conne
537613 } if action == 'shutdown' else {}
538614
539615
540- def test_port_flap_with_syslog (
541- request ,
542- duthost ,
543- bgp_peers_info ,
544- setup_routes_before_test
545- ):
546- global current_test , test_results
547- current_test = request .node .name
548- TIMESTAMP = datetime .datetime .now ().strftime ("%Y%m%d_%H%M%S" )
549- LOG_STAMP = "ONLY_ANALYSIS_LOGS_AFTER_THIS_LINE_%s" % TIMESTAMP
550- TMP_SYSLOG_FILEPATH = "/tmp/syslog_after_bgp_flapping_%s.log" % TIMESTAMP
551- bgp_neighbors = [hostname for hostname in bgp_peers_info .keys ()]
552- flapping_neighbor = random .choice (bgp_neighbors )
553- flapping_ports = [bgp_peers_info [flapping_neighbor ][DUT_PORT ]]
554- logger .info ("Flapping port: %s" , flapping_ports )
555-
556- startup_routes = get_all_bgp_ipv6_routes (duthost , True )
557- nexthops_to_remove = [b [IPV6_KEY ] for b in bgp_peers_info .values () if b [DUT_PORT ] in flapping_ports ]
558- expected_routes = deepcopy (startup_routes )
559- remove_routes_with_nexthops (startup_routes , nexthops_to_remove , expected_routes )
560- compressed_expected_routes = compress_expected_routes (expected_routes )
561- duthost .shell ('sudo logger "%s"' % LOG_STAMP )
562- try :
563- result = check_bgp_routes_converged (
564- duthost = duthost ,
565- expected_routes = compressed_expected_routes ,
566- shutdown_connections = flapping_ports ,
567- connection_type = 'ports' ,
568- shutdown_all_connections = False ,
569- timeout = _get_max_time ('controlplane_convergence' ),
570- compressed = True ,
571- action = 'shutdown'
572- )
573-
574- if not result .get ("converged" ):
575- pytest .fail ("BGP routes are not stable in long time" )
576-
577- duthost .shell ('sudo logger -f /var/log/swss/sairedis.rec' )
578- duthost .shell ('sudo awk "/%s/ {found=1; next} found" %s > %s'
579- % (LOG_STAMP , '/var/log/syslog' , TMP_SYSLOG_FILEPATH ))
580-
581- last_group_update = duthost .shell ('sudo cat %s | grep "|C|SAI_OBJECT_TYPE_NEXT_HOP_GROUP_MEMBER||" | tail -n 1'
582- % TMP_SYSLOG_FILEPATH )['stdout' ]
583- last_group_update_time_str = re .search (r'\d{4}-\d{2}-\d{2}\.\d{2}:\d{2}:\d{2}\.\d+' , last_group_update ).group (0 )
584- last_group_update_time = datetime .datetime .strptime (last_group_update_time_str , "%Y-%m-%d.%H:%M:%S.%f" )
585-
586- port_shut_log = duthost .shell ('sudo cat %s | grep "Configure %s admin status to down" | tail -n 1'
587- % (TMP_SYSLOG_FILEPATH , flapping_ports [0 ]))['stdout' ]
588- port_shut_time_str = " " .join (port_shut_log .split ()[:4 ])
589- port_shut_time = datetime .datetime .strptime (port_shut_time_str , "%Y %b %d %H:%M:%S.%f" )
590-
591- time_gap = (last_group_update_time - port_shut_time ).total_seconds ()
592- if time_gap > _get_max_time ('controlplane_convergence' ):
593- pytest .fail ("Time is too long, from port shut to last group update is %s seconds" % time_gap )
594- logger .info ("Time difference between port shut and last nexthop group update is %s seconds" , time_gap )
595- test_results [current_test ] = "Time between port shut and last nexthop group update is %s seconds" % time_gap
596- finally :
597- duthost .no_shutdown_multiple (flapping_ports )
598-
599-
600- @pytest .mark .parametrize ("flapping_port_count" , [1 , 10 , 20 , 'all' ])
601- def test_sessions_flapping (
602- request ,
603- duthost ,
604- ptfadapter ,
605- bgp_peers_info ,
606- flapping_port_count ,
607- setup_routes_before_test
608- ):
609- '''
610- Validates that both control plane and data plane remain functional with acceptable downtime when BGP sessions are
611- flapped (brought down and back up), simulating various failure or maintenance scenarios.
612-
613- Uses the flapper function to orchestrate the flapping of BGP sessions and measure convergence times.
614-
615- Parameters range from flapping a single session to all sessions.
616-
617- Expected result:
618- Dataplane downtime is less than MAX_DOWNTIME_PORT_FLAPPING or MAX_DOWNTIME_UNISOLATION for all ports.
619- '''
620- pdp = ptfadapter .dataplane
621- pdp .set_qlen (PACKET_QUEUE_LENGTH )
622-
623- # Measure shutdown convergence
624- transient_setup = flapper (duthost , pdp , bgp_peers_info , None , flapping_port_count , 'ports' , 'shutdown' )
625- # Measure startup convergence
626- flapper (duthost , pdp , None , transient_setup , flapping_port_count , 'ports' , 'startup' )
627-
628-
629616def test_nexthop_group_member_scale (
630617 duthost ,
631618 ptfadapter ,
@@ -785,3 +772,32 @@ def test_nexthop_group_member_scale(
785772 validate_rx_tx_counters (pdp , end_time , start_time , exp_mask , _get_max_time ('dataplane_downtime' , 1 ))
786773 if not result .get ("converged" ):
787774 pytest .fail ("BGP routes are not stable in long time" )
775+
776+
777+ @pytest .mark .parametrize ("flapping_port_count" , [1 , 10 , 20 , 'all' ])
778+ def test_sessions_flapping (
779+ request ,
780+ duthost ,
781+ ptfadapter ,
782+ bgp_peers_info ,
783+ flapping_port_count ,
784+ setup_routes_before_test
785+ ):
786+ '''
787+ Validates that both control plane and data plane remain functional with acceptable downtime when BGP sessions are
788+ flapped (brought down and back up), simulating various failure or maintenance scenarios.
789+
790+ Uses the flapper function to orchestrate the flapping of BGP sessions and measure convergence times.
791+
792+ Parameters range from flapping a single session to all sessions.
793+
794+ Expected result:
795+ Dataplane downtime is less than MAX_DOWNTIME_PORT_FLAPPING or MAX_DOWNTIME_UNISOLATION for all ports.
796+ '''
797+ pdp = ptfadapter .dataplane
798+ pdp .set_qlen (PACKET_QUEUE_LENGTH )
799+
800+ # Measure shutdown convergence
801+ transient_setup = flapper (duthost , pdp , bgp_peers_info , None , flapping_port_count , 'ports' , 'shutdown' )
802+ # Measure startup convergence
803+ flapper (duthost , pdp , None , transient_setup , flapping_port_count , 'ports' , 'startup' )
0 commit comments