@@ -333,13 +333,23 @@ def kill(self):
333333 # https://github.com/firecracker-microvm/firecracker/pull/4442/commits/d63eb7a65ffaaae0409d15ed55d99ecbd29bc572
334334
335335 # filter ps results for the jailer's unique id
336- _ , stdout , stderr = utils .check_output (
337- f"ps ax -o cmd -ww | grep { self .jailer .jailer_id } "
336+ _ , stdout , stderr = utils .run_cmd (
337+ f"ps ax -o pid, cmd -ww | grep { self .jailer .jailer_id } "
338338 )
339+
340+ assert not stderr , f"error querying processes using `ps`: { stderr } "
341+
342+ offenders = []
343+ for proc in stdout .splitlines ():
344+ _ , cmd = proc .lower ().split (maxsplit = 1 )
345+ if "firecracker" in proc and not cmd .startswith ("screen" ):
346+ offenders .append (proc )
347+
339348 # make sure firecracker was killed
340- assert (
341- stderr == "" and "firecracker" not in stdout
342- ), f"Firecracker reported its pid { self .firecracker_pid } , which was killed, but there still exist processes using the supposedly dead Firecracker's jailer_id: { stdout } "
349+ assert not offenders , (
350+ f"Firecracker reported its pid { self .firecracker_pid } , which was killed, but there still exist processes using the supposedly dead Firecracker's jailer_id: \n "
351+ + "\n " .join (offenders )
352+ )
343353
344354 if self .uffd_handler and self .uffd_handler .is_running ():
345355 self .uffd_handler .kill ()
@@ -605,7 +615,12 @@ def spawn(
605615 # pylint: disable=subprocess-run-check
606616 # pylint: disable=too-many-branches
607617 self .jailer .setup ()
608- self .api = Api (self .jailer .api_socket_path ())
618+ self .api = Api (
619+ self .jailer .api_socket_path (),
620+ on_error = lambda verb , uri , err_msg : self ._dump_debug_information (
621+ f"Error during { verb } { uri } : { err_msg } "
622+ ),
623+ )
609624
610625 if log_file is not None :
611626 self .log_file = Path (self .path ) / log_file
@@ -673,21 +688,33 @@ def spawn(
673688 if emit_metrics :
674689 self .monitors .append (FCMetricsMonitor (self ))
675690
676- # Wait for the jailer to create resources needed, and Firecracker to
677- # create its API socket.
678- # We expect the jailer to start within 80 ms. However, we wait for
679- # 1 sec since we are rechecking the existence of the socket 5 times
680- # and leave 0.2 delay between them.
681- if "no-api" not in self .jailer .extra_args :
682- self ._wait_create ()
691+ # Ensure Firecracker is in as good a state as possible wrts guest
692+ # responsiveness / API availability.
693+ # If we are using a config file and it has a network device specified,
694+ # use SSH to wait until guest userspace is available. If we are
695+ # using the API, wait until the log message indicating the API server
696+ # has finished initializing is printed (if logging is enabled), or
697+ # until the API socket file has been created.
698+ # If none of these apply, do a last ditch effort to make sure the
699+ # Firecracker process itself at least came up by checking
700+ # for the startup log message. Otherwise, you're on your own kid.
683701 if "config-file" in self .jailer .extra_args and self .iface :
684702 self .wait_for_ssh_up ()
685- if self .log_file and log_level in ("Trace" , "Debug" , "Info" ):
703+ elif "no-api" not in self .jailer .extra_args :
704+ if self .log_file and log_level in ("Trace" , "Debug" , "Info" ):
705+ self .check_log_message ("API server started." )
706+ else :
707+ self ._wait_for_api_socket ()
708+ elif self .log_file and log_level in ("Trace" , "Debug" , "Info" ):
686709 self .check_log_message ("Running Firecracker" )
687710
688711 @retry (wait = wait_fixed (0.2 ), stop = stop_after_attempt (5 ), reraise = True )
689- def _wait_create (self ):
712+ def _wait_for_api_socket (self ):
690713 """Wait until the API socket and chroot folder are available."""
714+
715+ # We expect the jailer to start within 80 ms. However, we wait for
716+ # 1 sec since we are rechecking the existence of the socket 5 times
717+ # and leave 0.2 delay between them.
691718 os .stat (self .jailer .api_socket_path ())
692719
693720 @retry (wait = wait_fixed (0.2 ), stop = stop_after_attempt (5 ), reraise = True )
@@ -1086,10 +1113,12 @@ def thread_backtraces(self):
10861113 backtraces = []
10871114 for thread_name , thread_pids in utils .get_threads (self .firecracker_pid ).items ():
10881115 for pid in thread_pids :
1089- backtraces .append (
1090- f"{ thread_name } ({ pid = } ):\n "
1091- f"{ utils .check_output (f'cat /proc/{ pid } /stack' ).stdout } "
1092- )
1116+ try :
1117+ stack = Path (f"/proc/{ pid } /stack" ).read_text ("UTF-8" )
1118+ except FileNotFoundError :
1119+ continue # process might've gone away between get_threads() call and here
1120+
1121+ backtraces .append (f"{ thread_name } ({ pid = } ):\n { stack } " )
10931122 return "\n " .join (backtraces )
10941123
10951124 def _dump_debug_information (self , what : str ):
0 commit comments