File tree Expand file tree Collapse file tree 1 file changed +24
-10
lines changed
Expand file tree Collapse file tree 1 file changed +24
-10
lines changed Original file line number Diff line number Diff line change @@ -405,21 +405,35 @@ def fetch_logs(
405405 job_subdir = self .job_dir [len (nemo_run_home ) + 1 :] # +1 to remove the initial backslash
406406 self .pvc_job_dir = os .path .join (self .pvc_nemo_run_dir , job_subdir )
407407
408- cmd .extend ([glob . glob ( f"{ self .pvc_job_dir } /logs/output-* .log" )])
408+ cmd .extend ([f"{ self .pvc_job_dir } /logs/output-{ i } .log" for i in range ( self . nodes )])
409409
410410 logger .info (f"Attempting to stream logs with command: { cmd } " )
411411
412412 proc = subprocess .Popen (cmd , stdout = subprocess .PIPE , text = True , bufsize = 1 )
413413
414- try :
415- for line in iter (proc .stdout .readline , "" ):
416- if line :
417- yield line .rstrip ("\n " )
418- if proc .poll () is not None :
419- break
420- finally :
421- proc .terminate ()
422- proc .wait (timeout = 2 )
414+ if stream :
415+ while True :
416+ try :
417+ for line in iter (proc .stdout .readline , "" ):
418+ if line :
419+ yield line .rstrip ("\n " )
420+ if proc .poll () is not None :
421+ break
422+ except Exception as e :
423+ logger .error (f"Error streaming logs: { e } " )
424+ time .sleep (3 )
425+ continue
426+
427+ else :
428+ try :
429+ for line in iter (proc .stdout .readline , "" ):
430+ if line :
431+ yield line .rstrip ("\n " )
432+ if proc .poll () is not None :
433+ break
434+ finally :
435+ proc .terminate ()
436+ proc .wait (timeout = 2 )
423437
424438 def cancel (self , job_id : str ):
425439 # Retrieve the authentication token for the REST calls
You can’t perform that action at this time.
0 commit comments