Skip to content

Commit 16f771d

Browse files
committed
test
Signed-off-by: oliver könig <okoenig@nvidia.com>
1 parent a9f5f31 commit 16f771d

File tree

1 file changed

+24
-10
lines changed

1 file changed

+24
-10
lines changed

nemo_run/core/execution/dgxcloud.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -405,21 +405,35 @@ def fetch_logs(
405405
job_subdir = self.job_dir[len(nemo_run_home) + 1 :] # +1 to remove the initial backslash
406406
self.pvc_job_dir = os.path.join(self.pvc_nemo_run_dir, job_subdir)
407407

408-
cmd.extend([glob.glob(f"{self.pvc_job_dir}/logs/output-*.log")])
408+
cmd.extend([f"{self.pvc_job_dir}/logs/output-{i}.log" for i in range(self.nodes)])
409409

410410
logger.info(f"Attempting to stream logs with command: {cmd}")
411411

412412
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, text=True, bufsize=1)
413413

414-
try:
415-
for line in iter(proc.stdout.readline, ""):
416-
if line:
417-
yield line.rstrip("\n")
418-
if proc.poll() is not None:
419-
break
420-
finally:
421-
proc.terminate()
422-
proc.wait(timeout=2)
414+
if stream:
415+
while True:
416+
try:
417+
for line in iter(proc.stdout.readline, ""):
418+
if line:
419+
yield line.rstrip("\n")
420+
if proc.poll() is not None:
421+
break
422+
except Exception as e:
423+
logger.error(f"Error streaming logs: {e}")
424+
time.sleep(3)
425+
continue
426+
427+
else:
428+
try:
429+
for line in iter(proc.stdout.readline, ""):
430+
if line:
431+
yield line.rstrip("\n")
432+
if proc.poll() is not None:
433+
break
434+
finally:
435+
proc.terminate()
436+
proc.wait(timeout=2)
423437

424438
def cancel(self, job_id: str):
425439
# Retrieve the authentication token for the REST calls

0 commit comments

Comments
 (0)