Skip to content

Commit 82ef039

Browse files
Improve error logging for manifest build failures
Add detailed diagnostics when srun/rclone fails: - Log the full command that was executed - Capture all SLURM environment variables - Print SLURM memory vars to stderr for quick debugging - Include full stdout and stderr in error log This helps diagnose environment-related failures like SLURM_MEM_* variable conflicts. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent b8f4335 commit 82ef039

File tree

1 file changed

+28
-3
lines changed

1 file changed

+28
-3
lines changed

src/xfer/cli.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -263,16 +263,41 @@ def manifest_build(
263263
# Write error details to xfer-err/
264264
err_file = err_dir / f"manifest_build-{run_id}.log"
265265
with err_file.open("w", encoding="utf-8") as ef:
266-
ef.write(f"Exception: {exc}\n")
267-
import traceback
266+
ef.write(f"Exception: {exc}\n\n")
267+
268+
# Log the command that was run
269+
ef.write("--- COMMAND ---\n")
270+
ef.write(" ".join(shlex.quote(c) for c in srun_cmd) + "\n\n")
268271

272+
# Log relevant SLURM environment variables
273+
ef.write("--- SLURM ENVIRONMENT ---\n")
274+
slurm_vars = {k: v for k, v in os.environ.items() if k.startswith("SLURM")}
275+
for k, v in sorted(slurm_vars.items()):
276+
ef.write(f"{k}={v}\n")
277+
ef.write("\n")
278+
279+
import traceback
280+
ef.write("--- TRACEBACK ---\n")
269281
ef.write(traceback.format_exc())
270-
# If subprocess.CalledProcessError, try to write stderr
282+
283+
# If subprocess.CalledProcessError, write stdout and stderr
271284
if hasattr(exc, "stderr") and exc.stderr:
272285
with err_file.open("a", encoding="utf-8") as ef:
273286
ef.write("\n--- STDERR ---\n")
274287
ef.write(str(exc.stderr))
288+
if hasattr(exc, "stdout") and exc.stdout:
289+
with err_file.open("a", encoding="utf-8") as ef:
290+
ef.write("\n--- STDOUT ---\n")
291+
ef.write(str(exc.stdout))
292+
293+
# Also print key info to stderr for visibility in job logs
275294
eprint(f"ERROR: srun/rclone failed, see {err_file}")
295+
eprint(f"Command: {' '.join(shlex.quote(c) for c in srun_cmd)}")
296+
if hasattr(exc, "stderr") and exc.stderr:
297+
eprint(f"stderr: {exc.stderr}")
298+
slurm_mem_vars = {k: v for k, v in os.environ.items() if "MEM" in k and k.startswith("SLURM")}
299+
if slurm_mem_vars:
300+
eprint(f"SLURM memory env vars: {slurm_mem_vars}")
276301
raise
277302

278303
# Build JSONL

0 commit comments

Comments
 (0)