1+ # Launches an Isaac Lab Reach/Franka training job on Nebius AI serverless from OSMO.
2+ # Mounts shared storage for logs/checkpoints, then waits for completion and surfaces failures.
13workflow :
24 name : osmo-isaac-reachy-rl
35 pool : default
@@ -6,7 +8,6 @@ workflow:
68 nebius-cli :
79 cpu : 1
810 memory : 2Gi
9- platform : L40S
1011
1112 tasks :
1213 - name : trigger-reachy-training
@@ -16,9 +17,9 @@ workflow:
1617 environment :
1718 SUBNET_ID : " vpcsubnet-XXX"
1819 PARENT_ID : " project-XXX"
19- FILESYSTEM_ID : " computefilesystem-XXX"
2020 NEBIUS_IAM_TOKEN : " NEBIUS_IAM_TOKEN"
2121 NGC_API_KEY : " nvapi-XXX"
22+ FILESYSTEM_ID : " computefilesystem-XXX"
2223 # RL training parameters
2324 TASK : " Isaac-Reach-Franka-v0" # swap for Isaac-Reach-Reachy-v0 when extension is installed
2425 NUM_ENVS : " 4096"
@@ -43,24 +44,58 @@ workflow:
4344 mkdir -p /root/.nebius
4445 printf 'current-profile: default\nprofiles:\n default:\n endpoint: api.eu-north1.nebius.cloud:443\n parent-id: %s\n' \
4546 "$PARENT_ID" > /root/.nebius/config.yaml
47+ mkdir -p {{output}}
4648
4749 echo "--- 3. Launch Isaac Lab training job ---"
48- RUN_NAME="${TASK}-$(date +%s)"
49- JOB_OUTPUT=$(nebius ai job create \
50- --name "reachy-train-$(date +%s)" \
51- --image "nvcr.io/nvidia/isaac-lab:2.3.2" \
52- --registry-username "\$oauthtoken" \
53- --registry-password "$NGC_API_KEY" \
54- --platform "gpu-l40s-d" \
55- --preset "1gpu-16vcpu-96gb" \
56- --volume "$FILESYSTEM_ID:/mnt/storage" \
57- --container-command "bash" \
58- --args "-c '/workspace/isaaclab/isaaclab.sh -p /workspace/isaaclab/scripts/reinforcement_learning/rsl_rl/train.py --task $TASK --headless --num_envs $NUM_ENVS --max_iterations $MAX_ITERATIONS +runner.log_root_path=/mnt/storage/logs 2>&1'" \
59- --timeout "120m" \
60- --subnet-id "$SUBNET_ID" \
61- --parent-id "$PARENT_ID")
50+ # The OSMO task is just a CPU launcher pod. The actual serverless
51+ # target is selected by the Nebius AI job platform below.
52+ if [[ -z "${FILESYSTEM_ID:-}" || "${FILESYSTEM_ID}" == "computefilesystem-XXX" ]]; then
53+ echo "FILESYSTEM_ID must be set to a real filestore ID for persistent training outputs." >&2
54+ exit 1
55+ fi
56+
57+ LOG_ROOT_PATH="/mnt/storage/logs"
58+ RESULTS_PATH="${LOG_ROOT_PATH}/${TASK}/"
59+ echo "Using filestore for logs/checkpoints: ${FILESYSTEM_ID}"
60+ JOB_CMD=(
61+ nebius ai job create
62+ --name "reachy-train-$(date +%s)"
63+ --image "nvcr.io/nvidia/isaac-lab:2.3.2"
64+ --registry-username "\$oauthtoken"
65+ --registry-password "$NGC_API_KEY"
66+ --platform "gpu-h100-sxm"
67+ --preset "1gpu-16vcpu-200gb"
68+ --volume "${FILESYSTEM_ID}:/mnt/storage"
69+ )
70+
71+ JOB_CMD+=(
72+ --container-command "bash"
73+ --args "-c '/workspace/isaaclab/isaaclab.sh -p /workspace/isaaclab/scripts/reinforcement_learning/rsl_rl/train.py --task $TASK --headless --num_envs $NUM_ENVS --max_iterations $MAX_ITERATIONS +runner.log_root_path=${LOG_ROOT_PATH} 2>&1'"
74+ --timeout "120m"
75+ --subnet-id "$SUBNET_ID"
76+ --parent-id "$PARENT_ID"
77+ )
78+
79+ JOB_CREATE_RESPONSE_FILE="{{output}}/job-create-response.txt"
80+ JOB_CREATE_ERROR_FILE="{{output}}/job-create-error.txt"
81+ JOB_ID_FILE="{{output}}/job-id.txt"
82+
83+ if ! JOB_OUTPUT=$("${JOB_CMD[@]}" 2>&1); then
84+ printf '%s\n' "$JOB_OUTPUT" | tee "$JOB_CREATE_ERROR_FILE"
85+ echo "Job create failed before a job ID was returned." | tee -a "$JOB_CREATE_ERROR_FILE"
86+ echo "If the plain serverless Isaac workflow succeeds, the likely culprit is the filestore mount: ${FILESYSTEM_ID}:/mnt/storage" | tee -a "$JOB_CREATE_ERROR_FILE"
87+ exit 1
88+ fi
89+
90+ printf '%s\n' "$JOB_OUTPUT" > "$JOB_CREATE_RESPONSE_FILE"
6291 echo "Job create response: $JOB_OUTPUT"
6392 JOB_ID=$(echo "$JOB_OUTPUT" | grep -oE 'aijob-[a-z0-9]+' | head -1)
93+ if [[ -z "$JOB_ID" ]]; then
94+ echo "Job create response did not include a job ID." | tee "$JOB_CREATE_ERROR_FILE"
95+ printf '%s\n' "$JOB_OUTPUT" | tee -a "$JOB_CREATE_ERROR_FILE"
96+ exit 1
97+ fi
98+ printf '%s\n' "$JOB_ID" > "$JOB_ID_FILE"
6499 echo "Submitted training job: $JOB_ID"
65100
66101 echo "--- 4. Wait for training completion ---"
@@ -72,7 +107,7 @@ workflow:
72107 case "$STATUS" in
73108 SUCCEEDED|COMPLETED)
74109 echo "Training completed successfully."
75- echo "Logs and checkpoints saved to: /mnt/storage/logs/$TASK/ "
110+ echo "Logs and checkpoints path: ${RESULTS_PATH} "
76111 break ;;
77112 FAILED|ERROR|CANCELLED)
78113 echo "Training failed with status: $STATUS"
0 commit comments