Skip to content

Commit e131fe5

Browse files
author
Timothy Le
committed
Updated examples with agnostic platforms
1 parent a53b165 commit e131fe5

File tree

2 files changed

+57
-20
lines changed

2 files changed

+57
-20
lines changed

applications/osmo/workflows/osmo/test_isaac_lab_serverless.yaml renamed to applications/osmo/workflows/osmo/test_serverless_isaac.yaml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# Launches an Isaac Lab job on Nebius AI serverless from an OSMO CPU-only helper pod.
2+
# Waits for the remote job to finish and reports the final serverless status in the task logs.
13
workflow:
24
name: osmo-isaac-rl-launcher
35
pool: default
@@ -6,7 +8,6 @@ workflow:
68
nebius-cli:
79
cpu: 1
810
memory: 2Gi
9-
platform: L40S
1011

1112
tasks:
1213
- name: trigger-isaac-l40s-job
@@ -18,7 +19,6 @@ workflow:
1819
PARENT_ID: "project-XXX"
1920
NEBIUS_IAM_TOKEN: "<nebius iam get-access-token result>"
2021
NGC_API_KEY: "nvapi-XXX"
21-
2222
command:
2323
- bash
2424
- -lc
@@ -40,8 +40,10 @@ workflow:
4040
"$PARENT_ID" > /root/.nebius/config.yaml
4141
4242
echo "--- 3. Launch Isaac Lab L40S job ---"
43+
# The OSMO task is just a CPU launcher pod. The actual serverless
44+
# target is selected by the Nebius AI job platform below.
4345
JOB_OUTPUT=$(nebius ai job create \
44-
--name "isaac-reachy-$(date +%s)" \
46+
--name "isaac-serverless-$(date +%s)" \
4547
--image "nvcr.io/nvidia/isaac-lab:2.3.2" \
4648
--registry-username "\$oauthtoken" \
4749
--registry-password "$NGC_API_KEY" \

applications/osmo/workflows/osmo/test_isaac_reachy.yaml renamed to applications/osmo/workflows/osmo/test_serverless_isaac_reachy.yaml

Lines changed: 52 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# Launches an Isaac Lab Reach/Franka training job on Nebius AI serverless from OSMO.
2+
# Mounts shared storage for logs/checkpoints, then waits for completion and surfaces failures.
13
workflow:
24
name: osmo-isaac-reachy-rl
35
pool: default
@@ -6,7 +8,6 @@ workflow:
68
nebius-cli:
79
cpu: 1
810
memory: 2Gi
9-
platform: L40S
1011

1112
tasks:
1213
- name: trigger-reachy-training
@@ -16,9 +17,9 @@ workflow:
1617
environment:
1718
SUBNET_ID: "vpcsubnet-XXX"
1819
PARENT_ID: "project-XXX"
19-
FILESYSTEM_ID: "computefilesystem-XXX"
2020
NEBIUS_IAM_TOKEN: "NEBIUS_IAM_TOKEN"
2121
NGC_API_KEY: "nvapi-XXX"
22+
FILESYSTEM_ID: "computefilesystem-XXX"
2223
# RL training parameters
2324
TASK: "Isaac-Reach-Franka-v0" # swap for Isaac-Reach-Reachy-v0 when extension is installed
2425
NUM_ENVS: "4096"
@@ -43,24 +44,58 @@ workflow:
4344
mkdir -p /root/.nebius
4445
printf 'current-profile: default\nprofiles:\n default:\n endpoint: api.eu-north1.nebius.cloud:443\n parent-id: %s\n' \
4546
"$PARENT_ID" > /root/.nebius/config.yaml
47+
mkdir -p {{output}}
4648
4749
echo "--- 3. Launch Isaac Lab training job ---"
48-
RUN_NAME="${TASK}-$(date +%s)"
49-
JOB_OUTPUT=$(nebius ai job create \
50-
--name "reachy-train-$(date +%s)" \
51-
--image "nvcr.io/nvidia/isaac-lab:2.3.2" \
52-
--registry-username "\$oauthtoken" \
53-
--registry-password "$NGC_API_KEY" \
54-
--platform "gpu-l40s-d" \
55-
--preset "1gpu-16vcpu-96gb" \
56-
--volume "$FILESYSTEM_ID:/mnt/storage" \
57-
--container-command "bash" \
58-
--args "-c '/workspace/isaaclab/isaaclab.sh -p /workspace/isaaclab/scripts/reinforcement_learning/rsl_rl/train.py --task $TASK --headless --num_envs $NUM_ENVS --max_iterations $MAX_ITERATIONS +runner.log_root_path=/mnt/storage/logs 2>&1'" \
59-
--timeout "120m" \
60-
--subnet-id "$SUBNET_ID" \
61-
--parent-id "$PARENT_ID")
50+
# The OSMO task is just a CPU launcher pod. The actual serverless
51+
# target is selected by the Nebius AI job platform below.
52+
if [[ -z "${FILESYSTEM_ID:-}" || "${FILESYSTEM_ID}" == "computefilesystem-XXX" ]]; then
53+
echo "FILESYSTEM_ID must be set to a real filestore ID for persistent training outputs." >&2
54+
exit 1
55+
fi
56+
57+
LOG_ROOT_PATH="/mnt/storage/logs"
58+
RESULTS_PATH="${LOG_ROOT_PATH}/${TASK}/"
59+
echo "Using filestore for logs/checkpoints: ${FILESYSTEM_ID}"
60+
JOB_CMD=(
61+
nebius ai job create
62+
--name "reachy-train-$(date +%s)"
63+
--image "nvcr.io/nvidia/isaac-lab:2.3.2"
64+
--registry-username "\$oauthtoken"
65+
--registry-password "$NGC_API_KEY"
66+
--platform "gpu-h100-sxm"
67+
--preset "1gpu-16vcpu-200gb"
68+
--volume "${FILESYSTEM_ID}:/mnt/storage"
69+
)
70+
71+
JOB_CMD+=(
72+
--container-command "bash"
73+
--args "-c '/workspace/isaaclab/isaaclab.sh -p /workspace/isaaclab/scripts/reinforcement_learning/rsl_rl/train.py --task $TASK --headless --num_envs $NUM_ENVS --max_iterations $MAX_ITERATIONS +runner.log_root_path=${LOG_ROOT_PATH} 2>&1'"
74+
--timeout "120m"
75+
--subnet-id "$SUBNET_ID"
76+
--parent-id "$PARENT_ID"
77+
)
78+
79+
JOB_CREATE_RESPONSE_FILE="{{output}}/job-create-response.txt"
80+
JOB_CREATE_ERROR_FILE="{{output}}/job-create-error.txt"
81+
JOB_ID_FILE="{{output}}/job-id.txt"
82+
83+
if ! JOB_OUTPUT=$("${JOB_CMD[@]}" 2>&1); then
84+
printf '%s\n' "$JOB_OUTPUT" | tee "$JOB_CREATE_ERROR_FILE"
85+
echo "Job create failed before a job ID was returned." | tee -a "$JOB_CREATE_ERROR_FILE"
86+
echo "If the plain serverless Isaac workflow succeeds, the likely culprit is the filestore mount: ${FILESYSTEM_ID}:/mnt/storage" | tee -a "$JOB_CREATE_ERROR_FILE"
87+
exit 1
88+
fi
89+
90+
printf '%s\n' "$JOB_OUTPUT" > "$JOB_CREATE_RESPONSE_FILE"
6291
echo "Job create response: $JOB_OUTPUT"
6392
JOB_ID=$(echo "$JOB_OUTPUT" | grep -oE 'aijob-[a-z0-9]+' | head -1)
93+
if [[ -z "$JOB_ID" ]]; then
94+
echo "Job create response did not include a job ID." | tee "$JOB_CREATE_ERROR_FILE"
95+
printf '%s\n' "$JOB_OUTPUT" | tee -a "$JOB_CREATE_ERROR_FILE"
96+
exit 1
97+
fi
98+
printf '%s\n' "$JOB_ID" > "$JOB_ID_FILE"
6499
echo "Submitted training job: $JOB_ID"
65100
66101
echo "--- 4. Wait for training completion ---"
@@ -72,7 +107,7 @@ workflow:
72107
case "$STATUS" in
73108
SUCCEEDED|COMPLETED)
74109
echo "Training completed successfully."
75-
echo "Logs and checkpoints saved to: /mnt/storage/logs/$TASK/"
110+
echo "Logs and checkpoints path: ${RESULTS_PATH}"
76111
break ;;
77112
FAILED|ERROR|CANCELLED)
78113
echo "Training failed with status: $STATUS"

0 commit comments

Comments
 (0)