Skip to content

Commit 1b030d3

Browse files
authored
Merge pull request #873 from nebius/osmo-workflow-example
OSMO workflow examples using serverless
2 parents 62c845a + e131fe5 commit 1b030d3

File tree

2 files changed

+190
-0
lines changed

2 files changed

+190
-0
lines changed
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# Launches an Isaac Lab job on Nebius AI serverless from an OSMO CPU-only helper pod.
2+
# Waits for the remote job to finish and reports the final serverless status in the task logs.
3+
workflow:
4+
name: osmo-isaac-rl-launcher
5+
pool: default
6+
7+
resources:
8+
nebius-cli:
9+
cpu: 1
10+
memory: 2Gi
11+
12+
tasks:
13+
- name: trigger-isaac-l40s-job
14+
image: ubuntu:22.04
15+
resource: nebius-cli
16+
17+
environment:
18+
SUBNET_ID: "vpcsubnet-XXX"
19+
PARENT_ID: "project-XXX"
20+
NEBIUS_IAM_TOKEN: "<nebius iam get-access-token result>"
21+
NGC_API_KEY: "nvapi-XXX"
22+
command:
23+
- bash
24+
- -lc
25+
args:
26+
- |
27+
set -euo pipefail
28+
29+
echo "--- 1. Install Nebius CLI ---"
30+
apt-get update && apt-get install -y curl bash jq
31+
32+
curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh | bash
33+
34+
export PATH="$PATH:/root/.nebius/bin"
35+
nebius version
36+
37+
echo "--- 2. Configure Nebius credentials ---"
38+
mkdir -p /root/.nebius
39+
printf 'current-profile: default\nprofiles:\n default:\n endpoint: api.eu-north1.nebius.cloud:443\n parent-id: %s\n' \
40+
"$PARENT_ID" > /root/.nebius/config.yaml
41+
42+
echo "--- 3. Launch Isaac Lab L40S job ---"
43+
# The OSMO task is just a CPU launcher pod. The actual serverless
44+
# target is selected by the Nebius AI job platform below.
45+
JOB_OUTPUT=$(nebius ai job create \
46+
--name "isaac-serverless-$(date +%s)" \
47+
--image "nvcr.io/nvidia/isaac-lab:2.3.2" \
48+
--registry-username "\$oauthtoken" \
49+
--registry-password "$NGC_API_KEY" \
50+
--platform "gpu-l40s-d" \
51+
--preset "1gpu-16vcpu-96gb" \
52+
--container-command "bash" \
53+
--args "-c '/workspace/isaaclab/isaaclab.sh -p -c \"import isaaclab; print(isaaclab.__version__)\"'" \
54+
--timeout "60m" \
55+
--subnet-id "$SUBNET_ID" \
56+
--parent-id "$PARENT_ID")
57+
echo "Job create response: $JOB_OUTPUT"
58+
JOB_ID=$(echo "$JOB_OUTPUT" | grep -oP 'aijob-\w+' | head -1)
59+
echo "Submitted job: $JOB_ID"
60+
61+
echo "--- 4. Wait for job completion ---"
62+
while true; do
63+
JOB_RAW=$(nebius ai job get --id "$JOB_ID" --format json 2>&1)
64+
STATUS=$(echo "$JOB_RAW" | jq -r '.status.phase // .status.state // empty' 2>/dev/null \
65+
|| echo "$JOB_RAW" | grep -oP '(RUNNING|SUCCEEDED|COMPLETED|FAILED|ERROR|CANCELLED|STARTING|PENDING)' | head -1)
66+
echo "Job status: $STATUS"
67+
case "$STATUS" in
68+
SUCCEEDED|COMPLETED) echo "Isaac Lab job completed successfully."; break ;;
69+
FAILED|ERROR|CANCELLED) echo "Isaac Lab job failed with status: $STATUS"; exit 1 ;;
70+
*) sleep 30 ;;
71+
esac
72+
done
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# Launches an Isaac Lab Reach/Franka training job on Nebius AI serverless from OSMO.
2+
# Mounts shared storage for logs/checkpoints, then waits for completion and surfaces failures.
3+
workflow:
4+
name: osmo-isaac-reachy-rl
5+
pool: default
6+
7+
resources:
8+
nebius-cli:
9+
cpu: 1
10+
memory: 2Gi
11+
12+
tasks:
13+
- name: trigger-reachy-training
14+
image: ubuntu:22.04
15+
resource: nebius-cli
16+
17+
environment:
18+
SUBNET_ID: "vpcsubnet-XXX"
19+
PARENT_ID: "project-XXX"
20+
NEBIUS_IAM_TOKEN: "NEBIUS_IAM_TOKEN"
21+
NGC_API_KEY: "nvapi-XXX"
22+
FILESYSTEM_ID: "computefilesystem-XXX"
23+
# RL training parameters
24+
TASK: "Isaac-Reach-Franka-v0" # swap for Isaac-Reach-Reachy-v0 when extension is installed
25+
NUM_ENVS: "4096"
26+
MAX_ITERATIONS: "500"
27+
28+
command:
29+
- bash
30+
- -lc
31+
args:
32+
- |
33+
set -euo pipefail
34+
35+
echo "--- 1. Install Nebius CLI ---"
36+
apt-get update && apt-get install -y curl bash jq
37+
38+
curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh | bash
39+
40+
export PATH="$PATH:/root/.nebius/bin"
41+
nebius version
42+
43+
echo "--- 2. Configure Nebius credentials ---"
44+
mkdir -p /root/.nebius
45+
printf 'current-profile: default\nprofiles:\n default:\n endpoint: api.eu-north1.nebius.cloud:443\n parent-id: %s\n' \
46+
"$PARENT_ID" > /root/.nebius/config.yaml
47+
mkdir -p {{output}}
48+
49+
echo "--- 3. Launch Isaac Lab training job ---"
50+
# The OSMO task is just a CPU launcher pod. The actual serverless
51+
# target is selected by the Nebius AI job platform below.
52+
if [[ -z "${FILESYSTEM_ID:-}" || "${FILESYSTEM_ID}" == "computefilesystem-XXX" ]]; then
53+
echo "FILESYSTEM_ID must be set to a real filestore ID for persistent training outputs." >&2
54+
exit 1
55+
fi
56+
57+
LOG_ROOT_PATH="/mnt/storage/logs"
58+
RESULTS_PATH="${LOG_ROOT_PATH}/${TASK}/"
59+
echo "Using filestore for logs/checkpoints: ${FILESYSTEM_ID}"
60+
JOB_CMD=(
61+
nebius ai job create
62+
--name "reachy-train-$(date +%s)"
63+
--image "nvcr.io/nvidia/isaac-lab:2.3.2"
64+
--registry-username "\$oauthtoken"
65+
--registry-password "$NGC_API_KEY"
66+
--platform "gpu-h100-sxm"
67+
--preset "1gpu-16vcpu-200gb"
68+
--volume "${FILESYSTEM_ID}:/mnt/storage"
69+
)
70+
71+
JOB_CMD+=(
72+
--container-command "bash"
73+
--args "-c '/workspace/isaaclab/isaaclab.sh -p /workspace/isaaclab/scripts/reinforcement_learning/rsl_rl/train.py --task $TASK --headless --num_envs $NUM_ENVS --max_iterations $MAX_ITERATIONS +runner.log_root_path=${LOG_ROOT_PATH} 2>&1'"
74+
--timeout "120m"
75+
--subnet-id "$SUBNET_ID"
76+
--parent-id "$PARENT_ID"
77+
)
78+
79+
JOB_CREATE_RESPONSE_FILE="{{output}}/job-create-response.txt"
80+
JOB_CREATE_ERROR_FILE="{{output}}/job-create-error.txt"
81+
JOB_ID_FILE="{{output}}/job-id.txt"
82+
83+
if ! JOB_OUTPUT=$("${JOB_CMD[@]}" 2>&1); then
84+
printf '%s\n' "$JOB_OUTPUT" | tee "$JOB_CREATE_ERROR_FILE"
85+
echo "Job create failed before a job ID was returned." | tee -a "$JOB_CREATE_ERROR_FILE"
86+
echo "If the plain serverless Isaac workflow succeeds, the likely culprit is the filestore mount: ${FILESYSTEM_ID}:/mnt/storage" | tee -a "$JOB_CREATE_ERROR_FILE"
87+
exit 1
88+
fi
89+
90+
printf '%s\n' "$JOB_OUTPUT" > "$JOB_CREATE_RESPONSE_FILE"
91+
echo "Job create response: $JOB_OUTPUT"
92+
JOB_ID=$(echo "$JOB_OUTPUT" | grep -oE 'aijob-[a-z0-9]+' | head -1)
93+
if [[ -z "$JOB_ID" ]]; then
94+
echo "Job create response did not include a job ID." | tee "$JOB_CREATE_ERROR_FILE"
95+
printf '%s\n' "$JOB_OUTPUT" | tee -a "$JOB_CREATE_ERROR_FILE"
96+
exit 1
97+
fi
98+
printf '%s\n' "$JOB_ID" > "$JOB_ID_FILE"
99+
echo "Submitted training job: $JOB_ID"
100+
101+
echo "--- 4. Wait for training completion ---"
102+
while true; do
103+
JOB_RAW=$(nebius ai job get --id "$JOB_ID" --format json 2>&1)
104+
STATUS=$(echo "$JOB_RAW" | jq -r '.status.phase // .status.state // empty' 2>/dev/null \
105+
|| echo "$JOB_RAW" | grep -oE '(RUNNING|SUCCEEDED|COMPLETED|FAILED|ERROR|CANCELLED|STARTING|PENDING)' | head -1)
106+
echo "Training job status: $STATUS"
107+
case "$STATUS" in
108+
SUCCEEDED|COMPLETED)
109+
echo "Training completed successfully."
110+
echo "Logs and checkpoints path: ${RESULTS_PATH}"
111+
break ;;
112+
FAILED|ERROR|CANCELLED)
113+
echo "Training failed with status: $STATUS"
114+
nebius ai job logs "$JOB_ID" 2>&1 | tail -50
115+
exit 1 ;;
116+
*) sleep 60 ;;
117+
esac
118+
done

0 commit comments

Comments
 (0)