|
| 1 | +workflow: |
| 2 | + name: osmo-isaac-reachy-rl |
| 3 | + pool: default |
| 4 | + |
| 5 | + resources: |
| 6 | + nebius-cli: |
| 7 | + cpu: 1 |
| 8 | + memory: 2Gi |
| 9 | + platform: L40S |
| 10 | + |
| 11 | + tasks: |
| 12 | + - name: trigger-reachy-training |
| 13 | + image: ubuntu:22.04 |
| 14 | + resource: nebius-cli |
| 15 | + |
| 16 | + environment: |
| 17 | + SUBNET_ID: "vpcsubnet-XXX" |
| 18 | + PARENT_ID: "project-XXX" |
| 19 | + FILESYSTEM_ID: "computefilesystem-XXX" |
| 20 | + NEBIUS_IAM_TOKEN: "NEBIUS_IAM_TOKEN" |
| 21 | + NGC_API_KEY: "nvapi-XXX" |
| 22 | + # RL training parameters |
| 23 | + TASK: "Isaac-Reach-Franka-v0" # swap for Isaac-Reach-Reachy-v0 when extension is installed |
| 24 | + NUM_ENVS: "4096" |
| 25 | + MAX_ITERATIONS: "500" |
| 26 | + |
| 27 | + command: |
| 28 | + - bash |
| 29 | + - -lc |
| 30 | + args: |
| 31 | + - | |
| 32 | + set -euo pipefail |
| 33 | +
|
| 34 | + echo "--- 1. Install Nebius CLI ---" |
| 35 | + apt-get update && apt-get install -y curl bash jq |
| 36 | +
|
| 37 | + curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh | bash |
| 38 | +
|
| 39 | + export PATH="$PATH:/root/.nebius/bin" |
| 40 | + nebius version |
| 41 | +
|
| 42 | + echo "--- 2. Configure Nebius credentials ---" |
| 43 | + mkdir -p /root/.nebius |
| 44 | + printf 'current-profile: default\nprofiles:\n default:\n endpoint: api.eu-north1.nebius.cloud:443\n parent-id: %s\n' \ |
| 45 | + "$PARENT_ID" > /root/.nebius/config.yaml |
| 46 | +
|
| 47 | + echo "--- 3. Launch Isaac Lab training job ---" |
| 48 | + RUN_NAME="${TASK}-$(date +%s)" |
| 49 | + JOB_OUTPUT=$(nebius ai job create \ |
| 50 | + --name "reachy-train-$(date +%s)" \ |
| 51 | + --image "nvcr.io/nvidia/isaac-lab:2.3.2" \ |
| 52 | + --registry-username "\$oauthtoken" \ |
| 53 | + --registry-password "$NGC_API_KEY" \ |
| 54 | + --platform "gpu-l40s-d" \ |
| 55 | + --preset "1gpu-16vcpu-96gb" \ |
| 56 | + --volume "$FILESYSTEM_ID:/mnt/storage" \ |
| 57 | + --container-command "bash" \ |
| 58 | + --args "-c '/workspace/isaaclab/isaaclab.sh -p /workspace/isaaclab/scripts/reinforcement_learning/rsl_rl/train.py --task $TASK --headless --num_envs $NUM_ENVS --max_iterations $MAX_ITERATIONS +runner.log_root_path=/mnt/storage/logs 2>&1'" \ |
| 59 | + --timeout "120m" \ |
| 60 | + --subnet-id "$SUBNET_ID" \ |
| 61 | + --parent-id "$PARENT_ID") |
| 62 | + echo "Job create response: $JOB_OUTPUT" |
| 63 | + JOB_ID=$(echo "$JOB_OUTPUT" | grep -oE 'aijob-[a-z0-9]+' | head -1) |
| 64 | + echo "Submitted training job: $JOB_ID" |
| 65 | +
|
| 66 | + echo "--- 4. Wait for training completion ---" |
| 67 | + while true; do |
| 68 | + JOB_RAW=$(nebius ai job get --id "$JOB_ID" --format json 2>&1) |
| 69 | + STATUS=$(echo "$JOB_RAW" | jq -r '.status.phase // .status.state // empty' 2>/dev/null \ |
| 70 | + || echo "$JOB_RAW" | grep -oE '(RUNNING|SUCCEEDED|COMPLETED|FAILED|ERROR|CANCELLED|STARTING|PENDING)' | head -1) |
| 71 | + echo "Training job status: $STATUS" |
| 72 | + case "$STATUS" in |
| 73 | + SUCCEEDED|COMPLETED) |
| 74 | + echo "Training completed successfully." |
| 75 | + echo "Logs and checkpoints saved to: /mnt/storage/logs/$TASK/" |
| 76 | + break ;; |
| 77 | + FAILED|ERROR|CANCELLED) |
| 78 | + echo "Training failed with status: $STATUS" |
| 79 | + nebius ai job logs "$JOB_ID" 2>&1 | tail -50 |
| 80 | + exit 1 ;; |
| 81 | + *) sleep 60 ;; |
| 82 | + esac |
| 83 | + done |
0 commit comments