allenai
diff --git a/‎.claude/skills/run-evaluation/SKILL.md‎
Lines changed: 13 additions & 0 deletions b/‎.claude/skills/run-evaluation/SKILL.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎.dockerignore‎
Lines changed: 1 addition & 0 deletions b/‎.dockerignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎configs/model_servers/cogact/cogact.yaml‎
Lines changed: 0 additions & 1 deletion b/‎configs/model_servers/cogact/cogact.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎configs/model_servers/groot/groot.yaml‎
Lines changed: 0 additions & 1 deletion b/‎configs/model_servers/groot/groot.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎configs/model_servers/groot/simpler_google_robot.yaml‎
Lines changed: 7 additions & 0 deletions b/‎configs/model_servers/groot/simpler_google_robot.yaml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎configs/model_servers/groot/simpler_widowx.yaml‎
Lines changed: 9 additions & 0 deletions b/‎configs/model_servers/groot/simpler_widowx.yaml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎configs/model_servers/oft/libero_spatial.yaml‎
Lines changed: 0 additions & 1 deletion b/‎configs/model_servers/oft/libero_spatial.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎configs/model_servers/openvla/openvla.yaml‎
Lines changed: 0 additions & 1 deletion b/‎configs/model_servers/openvla/openvla.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎configs/model_servers/pi0/libero.yaml‎
Lines changed: 0 additions & 1 deletion b/‎configs/model_servers/pi0/libero.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎configs/model_servers/pi0/libero_fast.yaml‎
Lines changed: 0 additions & 1 deletion b/‎configs/model_servers/pi0/libero_fast.yaml‎
Lines changed: 0 additions & 1 deletion
@@ -249,6 +249,19 @@ vla-eval merge -c configs/libero_spatial.yaml -o results/libero_spatial.json
 vla-eval test --all
 ```
 
+### Parallel evaluations of different models
+
+Shard result files are named by benchmark + shard ID (e.g.
+`LIBEROBenchmark_libero_spatial_shard0of10.json`). If two evals use the
+same benchmark config, shard count, and output directory, they will
+collide. The orchestrator prevents this with a file lock — the second
+eval will **fail immediately** with `FileExistsError` rather than
+silently overwriting results.
+
+If you hit this error, either:
+- Use **different output directories** (modify `output_dir` in the config), or
+- Use **different shard counts** (e.g. `--num-shards 10` vs `--num-shards 8`).
+
 ### Troubleshooting
 
 | Problem | Solution |
 
@@ -7,4 +7,5 @@
 !docker/calvin_validation_data/
 !docker/init_states/
 !docker/*_entrypoint.sh
+!docker/*.patch
 
@@ -2,7 +2,6 @@
 # Weight: CogACT/CogACT-Base (HuggingFace)
 # Output: 16 actions × 7-DoF (future_action_window_size=15 → 16 steps)
 #
-# Usage: vla-eval serve --config configs/model_servers/cogact/cogact.yaml
 #
 # Available checkpoints:
 #   CogACT/CogACT-Small  (action_model_type: DiT-S)
 
@@ -2,7 +2,6 @@
 # Weight: nvidia/GR00T-N1.6-3B (HuggingFace)
 # Action chunking enabled (chunk_size=16).
 #
-# Usage: vla-eval serve --config configs/model_servers/groot/groot.yaml
 #
 # Available embodiment_tags for foundation model: GR1, ROBOCASA_PANDA_OMRON, BEHAVIOR_R1_PRO
 # Fine-tuned checkpoints may support: LIBERO_PANDA, OXE_GOOGLE, OXE_WIDOWX, UNITREE_G1
 
@@ -0,0 +1,7 @@
+# GR00T N1.6 — SimplerEnv Google Robot (official NVIDIA checkpoint)
+script: "src/vla_eval/model_servers/groot.py"
+args:
+  model_path: nvidia/GR00T-N1.6-fractal
+  embodiment_tag: OXE_GOOGLE
+  chunk_size: 16
+  port: 8000
@@ -0,0 +1,9 @@
+# GR00T N1.6 — SimplerEnv WidowX (official NVIDIA checkpoint)
+script: "src/vla_eval/model_servers/groot.py"
+args:
+  model_path: nvidia/GR00T-N1.6-bridge
+  embodiment_tag: OXE_WIDOWX
+  image_resolution: 256
+  chunk_size: 16
+  bridge_rotation: true
+  port: 8000
@@ -2,7 +2,6 @@
 # Weight: moojink/openvla-7b-oft-finetuned-libero-spatial (HuggingFace)
 # Action chunking enabled (parallel decoding, 26× faster than OpenVLA).
 #
-# Usage: vla-eval serve --config configs/model_servers/oft/libero_spatial.yaml
 extends: _base.yaml
 args:
   pretrained_checkpoint: moojink/openvla-7b-oft-finetuned-libero-spatial
 
@@ -2,7 +2,6 @@
 # Weight: openvla/openvla-7b (HuggingFace)
 # No action chunking (chunk_size=1), returns 7-dim actions.
 #
-# Usage: vla-eval serve --config configs/model_servers/openvla/openvla.yaml
 #
 # For task-specific unnormalization, set unnorm_key to match the
 # training dataset (e.g. "bridge_orig" for BridgeData V2).
 
@@ -1,7 +1,6 @@
 # π₀.5 model server — LIBERO (direct OpenPI inference)
 # Loads the policy checkpoint directly; no external server needed.
 #
-# Usage: vla-eval serve --config configs/model_servers/pi0/libero.yaml
 #
 # Available config_name values (see openpi repo):
 #   pi0_fast_libero, pi05_libero, pi0_fast_droid, pi05_droid, ...
 
@@ -1,6 +1,5 @@
 # π₀-FAST model server — LIBERO (direct OpenPI inference)
 #
-# Usage: vla-eval serve --config configs/model_servers/pi0/libero_fast.yaml
 #
 # Pi0-FAST uses FAST tokenizer — a different, lower-performing variant than Pi0.5.
 # For Pi0.5 (96.85% on LIBERO), use libero.yaml instead.
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,6 @@`
`2`	`2`	`# Weight: CogACT/CogACT-Base (HuggingFace)`
`3`	`3`	`# Output: 16 actions × 7-DoF (future_action_window_size=15 → 16 steps)`
`4`	`4`	`#`
`5`		`-# Usage: vla-eval serve --config configs/model_servers/cogact/cogact.yaml`
`6`	`5`	`#`
`7`	`6`	`# Available checkpoints:`
`8`	`7`	`# CogACT/CogACT-Small (action_model_type: DiT-S)`
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,6 @@`
`2`	`2`	`# Weight: nvidia/GR00T-N1.6-3B (HuggingFace)`
`3`	`3`	`# Action chunking enabled (chunk_size=16).`
`4`	`4`	`#`
`5`		`-# Usage: vla-eval serve --config configs/model_servers/groot/groot.yaml`
`6`	`5`	`#`
`7`	`6`	`# Available embodiment_tags for foundation model: GR1, ROBOCASA_PANDA_OMRON, BEHAVIOR_R1_PRO`
`8`	`7`	`# Fine-tuned checkpoints may support: LIBERO_PANDA, OXE_GOOGLE, OXE_WIDOWX, UNITREE_G1`
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,6 @@`
`2`	`2`	`# Weight: moojink/openvla-7b-oft-finetuned-libero-spatial (HuggingFace)`
`3`	`3`	`# Action chunking enabled (parallel decoding, 26× faster than OpenVLA).`
`4`	`4`	`#`
`5`		`-# Usage: vla-eval serve --config configs/model_servers/oft/libero_spatial.yaml`
`6`	`5`	`extends: _base.yaml`
`7`	`6`	`args:`
`8`	`7`	`pretrained_checkpoint: moojink/openvla-7b-oft-finetuned-libero-spatial`
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,6 @@`
`2`	`2`	`# Weight: openvla/openvla-7b (HuggingFace)`
`3`	`3`	`# No action chunking (chunk_size=1), returns 7-dim actions.`
`4`	`4`	`#`
`5`		`-# Usage: vla-eval serve --config configs/model_servers/openvla/openvla.yaml`
`6`	`5`	`#`
`7`	`6`	`# For task-specific unnormalization, set unnorm_key to match the`
`8`	`7`	`# training dataset (e.g. "bridge_orig" for BridgeData V2).`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,6 @@`
`1`	`1`	`# π₀.5 model server — LIBERO (direct OpenPI inference)`
`2`	`2`	`# Loads the policy checkpoint directly; no external server needed.`
`3`	`3`	`#`
`4`		`-# Usage: vla-eval serve --config configs/model_servers/pi0/libero.yaml`
`5`	`4`	`#`
`6`	`5`	`# Available config_name values (see openpi repo):`
`7`	`6`	`# pi0_fast_libero, pi05_libero, pi0_fast_droid, pi05_droid, ...`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,5 @@`
`1`	`1`	`# π₀-FAST model server — LIBERO (direct OpenPI inference)`
`2`	`2`	`#`
`3`		`-# Usage: vla-eval serve --config configs/model_servers/pi0/libero_fast.yaml`
`4`	`3`	`#`
`5`	`4`	`# Pi0-FAST uses FAST tokenizer — a different, lower-performing variant than Pi0.5.`
`6`	`5`	`# For Pi0.5 (96.85% on LIBERO), use libero.yaml instead.`