diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 00000000..9e64b618
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,32 @@
+{
+    // docker build . -t simpler-env
+    "image": "simpler_env",
+    "runArgs": [
+        "--name=simpler_env-ynn",
+        "--net=host",
+        "--ipc=host",
+        "--gpus",
+        "all",
+        "-e",
+        "NVIDIA_DRIVER_CAPABILITIES=all",
+        "--privileged"
+    ],
+    "mounts": [
+        "source=/home/takanori.yoshimoto/code/SimplerEnv,target=/app/SimplerEnv,type=bind",
+        "source=/home/takanori.yoshimoto/code/hsr_openpi/checkpoints,target=/data/checkpoints,type=bind"
+    ],
+    "workspaceFolder": "/app/SimplerEnv",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-python.python",
+                "ms-python.vscode-pylance",
+                "ms-python.black-formatter",
+                "ms-python.isort",
+                "ms-azuretools.vscode-docker",
+                "github.copilot",
+                "github.vscode-pull-request-github"
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index dfb15106..d8002eb9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,9 @@ __pycache__
 _ext
 *.pyc
 *.so
+.venv/
+results/
+checkpoints/
 build/
 dist/
 *.egg-info/
diff --git a/.gitmodules b/.gitmodules
index 046394c1..c8ba23d8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,9 @@
 [submodule "ManiSkill2_real2sim"]
 	path = ManiSkill2_real2sim
-	url = https://github.com/simpler-env/ManiSkill2_real2sim
+	url = https://github.com/airoa-org/ManiSkill2_real2sim.git
+	# url = https://github.com/allenzren/ManiSkill2_real2sim.git
+	# url = https://github.com/simpler-env/ManiSkill2_real2sim
+[submodule "Isaac-GR00T"]
+	path = Isaac-GR00T
+	url = https://github.com/NVIDIA/Isaac-GR00T.git
+	branch = main
diff --git a/Isaac-GR00T b/Isaac-GR00T
new file mode 160000
index 00000000..4ea96a16
--- /dev/null
+++ b/Isaac-GR00T
@@ -0,0 +1 @@
+Subproject commit 4ea96a16b15cfdbbd787b6b4f519a12687281330
diff --git a/ManiSkill2_real2sim b/ManiSkill2_real2sim
index ef7a4d4f..024cfe15 160000
--- a/ManiSkill2_real2sim
+++ b/ManiSkill2_real2sim
@@ -1 +1 @@
-Subproject commit ef7a4d4fdf4b69f2c2154db5b15b9ac8dfe10682
+Subproject commit 024cfe1582c91e0a9f3436e07bd87dd167013cea
diff --git a/README.md b/README.md
index d46213f7..9228f400 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# SimplerEnv: Simulated Manipulation Policy Evaluation Environments for Real Robot Setups
+# SimplerEnv: Simulated Manipulation Policy Evaluation Environments for Real Robot Setups (Multi-model Support 🔥)
 
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/simpler-env/SimplerEnv/blob/main/example.ipynb)
 
@@ -6,7 +6,7 @@
 
 Significant progress has been made in building generalist robot manipulation policies, yet their scalable and reproducible evaluation remains challenging, as real-world evaluation is operationally expensive and inefficient. We propose employing physical simulators as efficient, scalable, and informative complements to real-world evaluations. These simulation evaluations offer valuable quantitative metrics for checkpoint selection, insights into potential real-world policy behaviors or failure modes, and standardized setups to enhance reproducibility.
 
-This repository's code is based in the [SAPIEN](https://sapien.ucsd.edu/) simulator and the CPU based [ManiSkill2](https://maniskill2.github.io/) benchmark. We have also integrated the Bridge dataset environments into ManiSkill3, which offers GPU parallelization and can run 10-15x faster than the ManiSkill2 version. For instructions on how to use the GPU parallelized environments and evaluate policies on them, see: https://github.com/simpler-env/SimplerEnv/tree/maniskill3
+This repository is based in the [SAPIEN](https://sapien.ucsd.edu/) simulator and the [ManiSkill2](https://maniskill2.github.io/) benchmark (we will also integrate the evaluation envs into ManiSkill3 once it is complete).
 
 This repository encompasses 2 real-to-sim evaluation setups:
 - `Visual Matching` evaluation: Matching real & sim visual appearances for policy evaluation by overlaying real-world images onto simulation backgrounds and adjusting foreground object and robot textures in simulation.
@@ -23,12 +23,45 @@ We hope that our work guides and inspires future real-to-sim evaluation efforts.
   - [Code Structure](#code-structure)
   - [Adding New Policies](#adding-new-policies)
   - [Adding New Real-to-Sim Evaluation Environments and Robots](#adding-new-real-to-sim-evaluation-environments-and-robots)
-  - [Full Installation (RT-1 and Octo Inference, Env Building)](#full-installation-rt-1-and-octo-inference-env-building)
+  - [Full Installation (RT-1, Octo, OpenVLA Inference, Env Building)](#full-installation-rt-1-octo-openvla-inference-env-building)
     - [RT-1 Inference Setup](#rt-1-inference-setup)
     - [Octo Inference Setup](#octo-inference-setup)
+    - [OpenVLA Inference Setup](#openvla-inference-setup)
   - [Troubleshooting](#troubleshooting)
   - [Citation](#citation)
 
+## Benchmark @ GoogleSheets
+> [!TIP]
+> We maintain a public Google Sheets documenting the latest SOTA models' performance and fine-tuned weights on Simpler-Env, making community benchmarking more accessible. Welcome to contribute and update!
+>
+> [simpler env benchmark @ GoogleSheets 📊](https://docs.google.com/spreadsheets/d/1cLhEW9QnVkP4rqxsFVzdBVRyBWVdSm0d5zp1L_-QJx4/edit?usp=sharing)
+<img width="1789" alt="image" src="https://github.com/user-attachments/assets/68e2ad3d-b24f-4562-97d9-434f23cede86" />
+
+
+
+## Models
+> [!NOTE]
+> Hello everyone!
+> This repository has now fully opened issues and discussions. We warmly welcome you to: 🤗
+> Discuss any problems you encounter 🙋
+> Submit fixes ✅
+> Support new models! 🚀
+> Given the significant environmental differences across various models and the specific dependencies required for simulator rendering, I will soon provide a Docker solution and a benchmark performance table. I’ll also do my best to address any issues you run into.
+> Thank you for your support and contributions! 🎉
+>
+> To support state input, we use the submodule `ManiSkill2_real2sim` from https://github.com/allenzren/ManiSkill2_real2sim
+
+| Model Name   | support | Note  |
+| -----------  | -----  | -----  |
+| Octo         | ✅     |        |
+| RT1          | ✅     |        |
+| OpenVLA      | ✅     |        |
+| CogACT       | ✅     | OpenVLA-based         |
+| SpatialVLA   | ✅     | [transformers == 4.47.0](https://github.com/SpatialVLA/SpatialVLA) |
+| Pi0/Pi0-Fast (openpi version) | ✅     | [openpi](https://github.com/Physical-Intelligence/openpi) |
+| Pi0/Pi0-Fast (lerobot version)     | ✅ | [lerobot](https://github.com/huggingface/lerobot) |
+| GR00T     | ✅ | [Isaac-GR00T](https://github.com/NVIDIA/Isaac-GR00T) |
+
 
 ## Getting Started
 
@@ -71,13 +104,13 @@ Prerequisites:
 
 Create an anaconda environment:
 ```
-conda create -n simpler_env python=3.10 (3.10 or 3.11)
+conda create -n simpler_env python=3.10 (any version above 3.10 should be fine)
 conda activate simpler_env
 ```
 
 Clone this repo:
 ```
-git clone https://github.com/simpler-env/SimplerEnv --recurse-submodules
+git clone https://github.com/simpler-env/SimplerEnv --recurse-submodules --depth 1
 ```
 
 Install numpy<2.0 (otherwise errors in IK might occur in pinocchio):
@@ -97,15 +130,15 @@ cd {this_repo}
 pip install -e .
 ```
 
-**If you'd like to perform evaluations on our provided agents (e.g., RT-1, Octo), or add new robots and environments, please additionally follow the full installation instructions [here](#full-installation-rt-1-and-octo-inference-env-building).**
+**If you'd like to perform evaluations on our provided agents (e.g., RT-1, Octo, OpenVLA), or add new robots and environments, please additionally follow the full installation instructions [here](#full-installation-rt-1-octo-openvla-inference-env-building).**
 
 
 ## Examples
 
-- Simple RT-1 and Octo evaluation script on prepackaged environments with visual matching evaluation setup: see [`simpler_env/simple_inference_visual_matching_prepackaged_envs.py`](https://github.com/simpler-env/SimplerEnv/blob/main/simpler_env/simple_inference_visual_matching_prepackaged_envs.py).
+- Simple RT-1, Octo, and OpenVLA evaluation script on prepackaged environments with visual matching evaluation setup: see [`simpler_env/simple_inference_visual_matching_prepackaged_envs.py`](https://github.com/simpler-env/SimplerEnv/blob/main/simpler_env/simple_inference_visual_matching_prepackaged_envs.py).
 - Colab notebook for RT-1 and Octo inference: see [this link](https://colab.research.google.com/github/simpler-env/SimplerEnv/blob/main/example.ipynb).
 - Environment interactive visualization and manual control: see [`ManiSkill2_real2sim/mani_skill2_real2sim/examples/demo_manual_control_custom_envs.py`](https://github.com/simpler-env/ManiSkill2_real2sim/blob/main/mani_skill2_real2sim/examples/demo_manual_control_custom_envs.py)
-- Policy inference scripts to reproduce our Google Robot and WidowX real-to-sim evaluation results with sweeps over object / robot poses and advanced loggings. These contain both visual matching and variant aggregation evaluation setups along with RT-1, RT-1-X, and Octo policies. See [`scripts/`](https://github.com/simpler-env/SimplerEnv/tree/main/scripts).
+- Policy inference scripts to reproduce our Google Robot and WidowX real-to-sim evaluation results with sweeps over object / robot poses and advanced loggings. These contain both visual matching and variant aggregation evaluation setups along with RT-1, RT-1-X, Octo, and OpenVLA policies. See [`scripts/`](https://github.com/simpler-env/SimplerEnv/tree/main/scripts).
 - Real-to-sim evaluation videos from running `scripts/*.sh`: see [this link](https://huggingface.co/datasets/xuanlinli17/simpler-env-eval-example-videos/tree/main).
 
 ## Current Environments
@@ -183,6 +216,7 @@ simpler_env/
    policies/: policy implementations
       rt1/: RT-1 policy implementation
       octo/: Octo policy implementation
+      openvla/: OpenVLA policy implementation
    utils/:
       env/: environment building and observation utilities
       debug/: debugging tools for policies and robots
@@ -205,7 +239,7 @@ scripts/: example bash scripts for policy inference under our variant aggregatio
 
 If you want to use existing environments for evaluating new policies, you can keep `./ManiSkill2_real2sim` as is.
 
-1. Implement new policy inference scripts in `simpler_env/policies/{your_new_policy}`, following the examples for RT-1 (`simpler_env/policies/rt1`) and Octo (`simpler_env/policies/octo`) policies.
+1. Implement new policy inference scripts in `simpler_env/policies/{your_new_policy}`, following the examples for RT-1 (`simpler_env/policies/rt1`), Octo (`simpler_env/policies/octo`), and OpenVLA (`simpler_env/policies/openvla`) policies.
 2. You can now use `simpler_env/simple_inference_visual_matching_prepackaged_envs.py` to perform policy evaluations in simulation.
    - If the policy behaviors deviate a lot from those in the real-world, you can write similar scripts as in `simpler_env/utils/debug/{policy_name}_inference_real_video.py` to debug the policy behaviors. The debugging script performs policy inference by feeding real eval video frames into the policy. If the policy behavior still deviates significantly from real, this may suggest that policy actions are processed incorrectly into the simulation environments. Please double check action orderings and action spaces.
 3. If you'd like to perform customized evaluations,
@@ -219,7 +253,7 @@ If you want to use existing environments for evaluating new policies, you can ke
 We provide a step-by-step guide to add new real-to-sim evaluation environments and robots in [this README](ADDING_NEW_ENVS_ROBOTS.md)
 
 
-## Full Installation (RT-1 and Octo Inference, Env Building)
+## Full Installation (RT-1, Octo, OpenVLA Inference, Env Building)
 
 If you'd like to perform evaluations on our provided agents (e.g., RT-1, Octo), or add new robots and environments, please follow the full installation instructions below.
 
@@ -289,6 +323,13 @@ If you are using CUDA 12, then to use GPU for Octo inference, you need CUDA vers
 
 `PATH=/usr/local/cuda-12.3/bin:$PATH   LD_LIBRARY_PATH=/usr/local/cuda-12.3/lib64:$LD_LIBRARY_PATH   bash scripts/octo_xxx_script.sh`
 
+### OpenVLA Inference Setup
+
+```
+pip install torch==2.3.1 torchvision==0.18.1 timm==0.9.10 tokenizers==0.15.2 accelerate==0.32.1
+pip install flash-attn==2.6.1 --no-build-isolation
+```
+
 ## Troubleshooting
 
 1. If you encounter issues such as
@@ -307,6 +348,10 @@ Follow [this link](https://maniskill.readthedocs.io/en/latest/user_guide/getting
 TypeError: 'NoneType' object is not subscriptable
 ```
 
+3. Please also refer to the original repo or [vulkan_setup](https://github.com/SpatialVLA/SpatialVLA/issues/3#issuecomment-2641739404) if you encounter any problems.
+
+4. `tensorflow-2.15.0` conflicts with `tensorflow-2.15.1`?
+The dlimp library has not been maintained for a long time, so the TensorFlow version might be out of date. A reliable solution is to comment out tensorflow==2.15.0 in the requirements file, install all other dependencies, and then install tensorflow==2.15.0 finally. Currently, using tensorflow==2.15.0 has not caused any problems.
 
 ## Citation
 
diff --git a/docker/10_nvidia.json b/docker/10_nvidia.json
new file mode 100644
index 00000000..bc224f29
--- /dev/null
+++ b/docker/10_nvidia.json
@@ -0,0 +1,6 @@
+{
+    "file_format_version": "1.0.0",
+    "ICD": {
+        "library_path": "libEGL_nvidia.so.0"
+    }
+}
\ No newline at end of file
diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100644
index 00000000..7526dccf
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,93 @@
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
+
+# 基本ツール
+ENV DEBIAN_FRONTEND=noninteractive \
+    NVIDIA_DRIVER_CAPABILITIES=all \
+    TZ=Asia/Tokyo \
+    PYTHONUNBUFFERED=1 \
+    VK_ICD_FILENAMES=/etc/vulkan/icd.d/nvidia_icd.json
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    bash-completion build-essential ca-certificates cmake curl git git-lfs \
+    htop libegl1 libxext6 libjpeg-dev libpng-dev libvulkan1 rsync \
+    tmux unzip vim wget xvfb pkg-config ffmpeg \
+    libglvnd-dev libgl1-mesa-dev libegl1-mesa-dev libgles2-mesa-dev libglib2.0-0 \
+    libsm6 libxrender1 libgomp1 libglu1-mesa libxi6 software-properties-common \
+    python3.11 python3.11-dev python3-pip && \
+    rm -rf /var/lib/apt/lists/*
+
+# Python 3.11をデフォルトに設定
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
+    python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel
+
+# Git LFSを有効化
+RUN git lfs install
+
+# Vulkan/EGL設定ファイルをコピー
+COPY docker/nvidia_icd.json /etc/vulkan/icd.d/nvidia_icd.json
+COPY docker/nvidia_layers.json /etc/vulkan/implicit_layer.d/nvidia_layers.json
+COPY docker/10_nvidia.json /usr/share/glvnd/egl_vendor.d/10_nvidia.json
+
+# 作業ディレクトリ
+WORKDIR /workspace
+
+# ソースコードをコピー（ビルド時のインストールに必要）
+# 注意: 実行時は run_docker.sh でホストのコードがマウントされ、これは上書きされます
+COPY . /workspace/
+
+# SimplerEnv と ManiSkill2_real2sim をインストール
+RUN cd /workspace/ManiSkill2_real2sim && pip install --no-cache-dir -e . && cd /workspace
+RUN pip install --no-cache-dir -e /workspace
+RUN pip install --no-cache-dir -r /workspace/requirements_full_install.txt
+
+# PyTorch と torchvision (CUDA 12.1)
+RUN pip install --no-cache-dir --index-url https://download.pytorch.org/whl/cu121 \
+    torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1
+
+# Flash Attention
+RUN pip install --no-cache-dir --no-deps \
+    https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.1.post4/flash_attn-2.7.1.post4+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
+
+# Isaac-GR00T のベース依存（--no-deps で重複を避ける）
+RUN cd /workspace/Isaac-GR00T && pip install --no-cache-dir -e .[base] --no-deps && cd /workspace
+
+# 追加ライブラリ（問題が発生しやすい依存関係を明示的にインストール）
+RUN pip install --no-cache-dir \
+    pandas==2.2.3 \
+    "pydantic>=2,<3" typing_extensions==4.12.2 --force-reinstall
+
+RUN pip install --no-cache-dir \
+    albumentations==1.4.18 albucore==0.0.17 scikit-image==0.25.2 lazy_loader==0.4 --no-deps
+
+RUN pip install --no-cache-dir \
+    decord==0.6.0 av==12.3.0 --no-deps
+
+RUN pip install --no-cache-dir \
+    nptyping==2.5.0 numpydantic==1.6.10 --no-deps
+
+RUN pip install --no-cache-dir \
+    diffusers==0.30.2 timm==1.0.14 peft==0.14.0
+
+RUN pip install --no-cache-dir \
+    transformers==4.51.3 --force-reinstall --no-deps
+
+RUN pip install --no-cache-dir \
+    pyzmq --no-deps
+
+RUN pip install --no-cache-dir \
+    "tokenizers>=0.21,<0.22" --no-deps
+
+# PyTorch3D（ソースからビルド）
+RUN pip install --no-cache-dir "git+https://github.com/facebookresearch/pytorch3d.git"
+
+# NumPy < 2.0 を再度強制（PyTorch3Dがアップグレードする可能性があるため）
+RUN pip install --no-cache-dir --force-reinstall "numpy>=1.24.4,<2.0"
+
+# エントリーポイントスクリプトをコピー
+COPY docker/entrypoint.sh /usr/local/bin/entrypoint.sh
+RUN chmod +x /usr/local/bin/entrypoint.sh
+
+# エントリーポイントを設定（起動時に設定ファイルを自動コピー）
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
+CMD ["bash"]
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 00000000..029e8f9f
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,83 @@
+# SimplerEnv Docker Environment
+
+このディレクトリには、SimplerEnv (benchmark-v2) のDocker環境を構築するためのスクリプトが含まれています。
+
+## 前提条件
+
+- NVIDIA Driver >= 535 / CUDA 12.1 相当
+- Docker がインストールされていること
+- nvidia-docker (NVIDIA Container Toolkit) がインストールされていること
+- Linux + GPU ノード
+
+## クイックスタート
+
+### 1. Docker イメージをビルド
+
+リポジトリのルートディレクトリで以下を実行します。
+
+```bash
+bash docker/build_docker.sh
+```
+
+ビルドには30〜60分程度かかります。このプロセスでは以下がインストールされます：
+
+- Python 3.11
+- PyTorch 2.5.1 (CUDA 12.1)
+- SimplerEnv と ManiSkill2_real2sim
+- Isaac-GR00T と全ての依存関係
+- Flash Attention 2.7.1
+- PyTorch3D
+- その他の必要なライブラリ
+
+### 2. コンテナを起動
+
+```bash
+bash docker/run_docker.sh
+```
+
+### 3. コンテナに入る
+
+```bash
+docker exec -it simplerenv bash
+```
+
+## 使用方法
+
+コンテナ内では、すべての依存関係がインストール済みなので、そのまま評価スクリプトを実行できます。
+
+### WidowX (Bridge) での評価
+
+```bash
+python scripts/gr00t/evaluate_bridge.py \
+  --ckpt-path /path/to/checkpoint-group6/
+```
+
+### Google Robot (Fractal) での評価
+
+```bash
+python scripts/gr00t/evaluate_fractal.py \
+  --ckpt-path /path/to/checkpoint-group6/
+```
+
+## 技術詳細
+
+### インストールされる主要なパッケージ
+
+- **PyTorch**: 2.5.1 (CUDA 12.1)
+- **Flash Attention**: 2.7.1.post4
+- **Transformers**: 4.51.3
+- **Diffusers**: 0.30.2
+- **Timm**: 1.0.14
+- **PEFT**: 0.14.0
+- **PyTorch3D**: latest from source
+- **その他**: pandas, pydantic, albumentations, decord, av, nptyping, numpydantic, pyzmq, tokenizers
+
+### Dockerfileの構成
+
+1. ベースイメージ: `nvidia/cuda:12.1.0-devel-ubuntu22.04`
+2. システムパッケージのインストール
+3. Python 3.11のセットアップ
+4. SimplerEnv と ManiSkill2_real2sim のインストール
+5. PyTorch とその他のディープラーニングライブラリのインストール
+6. Isaac-GR00T の依存関係のインストール
+7. 設定ファイルの上書き
\ No newline at end of file
diff --git a/docker/build_docker.sh b/docker/build_docker.sh
new file mode 100644
index 00000000..2c8f80be
--- /dev/null
+++ b/docker/build_docker.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# SimplerEnv Docker Image Build Script
+# Usage: bash docker/build_docker.sh
+
+set -e
+
+IMAGE_NAME="simplerenv"
+TAG="benchmark-v2"
+
+echo "Building Docker image: ${IMAGE_NAME}:${TAG}"
+echo "This may take a while (30-60 minutes)..."
+
+docker build \
+  -f docker/Dockerfile \
+  -t ${IMAGE_NAME}:${TAG} \
+  .
diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
new file mode 100644
index 00000000..a5b0bb46
--- /dev/null
+++ b/docker/entrypoint.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+# Vulkan設定（レンダリングに必要）
+export VK_ICD_FILENAMES=/etc/vulkan/icd.d/nvidia_icd.json
+
+# GR00T設定ファイルを上書き（起動時に毎回実行）
+echo "Copying GR00T configuration files..."
+cp /workspace/scripts/gr00t/data_config.py /workspace/Isaac-GR00T/gr00t/experiment/data_config.py
+cp /workspace/scripts/gr00t/embodiment_tags.py /workspace/Isaac-GR00T/gr00t/data/embodiment_tags.py
+echo "Configuration files copied successfully."
+
+# 渡されたコマンドを実行
+exec "$@"
diff --git a/docker/nvidia_icd.json b/docker/nvidia_icd.json
new file mode 100644
index 00000000..dbd5a51a
--- /dev/null
+++ b/docker/nvidia_icd.json
@@ -0,0 +1,7 @@
+{
+    "file_format_version": "1.0.0",
+    "ICD": {
+        "library_path": "libGLX_nvidia.so.0",
+        "api_version": "1.2.155"
+    }
+}
\ No newline at end of file
diff --git a/docker/nvidia_layers.json b/docker/nvidia_layers.json
new file mode 100644
index 00000000..baaded95
--- /dev/null
+++ b/docker/nvidia_layers.json
@@ -0,0 +1,21 @@
+{
+    "file_format_version": "1.0.0",
+    "layer": {
+        "name": "VK_LAYER_NV_optimus",
+        "type": "INSTANCE",
+        "library_path": "libGLX_nvidia.so.0",
+        "api_version": "1.2.155",
+        "implementation_version": "1",
+        "description": "NVIDIA Optimus layer",
+        "functions": {
+            "vkGetInstanceProcAddr": "vk_optimusGetInstanceProcAddr",
+            "vkGetDeviceProcAddr": "vk_optimusGetDeviceProcAddr"
+        },
+        "enable_environment": {
+            "__NV_PRIME_RENDER_OFFLOAD": "1"
+        },
+        "disable_environment": {
+            "DISABLE_LAYER_NV_OPTIMUS_1": ""
+        }
+    }
+}
\ No newline at end of file
diff --git a/docker/run_docker.sh b/docker/run_docker.sh
new file mode 100644
index 00000000..b5b5107d
--- /dev/null
+++ b/docker/run_docker.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# SimplerEnv Docker Container Run Script
+# Usage: bash docker/run_docker.sh
+
+set -e
+
+IMAGE_NAME="simplerenv"
+TAG="benchmark-v2"
+CONTAINER_NAME="simplerenv"
+
+# Dockerコンテナを起動し、現在のユーザーのDISPLAY環境変数とX11のソケットを渡す
+docker run --gpus all -it -d \
+  --name ${CONTAINER_NAME} \
+  --shm-size=250g \
+  -e DISPLAY=$DISPLAY \
+  -v $(pwd):/workspace \
+  -v /tmp/.X11-unix:/tmp/.X11-unix \
+  -v $HOME/.Xauthority:/root/.Xauthority \
+  --net host \
+  ${IMAGE_NAME}:${TAG}
+
+echo ""
+echo "Container started successfully!"
+echo "Container name: ${CONTAINER_NAME}"
+echo ""
+echo "To enter the container, run:"
+echo "  docker exec -it ${CONTAINER_NAME} bash"
+echo ""
+echo "To stop the container, run:"
+echo "  docker stop ${CONTAINER_NAME}"
+
diff --git a/pyproject.toml b/pyproject.toml
index 1e2b786d..77f61767 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,24 @@
+[project]
+name = "simpler_env"
+version = "0.0.1"
+description = "SimplerEnv"
+readme = "README.md"
+requires-python = ">=3.10,<3.12"
+dependencies = [
+    "mani_skill2_real2sim",
+    "numpy<2.0",
+    "mediapy",
+    "transforms3d",
+    "matplotlib",
+    "statsmodels",
+]
+
+
+[project.optional-dependencies]
+torch = [
+    "tensorflow-cpu",
+]
+
 [tool.black]
 # https://github.com/psf/black
 line-length = 120
@@ -9,3 +30,10 @@ profile = "black"
 line_length = 120
 force_sort_within_sections = "True"
 order_by_type = "False"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.uv.sources]
+mani_skill2_real2sim = { path = "ManiSkill2_real2sim", editable = true }
\ No newline at end of file
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 00000000..9ba35369
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,183 @@
+## Introduction
+
+# SimplerEnvのMRを提出する際の注意事項
+
+- MRを提出する際に、WASABIにアップされているモデル重みのパスを記載してください。
+- 運営側でも並行してコード修正やバグ対応を進めております。そのため、MR提出日の時点で、各チームの提出ブランチに改めて `benchmark-v2` ブランチをマージしていただき、**Merge Conflictがない状態**にしていただけますよう、ご協力をお願いいたします。
+- また、以下の評価スクリプトについては、各チームのモデルと統合した上で、**最後まで正常に実行できること**をご確認ください。
+
+Google Robot Evaluation Script
+```
+CUDA_VISIBLE_DEVICES=<GPU_DEVICE_ID> python scripts/rt1/evaluate_fractal.py --ckpt-path /path/to/ckpt
+```
+
+WidowX Evaluation Script
+```
+CUDA_VISIBLE_DEVICES=<GPU_DEVICE_ID> python scripts/openpi/challenge_widowx.py --ckpt /path/to/ckpt 
+```
+
+### Google Robot Evaluation Task List
+
+| Task  | Challenge | Task Definition                                     | Task                                | Randomizer Pool                                                                    |
+| ----- | --------- | --------------------------------------------------- | ----------------------------------- | ---------------------------------------------------------------------------------- |
+| 1-1   | 1         | pick `<object>`                                     | `pick_object_visual_matching`       | `<object>`,`<position>`,`<object_orientation>`,`<robot_color>`,`<camera_position>` |
+| 1-2   | 1         | pick `<object>`                                     | `pick_object_variant_agg`           | `<object>`,`<position>`,`<object_orientation>`,`<background/cabinet>`              |
+| 2-1   | 1         | pick `<object>`                                     | `pick_object_among_visual_matching` | `<object>`,`<position>`,`<object_orientation>`,`<robot_color>`,`<camera_position>` |
+| 2-2   | 1         | pick `<object>`                                     | `pick_object_among_variant_agg`     | `<object>`,`<position>`,`<object_orientation>`,`<background/cabinet>`              |
+| 3-1   | 1         | open/close `<position>` drawer                      | `drawer_visual_matching`            | `<position>`,`<robot_color>`,`<background-robot_init_pos>`                         |
+| 3-2   | 1         | open/close `<position>` drawer                      | `drawer_variant_agg`                | `<position>`,`<lighting>`,`<background>`,`<cabinet>`                               |
+| 4-1   | 2         | move `<object>` near `<object>`                     | `move_near_visual_matching`         | `<object>`,`<position>`,`<robot_position>`,`<robot_color>`                         |
+| 4-2   | 2         | move `<object>` near `<object>`                     | `move_near_variant_agg`             | `<object>`,`<position>`,`<lighting>`,`<background/cabinet>`,`<camera_position>`    |
+| 5-1   | 2         | open top drawer -> place `<object>` into top drawer | `put_in_drawer_visual_matching`     | `<object>`,`<robot_color>`,`<background-robot_init_pos>`                           |
+| 5-2   | 2         | open top drawer -> place `<object>` into top drawer | `put_in_drawer_variant_agg`         | `<object>`,`<lighting>`,`<robot_position>`,`<background>`,`<cabinet>`              |
+
+
+### WidowX Evaluation Task List
+
+| Task | Challenge | Task Definition               |
+| ---- | --------- | ----------------------------- |
+| 1    | 1         | Pick `<object>`               |
+| 2    | 2         | Stacking `<cube>` on `<cube>` |
+| 3    | 2         | Put `<object>` on `<top>`     |
+| 4    | 2         | Put `<object>` in basket      |
+
+**ランダマイザープール**
+
+| Pool Name  | Element                                                    |
+| ---------- | ---------------------------------------------------------- |
+| `<object>` | `green cube`, `yellow cube`, `eggplant`, `spoon`, `carrot` |
+| `<top>`    | `plate`, `towel`                                           |
+| `<cube>`   | `green cube`, `yellow cube`                                |
+
+
+## 環境構築
+
+詳細なそれぞれの環境のドキュメント
+- [RT-1](scripts/rt1/README.md)
+- [OpenPi](scripts/openpi/README.md)
+- [lerobot-pi0](scripts/lerobotpi/README.md)
+
+### 1. Docker構築
+```bash
+git clone https://github.com/airoa-org/SimplerEnv.git
+cd SimplerEnv
+git checkout origin/benchmark-v2
+git submodule update --init --recursive
+cp docker/Dockerfile .
+docker build -t simpler_env .
+```
+
+### 2. モデルインストール～実行
+
+#### RT-1
+
+インストール
+```bash
+# Install Google Cloud SDK
+cd ..
+curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz
+tar -xf google-cloud-cli-linux-x86_64.tar.gz
+./google-cloud-sdk/install.sh
+cd SimplerEnv
+
+# Create a virtual environment
+uv venv -p 3.10 scripts/rt1/.venv
+
+# Install dependencies
+source $(pwd)/scripts/rt1/.venv/bin/activate
+uv pip install tensorflow==2.15.0
+uv pip install -r requirements_full_install.txt
+uv pip install -e .
+uv pip install tensorflow[and-cuda]==2.15.1
+
+# If you encounter an import error.
+# ImportError: This version of TensorFlow Probability requires TensorFlow version >= 2.16; Detected an installation of version 2.15.1. Please upgrade TensorFlow to proceed.
+# Do the following.
+uv pip uninstall tensorflow-probability
+uv pip install "tensorflow-probability==0.22.1" --no-deps
+
+
+# Make a checkpoint dir:
+mkdir checkpoints
+
+# RT-1-X
+gsutil -m cp -r gs://gdm-robotics-open-x-embodiment/open_x_embodiment_and_rt_x_oss/rt_1_x_tf_trained_for_002272480_step.zip .
+unzip rt_1_x_tf_trained_for_002272480_step.zip
+mv rt_1_x_tf_trained_for_002272480_step checkpoints
+rm rt_1_x_tf_trained_for_002272480_step.zip
+
+# RT-1-Converged
+gsutil -m cp -r gs://gdm-robotics-open-x-embodiment/open_x_embodiment_and_rt_x_oss/rt_1_tf_trained_for_000400120 .
+mv rt_1_tf_trained_for_000400120 checkpoints
+
+# RT-1-15%
+gsutil -m cp -r gs://gdm-robotics-open-x-embodiment/open_x_embodiment_and_rt_x_oss/rt_1_tf_trained_for_000058240 .
+mv rt_1_tf_trained_for_000058240 checkpoints
+
+# RT-1-Begin
+gsutil -m cp -r gs://gdm-robotics-open-x-embodiment/open_x_embodiment_and_rt_x_oss/rt_1_tf_trained_for_000001120 .
+mv rt_1_tf_trained_for_000001120 checkpoints      
+```
+
+実行
+```bash
+# fractal
+CUDA_VISIBLE_DEVICES=1 python scripts/rt1/evaluate_fractal.py --ckpt-path checkpoints/rt_1_tf_trained_for_000400120
+```
+
+#### OpenPi
+
+インストール
+```bash
+git clone your_openpi
+# サブモジュールの中にもbranchの切り替えとかが必要の可能性がある
+# 例
+# git branch -a
+# git checkout remotes/origin/feature/fractal
+cd your_openpi
+GIT_LFS_SKIP_SMUDGE=1 UV_PROJECT_ENVIRONMENT=../scripts/openpi/.venv uv sync
+source ../scripts/openpi/.venv/bin/activate
+GIT_LFS_SKIP_SMUDGE=1 uv pip install -e .
+cd ..
+
+source $(pwd)/scripts/openpi/.venv/bin/activate
+uv pip install -e . ".[torch]"
+
+huggingface-cli download --resume-download --repo-type model HaomingSong/openpi0-fractal-lora --local-dir /path/to/ckpt
+```
+
+実行
+1. OpenPiの場合モデルを起動
+```bash
+# current directory: /app/SimplerEnV
+cd openpi
+export SERVER_ARGS="policy:checkpoint --policy.config=pi0_bridge_low_mem_finetune --policy.dir=/path/to/ckpt"
+uv run scripts/serve_policy.py $SERVER_ARGS
+```
+
+2. 実行
+```bash
+# current directory: /app/SimplerEnV
+CUDA_VISIBLE_DEVICES=1 python scripts/openpi/challenge_widowx.py --ckpt /path/to/ckpt --control-freq 5
+```
+
+#### lerobot-pi0
+
+インストール
+```bash
+uv venv -p 3.10 scripts/lerobotpi/.venv
+source $(pwd)/scripts/lerobotpi/.venv/bin/activate
+
+uv pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124
+uv pip install git+https://github.com/huggingface/lerobot.git@ce2b9724bfe1b5a4c45e61b1890eef3f5ab0909c#egg=lerobot[pi0]
+uv pip install -e . ".[torch]"
+uv pip install pytest
+```
+
+実行
+```bash
+huggingface-cli login
+CUDA_VISIBLE_DEVICES=1 python scripts/lerobotpi/evaluate_fractal.py --ckpt-path HaomingSong/lerobot-pi0-fractal
+CUDA_VISIBLE_DEVICES=1 python scripts/lerobotpi/evaluate_fractal.py --ckpt-path lerobot/pi0
+```
+
diff --git a/scripts/gr00t/data_config.py b/scripts/gr00t/data_config.py
new file mode 100644
index 00000000..bd91a28a
--- /dev/null
+++ b/scripts/gr00t/data_config.py
@@ -0,0 +1,1066 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+
+from gr00t.data.dataset import ModalityConfig
+from gr00t.data.transform.base import ComposedModalityTransform, ModalityTransform
+from gr00t.data.transform.concat import ConcatTransform
+from gr00t.data.transform.state_action import (
+    StateActionSinCosTransform,
+    StateActionToTensor,
+    StateActionTransform,
+)
+from gr00t.data.transform.video import (
+    VideoColorJitter,
+    VideoCrop,
+    VideoResize,
+    VideoToNumpy,
+    VideoToTensor,
+)
+from gr00t.model.transforms import GR00TTransform
+
+
+class BaseDataConfig(ABC):
+    @abstractmethod
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        pass
+
+    @abstractmethod
+    def transform(self) -> ModalityTransform:
+        pass
+
+
+###########################################################################################
+
+
+class FourierGr1ArmsOnlyDataConfig(BaseDataConfig):
+    video_keys = ["video.ego_view"]
+    state_keys = [
+        "state.left_arm",
+        "state.right_arm",
+        "state.left_hand",
+        "state.right_hand",
+    ]
+    action_keys = [
+        "action.left_arm",
+        "action.right_arm",
+        "action.left_hand",
+        "action.right_hand",
+    ]
+    language_keys = ["annotation.human.action.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+
+        return modality_configs
+
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionSinCosTransform(apply_to=self.state_keys),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "min_max" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            # model-specific transform
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+
+
+###########################################################################################
+
+
+class So100DataConfig(BaseDataConfig):
+    video_keys = ["video.webcam"]
+    state_keys = ["state.single_arm", "state.gripper"]
+    action_keys = ["action.single_arm", "action.gripper"]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+
+        return modality_configs
+
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "min_max" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "min_max" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            # model-specific transform
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+
+
+###########################################################################################
+
+
+class So100DualCamDataConfig(So100DataConfig):
+    video_keys = ["video.front", "video.wrist"]
+    state_keys = ["state.single_arm", "state.gripper"]
+    action_keys = ["action.single_arm", "action.gripper"]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+
+
+###########################################################################################
+
+
+class UnitreeG1DataConfig(BaseDataConfig):
+    video_keys = ["video.rs_view"]
+    state_keys = ["state.left_arm", "state.right_arm", "state.left_hand", "state.right_hand"]
+    action_keys = ["action.left_arm", "action.right_arm", "action.left_hand", "action.right_hand"]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+
+        return modality_configs
+
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "min_max" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "min_max" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            # model-specific transform
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+
+
+class UnitreeG1FullBodyDataConfig(UnitreeG1DataConfig):
+    video_keys = ["video.rs_view"]
+    state_keys = [
+        "state.left_leg",
+        "state.right_leg",
+        "state.waist",
+        "state.left_arm",
+        "state.right_arm",
+        "state.left_hand",
+        "state.right_hand",
+    ]
+    action_keys = ["action.left_arm", "action.right_arm", "action.left_hand", "action.right_hand"]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+
+
+###########################################################################################
+
+
+class FourierGr1FullUpperBodyDataConfig(BaseDataConfig):
+    video_keys = ["video.front_view"]
+    state_keys = [
+        "state.left_arm",
+        "state.right_arm",
+        "state.left_hand",
+        "state.right_hand",
+        "state.waist",
+        "state.neck",
+    ]
+    action_keys = [
+        "action.left_arm",
+        "action.right_arm",
+        "action.left_hand",
+        "action.right_hand",
+        "action.waist",
+        "action.neck",
+    ]
+    language_keys = ["annotation.human.action.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+
+    def modality_config(self):
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+
+    def transform(self):
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "min_max" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "min_max" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+
+        return ComposedModalityTransform(transforms=transforms)
+
+
+###########################################################################################
+
+
+class BimanualPandaGripperDataConfig(BaseDataConfig):
+    video_keys = [
+        "video.right_wrist_view",
+        "video.left_wrist_view",
+        "video.front_view",
+    ]
+    state_keys = [
+        "state.right_arm_eef_pos",
+        "state.right_arm_eef_quat",
+        "state.right_gripper_qpos",
+        "state.left_arm_eef_pos",
+        "state.left_arm_eef_quat",
+        "state.left_gripper_qpos",
+    ]
+    action_keys = [
+        "action.right_arm_eef_pos",
+        "action.right_arm_eef_rot",
+        "action.right_gripper_close",
+        "action.left_arm_eef_pos",
+        "action.left_arm_eef_rot",
+        "action.left_gripper_close",
+    ]
+
+    language_keys = ["annotation.human.action.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+
+    # Used in StateActionTransform for normalization and target rotations
+    state_normalization_modes = {
+        "state.right_arm_eef_pos": "min_max",
+        "state.right_gripper_qpos": "min_max",
+        "state.left_arm_eef_pos": "min_max",
+        "state.left_gripper_qpos": "min_max",
+    }
+    state_target_rotations = {
+        "state.right_arm_eef_quat": "rotation_6d",
+        "state.left_arm_eef_quat": "rotation_6d",
+    }
+    action_normalization_modes = {
+        "action.right_gripper_close": "binary",
+        "action.left_gripper_close": "binary",
+    }
+
+    def modality_config(self):
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+
+    def transform(self):
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes=self.state_normalization_modes,
+                target_rotations=self.state_target_rotations,
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes=self.action_normalization_modes,
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+
+        return ComposedModalityTransform(transforms=transforms)
+
+
+###########################################################################################
+
+
+class BimanualPandaHandDataConfig(BimanualPandaGripperDataConfig):
+    video_keys = [
+        "video.right_wrist_view",
+        "video.left_wrist_view",
+        "video.ego_view",
+    ]
+    state_keys = [
+        "state.right_arm_eef_pos",
+        "state.right_arm_eef_quat",
+        "state.right_hand",
+        "state.left_arm_eef_pos",
+        "state.left_arm_eef_quat",
+        "state.left_hand",
+    ]
+    action_keys = [
+        "action.right_arm_eef_pos",
+        "action.right_arm_eef_rot",
+        "action.right_hand",
+        "action.left_arm_eef_pos",
+        "action.left_arm_eef_rot",
+        "action.left_hand",
+    ]
+    language_keys = ["annotation.human.action.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+
+    # Used in StateActionTransform for normalization and target rotations
+    state_normalization_modes = {
+        "state.right_arm_eef_pos": "min_max",
+        "state.right_hand": "min_max",
+        "state.left_arm_eef_pos": "min_max",
+        "state.left_hand": "min_max",
+    }
+    action_normalization_modes = {
+        "action.right_hand": "min_max",
+        "action.left_hand": "min_max",
+    }
+    state_target_rotations = {
+        "state.right_arm_eef_quat": "rotation_6d",
+        "state.left_arm_eef_quat": "rotation_6d",
+    }
+
+
+###########################################################################################
+
+
+class SinglePandaGripperDataConfig(BimanualPandaGripperDataConfig):
+    video_keys = [
+        "video.left_view",
+        "video.right_view",
+        "video.wrist_view",
+    ]
+    state_keys = [
+        "state.end_effector_position_relative",
+        "state.end_effector_rotation_relative",
+        "state.gripper_qpos",
+        "state.base_position",
+        "state.base_rotation",
+    ]
+    action_keys = [
+        "action.end_effector_position",
+        "action.end_effector_rotation",
+        "action.gripper_close",
+        "action.base_motion",
+        "action.control_mode",
+    ]
+
+    language_keys = ["annotation.human.action.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+
+    # Used in StateActionTransform for normalization and target rotations
+    state_normalization_modes = {
+        "state.end_effector_position_relative": "min_max",
+        "state.end_effector_rotation_relative": "min_max",
+        "state.gripper_qpos": "min_max",
+        "state.base_position": "min_max",
+        "state.base_rotation": "min_max",
+    }
+    state_target_rotations = {
+        "state.end_effector_rotation_relative": "rotation_6d",
+        "state.base_rotation": "rotation_6d",
+    }
+    action_normalization_modes = {
+        "action.end_effector_position": "min_max",
+        "action.end_effector_rotation": "min_max",
+        "action.gripper_close": "binary",
+        "action.base_motion": "min_max",
+        "action.control_mode": "binary",
+    }
+
+
+###########################################################################################
+
+
+class FourierGr1ArmsWaistDataConfig(FourierGr1ArmsOnlyDataConfig):
+    video_keys = ["video.ego_view"]
+    state_keys = [
+        "state.left_arm",
+        "state.right_arm",
+        "state.left_hand",
+        "state.right_hand",
+        "state.waist",
+    ]
+    action_keys = [
+        "action.left_arm",
+        "action.right_arm",
+        "action.left_hand",
+        "action.right_hand",
+        "action.waist",
+    ]
+    language_keys = ["annotation.human.coarse_action"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+
+    def modality_config(self):
+        return super().modality_config()
+
+    def transform(self):
+        return super().transform()
+
+
+###########################################################################################
+
+
+class OxeDroidDataConfig:
+    video_keys = [
+        "video.exterior_image_1",
+        "video.exterior_image_2",
+        "video.wrist_image",
+    ]
+    state_keys = [
+        "state.eef_position",
+        "state.eef_rotation",
+        "state.gripper_position",
+    ]
+    action_keys = [
+        "action.eef_position_delta",
+        "action.eef_rotation_delta",
+        "action.gripper_position",
+    ]
+    language_keys = ["annotation.language.language_instruction"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+
+    def modality_config(self):
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+
+    def transform(self):
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={
+                    "state.eef_position": "min_max",
+                    "state.gripper_position": "min_max",
+                },
+                target_rotations={
+                    "state.eef_rotation": "rotation_6d",
+                },
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={
+                    "action.gripper_position": "binary",
+                },
+                target_rotations={"action.eef_rotation_delta": "axis_angle"},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+
+        return ComposedModalityTransform(transforms=transforms)
+
+
+###########################################################################################
+
+
+class AgibotGenie1DataConfig:
+    video_keys = [
+        "video.top_head",
+        "video.hand_left",
+        "video.hand_right",
+    ]
+    state_keys = [
+        "state.left_arm_joint_position",
+        "state.right_arm_joint_position",
+        "state.left_effector_position",
+        "state.right_effector_position",
+        "state.head_position",
+        "state.waist_position",
+    ]
+    action_keys = [
+        "action.left_arm_joint_position",
+        "action.right_arm_joint_position",
+        "action.left_effector_position",
+        "action.right_effector_position",
+        "action.head_position",
+        "action.waist_position",
+        "action.robot_velocity",
+    ]
+    language_keys = ["annotation.language.action_text"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+
+    def modality_config(self):
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+
+    def transform(self):
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={
+                    "state.left_arm_joint_position": "min_max",
+                    "state.right_arm_joint_position": "min_max",
+                    "state.left_effector_position": "min_max",
+                    "state.right_effector_position": "min_max",
+                    "state.head_position": "min_max",
+                    "state.waist_position": "min_max",
+                },
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={
+                    "action.left_arm_joint_position": "min_max",
+                    "action.right_arm_joint_position": "min_max",
+                    "action.left_effector_position": "min_max",
+                    "action.right_effector_position": "min_max",
+                    "action.head_position": "min_max",
+                    "action.waist_position": "min_max",
+                    "action.robot_velocity": "min_max",
+                },
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+
+        return ComposedModalityTransform(transforms=transforms)
+
+class GoogleRobotDataConfig_chunk1(BaseDataConfig):
+    video_keys = [
+        "video.image",
+    ]
+    state_keys = [
+        "state.ee_pos",
+        "state.ee_rot",
+        "state.gripper",
+    ]
+    action_keys = [
+        "action.delta_ee_pos",
+        "action.delta_ee_rot",
+        "action.gripper",
+    ]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(1))
+
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+
+        return modality_configs
+
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "min_max" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "min_max" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            # model-specific transform
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+
+
+class WidowXDataConfig_chunk1(BaseDataConfig):
+    video_keys = ["video.image"]
+    state_keys = ["state.ee_pos", "state.ee_rot", "state.gripper"]
+    action_keys = ["action.delta_ee_pos", "action.delta_ee_rot", "action.gripper"]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(1))
+
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+
+        return modality_configs
+
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "min_max" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "min_max" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            # model-specific transform
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+
+###########################################################################################
+
+DATA_CONFIG_MAP = {
+    "fourier_gr1_arms_waist": FourierGr1ArmsWaistDataConfig(),
+    "fourier_gr1_arms_only": FourierGr1ArmsOnlyDataConfig(),
+    "fourier_gr1_full_upper_body": FourierGr1FullUpperBodyDataConfig(),
+    "bimanual_panda_gripper": BimanualPandaGripperDataConfig(),
+    "bimanual_panda_hand": BimanualPandaHandDataConfig(),
+    "single_panda_gripper": SinglePandaGripperDataConfig(),
+    "so100": So100DataConfig(),
+    "so100_dualcam": So100DualCamDataConfig(),
+    "unitree_g1": UnitreeG1DataConfig(),
+    "unitree_g1_full_body": UnitreeG1FullBodyDataConfig(),
+    "oxe_droid": OxeDroidDataConfig(),
+    "agibot_genie1": AgibotGenie1DataConfig(),
+    "google_robot": GoogleRobotDataConfig_chunk1(),
+    "widowx": WidowXDataConfig_chunk1(),
+}
diff --git a/scripts/gr00t/embodiment_tags.py b/scripts/gr00t/embodiment_tags.py
new file mode 100644
index 00000000..d11608bd
--- /dev/null
+++ b/scripts/gr00t/embodiment_tags.py
@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+
+
+class EmbodimentTag(Enum):
+    GR1 = "gr1"
+    """
+    The GR1 dataset.
+    """
+
+    OXE_DROID = "oxe_droid"
+    """
+    The OxE Droid dataset.
+    """
+
+    AGIBOT_GENIE1 = "agibot_genie1"
+    """
+    The AgiBot Genie-1 with gripper dataset.
+    """
+
+    NEW_EMBODIMENT = "new_embodiment"
+    """
+    Any new embodiment for finetuning.
+    """
+    GOOGLE_ROBOT = "google_robot"
+    """
+    The Google Robot dataset.
+    """
+    WIDOWX = "widowx"
+    """
+    The Widow-X 250 dataset.
+    """
+
+# Embodiment tag string: to projector index in the Action Expert Module
+EMBODIMENT_TAG_MAPPING = {
+    EmbodimentTag.NEW_EMBODIMENT.value: 31,
+    EmbodimentTag.OXE_DROID.value: 17,
+    EmbodimentTag.AGIBOT_GENIE1.value: 26,
+    EmbodimentTag.GR1.value: 24,
+    EmbodimentTag.GOOGLE_ROBOT.value: 13,
+    EmbodimentTag.WIDOWX.value: 15,
+}
diff --git a/scripts/gr00t/evaluate_bridge.py b/scripts/gr00t/evaluate_bridge.py
new file mode 100644
index 00000000..460460a5
--- /dev/null
+++ b/scripts/gr00t/evaluate_bridge.py
@@ -0,0 +1,45 @@
+import argparse
+
+from simpler_env.evaluation.bridge_tasks import (
+    widowx_task1_pick_object,
+    widowx_task2_stack_cube,
+    widowx_task3_put_object_on_top,
+    widowx_task4_put_object_in_basket
+)
+from simpler_env.policies.gr00t.gr00t_model import Gr00tInference
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run Comprehensive ManiSkill2 Evaluation")
+    parser.add_argument("--ckpt-path", type=str, required=True, help="Path to the checkpoint to evaluate.")
+    parser.add_argument("--control-freq", type=int, default=5, help="Set control frequency (default->5)")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    ckpt_path = args.ckpt_path
+
+    policy = Gr00tInference(saved_model_path=ckpt_path, policy_setup="widowx_bridge", action_scale=1.0)
+
+    print("Policy initialized. Starting evaluation...")
+
+    tasks = [
+        widowx_task1_pick_object, 
+        widowx_task2_stack_cube, 
+        widowx_task3_put_object_on_top, 
+        widowx_task4_put_object_in_basket
+    ]
+
+    final_scores = []
+    episode_counter = 0
+    for task in tasks:
+        cur_scores = task(
+            env_policy=policy, ckpt_path=args.ckpt_path, episode_id=episode_counter, control_freq=args.control_freq
+        )
+        final_scores += cur_scores
+        episode_counter += 1
+
+    print("\nEvaluation finished.")
+    print(f"Final calculated scores: {final_scores}")
+
diff --git a/scripts/gr00t/evaluate_fractal.py b/scripts/gr00t/evaluate_fractal.py
new file mode 100644
index 00000000..94eb75e4
--- /dev/null
+++ b/scripts/gr00t/evaluate_fractal.py
@@ -0,0 +1,25 @@
+import argparse
+
+from simpler_env.evaluation.fractal_tasks import run_comprehensive_evaluation
+from simpler_env.policies.gr00t.gr00t_model import Gr00tInference
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run Comprehensive ManiSkill2 Evaluation")
+    parser.add_argument("--ckpt-path", type=str, required=True, help="Path to the checkpoint to evaluate.")
+    parser.add_argument("--control-freq", type=int, default=3, help="Set control frequency (default->3)")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    ckpt_path = args.ckpt_path
+
+    policy = Gr00tInference(saved_model_path=ckpt_path, policy_setup="google_robot", action_scale=1.0)
+
+    print("Policy initialized. Starting evaluation...")
+
+    final_scores = run_comprehensive_evaluation(env_policy=policy, ckpt_path=args.ckpt_path, control_freq=args.control_freq)
+
+    print("\nEvaluation finished.")
+    print(f"Final calculated scores: {final_scores}")
diff --git a/scripts/hsr_openpi/README.md b/scripts/hsr_openpi/README.md
new file mode 100644
index 00000000..fde4b226
--- /dev/null
+++ b/scripts/hsr_openpi/README.md
@@ -0,0 +1,15 @@
+
+```bash
+git clone https://github.com/airoa-org/hsr_openpi.git
+cd hsr_openpi
+git checkout remotes/origin/release
+GIT_LFS_SKIP_SMUDGE=1 UV_PROJECT_ENVIRONMENT=../scripts/hsr_openpi/.venv uv sync
+GIT_LFS_SKIP_SMUDGE=1 uv pip install -e .
+cd ..
+
+source $(pwd)/scripts/hsr_openpi/.venv/bin/activate
+uv pip install -e . ".[torch]"
+
+
+python scripts/hsr_openpi/evaluate_fractal.py --ckpt-path /data/checkpoints/pi0_fractal_low_mem_finetune2/my_experiment/17000
+```
\ No newline at end of file
diff --git a/scripts/hsr_openpi/evaluate_fractal.py b/scripts/hsr_openpi/evaluate_fractal.py
new file mode 100644
index 00000000..bc113acb
--- /dev/null
+++ b/scripts/hsr_openpi/evaluate_fractal.py
@@ -0,0 +1,39 @@
+import argparse
+
+from openpi_client import action_chunk_broker
+
+from openpi.policies import policy_config as _policy_config
+from openpi.training import config as _config
+from simpler_env.evaluation.fractal_tasks import run_comprehensive_evaluation
+from simpler_env.policies.adapter import AiroaToSimplerFractalAdapter
+from simpler_env.policies.hsr_openpi.pi0_or_fast import OpenpiToAiroaPolicy
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run Comprehensive ManiSkill2 Evaluation")
+    parser.add_argument("--ckpt-path", type=str, required=True, help="Path to the checkpoint to evaluate.")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    ckpt_path = args.ckpt_path
+
+    policy = OpenpiToAiroaPolicy(
+        policy=action_chunk_broker.ActionChunkBroker(
+            policy=_policy_config.create_trained_policy(
+                _config.get_config("pi0_fractal_low_mem_finetune"),
+                ckpt_path,
+            ),
+            action_horizon=10,
+        ),
+    )
+
+    env_policy = AiroaToSimplerFractalAdapter(policy=policy)
+
+    print("Policy initialized. Starting evaluation...")
+
+    final_scores = run_comprehensive_evaluation(env_policy=env_policy, ckpt_path=args.ckpt_path)
+
+    print("\nEvaluation finished.")
+    print(f"Final calculated scores: {final_scores}")
diff --git a/scripts/lerobotpi/README.md b/scripts/lerobotpi/README.md
new file mode 100644
index 00000000..6f1b7bca
--- /dev/null
+++ b/scripts/lerobotpi/README.md
@@ -0,0 +1,16 @@
+
+```bash
+uv venv -p 3.10 scripts/lerobotpi/.venv
+
+
+# uv pip install -e .
+source $(pwd)/scripts/lerobotpi/.venv/bin/activate
+uv pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124
+uv pip install git+https://github.com/huggingface/lerobot.git@ce2b9724bfe1b5a4c45e61b1890eef3f5ab0909c#egg=lerobot[pi0]
+uv pip install -e . ".[torch]"
+uv pip install pytest
+
+huggingface-cli login
+CUDA_VISIBLE_DEVICES=1 python scripts/lerobotpi/evaluate_fractal.py --ckpt-path HaomingSong/lerobot-pi0-fractal
+CUDA_VISIBLE_DEVICES=1 python scripts/lerobotpi/evaluate_fractal.py --ckpt-path lerobot/pi0
+```
\ No newline at end of file
diff --git a/scripts/lerobotpi/evaluate_fractal.py b/scripts/lerobotpi/evaluate_fractal.py
new file mode 100644
index 00000000..4fb77a82
--- /dev/null
+++ b/scripts/lerobotpi/evaluate_fractal.py
@@ -0,0 +1,24 @@
+import argparse
+
+from simpler_env.evaluation.fractal_tasks import run_comprehensive_evaluation
+from simpler_env.policies.lerobotpi.pi0_or_fast import LerobotPiFastInference
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run Comprehensive ManiSkill2 Evaluation")
+    parser.add_argument("--ckpt-path", type=str, required=True, help="Path to the checkpoint to evaluate.")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    ckpt_path = args.ckpt_path
+
+    policy = LerobotPiFastInference(saved_model_path=ckpt_path, policy_setup="google_robot", action_scale=1.0)
+
+    print("Policy initialized. Starting evaluation...")
+
+    final_scores = run_comprehensive_evaluation(env_policy=policy, ckpt_path=args.ckpt_path)
+
+    print("\nEvaluation finished.")
+    print(f"Final calculated scores: {final_scores}")
diff --git a/scripts/misc/octo_drawer_variant_agg_alt_urdf.sh b/scripts/misc/octo_drawer_variant_agg_alt_urdf.sh
deleted file mode 100644
index 9440be5d..00000000
--- a/scripts/misc/octo_drawer_variant_agg_alt_urdf.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
-
-
-
-declare -a policy_models=(
-"octo-base"
-)
-
-declare -a env_names=(
-OpenTopDrawerCustomInScene-v0
-OpenMiddleDrawerCustomInScene-v0
-OpenBottomDrawerCustomInScene-v0
-CloseTopDrawerCustomInScene-v0
-CloseMiddleDrawerCustomInScene-v0
-CloseBottomDrawerCustomInScene-v0
-)
-
-urdf_version=recolor_sim
-
-EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs urdf_version=${urdf_version}"
-
-
-# base setup
-scene_name=frl_apartment_stage_simple
-
-EvalSim() {
-  echo ${policy_model} ${env_name}
-
-  python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-    --robot google_robot_static \
-    --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-    --env-name ${env_name} --scene-name ${scene_name} \
-    --robot-init-x 0.65 0.85 3 --robot-init-y -0.2 0.2 3 \
-    --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0.0 0.0 1 \
-    --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-    ${EXTRA_ARGS}
-}
-
-
-for policy_model in "${policy_models[@]}"; do
-  for env_name in "${env_names[@]}"; do
-    EvalSim
-  done
-done
-
-
-# backgrounds
-
-declare -a scene_names=(
-"modern_bedroom_no_roof"
-"modern_office_no_roof"
-)
-
-for scene_name in "${scene_names[@]}"; do
-  for policy_model in "${policy_models[@]}"; do
-    for env_name in "${env_names[@]}"; do
-      EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt urdf_version=${urdf_version}"
-      EvalSim
-    done
-  done
-done
-
-
-# lightings
-scene_name=frl_apartment_stage_simple
-
-for policy_model in "${policy_models[@]}"; do
-  for env_name in "${env_names[@]}"; do
-    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt urdf_version=${urdf_version} light_mode=brighter"
-    EvalSim
-    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt urdf_version=${urdf_version} light_mode=darker"
-    EvalSim
-  done
-done
-
-
-# new cabinets
-scene_name=frl_apartment_stage_simple
-
-for policy_model in "${policy_models[@]}"; do
-  for env_name in "${env_names[@]}"; do
-    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt urdf_version=${urdf_version} station_name=mk_station2"
-    EvalSim
-    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt urdf_version=${urdf_version} station_name=mk_station3"
-    EvalSim
-  done
-done
diff --git a/scripts/misc/octo_move_near_variant_agg_alt_urdf.sh b/scripts/misc/octo_move_near_variant_agg_alt_urdf.sh
deleted file mode 100644
index 11c1aa09..00000000
--- a/scripts/misc/octo_move_near_variant_agg_alt_urdf.sh
+++ /dev/null
@@ -1,144 +0,0 @@
-
-gpu_id=0
-
-declare -a arr=("octo-base")
-urdf_version=recolor_sim
-
-for policy_model in "${arr[@]}"; do echo "$policy_model"; done
-
-
-# base setup
-
-env_name=MoveNearGoogleInScene-v0
-scene_name=google_pick_coke_can_1_v4
-
-for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs urdf_version=${urdf_version};
-
-done
-
-
-
-# distractor
-
-for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs no_distractor=True urdf_version=${urdf_version};
-
-done
-
-
-# backgrounds
-
-env_name=MoveNearGoogleInScene-v0
-declare -a scene_arr=("google_pick_coke_can_1_v4_alt_background" \
-                      "google_pick_coke_can_1_v4_alt_background_2")
-
-for scene_name in "${scene_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs urdf_version=${urdf_version};
-
-done
-
-done
-
-
-
-
-
-# lighting
-
-env_name=MoveNearGoogleInScene-v0
-scene_name=google_pick_coke_can_1_v4
-
-for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs slightly_darker_lighting=True urdf_version=${urdf_version};
-
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs slightly_brighter_lighting=True urdf_version=${urdf_version};
-
-done
-
-
-
-
-
-# table textures
-
-env_name=MoveNearGoogleInScene-v0
-declare -a scene_arr=("Baked_sc1_staging_objaverse_cabinet1_h870" \
-                      "Baked_sc1_staging_objaverse_cabinet2_h870")
-
-for scene_name in "${scene_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs slightly_brighter_lighting=True urdf_version=${urdf_version};
-
-done
-
-done
-
-
-
-
-# camera orientations
-
-declare -a env_arr=("MoveNearAltGoogleCameraInScene-v0" \
-                   "MoveNearAltGoogleCamera2InScene-v0")
-scene_name=google_pick_coke_can_1_v4
-
-for env_name in "${env_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs slightly_brighter_lighting=True urdf_version=${urdf_version};
-
-done
-
-done
diff --git a/scripts/misc/octo_pick_coke_can_variant_agg_alt_urdf.sh b/scripts/misc/octo_pick_coke_can_variant_agg_alt_urdf.sh
deleted file mode 100644
index 5d7b3dc5..00000000
--- a/scripts/misc/octo_pick_coke_can_variant_agg_alt_urdf.sh
+++ /dev/null
@@ -1,183 +0,0 @@
-
-gpu_id=0
-
-declare -a arr=("octo-base")
-
-# lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically
-declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True")
-urdf_version=recolor_sim
-# recolor_sim
-
-for policy_model in "${arr[@]}"; do echo "$policy_model"; done
-
-
-# base setup
-
-env_name=GraspSingleOpenedCokeCanInScene-v0
-scene_name=google_pick_coke_can_1_v4
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version};
-
-done
-
-done
-
-
-
-# table textures
-
-env_name=GraspSingleOpenedCokeCanInScene-v0
-
-declare -a scene_arr=("Baked_sc1_staging_objaverse_cabinet1_h870" \
-                      "Baked_sc1_staging_objaverse_cabinet2_h870")
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for scene_name in "${scene_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version};
-
-done
-
-done
-
-done
-
-
-
-
-# distractors
-
-env_name=GraspSingleOpenedCokeCanDistractorInScene-v0
-scene_name=google_pick_coke_can_1_v4
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version};
-
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version} distractor_config=more;
-
-done
-
-done
-
-
-
-
-# backgrounds
-
-env_name=GraspSingleOpenedCokeCanInScene-v0
-declare -a scene_arr=("google_pick_coke_can_1_v4_alt_background" \
-                      "google_pick_coke_can_1_v4_alt_background_2")
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for scene_name in "${scene_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version};
-
-done
-
-done
-
-done
-
-
-
-# lightings
-
-env_name=GraspSingleOpenedCokeCanInScene-v0
-scene_name=google_pick_coke_can_1_v4
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version} slightly_darker_lighting=True;
-
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version} slightly_brighter_lighting=True;
-
-done
-
-done
-
-
-
-
-# camera orientations
-
-declare -a env_arr=("GraspSingleOpenedCokeCanAltGoogleCameraInScene-v0" \
-                   "GraspSingleOpenedCokeCanAltGoogleCamera2InScene-v0")
-scene_name=google_pick_coke_can_1_v4
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for env_name in "${env_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version};
-
-done
-
-done
-
-done
diff --git a/scripts/octo_drawer_visual_matching.sh b/scripts/octo_drawer_visual_matching.sh
deleted file mode 100644
index 41e8c893..00000000
--- a/scripts/octo_drawer_visual_matching.sh
+++ /dev/null
@@ -1,135 +0,0 @@
-# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
-
-
-
-declare -a policy_models=(
-"octo-base"
-# "octo-server"
-)
-
-declare -a env_names=(
-OpenTopDrawerCustomInScene-v0
-OpenMiddleDrawerCustomInScene-v0
-OpenBottomDrawerCustomInScene-v0
-CloseTopDrawerCustomInScene-v0
-CloseMiddleDrawerCustomInScene-v0
-CloseBottomDrawerCustomInScene-v0
-)
-
-# URDF variations
-declare -a urdf_version_arr=("recolor_cabinet_visual_matching_1" "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" None)
-
-for urdf_version in "${urdf_version_arr[@]}"; do
-
-EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_station_recolor light_mode=simple disable_bad_material=True urdf_version=${urdf_version}"
-
-EvalOverlay() {
-# A0
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.644 0.644 1 --robot-init-y -0.179 -0.179 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.03 -0.03 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png \
-  ${EXTRA_ARGS}
-
-# A1
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.765 0.765 1 --robot-init-y -0.182 -0.182 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.02 -0.02 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a1.png \
-  ${EXTRA_ARGS}
-
-# A2
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.889 0.889 1 --robot-init-y -0.203 -0.203 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.06 -0.06 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a2.png \
-  ${EXTRA_ARGS}
-
-# B0
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.652 0.652 1 --robot-init-y 0.009 0.009 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png \
-  ${EXTRA_ARGS}
-
-# B1
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.752 0.752 1 --robot-init-y 0.009 0.009 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b1.png \
-  ${EXTRA_ARGS}
-
-# B2
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.851 0.851 1 --robot-init-y 0.035 0.035 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b2.png \
-  ${EXTRA_ARGS}
-
-# C0
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.665 0.665 1 --robot-init-y 0.224 0.224 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png \
-  ${EXTRA_ARGS}
-
-# C1
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.765 0.765 1 --robot-init-y 0.222 0.222 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.025 -0.025 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c1.png \
-  ${EXTRA_ARGS}
-
-# C2
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.865 0.865 1 --robot-init-y 0.222 0.222 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.025 -0.025 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c2.png \
-  ${EXTRA_ARGS}
-}
-
-
-for policy_model in "${policy_models[@]}"; do
-  for env_name in "${env_names[@]}"; do
-    EvalOverlay
-  done
-done
-
-
-done
diff --git a/scripts/octo_move_near_visual_matching.sh b/scripts/octo_move_near_visual_matching.sh
deleted file mode 100644
index dd89caee..00000000
--- a/scripts/octo_move_near_visual_matching.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-
-
-gpu_id=0
-
-declare -a policy_models=(
-  "octo-base"
-  # "octo-server"
-)
-
-env_name=MoveNearGoogleBakedTexInScene-v0
-# env_name=MoveNearGoogleBakedTexInScene-v1
-scene_name=google_pick_coke_can_1_v4
-rgb_overlay_path=./ManiSkill2_real2sim/data/real_inpainting/google_move_near_real_eval_1.png
-# rgb_overlay_path=./ManiSkill2_real2sim/data/real_inpainting/google_move_near_real_eval_2.png
-
-# URDF variations
-declare -a urdf_version_arr=(None "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" "recolor_cabinet_visual_matching_1")
-
-for policy_model in "${policy_models[@]}"; do echo "$policy_model"; done
-
-
-for urdf_version in "${urdf_version_arr[@]}";
-
-do for policy_model in "${policy_models[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --rgb-overlay-path ${rgb_overlay_path} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs urdf_version=${urdf_version} \
-  --additional-env-save-tags baked_except_bpb_orange; # google_move_near_real_eval_1.png
-
-# do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-#   --robot google_robot_static \
-#   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-#   --env-name ${env_name} --scene-name ${scene_name} \
-#   --rgb-overlay-path ${rgb_overlay_path} \
-#   --robot-init-x 0.36 0.36 1 --robot-init-y 0.22 0.22 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-#   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.028 -0.028 1 \
-#   --additional-env-build-kwargs urdf_version=${urdf_version} \
-#   --additional-env-save-tags baked_except_bpb_orange; # google_move_near_real_eval_2.png
-
-done
-
-done
diff --git a/scripts/openpi/README.md b/scripts/openpi/README.md
new file mode 100644
index 00000000..4d02babd
--- /dev/null
+++ b/scripts/openpi/README.md
@@ -0,0 +1,19 @@
+
+```bash
+git clone https://github.com/Physical-Intelligence/openpi.git
+cd openpi
+GIT_LFS_SKIP_SMUDGE=1 UV_PROJECT_ENVIRONMENT=../scripts/openpi/.venv uv sync
+source ../scripts/openpi/.venv/bin/activate
+GIT_LFS_SKIP_SMUDGE=1 uv pip install -e .
+cd ..
+
+source $(pwd)/scripts/openpi/.venv/bin/activate
+uv pip install -e . ".[torch]"
+
+
+huggingface-cli download --resume-download --repo-type model HaomingSong/openpi0-fractal-lora --local-dir /data/checkpoints/openpi0-fractal-lora
+
+python scripts/openpi/evaluate_fractal2.py --ckpt-path /data/checkpoints/openpi0-fractal-lora
+CUDA_VISIBLE_DEVICES=1 python scripts/openpi/evaluate_fractal.py --ckpt-path HaomingSong/openpi0-bridge-lora
+```
+
diff --git a/scripts/openpi/challenge_google_robot.py b/scripts/openpi/challenge_google_robot.py
new file mode 100644
index 00000000..a180cbe0
--- /dev/null
+++ b/scripts/openpi/challenge_google_robot.py
@@ -0,0 +1,25 @@
+import argparse
+
+from simpler_env.evaluation.fractal_tasks import run_comprehensive_evaluation
+from simpler_env.policies.openpi.pi0_or_fast import OpenPiFastInference
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run Comprehensive ManiSkill2 Evaluation")
+    parser.add_argument("--ckpt-path", type=str, required=True, help="Path to the checkpoint to evaluate.")
+    parser.add_argument("--control-freq", type=int, default=3, help="Set control frequency (default->3)")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    ckpt_path = args.ckpt_path
+
+    policy = OpenPiFastInference(saved_model_path=ckpt_path, policy_setup="google_robot", action_scale=1.0)
+
+    print("Policy initialized. Starting evaluation...")
+
+    final_scores = run_comprehensive_evaluation(env_policy=policy, ckpt_path=args.ckpt_path, control_freq=args.control_freq)
+
+    print("\nEvaluation finished.")
+    print(f"Final calculated scores: {final_scores}")
diff --git a/scripts/openpi/challenge_widowx.py b/scripts/openpi/challenge_widowx.py
new file mode 100644
index 00000000..503997bc
--- /dev/null
+++ b/scripts/openpi/challenge_widowx.py
@@ -0,0 +1,42 @@
+import argparse
+
+from simpler_env.evaluation.bridge_tasks import (
+    widowx_task1_pick_object,
+    widowx_task2_stack_cube,
+    widowx_task3_put_object_on_top,
+    widowx_task4_put_object_in_basket
+)
+from simpler_env.policies.openpi.pi0_or_fast import OpenPiFastInference
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run Comprehensive ManiSkill2 Evaluation")
+    parser.add_argument("--ckpt-path", type=str, required=True, help="Path to the checkpoint to evaluate.")
+    parser.add_argument("--control-freq", type=int, default=5, help="Set control frequency (default->5)")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    ckpt_path = args.ckpt_path
+
+    policy = OpenPiFastInference(saved_model_path=ckpt_path, policy_setup="widowx_bridge", action_scale=1.0)
+
+    print("Policy initialized. Starting evaluation...")
+
+    tasks = [
+        widowx_task1_pick_object, 
+        widowx_task2_stack_cube, 
+        widowx_task3_put_object_on_top, 
+        widowx_task4_put_object_in_basket
+    ]
+
+    final_scores = []
+    for task in tasks:
+        cur_scores = task(
+            env_policy=policy, ckpt_path=args.ckpt_path, control_freq=args.control_freq
+        )
+        final_scores += cur_scores
+
+    print("\nEvaluation finished.")
+    print(f"Final calculated scores: {final_scores}")
diff --git a/scripts/openpi/evaluate_bridge.py b/scripts/openpi/evaluate_bridge.py
new file mode 100644
index 00000000..8874e48c
--- /dev/null
+++ b/scripts/openpi/evaluate_bridge.py
@@ -0,0 +1,24 @@
+import argparse
+
+from simpler_env.evaluation.evaluate_bridge import run_comprehensive_evaluation_bridge
+from simpler_env.policies.openpi.pi0_or_fast import OpenPiFastInference
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run Comprehensive ManiSkill2 Evaluation")
+    parser.add_argument("--ckpt-path", type=str, required=True, help="Path to the checkpoint to evaluate.")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    ckpt_path = args.ckpt_path
+
+    policy = OpenPiFastInference(saved_model_path=ckpt_path, policy_setup="widowx_bridge", action_scale=1.0)
+
+    print("Policy initialized. Starting evaluation...")
+
+    final_scores = run_comprehensive_evaluation_bridge(env_policy=policy, ckpt_path=args.ckpt_path)
+
+    print("\nEvaluation finished.")
+    print(f"Final calculated scores: {final_scores}")
diff --git a/scripts/openpi/evaluate_fractal.py b/scripts/openpi/evaluate_fractal.py
new file mode 100644
index 00000000..45bcef89
--- /dev/null
+++ b/scripts/openpi/evaluate_fractal.py
@@ -0,0 +1,24 @@
+import argparse
+
+from simpler_env.evaluation.fractal_tasks import run_comprehensive_evaluation
+from simpler_env.policies.openpi.pi0_or_fast import OpenPiFastInference
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run Comprehensive ManiSkill2 Evaluation")
+    parser.add_argument("--ckpt-path", type=str, required=True, help="Path to the checkpoint to evaluate.")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    ckpt_path = args.ckpt_path
+
+    policy = OpenPiFastInference(saved_model_path=ckpt_path, policy_setup="google_robot", action_scale=1.0)
+
+    print("Policy initialized. Starting evaluation...")
+
+    final_scores = run_comprehensive_evaluation(env_policy=policy, ckpt_path=args.ckpt_path)
+
+    print("\nEvaluation finished.")
+    print(f"Final calculated scores: {final_scores}")
diff --git a/scripts/openpi/evaluate_fractal2.py b/scripts/openpi/evaluate_fractal2.py
new file mode 100644
index 00000000..48862b4d
--- /dev/null
+++ b/scripts/openpi/evaluate_fractal2.py
@@ -0,0 +1,39 @@
+import argparse
+
+from openpi_client import action_chunk_broker
+
+from openpi.policies import policy_config as _policy_config
+from openpi.training import config as _config
+from simpler_env.evaluation.fractal_tasks import run_comprehensive_evaluation
+from simpler_env.policies.adapter import AiroaToSimplerFractalAdapter
+from simpler_env.policies.hsr_openpi.pi0_or_fast import OpenpiToAiroaPolicy
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run Comprehensive ManiSkill2 Evaluation")
+    parser.add_argument("--ckpt-path", type=str, required=True, help="Path to the checkpoint to evaluate.")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    ckpt_path = args.ckpt_path
+
+    policy = OpenpiToAiroaPolicy(
+        policy=action_chunk_broker.ActionChunkBroker(
+            policy=_policy_config.create_trained_policy(
+                _config.get_config("pi0_fractal_lora"),
+                ckpt_path,
+            ),
+            action_horizon=10,
+        ),
+    )
+
+    env_policy = AiroaToSimplerFractalAdapter(policy=policy)
+
+    print("Policy initialized. Starting evaluation...")
+
+    final_scores = run_comprehensive_evaluation(env_policy=env_policy, ckpt_path=args.ckpt_path)
+
+    print("\nEvaluation finished.")
+    print(f"Final calculated scores: {final_scores}")
diff --git a/scripts/openpi/task_bridge_1.py b/scripts/openpi/task_bridge_1.py
new file mode 100644
index 00000000..2c01f79d
--- /dev/null
+++ b/scripts/openpi/task_bridge_1.py
@@ -0,0 +1,27 @@
+import argparse
+
+from simpler_env.evaluation.bridge_tasks import widowx_task1_pick_object
+from simpler_env.policies.openpi.pi0_or_fast import OpenPiFastInference
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run Comprehensive ManiSkill2 Evaluation")
+    parser.add_argument("--ckpt-path", type=str, required=True, help="Path to the checkpoint to evaluate.")
+    parser.add_argument("--control-freq", type=int, default=5, help="Set control frequency (default->5)")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    ckpt_path = args.ckpt_path
+
+    policy = OpenPiFastInference(saved_model_path=ckpt_path, policy_setup="widowx_bridge", action_scale=1.0)
+
+    print("Policy initialized. Starting evaluation...")
+
+    final_scores = widowx_task1_pick_object(
+        env_policy=policy, ckpt_path=args.ckpt_path, control_freq=args.control_freq
+    )
+
+    print("\nEvaluation finished.")
+    print(f"Final calculated scores: {final_scores}")
diff --git a/scripts/openpi/task_bridge_2.py b/scripts/openpi/task_bridge_2.py
new file mode 100644
index 00000000..e6761208
--- /dev/null
+++ b/scripts/openpi/task_bridge_2.py
@@ -0,0 +1,27 @@
+import argparse
+
+from simpler_env.evaluation.bridge_tasks import widowx_task2_stack_cube
+from simpler_env.policies.openpi.pi0_or_fast import OpenPiFastInference
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run Comprehensive ManiSkill2 Evaluation")
+    parser.add_argument("--ckpt-path", type=str, required=True, help="Path to the checkpoint to evaluate.")
+    parser.add_argument("--control-freq", type=int, default=5, help="Set control frequency (default->5)")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    ckpt_path = args.ckpt_path
+
+    policy = OpenPiFastInference(saved_model_path=ckpt_path, policy_setup="widowx_bridge", action_scale=1.0)
+
+    print("Policy initialized. Starting evaluation...")
+
+    final_scores = widowx_task2_stack_cube(
+        env_policy=policy, ckpt_path=args.ckpt_path, control_freq=args.control_freq
+    )
+
+    print("\nEvaluation finished.")
+    print(f"Final calculated scores: {final_scores}")
diff --git a/scripts/openpi/task_bridge_3.py b/scripts/openpi/task_bridge_3.py
new file mode 100644
index 00000000..b7b7d5c0
--- /dev/null
+++ b/scripts/openpi/task_bridge_3.py
@@ -0,0 +1,27 @@
+import argparse
+
+from simpler_env.evaluation.bridge_tasks import widowx_task3_put_object_on_top
+from simpler_env.policies.openpi.pi0_or_fast import OpenPiFastInference
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run Comprehensive ManiSkill2 Evaluation")
+    parser.add_argument("--ckpt-path", type=str, required=True, help="Path to the checkpoint to evaluate.")
+    parser.add_argument("--control-freq", type=int, default=5, help="Set control frequency (default->5)")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    ckpt_path = args.ckpt_path
+
+    policy = OpenPiFastInference(saved_model_path=ckpt_path, policy_setup="widowx_bridge", action_scale=1.0)
+
+    print("Policy initialized. Starting evaluation...")
+
+    final_scores = widowx_task3_put_object_on_top(
+        env_policy=policy, ckpt_path=args.ckpt_path, control_freq=args.control_freq
+    )
+
+    print("\nEvaluation finished.")
+    print(f"Final calculated scores: {final_scores}")
diff --git a/scripts/openpi/task_bridge_4.py b/scripts/openpi/task_bridge_4.py
new file mode 100644
index 00000000..8fa919e3
--- /dev/null
+++ b/scripts/openpi/task_bridge_4.py
@@ -0,0 +1,27 @@
+import argparse
+
+from simpler_env.evaluation.bridge_tasks import widowx_task4_put_object_in_basket
+from simpler_env.policies.openpi.pi0_or_fast import OpenPiFastInference
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run Comprehensive ManiSkill2 Evaluation")
+    parser.add_argument("--ckpt-path", type=str, required=True, help="Path to the checkpoint to evaluate.")
+    parser.add_argument("--control-freq", type=int, default=5, help="Set control frequency (default->5)")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    ckpt_path = args.ckpt_path
+
+    policy = OpenPiFastInference(saved_model_path=ckpt_path, policy_setup="widowx_bridge", action_scale=1.0)
+
+    print("Policy initialized. Starting evaluation...")
+
+    final_scores = widowx_task4_put_object_in_basket(
+        env_policy=policy, ckpt_path=args.ckpt_path, control_freq=args.control_freq
+    )
+
+    print("\nEvaluation finished.")
+    print(f"Final calculated scores: {final_scores}")
diff --git a/scripts/originals/bridge.sh b/scripts/originals/bridge.sh
new file mode 100644
index 00000000..720460d5
--- /dev/null
+++ b/scripts/originals/bridge.sh
@@ -0,0 +1,53 @@
+# scripts/bridge.sh
+ckpt_path=$1
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
+
+
+scene_name=bridge_table_1_v1
+robot=widowx
+rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_real_eval_1.png
+robot_init_x=0.147
+robot_init_y=0.028
+
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+  --robot ${robot} --policy-setup widowx_bridge \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+  --robot ${robot} --policy-setup widowx_bridge \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+  --robot ${robot} --policy-setup widowx_bridge \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --env-name PutSpoonOnTableClothInScene-v0 --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+
+
+scene_name=bridge_table_1_v2
+robot=widowx_sink_camera_setup
+rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_sink.png
+robot_init_x=0.127
+robot_init_y=0.06
+
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+  --robot ${robot} --policy-setup widowx_bridge \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 120 \
+  --env-name PutEggplantInBasketScene-v0 --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+
diff --git a/scripts/octo_drawer_variant_agg.sh b/scripts/originals/drawer_variant_agg.sh
similarity index 76%
rename from scripts/octo_drawer_variant_agg.sh
rename to scripts/originals/drawer_variant_agg.sh
index a566b369..a29de36d 100644
--- a/scripts/octo_drawer_variant_agg.sh
+++ b/scripts/originals/drawer_variant_agg.sh
@@ -1,10 +1,12 @@
+# scripts/drawer_variant_agg.sh
 # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+ckpt_path=$1
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
 
-
-
-declare -a policy_models=(
-"octo-base"
-)
+declare -a ckpt_paths=(${ckpt_path})
 
 declare -a env_names=(
 OpenTopDrawerCustomInScene-v0
@@ -22,9 +24,9 @@ EXTRA_ARGS="--enable-raytracing"
 scene_name=frl_apartment_stage_simple
 
 EvalSim() {
-  echo ${policy_model} ${env_name}
+  echo ${ckpt_path} ${env_name}
 
-  python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+  CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
     --robot google_robot_static \
     --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
     --env-name ${env_name} --scene-name ${scene_name} \
@@ -35,7 +37,7 @@ EvalSim() {
 }
 
 
-for policy_model in "${policy_models[@]}"; do
+for ckpt_path in "${ckpt_paths[@]}"; do
   for env_name in "${env_names[@]}"; do
     EvalSim
   done
@@ -50,7 +52,7 @@ declare -a scene_names=(
 )
 
 for scene_name in "${scene_names[@]}"; do
-  for policy_model in "${policy_models[@]}"; do
+  for ckpt_path in "${ckpt_paths[@]}"; do
     for env_name in "${env_names[@]}"; do
       EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt"
       EvalSim
@@ -62,7 +64,7 @@ done
 # lightings
 scene_name=frl_apartment_stage_simple
 
-for policy_model in "${policy_models[@]}"; do
+for ckpt_path in "${ckpt_paths[@]}"; do
   for env_name in "${env_names[@]}"; do
     EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=brighter"
     EvalSim
@@ -75,7 +77,7 @@ done
 # new cabinets
 scene_name=frl_apartment_stage_simple
 
-for policy_model in "${policy_models[@]}"; do
+for ckpt_path in "${ckpt_paths[@]}"; do
   for env_name in "${env_names[@]}"; do
     EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station2"
     EvalSim
diff --git a/scripts/originals/drawer_visual_matching.sh b/scripts/originals/drawer_visual_matching.sh
new file mode 100644
index 00000000..83655b1f
--- /dev/null
+++ b/scripts/originals/drawer_visual_matching.sh
@@ -0,0 +1,136 @@
+# scripts/drawer_visual_matching.sh
+# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+ckpt_path=$1
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
+
+declare -a ckpt_paths=($ckpt_path)
+
+declare -a env_names=(
+  OpenTopDrawerCustomInScene-v0
+  OpenMiddleDrawerCustomInScene-v0
+  OpenBottomDrawerCustomInScene-v0
+  CloseTopDrawerCustomInScene-v0
+  CloseMiddleDrawerCustomInScene-v0
+  CloseBottomDrawerCustomInScene-v0
+)
+
+# URDF variations
+declare -a urdf_version_arr=("recolor_cabinet_visual_matching_1" "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" None)
+
+for urdf_version in "${urdf_version_arr[@]}"; do
+
+  EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_station_recolor light_mode=simple disable_bad_material=True urdf_version=${urdf_version}"
+
+  EvalOverlay() {
+    # A0
+    echo ${ckpt_path} ${env_name} ${urdf_version}
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.644 0.644 1 --robot-init-y -0.179 -0.179 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.03 -0.03 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png \
+      ${EXTRA_ARGS}
+
+    # A1
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.765 0.765 1 --robot-init-y -0.182 -0.182 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.02 -0.02 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a1.png \
+      ${EXTRA_ARGS}
+
+    # A2
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.889 0.889 1 --robot-init-y -0.203 -0.203 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.06 -0.06 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a2.png \
+      ${EXTRA_ARGS}
+
+    # B0
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.652 0.652 1 --robot-init-y 0.009 0.009 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png \
+      ${EXTRA_ARGS}
+
+    # B1
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.752 0.752 1 --robot-init-y 0.009 0.009 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b1.png \
+      ${EXTRA_ARGS}
+
+    # B2
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.851 0.851 1 --robot-init-y 0.035 0.035 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b2.png \
+      ${EXTRA_ARGS}
+
+    # C0
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.665 0.665 1 --robot-init-y 0.224 0.224 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png \
+      ${EXTRA_ARGS}
+
+    # C1
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.765 0.765 1 --robot-init-y 0.222 0.222 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.025 -0.025 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c1.png \
+      ${EXTRA_ARGS}
+
+    # C2
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.865 0.865 1 --robot-init-y 0.222 0.222 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.025 -0.025 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c2.png \
+      ${EXTRA_ARGS}
+  }
+
+
+  for ckpt_path in "${ckpt_paths[@]}"; do
+    for env_name in "${env_names[@]}"; do
+      EvalOverlay
+    done
+  done
+
+done
diff --git a/scripts/octo_move_near_variant_agg.sh b/scripts/originals/move_near_variant_agg.sh
similarity index 74%
rename from scripts/octo_move_near_variant_agg.sh
rename to scripts/originals/move_near_variant_agg.sh
index 43733968..55054ea6 100644
--- a/scripts/octo_move_near_variant_agg.sh
+++ b/scripts/originals/move_near_variant_agg.sh
@@ -1,9 +1,13 @@
+# scripts/move_near_variant_agg.sh
+ckpt_path=$1
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
 
-gpu_id=0
+declare -a arr=("${ckpt_path}")
 
-declare -a arr=("octo-base")
-
-for policy_model in "${arr[@]}"; do echo "$policy_model"; done
+for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done
 
 
 # base setup
@@ -11,9 +15,9 @@ for policy_model in "${arr[@]}"; do echo "$policy_model"; done
 env_name=MoveNearGoogleInScene-v0
 scene_name=google_pick_coke_can_1_v4
 
-for policy_model in "${arr[@]}";
+for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -26,9 +30,9 @@ done
 
 # distractor
 
-for policy_model in "${arr[@]}";
+for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -47,9 +51,9 @@ declare -a scene_arr=("google_pick_coke_can_1_v4_alt_background" \
 
 for scene_name in "${scene_arr[@]}";
 
-do for policy_model in "${arr[@]}";
+do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -69,9 +73,9 @@ done
 env_name=MoveNearGoogleInScene-v0
 scene_name=google_pick_coke_can_1_v4
 
-for policy_model in "${arr[@]}";
+for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -79,7 +83,7 @@ do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
   --additional-env-build-kwargs slightly_darker_lighting=True;
 
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -101,9 +105,9 @@ declare -a scene_arr=("Baked_sc1_staging_objaverse_cabinet1_h870" \
 
 for scene_name in "${scene_arr[@]}";
 
-do for policy_model in "${arr[@]}";
+do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -125,9 +129,9 @@ scene_name=google_pick_coke_can_1_v4
 
 for env_name in "${env_arr[@]}";
 
-do for policy_model in "${arr[@]}";
+do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/originals/move_near_visual_matching.sh b/scripts/originals/move_near_visual_matching.sh
new file mode 100644
index 00000000..eaee31b1
--- /dev/null
+++ b/scripts/originals/move_near_visual_matching.sh
@@ -0,0 +1,37 @@
+# scripts/move_near_visual_matching.sh
+ckpt_path=$1
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
+
+declare -a arr=($ckpt_path)
+
+env_name=MoveNearGoogleBakedTexInScene-v0
+# env_name=MoveNearGoogleBakedTexInScene-v1
+scene_name=google_pick_coke_can_1_v4
+rgb_overlay_path=./ManiSkill2_real2sim/data/real_inpainting/google_move_near_real_eval_1.png
+
+# URDF variations
+declare -a urdf_version_arr=(None "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" "recolor_cabinet_visual_matching_1")
+
+for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done
+
+
+for urdf_version in "${urdf_version_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
+  --additional-env-build-kwargs urdf_version=${urdf_version} \
+  --additional-env-save-tags baked_except_bpb_orange;
+
+done
+
+done
diff --git a/scripts/octo_pick_coke_can_variant_agg.sh b/scripts/originals/pick_coke_can_variant_agg.sh
similarity index 78%
rename from scripts/octo_pick_coke_can_variant_agg.sh
rename to scripts/originals/pick_coke_can_variant_agg.sh
index c446b5d1..9be76965 100644
--- a/scripts/octo_pick_coke_can_variant_agg.sh
+++ b/scripts/originals/pick_coke_can_variant_agg.sh
@@ -1,12 +1,16 @@
+# scripts/pick_coke_can_variant_agg.sh
+ckpt_path=$1
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
 
-gpu_id=0
-
-declare -a arr=("octo-base")
+declare -a arr=($ckpt_path)
 
 # lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically
 declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True")
 
-for policy_model in "${arr[@]}"; do echo "$policy_model"; done
+for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done
 
 
 # base setup
@@ -16,9 +20,9 @@ scene_name=google_pick_coke_can_1_v4
 
 for coke_can_option in "${coke_can_options_arr[@]}";
 
-do for policy_model in "${arr[@]}";
+do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -44,9 +48,9 @@ for coke_can_option in "${coke_can_options_arr[@]}";
 
 do for scene_name in "${scene_arr[@]}";
 
-do for policy_model in "${arr[@]}";
+do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -70,9 +74,9 @@ scene_name=google_pick_coke_can_1_v4
 
 for coke_can_option in "${coke_can_options_arr[@]}";
 
-do for policy_model in "${arr[@]}";
+do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -80,7 +84,7 @@ do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
   --additional-env-build-kwargs ${coke_can_option};
 
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -105,9 +109,9 @@ for coke_can_option in "${coke_can_options_arr[@]}";
 
 do for scene_name in "${scene_arr[@]}";
 
-do for policy_model in "${arr[@]}";
+do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -130,9 +134,9 @@ scene_name=google_pick_coke_can_1_v4
 
 for coke_can_option in "${coke_can_options_arr[@]}";
 
-do for policy_model in "${arr[@]}";
+do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -140,7 +144,7 @@ do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
   --additional-env-build-kwargs ${coke_can_option} slightly_darker_lighting=True;
 
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -165,9 +169,9 @@ for coke_can_option in "${coke_can_options_arr[@]}";
 
 do for env_name in "${env_arr[@]}";
 
-do for policy_model in "${arr[@]}";
+do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/octo_pick_coke_can_visual_matching.sh b/scripts/originals/pick_coke_can_visual_matching.sh
similarity index 70%
rename from scripts/octo_pick_coke_can_visual_matching.sh
rename to scripts/originals/pick_coke_can_visual_matching.sh
index c633a339..f36e29bc 100644
--- a/scripts/octo_pick_coke_can_visual_matching.sh
+++ b/scripts/originals/pick_coke_can_visual_matching.sh
@@ -1,12 +1,12 @@
+# scripts/pick_coke_can_visual_matching.sh
+ckpt_path=$1
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
 
+declare -a arr=($ckpt_path)
 
-gpu_id=0
-
-declare -a policy_models=(
-  "octo-base"
-  # "octo-small"
-  # "octo-server"
-)
 
 # lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically
 declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True")
@@ -18,18 +18,19 @@ env_name=GraspSingleOpenedCokeCanInScene-v0
 scene_name=google_pick_coke_can_1_v4
 rgb_overlay_path=./ManiSkill2_real2sim/data/real_inpainting/google_coke_can_real_eval_1.png
 
-for policy_model in "${policy_models[@]}"; do echo "$policy_model"; done
+for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done
 
 
-for urdf_version in "${urdf_version_arr[@]}";
 
-do for policy_model in "${policy_models[@]}";
+for urdf_version in "${urdf_version_arr[@]}";
 
 do for coke_can_option in "${coke_can_options_arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static  --policy-setup google_robot \
-  --control-freq 3 --sim-freq 501 --max-episode-steps 80 \
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
   --rgb-overlay-path ${rgb_overlay_path} \
   --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
diff --git a/scripts/octo_put_in_drawer_variant_agg.sh b/scripts/originals/put_in_drawer_variant_agg.sh
similarity index 76%
rename from scripts/octo_put_in_drawer_variant_agg.sh
rename to scripts/originals/put_in_drawer_variant_agg.sh
index b833a896..37a0036d 100644
--- a/scripts/octo_put_in_drawer_variant_agg.sh
+++ b/scripts/originals/put_in_drawer_variant_agg.sh
@@ -1,10 +1,12 @@
+# scripts/put_in_drawer_variant_agg.sh
 # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+ckpt_paths=($1)
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
 
-
-
-declare -a policy_models=(
-"octo-base"
-)
+declare -a arr=($ckpt_path)
 
 declare -a env_names=(
 PlaceIntoClosedTopDrawerCustomInScene-v0
@@ -17,9 +19,9 @@ EXTRA_ARGS="--enable-raytracing  --additional-env-build-kwargs model_ids=apple"
 scene_name=frl_apartment_stage_simple
 
 EvalSim() {
-  echo ${policy_model} ${env_name}
+  echo ${ckpt_path} ${env_name}
 
-  python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+  CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
     --robot google_robot_static \
     --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
     --env-name ${env_name} --scene-name ${scene_name} \
@@ -30,7 +32,7 @@ EvalSim() {
 }
 
 
-for policy_model in "${policy_models[@]}"; do
+for ckpt_path in "${ckpt_paths[@]}"; do
   for env_name in "${env_names[@]}"; do
     EvalSim
   done
@@ -45,7 +47,7 @@ declare -a scene_names=(
 )
 
 for scene_name in "${scene_names[@]}"; do
-  for policy_model in "${policy_models[@]}"; do
+  for ckpt_path in "${ckpt_paths[@]}"; do
     for env_name in "${env_names[@]}"; do
       EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt model_ids=apple"
       EvalSim
@@ -57,7 +59,7 @@ done
 # lightings
 scene_name=frl_apartment_stage_simple
 
-for policy_model in "${policy_models[@]}"; do
+for ckpt_path in "${ckpt_paths[@]}"; do
   for env_name in "${env_names[@]}"; do
     EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=brighter model_ids=apple"
     EvalSim
@@ -70,7 +72,7 @@ done
 # new cabinets
 scene_name=frl_apartment_stage_simple
 
-for policy_model in "${policy_models[@]}"; do
+for ckpt_path in "${ckpt_paths[@]}"; do
   for env_name in "${env_names[@]}"; do
     EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station2 model_ids=apple"
     EvalSim
diff --git a/scripts/octo_put_in_drawer_visual_matching.sh b/scripts/originals/put_in_drawer_visual_matching.sh
similarity index 72%
rename from scripts/octo_put_in_drawer_visual_matching.sh
rename to scripts/originals/put_in_drawer_visual_matching.sh
index f945815c..fbfb3634 100644
--- a/scripts/octo_put_in_drawer_visual_matching.sh
+++ b/scripts/originals/put_in_drawer_visual_matching.sh
@@ -1,11 +1,12 @@
+# scripts/put_in_drawer_visual_matching.sh
 # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+ckpt_paths=($1)
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
 
-
-
-declare -a policy_models=(
-"octo-base"
-# "octo-server"
-)
+declare -a arr=($ckpt_path)
 
 declare -a env_names=(
 PlaceIntoClosedTopDrawerCustomInScene-v0
@@ -13,6 +14,7 @@ PlaceIntoClosedTopDrawerCustomInScene-v0
 # PlaceIntoClosedBottomDrawerCustomInScene-v0
 )
 
+
 # URDF variations
 declare -a urdf_version_arr=("recolor_cabinet_visual_matching_1" "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" None)
 
@@ -20,9 +22,10 @@ for urdf_version in "${urdf_version_arr[@]}"; do
 
 EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_station_recolor light_mode=simple disable_bad_material=True urdf_version=${urdf_version} model_ids=baked_apple_v2"
 
+
 EvalOverlay() {
 # A0
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -31,9 +34,9 @@ python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path
   --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
   --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png \
   ${EXTRA_ARGS}
-
+echo "Done A0"
 # B0
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -42,9 +45,9 @@ python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path
   --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
   --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png \
   ${EXTRA_ARGS}
-
+echo "Done B0"
 # C0
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -53,14 +56,16 @@ python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path
   --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
   --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png \
   ${EXTRA_ARGS}
+echo "Done C0"
 }
 
 
-for policy_model in "${policy_models[@]}"; do
+for ckpt_path in "${ckpt_paths[@]}"; do
   for env_name in "${env_names[@]}"; do
     EvalOverlay
   done
 done
 
 
+
 done
diff --git a/scripts/originals/run_cogact.sh b/scripts/originals/run_cogact.sh
new file mode 100644
index 00000000..be4df642
--- /dev/null
+++ b/scripts/originals/run_cogact.sh
@@ -0,0 +1,39 @@
+model_name=cogact
+tasks=(
+  bridge.sh
+  drawer_variant_agg.sh
+  drawer_visual_matching.sh
+  pick_coke_can_variant_agg.sh
+  pick_coke_can_visual_matching.sh
+  move_near_variant_agg.sh
+  move_near_visual_matching.sh
+  put_in_drawer_variant_agg.sh
+  put_in_drawer_visual_matching.sh
+)
+
+ckpts=(
+  CogACT/CogACT-Large
+)
+
+action_ensemble_temp=-0.8
+for ckpt_path in ${ckpts[@]}; do
+  base_dir=$(dirname $ckpt_path)
+
+  # evaluation in simulator
+  # logging_dir=$base_dir/simpler_env/$(basename $ckpt_path)${action_ensemble_temp}
+  logging_dir=results/$(basename $ckpt_path)${action_ensemble_temp}
+  mkdir -p $logging_dir
+  for i in ${!tasks[@]}; do
+    task=${tasks[$i]}
+    echo "🚀 running $task ..."
+    device=0
+    session_name=CUDA${device}-$(basename $logging_dir)-${task}
+    bash scripts/$task $ckpt_path $model_name $action_ensemble_temp $logging_dir $device
+  done
+
+  # statistics evalution results
+  echo "🚀 all tasks DONE! Calculating metrics..."
+  python tools/calc_metrics_evaluation_videos.py \
+    --log-dir-root $logging_dir \
+    >>$logging_dir/total.metrics
+done
diff --git a/scripts/originals/run_gr00t.sh b/scripts/originals/run_gr00t.sh
new file mode 100644
index 00000000..10a9fb78
--- /dev/null
+++ b/scripts/originals/run_gr00t.sh
@@ -0,0 +1,45 @@
+# source scripts/env.sh
+# XDG_RUNTIME_DIR=/usr/lib
+# LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+
+model_name=gr00t
+tasks=(
+    bridge.sh
+    
+    # drawer_variant_agg.sh
+    # drawer_visual_matching.sh
+    
+    # move_near_variant_agg.sh
+    # move_near_visual_matching.sh
+
+    # pick_coke_can_variant_agg.sh
+    # pick_coke_can_visual_matching.sh
+
+    # put_in_drawer_variant_agg.sh
+    # put_in_drawer_visual_matching.sh
+)
+
+ckpts=(
+    # path to your loal model
+)
+
+action_ensemble_temp=-0.8
+
+for ckpt_path in ${ckpts[@]}; do
+    
+    logging_dir=results/$(basename $ckpt_path)${action_ensemble_temp}
+
+    mkdir -p $logging_dir
+    for i in ${!tasks[@]}; do
+        task=${tasks[$i]}
+        echo "🚀 running $task ..."
+        device=0
+        bash scripts/$task $ckpt_path $model_name $action_ensemble_temp $logging_dir $device
+    done
+
+    # statistics evalution results
+    echo "🚀 all tasks DONE! Calculating metrics..."
+    python tools/calc_metrics_evaluation_videos.py \
+        --log-dir-root $logging_dir \
+        >>$logging_dir/total.metrics
+done
diff --git a/scripts/originals/run_lerobotpifast.sh b/scripts/originals/run_lerobotpifast.sh
new file mode 100644
index 00000000..d2afc1ed
--- /dev/null
+++ b/scripts/originals/run_lerobotpifast.sh
@@ -0,0 +1,40 @@
+# XDG_RUNTIME_DIR=/usr/lib
+# LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+MUJOCO_GL="egl"
+
+model_name=lerobotpifast
+
+tasks=(
+    bridge.sh    
+    drawer_variant_agg.sh
+    drawer_visual_matching.sh
+    move_near_variant_agg.sh
+    move_near_visual_matching.sh
+    pick_coke_can_variant_agg.sh
+    pick_coke_can_visual_matching.sh
+    put_in_drawer_variant_agg.sh
+    put_in_drawer_visual_matching.sh
+)
+
+ckpts=(
+lerobot/pi0
+)
+
+action_ensemble_temp=-0.8
+
+for ckpt_path in ${ckpts[@]}; do    
+    
+    logging_dir=results/$(basename $ckpt_path)${action_ensemble_temp}
+    mkdir -p $logging_dir
+    for i in ${!tasks[@]}; do
+        task=${tasks[$i]}
+        cuda=$((i % 8))
+        GPU_IDX=${cuda} bash scripts/$task $ckpt_path $model_name $action_ensemble_temp $logging_dir
+    done
+
+    # statistics evalution results
+    echo "🚀 all tasks DONE! Calculating metrics..."
+    python tools/calc_metrics_evaluation_videos.py \
+        --log-dir-root $logging_dir \
+        >>$logging_dir/total.metrics
+done
diff --git a/scripts/originals/run_octo.sh b/scripts/originals/run_octo.sh
new file mode 100644
index 00000000..bb6116d3
--- /dev/null
+++ b/scripts/originals/run_octo.sh
@@ -0,0 +1,39 @@
+model_name=octo-base
+tasks=(
+  bridge.sh
+  drawer_variant_agg.sh
+  drawer_visual_matching.sh
+  move_near_variant_agg.sh
+  move_near_visual_matching.sh
+  pick_coke_can_variant_agg.sh
+  pick_coke_can_visual_matching.sh
+  put_in_drawer_variant_agg.sh
+  put_in_drawer_visual_matching.sh
+)
+
+ckpts=(
+  None
+)
+
+action_ensemble_temp=-0.8
+for ckpt_path in ${ckpts[@]}; do
+  base_dir=$(dirname $ckpt_path)
+
+  # evaluation in simulator
+  # logging_dir=$base_dir/simpler_env/$(basename $ckpt_path)${action_ensemble_temp}
+  logging_dir=results/$(basename $ckpt_path)${action_ensemble_temp}
+  mkdir -p $logging_dir
+  for i in ${!tasks[@]}; do
+    task=${tasks[$i]}
+    echo "🚀 running $task ..."
+    device=0
+    session_name=CUDA${device}-$(basename $logging_dir)-${task}
+    bash scripts/$task $ckpt_path $model_name $action_ensemble_temp $logging_dir $device
+  done
+
+  # statistics evalution results
+  echo "🚀 all tasks DONE! Calculating metrics..."
+  python tools/calc_metrics_evaluation_videos.py \
+    --log-dir-root $logging_dir \
+    >>$logging_dir/total.metrics
+done
diff --git a/scripts/originals/run_openpifast.sh b/scripts/originals/run_openpifast.sh
new file mode 100644
index 00000000..416ba62a
--- /dev/null
+++ b/scripts/originals/run_openpifast.sh
@@ -0,0 +1,48 @@
+XDG_RUNTIME_DIR=/usr/lib
+LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+
+model_name=openpifast
+
+tasks=(
+    bridge.sh    
+    drawer_variant_agg.sh
+    drawer_visual_matching.sh
+    move_near_variant_agg.sh
+    move_near_visual_matching.sh
+    pick_coke_can_variant_agg.sh
+    pick_coke_can_visual_matching.sh
+    put_in_drawer_variant_agg.sh
+    put_in_drawer_visual_matching.sh
+)
+
+ckpts=(
+
+)
+
+
+# to start server, please refe to https://github.com/Physical-Intelligence/openpi
+# e.g.,
+# cd /cpfs01/openpi
+# uv run scripts/serve_policy.py policy:checkpoint \
+#     --policy.config=pi0_fast_bridge_lora_pt_tokenizer \
+#     --policy.dir=/cpfs01/openpi/checkpoints/pi0_fast_bridge_lora_pt_tokenizer/pi0_fast_bridge_lora_pt_tokenizer/29999
+
+action_ensemble_temp=-0.8
+
+# client inference
+for ckpt_path in ${ckpts[@]}; do    
+    
+    logging_dir=results/$(basename $ckpt_path)${action_ensemble_temp}
+    mkdir -p $logging_dir
+    for i in ${!tasks[@]}; do
+        task=${tasks[$i]}
+        cuda=$((i % 8))
+        GPU_IDX=${cuda} bash scripts/$task $ckpt_path $model_name $action_ensemble_temp $logging_dir
+    done
+
+    # statistics evalution results
+    echo "🚀 all tasks DONE! Calculating metrics..."
+    python tools/calc_metrics_evaluation_videos.py \
+        --log-dir-root $logging_dir \
+        >>$logging_dir/total.metrics
+done
diff --git a/scripts/originals/run_openvla.sh b/scripts/originals/run_openvla.sh
new file mode 100644
index 00000000..fb5f3c43
--- /dev/null
+++ b/scripts/originals/run_openvla.sh
@@ -0,0 +1,39 @@
+model_name=openvla
+tasks=(
+  bridge.sh
+  drawer_variant_agg.sh
+  drawer_visual_matching.sh
+  move_near_variant_agg.sh
+  move_near_visual_matching.sh
+  pick_coke_can_variant_agg.sh
+  pick_coke_can_visual_matching.sh
+  put_in_drawer_variant_agg.sh
+  put_in_drawer_visual_matching.sh
+)
+
+ckpts=(
+  ~/projects/vla_ptm/pretrained/openvla-7b
+)
+
+action_ensemble_temp=-0.8
+for ckpt_path in ${ckpts[@]}; do
+  base_dir=$(dirname $ckpt_path)
+
+  # evaluation in simulator
+  # logging_dir=$base_dir/simpler_env/$(basename $ckpt_path)${action_ensemble_temp}
+  logging_dir=results/$(basename $ckpt_path)${action_ensemble_temp}
+  mkdir -p $logging_dir
+  for i in ${!tasks[@]}; do
+    task=${tasks[$i]}
+    echo "🚀 running $task ..."
+    device=0
+    session_name=CUDA${device}-$(basename $logging_dir)-${task}
+    bash scripts/$task $ckpt_path $model_name $action_ensemble_temp $logging_dir $device
+  done
+
+  # statistics evalution results
+  echo "🚀 all tasks DONE! Calculating metrics..."
+  python tools/calc_metrics_evaluation_videos.py \
+    --log-dir-root $logging_dir \
+    >>$logging_dir/total.metrics
+done
diff --git a/scripts/originals/run_rt1.sh b/scripts/originals/run_rt1.sh
new file mode 100644
index 00000000..eb803c34
--- /dev/null
+++ b/scripts/originals/run_rt1.sh
@@ -0,0 +1,39 @@
+model_name=rt1
+tasks=(
+  bridge.sh
+  drawer_variant_agg.sh
+  drawer_visual_matching.sh
+  move_near_variant_agg.sh
+  move_near_visual_matching.sh
+  pick_coke_can_variant_agg.sh
+  pick_coke_can_visual_matching.sh
+  put_in_drawer_variant_agg.sh
+  put_in_drawer_visual_matching.sh
+)
+
+ckpts=(
+  ../pretrained/rt_1_tf_trained_for_000400120
+)
+
+action_ensemble_temp=-0.8
+for ckpt_path in ${ckpts[@]}; do
+  base_dir=$(dirname $ckpt_path)
+
+  # evaluation in simulator
+  # logging_dir=$base_dir/simpler_env/$(basename $ckpt_path)${action_ensemble_temp}
+  logging_dir=results/$(basename $ckpt_path)${action_ensemble_temp}
+  mkdir -p $logging_dir
+  for i in ${!tasks[@]}; do
+    task=${tasks[$i]}
+    echo "🚀 running $task ..."
+    device=0
+    session_name=CUDA${device}-$(basename $logging_dir)-${task}
+    bash scripts/$task $ckpt_path $model_name $action_ensemble_temp $logging_dir $device
+  done
+
+  # statistics evalution results
+  echo "🚀 all tasks DONE! Calculating metrics..."
+  python tools/calc_metrics_evaluation_videos.py \
+    --log-dir-root $logging_dir \
+    >>$logging_dir/total.metrics
+done
diff --git a/scripts/originals/run_spatialvla.sh b/scripts/originals/run_spatialvla.sh
new file mode 100644
index 00000000..370370e8
--- /dev/null
+++ b/scripts/originals/run_spatialvla.sh
@@ -0,0 +1,42 @@
+model_name=spatialvla
+tasks=(
+    bridge.sh
+    drawer_variant_agg.sh
+    drawer_visual_matching.sh
+    move_near_variant_agg.sh
+    move_near_visual_matching.sh
+    pick_coke_can_variant_agg.sh
+    pick_coke_can_visual_matching.sh
+
+    # put_in_drawer_variant_agg.sh
+    # put_in_drawer_visual_matching.sh
+)
+
+ckpts=(
+    IPEC-COMMUNITY/spatialvla-4b-224-pt # or a local path
+)
+
+action_ensemble_temp=-0.8
+for ckpt_path in ${ckpts[@]}; do
+    # 🤗 NOTE: set hf cache to avoid confilcts
+    # base_dir=$(dirname $ckpt_path)
+    # export HF_MODULES_CACHE=$base_dir/hf_cache/modules
+    # mkdir -p $HF_MODULES_CACHE
+    # logging_dir=$base_dir/simpler_env/$(basename $ckpt_path)${action_ensemble_temp}
+  
+    logging_dir=results/$(basename $ckpt_path)${action_ensemble_temp}
+
+    mkdir -p $logging_dir
+    for i in ${!tasks[@]}; do
+        task=${tasks[$i]}
+        echo "🚀 running $task ..."
+        device=0
+        bash scripts/$task $ckpt_path $model_name $action_ensemble_temp $logging_dir $device
+    done
+
+    # statistics evalution results
+    echo "🚀 all tasks DONE! Calculating metrics..."
+    python tools/calc_metrics_evaluation_videos.py \
+        --log-dir-root $logging_dir \
+        >>$logging_dir/total.metrics
+done
diff --git a/scripts/rt1/README.md b/scripts/rt1/README.md
new file mode 100644
index 00000000..236fe8d2
--- /dev/null
+++ b/scripts/rt1/README.md
@@ -0,0 +1,45 @@
+
+```bash
+# Install Google Cloud SDK
+cd ..
+curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz
+tar -xf google-cloud-cli-linux-x86_64.tar.gz
+./google-cloud-sdk/install.sh
+cd SimplerEnv
+
+# Create a virtual environment
+uv venv -p 3.10 scripts/rt1/.venv
+
+# Install dependencies
+source $(pwd)/scripts/rt1/.venv/bin/activate
+uv pip install tensorflow==2.15.0
+uv pip install -r requirements_full_install.txt
+uv pip install -e .
+uv pip install tensorflow[and-cuda]==2.15.1
+
+
+# Make a checkpoint dir:
+mkdir checkpoints
+
+# RT-1-X
+gsutil -m cp -r gs://gdm-robotics-open-x-embodiment/open_x_embodiment_and_rt_x_oss/rt_1_x_tf_trained_for_002272480_step.zip .
+unzip rt_1_x_tf_trained_for_002272480_step.zip
+mv rt_1_x_tf_trained_for_002272480_step checkpoints
+rm rt_1_x_tf_trained_for_002272480_step.zip
+
+# RT-1-Converged
+gsutil -m cp -r gs://gdm-robotics-open-x-embodiment/open_x_embodiment_and_rt_x_oss/rt_1_tf_trained_for_000400120 .
+mv rt_1_tf_trained_for_000400120 checkpoints
+
+# RT-1-15%
+gsutil -m cp -r gs://gdm-robotics-open-x-embodiment/open_x_embodiment_and_rt_x_oss/rt_1_tf_trained_for_000058240 .
+mv rt_1_tf_trained_for_000058240 checkpoints
+
+# RT-1-Begin
+gsutil -m cp -r gs://gdm-robotics-open-x-embodiment/open_x_embodiment_and_rt_x_oss/rt_1_tf_trained_for_000001120 .
+mv rt_1_tf_trained_for_000001120 checkpoints      
+
+
+# Evaluate
+CUDA_VISIBLE_DEVICES=3 python scripts/rt1/evaluate_fractal.py --ckpt-path checkpoints/rt_1_tf_trained_for_000400120
+```
\ No newline at end of file
diff --git a/scripts/rt1/evaluate_fractal.py b/scripts/rt1/evaluate_fractal.py
new file mode 100644
index 00000000..ffbd9d59
--- /dev/null
+++ b/scripts/rt1/evaluate_fractal.py
@@ -0,0 +1,24 @@
+import argparse
+
+from simpler_env.evaluation.fractal_tasks import run_comprehensive_evaluation
+from simpler_env.policies.rt1.rt1_model import RT1Inference
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run Comprehensive ManiSkill2 Evaluation")
+    parser.add_argument("--ckpt-path", type=str, required=True, help="Path to the checkpoint to evaluate.")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    ckpt_path = args.ckpt_path
+
+    policy = RT1Inference(saved_model_path=ckpt_path, policy_setup="google_robot")
+
+    print("Policy initialized. Starting evaluation...")
+
+    final_scores = run_comprehensive_evaluation(env_policy=policy, ckpt_path=args.ckpt_path)
+
+    print("\nEvaluation finished.")
+    print(f"Final calculated scores: {final_scores}")
diff --git a/scripts/score.py b/scripts/score.py
new file mode 100644
index 00000000..759c1d25
--- /dev/null
+++ b/scripts/score.py
@@ -0,0 +1,129 @@
+import argparse
+from collections import OrderedDict
+import csv
+import json
+import os
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Compute episode scores from results.json")
+    parser.add_argument("--result-path", required=True, help="Directory that contains results.json and where scores.csv will be written")
+    parser.add_argument("--policy-setup", required=True, help="Policy setup string; must contain either 'widowx' or 'google_robot'")
+    return parser.parse_args()
+
+
+# You can modify the following to customize your scoring config
+# 1) The "rubrics_score" dict MUST preserve order (use OrderedDict or rely on Python 3.7+ insertion order).
+# 2) Weights MUST sum to 1.0.
+widowx_rubrics_score = OrderedDict(
+    [
+        ("is_src_obj_grasped", 0.3),
+        ("consecutive_grasp", 0.3),
+        ("src_on_target", 0.4),
+    ]
+)
+google_robot_move_near_rubrics_score = OrderedDict(
+    [
+        ("moved_correct_obj", 0.3),
+        ("near_tgt_obj", 0.3),
+        ("is_closest_to_tg", 0.4),
+    ]
+)
+google_robot_put_in_rubrics_score = OrderedDict(
+    [
+        ("is_drawer_open", 0.3),
+        ("grasped", 0.3),
+        ("has_contact", 0.4),
+    ]
+)
+SCORING_CONFIG = {
+    "widowx": {
+        "widowx_task2_stack_cube": widowx_rubrics_score,
+        "widowx_task3_put_object_on_top": widowx_rubrics_score,
+        "widowx_task4_put_object_in_basket": widowx_rubrics_score,
+    },
+    "google_robot": {
+        "fractal_move_near_visual_matching": google_robot_move_near_rubrics_score,
+        "fractal_move_near_variant_agg": google_robot_move_near_rubrics_score,
+        "fractal_put_in_drawer_visual_matching": google_robot_put_in_rubrics_score,
+        "fractal_put_in_drawer_variant_agg": google_robot_put_in_rubrics_score,
+    },
+}
+
+
+def ensure_weights_sum_to_one(weights: OrderedDict):
+    total = sum(weights.values())
+    if abs(total - 1.0) > 1e-9:
+        raise ValueError(f"Scoring weights must sum to 1.0, got {total}")
+
+
+def pick_policy(policy_setup: str) -> str:
+    s = policy_setup.lower()
+    if "widowx" in s:
+        return "widowx"
+    if "google_robot" in s:
+        return "google_robot"
+    return ValueError("`--policy-setup` must contain either 'widowx' or 'google_robot'.")
+
+
+def main():
+    args = parse_args()
+
+    base_path = os.path.abspath(args.result_path)
+    json_file = os.path.join(base_path, "results.json")
+    csv_file = os.path.join(base_path, "scores.csv")
+
+    policy = pick_policy(args.policy_setup)
+    scoring_config = SCORING_CONFIG[policy]
+    for rubrics_score in scoring_config.values():
+        ensure_weights_sum_to_one(rubrics_score)
+
+    # Load json file
+    try:
+        with open(json_file, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        if not isinstance(data, list):
+            data = []
+    except Exception:
+        data = []
+
+    score_list = []
+    for task in data:
+        task_name = task.get("task_name", "")
+        if task_name not in scoring_config.keys():
+            continue
+        trials = task.get("episodes", [])
+        for trial in trials:
+            trial_id = trial.get("trial_id")
+            episode_stats = trial.get("episode_stats", {})
+            # final_status = trial["final_status"]
+            score = 0.0
+
+            # Accumulate scores
+            rubrics_score = scoring_config[task_name]
+            for subtask in rubrics_score.keys():
+                status = episode_stats.get(subtask, {}).get("status", False)
+                if not status:
+                    break
+                score += rubrics_score[subtask]
+
+            cur_score_data = {
+                "task_name": task_name,
+                "trial_id": trial_id,
+                "score": score,
+            }
+            score_list.append(cur_score_data)
+
+    # Export as csv file
+    cols = ["task_name", "trial_id", "score"]
+    os.makedirs(os.path.dirname(csv_file), exist_ok=True)
+    with open(csv_file, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=cols)
+        writer.writeheader()
+        writer.writerows(score_list)
+
+    print(f"Scores saved to {csv_file}, total {len(score_list)} rows.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 2ff8c4ba..00000000
--- a/setup.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from setuptools import find_packages, setup
-
-setup(
-    name="simpler_env",
-    version="0.0.1",
-    author="Xuanlin Li",
-    packages=find_packages(include=["simpler_env*"]),
-    python_requires=">=3.10",
-)
diff --git a/simpler_env/__init__.py b/simpler_env/__init__.py
index e8b8aa53..cbecacd6 100644
--- a/simpler_env/__init__.py
+++ b/simpler_env/__init__.py
@@ -1,6 +1,5 @@
 import gymnasium as gym
 import mani_skill2_real2sim.envs
-import warnings
 
 ENVIRONMENTS = [
     "google_robot_pick_coke_can",
@@ -71,18 +70,10 @@
 }
 
 
-def make(task_name, **kwargs):
+def make(task_name):
     """Creates simulated eval environment from task name."""
     assert task_name in ENVIRONMENTS, f"Task {task_name} is not supported. Environments: \n {ENVIRONMENTS}"
-    env_name, env_kwargs = ENVIRONMENT_MAP[task_name]
-    
-    env_kwargs["obs_mode"] = "rgbd",
-    env_kwargs["prepackaged_config"] = True
-
-    for key, value in kwargs.items():
-        if key in env_kwargs:
-            warnings.warn(f"default value [{env_kwargs[key]}] for Key {key} changes to value [{value}]")
-        env_kwargs[key] = value
-
-    env = gym.make(env_name, **env_kwargs)
+    env_name, kwargs = ENVIRONMENT_MAP[task_name]
+    kwargs["prepackaged_config"] = True
+    env = gym.make(env_name, obs_mode="rgbd", **kwargs)
     return env
diff --git a/simpler_env/evaluation/argparse.py b/simpler_env/evaluation/argparse.py
index d17e4d88..9da3e992 100644
--- a/simpler_env/evaluation/argparse.py
+++ b/simpler_env/evaluation/argparse.py
@@ -44,7 +44,7 @@ def get_args():
         help="Obtain image observation from this camera for policy input. None = default",
     )
     parser.add_argument("--action-scale", type=float, default=1.0)
-
+    parser.add_argument("--action-ensemble-temp", type=float, default=-0.8)
     parser.add_argument("--control-freq", type=int, default=3)
     parser.add_argument("--sim-freq", type=int, default=513)
     parser.add_argument("--max-episode-steps", type=int, default=80)
@@ -99,7 +99,6 @@ def get_args():
         default=[-0.02, 0.42, 5],
         help="[ymin, ymax, num]",
     )
-
     parser.add_argument(
         "--additional-env-build-kwargs",
         nargs="+",
diff --git a/simpler_env/evaluation/bridge_tasks.py b/simpler_env/evaluation/bridge_tasks.py
new file mode 100644
index 00000000..c7c30425
--- /dev/null
+++ b/simpler_env/evaluation/bridge_tasks.py
@@ -0,0 +1,193 @@
+from typing import Any, Dict, List
+
+from . import random_envs
+from ..policies.base import AiroaBasePolicy
+from .config import ManiSkill2Config
+from .evaluate import _run_single_evaluation
+from .maniskill2_evaluator import run_maniskill2_eval_single_episode, get_robot_control_mode
+import numpy as np
+
+# グローバルカウンター
+global_episode_counter = 0
+
+def get_next_episode_id():
+    global global_episode_counter
+    current_id = global_episode_counter
+    global_episode_counter += 1
+    return current_id
+
+def custom_run_single_evaluation_with_counter(env_policy: AiroaBasePolicy, cfg: ManiSkill2Config, ckpt_path: str, start_episode_id: int = 0) -> List[bool]:
+    """各オブジェクトエピソードごとに異なるepisode_idを使用する評価関数"""
+    if cfg.additional_env_build_kwargs:
+        if "urdf_version" in cfg.additional_env_build_kwargs:
+            kwargs_info = cfg.additional_env_build_kwargs["urdf_version"]
+        else:
+            kwargs_info = cfg.additional_env_build_kwargs
+    else:
+        kwargs_info = None
+
+    print(f"  ▶️  Running: env={cfg.env_name}, scene={cfg.scene_name}, kwargs={kwargs_info}")
+    
+    control_mode = get_robot_control_mode(cfg.robot, cfg.policy_model)
+    success_arr = []
+    
+    # obj_variation_mode == "episode"の場合のカスタム処理
+    if cfg.obj_variation_mode == "episode":
+        rng = np.random.RandomState(42)
+        sampled_ids = rng.choice(range(36), size=cfg.obj_episode_range[1], replace=True)
+        
+        for idx, obj_episode_id in enumerate(sampled_ids):
+            # 各タスクで0からスタートするepisode_id
+            episode_id = idx
+            
+            success = run_maniskill2_eval_single_episode(
+                model=env_policy,
+                task_name=cfg.task_name,
+                episode_id=episode_id,
+                ckpt_path=cfg.ckpt_path,
+                robot_name=cfg.robot,
+                env_name=cfg.env_name,
+                scene_name=cfg.scene_name,
+                robot_init_x=cfg.robot_init_x_range[0],
+                robot_init_y=cfg.robot_init_y_range[0],
+                robot_init_quat=np.array(cfg.robot_init_rot_quat_center),
+                control_mode=control_mode,
+                obj_episode_id=obj_episode_id,
+                additional_env_build_kwargs=cfg.additional_env_build_kwargs,
+                rgb_overlay_path=cfg.rgb_overlay_path,
+                control_freq=cfg.control_freq,
+                sim_freq=cfg.sim_freq,
+                max_episode_steps=cfg.max_episode_steps,
+                enable_raytracing=cfg.enable_raytracing,
+                additional_env_save_tags=cfg.additional_env_save_tags,
+                obs_camera_name=cfg.obs_camera_name,
+                logging_dir="./results",
+            )
+            success_arr.append(success)
+    else:
+        # 他のバリエーションモードは元の実装を使用
+        return _run_single_evaluation(env_policy, cfg, ckpt_path)
+    
+    # 成功率を出力
+    print(f"  ✅  Success Rate: {np.mean(success_arr):.2%}")
+    return success_arr
+
+
+def widowx_task1_pick_object(env_policy: AiroaBasePolicy, ckpt_path: str, episode_id: int, control_freq: int = 5) -> List[List[bool]]:
+    max_time = 24
+    max_episode_steps = max_time * control_freq
+
+    results: List[List[bool]] = []
+
+    cfg = ManiSkill2Config(
+        policy_setup="widowx_bridge",
+        robot="widowx",
+        control_freq=control_freq,
+        sim_freq=500,
+        max_episode_steps=max_episode_steps,
+        env_name="GraspRandomObjectInScene-v0",
+        scene_name="bridge_table_1_v1",
+        rgb_overlay_path="ManiSkill2_real2sim/data/real_inpainting/bridge_real_eval_1.png",
+        robot_init_x_range=[0.147, 0.147, 1],
+        robot_init_y_range=[0.028, 0.028, 1],
+        obj_variation_mode="episode",
+        obj_episode_range=[0, 30],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+        ckpt_path=ckpt_path,
+        task_name="widowx_task1_pick_object",
+        episode_id=episode_id,
+    )
+    results.append(custom_run_single_evaluation_with_counter(env_policy, cfg, ckpt_path, episode_id))
+
+    return results
+
+
+def widowx_task2_stack_cube(env_policy: AiroaBasePolicy, ckpt_path: str, episode_id: int, control_freq: int = 5) -> List[List[bool]]:
+    max_time = 24
+    max_episode_steps = max_time * control_freq
+
+    results: List[List[bool]] = []
+
+    cfg = ManiSkill2Config(
+        policy_setup="widowx_bridge",
+        robot="widowx",
+        control_freq=control_freq,
+        sim_freq=500,
+        max_episode_steps=max_episode_steps,
+        env_name="StackRandomGreenYellowCubeInScene-v0",
+        scene_name="bridge_table_1_v1",
+        rgb_overlay_path="ManiSkill2_real2sim/data/real_inpainting/bridge_real_eval_1.png",
+        robot_init_x_range=[0.147, 0.147, 1],
+        robot_init_y_range=[0.028, 0.028, 1],
+        obj_variation_mode="episode",
+        obj_episode_range=[0, 30],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+        ckpt_path=ckpt_path,
+        task_name="widowx_task2_stack_cube",
+        episode_id=episode_id,
+    )
+    results.append(custom_run_single_evaluation_with_counter(env_policy, cfg, ckpt_path, episode_id))
+
+    return results
+
+
+def widowx_task3_put_object_on_top(env_policy: AiroaBasePolicy, ckpt_path: str, episode_id: int, control_freq: int = 5) -> List[List[bool]]:
+    max_time = 24
+    max_episode_steps = max_time * control_freq
+
+    results: List[List[bool]] = []
+
+    cfg = ManiSkill2Config(
+        policy_setup="widowx_bridge",
+        robot="widowx",
+        control_freq=control_freq,
+        sim_freq=500,
+        max_episode_steps=max_episode_steps,
+        env_name="PutRandomObjectOnRandomTopInScene-v0",
+        scene_name="bridge_table_1_v1",
+        rgb_overlay_path="ManiSkill2_real2sim/data/real_inpainting/bridge_real_eval_1.png",
+        robot_init_x_range=[0.147, 0.147, 1],
+        robot_init_y_range=[0.028, 0.028, 1],
+        obj_variation_mode="episode",
+        obj_episode_range=[0, 30],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+        ckpt_path=ckpt_path,
+        task_name="widowx_task3_put_object_on_top",
+        episode_id=episode_id,
+    )
+    results.append(custom_run_single_evaluation_with_counter(env_policy, cfg, ckpt_path, episode_id))
+
+    return results
+
+
+def widowx_task4_put_object_in_basket(env_policy: AiroaBasePolicy, ckpt_path: str, episode_id: int, control_freq: int = 5) -> List[List[bool]]:
+    max_time = 48
+    max_episode_steps = max_time * control_freq
+
+    results: List[List[bool]] = []
+
+    cfg = ManiSkill2Config(
+        policy_setup="widowx_bridge",
+        robot="widowx_sink_camera_setup",
+        control_freq=control_freq,
+        sim_freq=500,
+        max_episode_steps=max_episode_steps,
+        env_name="PutRandomObjectInBasketScene-v0",
+        scene_name="bridge_table_1_v2",
+        rgb_overlay_path="ManiSkill2_real2sim/data/real_inpainting/bridge_sink.png",
+        robot_init_x_range=[0.127, 0.127, 1],
+        robot_init_y_range=[0.06, 0.06, 1],
+        obj_variation_mode="episode",
+        obj_episode_range=[0, 30],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+        ckpt_path=ckpt_path,
+        task_name="widowx_task4_put_object_in_basket",
+        episode_id=episode_id,
+    )
+    results.append(custom_run_single_evaluation_with_counter(env_policy, cfg, ckpt_path, episode_id))
+
+    return results
diff --git a/simpler_env/evaluation/config.py b/simpler_env/evaluation/config.py
new file mode 100644
index 00000000..96523a9a
--- /dev/null
+++ b/simpler_env/evaluation/config.py
@@ -0,0 +1,95 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+from sapien.core import Pose
+from transforms3d.euler import euler2quat
+
+
+def parse_range_tuple(t):
+    return np.linspace(t[0], t[1], int(t[2]))
+
+
+@dataclass
+class ManiSkill2Config:
+    # Required parameters
+    env_name: str  # required=True in argparse
+    task_name: str
+    episode_id: int = 0
+
+    # Policy settings
+    policy_model: str = "rt1"
+    policy_setup: str = "google_robot"
+    ckpt_path: Optional[str] = None
+
+    # Environment settings
+    scene_name: str = "google_pick_coke_can_1_v4"
+    enable_raytracing: bool = False
+    robot: str = "google_robot_static"
+    obs_camera_name: Optional[str] = None
+    action_scale: float = 1.0
+
+    # Control settings
+    control_freq: int = 3
+    sim_freq: int = 513
+    max_episode_steps: int = 80
+    rgb_overlay_path: Optional[str] = None
+
+    # Robot initialization ranges
+    robot_variation_mode: str = "xy"  # choices: ["xy", "episode_xy"]
+    robot_episode_range: List[int] = field(default_factory=lambda: [0, 60])
+    robot_init_x_range: List[float] = field(default_factory=lambda: [0.35, 0.35, 1])  # start, end, len
+    robot_init_y_range: List[float] = field(default_factory=lambda: [0.20, 0.20, 1])
+    robot_init_rot_quat_center: List[float] = field(default_factory=lambda: [1, 0, 0, 0])
+    robot_init_rot_rpy_range: List[float] = field(default_factory=lambda: [0, 0, 1, 0, 0, 1, 0, 0, 1])
+
+    # Object variation settings
+    obj_variation_mode: str = "xy"  # choices: ["xy", "episode", "episode_xy"]
+    obj_episode_range: List[int] = field(default_factory=lambda: [0, 60])
+    obj_init_x_range: List[float] = field(default_factory=lambda: [-0.35, -0.12, 5])
+    obj_init_y_range: List[float] = field(default_factory=lambda: [-0.02, 0.42, 5])
+
+    # Additional settings
+    additional_env_build_kwargs: Optional[Dict[str, Any]] = None
+    additional_env_save_tags: Optional[str] = None
+    logging_dir: str = "./results"
+    tf_memory_limit: int = 3072
+    octo_init_rng: int = 0
+
+    # Computed attributes (will be set in __post_init__)
+    robot_init_xs: List[float] = field(init=False)
+    robot_init_ys: List[float] = field(init=False)
+    robot_init_quats: List[List[float]] = field(init=False)
+    obj_init_xs: Optional[List[float]] = field(init=False, default=None)
+    obj_init_ys: Optional[List[float]] = field(init=False, default=None)
+
+    def __post_init__(self):
+        """argparseの後処理と同等の計算を実行"""
+        # Robot pose calculations
+        if self.robot_variation_mode == "xy":
+            self.robot_init_xs = parse_range_tuple(self.robot_init_x_range)
+            self.robot_init_ys = parse_range_tuple(self.robot_init_y_range)
+
+        # Robot orientation calculations
+        self.robot_init_quats = []
+        for r in parse_range_tuple(self.robot_init_rot_rpy_range[:3]):
+            for p in parse_range_tuple(self.robot_init_rot_rpy_range[3:6]):
+                for y in parse_range_tuple(self.robot_init_rot_rpy_range[6:]):
+                    quat = (Pose(q=euler2quat(r, p, y)) * Pose(q=self.robot_init_rot_quat_center)).q
+                    self.robot_init_quats.append(quat)
+
+        # Object position calculations
+        if self.obj_variation_mode == "xy":
+            self.obj_init_xs = parse_range_tuple(self.obj_init_x_range)
+            self.obj_init_ys = parse_range_tuple(self.obj_init_y_range)
+
+        # Update logging info if using different camera
+        if self.obs_camera_name is not None:
+            if self.additional_env_save_tags is None:
+                self.additional_env_save_tags = f"obs_camera_{self.obs_camera_name}"
+            else:
+                self.additional_env_save_tags = self.additional_env_save_tags + f"_obs_camera_{self.obs_camera_name}"
+
+        # Validate obj_variation_mode
+        if self.obj_variation_mode not in ["xy", "episode", "episode_xy"]:
+            raise ValueError(f"obj_variation_mode must be 'xy' or 'episode', got {self.obj_variation_mode}")
diff --git a/simpler_env/evaluation/evaluate.py b/simpler_env/evaluation/evaluate.py
new file mode 100644
index 00000000..6eb1ddac
--- /dev/null
+++ b/simpler_env/evaluation/evaluate.py
@@ -0,0 +1,843 @@
+import random
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+
+from ..policies.base import AiroaBasePolicy
+from .config import ManiSkill2Config
+from .maniskill2_evaluator import maniskill2_evaluator
+
+
+def calculate_robust_score(results: List[List[bool]], penalty_factor: float = 0.5) -> float:
+    if not results:
+        return 0.0
+    success_rates = [np.mean(run) for run in results if run]
+    if not success_rates:
+        return 0.0
+    mean_success_rate = np.mean(success_rates)
+    std_dev = np.std(success_rates)
+    robust_score = mean_success_rate - penalty_factor * std_dev
+    return max(0.0, robust_score)
+
+
+def _run_single_evaluation(env_policy: AiroaBasePolicy, cfg: ManiSkill2Config, ckpt_path: str) -> List[bool]:
+    if cfg.additional_env_build_kwargs:
+        if "urdf_version" in cfg.additional_env_build_kwargs:
+            kwargs_info = cfg.additional_env_build_kwargs["urdf_version"]
+        else:
+            kwargs_info = cfg.additional_env_build_kwargs
+    else:
+        kwargs_info = None
+
+    print(f"  ▶️  Running: env={cfg.env_name}, scene={cfg.scene_name}, kwargs={kwargs_info}")
+    success_arr = maniskill2_evaluator(env_policy, cfg)
+    print(f"  ✅  Success Rate: {np.mean(success_arr):.2%}")
+    return success_arr
+
+
+def pick_object_visual_matching(env_policy: AiroaBasePolicy, ckpt_path: str) -> List[List[bool]]:
+    print("\n--- pick_object_visual_matching ---")
+    results: List[List[bool]] = []
+
+    direction_orientationions_arr = [
+        {"lr_switch": True},
+        {"upright": True},
+        {"laid_vertically": True},
+    ]
+
+    urdf_versions = [None, "recolor_tabletop_visual_matching_1", "recolor_tabletop_visual_matching_2", "recolor_cabinet_visual_matching_1"]
+
+    base_kwargs = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=3,
+        sim_freq=513,
+        max_episode_steps=160,
+        ckpt_path=ckpt_path,
+        robot_init_x_range=[0.35, 0.35, 1],
+        robot_init_y_range=[0.20, 0.20, 1],
+        obj_init_x_range=[-0.35, -0.12, 8],
+        obj_init_y_range=[-0.02, 0.42, 8],
+        obj_variation_mode="episode_xy",
+        obj_episode_range=[0, 5],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+    )
+
+    for urdf in urdf_versions:
+        for orientation in direction_orientationions_arr:
+            cfg = ManiSkill2Config(
+                **base_kwargs,
+                env_name="GraspSingleRandomObjectInScene-v0",
+                scene_name="google_pick_coke_can_1_v4",
+                rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/google_coke_can_real_eval_1.png",
+                additional_env_build_kwargs={**orientation, "urdf_version": urdf},
+            )
+            results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    return results
+
+
+def pick_object_variant_agg(env_policy: AiroaBasePolicy, ckpt_path: str) -> List[List[bool]]:
+    print("\n--- pick_object_variant_agg ---")
+    results: List[List[bool]] = []
+
+    object_orientation = [
+        {"lr_switch": True},
+        {"upright": True},
+        {"laid_vertically": True},
+    ]
+
+    base_kwargs = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=3,
+        sim_freq=513,
+        max_episode_steps=160,
+        ckpt_path=ckpt_path,
+        robot_init_x_range=[0.35, 0.35, 1],
+        robot_init_y_range=[0.20, 0.20, 1],
+        obj_init_x_range=[-0.35, -0.12, 5],
+        obj_init_y_range=[-0.02, 0.42, 5],
+        obj_variation_mode="episode_xy",
+        obj_episode_range=[0, 4],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+    )
+
+    # base
+    for orientation in object_orientation:
+        cfg = ManiSkill2Config(
+            **base_kwargs,
+            env_name="GraspSingleRandomObjectInScene-v0",
+            scene_name="google_pick_coke_can_1_v4",
+            additional_env_build_kwargs={**orientation},
+        )
+        results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # table textures (baked)
+    baked_scenes = ["Baked_sc1_staging_objaverse_cabinet1_h870", "Baked_sc1_staging_objaverse_cabinet2_h870"]
+    for scene in baked_scenes:
+        for orientation in object_orientation:
+            cfg = ManiSkill2Config(
+                **base_kwargs,
+                env_name="GraspSingleRandomObjectInScene-v0",
+                scene_name=scene,
+                additional_env_build_kwargs={**orientation},
+            )
+            results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # backgrounds
+    bg_scenes = ["google_pick_coke_can_1_v4_alt_background", "google_pick_coke_can_1_v4_alt_background_2"]
+    for scene in bg_scenes:
+        for orientation in object_orientation:
+            cfg = ManiSkill2Config(
+                **base_kwargs,
+                env_name="GraspSingleRandomObjectInScene-v0",
+                scene_name=scene,
+                additional_env_build_kwargs={**orientation},
+            )
+            results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # lightings (darker / brighter)
+    for orientation in object_orientation:
+        cfg = ManiSkill2Config(
+            **base_kwargs,
+            env_name="GraspSingleRandomObjectInScene-v0",
+            scene_name="google_pick_coke_can_1_v4",
+            additional_env_build_kwargs={**orientation, "slightly_darker_lighting": True},
+        )
+        results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+        cfg = ManiSkill2Config(
+            **base_kwargs,
+            env_name="GraspSingleRandomObjectInScene-v0",
+            scene_name="google_pick_coke_can_1_v4",
+            additional_env_build_kwargs={**orientation, "slightly_brighter_lighting": True},
+        )
+        results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # camera orientations
+    alt_envs = ["GraspSingleRandomObjectAltGoogleCameraInScene-v0", "GraspSingleRandomObjectAltGoogleCamera2InScene-v0"]
+    for env in alt_envs:
+        for orientation in object_orientation:
+            cfg = ManiSkill2Config(
+                **base_kwargs,
+                env_name=env,
+                scene_name="google_pick_coke_can_1_v4",
+                additional_env_build_kwargs={**orientation},
+            )
+            results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    return results
+
+
+# scenes = ["google_pick_coke_can_1_v4",
+#           "google_pick_coke_can_1_v4_alt_background",
+#           "google_pick_coke_can_1_v4_alt_background_2",
+#           "Baked_sc1_staging_objaverse_cabinet1_h870",
+#           "Baked_sc1_staging_objaverse_cabinet2_h870"]
+# object_orientation = [
+#     {"lr_switch": True},
+#     {"upright": True},
+#     {"laid_vertically": True},
+# ]
+# envs = ["GraspSingleRandomObjectInScene-v0", "GraspSingleRandomObjectAltGoogleCameraInScene-v0", "GraspSingleRandomObjectAltGoogleCamera2InScene-v0"]
+# lightings = [None, "darker", "brighter"]
+
+# trail_num = 30
+# for t in range(trial_num):
+#     random_kwargs() = np.random(scenes, object_orientation, envs, lightings)
+#     cfg = ManiSkill2Config(**base_kwargs, **random_kwargs)
+
+
+def pick_object_variant_agg(
+    env_policy: "AiroaBasePolicy",
+    ckpt_path: str,
+    num_trials: int = 30,
+) -> List[List[bool]]:
+    print("\n--- pick_object_variant_agg (single-pool, one-pick-per-trial) ---")
+
+    results: List[List[bool]] = []
+
+    scenes = [
+        "google_pick_coke_can_1_v4",
+        "google_pick_coke_can_1_v4_alt_background",
+        "google_pick_coke_can_1_v4_alt_background_2",
+        "Baked_sc1_staging_objaverse_cabinet1_h870",
+        "Baked_sc1_staging_objaverse_cabinet2_h870",
+    ]
+    envs = [
+        "GraspSingleRandomObjectInScene-v0",
+        "GraspSingleRandomObjectAltGoogleCameraInScene-v0",
+        "GraspSingleRandomObjectAltGoogleCamera2InScene-v0",
+    ]
+    object_orientation = [
+        {"lr_switch": True},
+        {"upright": True},
+        {"laid_vertically": True},
+    ]
+    lightings = [None, "darker", "brighter"]
+
+    base_kwargs: Dict[str, Any] = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=3,
+        sim_freq=513,
+        max_episode_steps=160,
+        ckpt_path=ckpt_path,
+        robot_init_x_range=[0.35, 0.35, 1],
+        robot_init_y_range=[0.20, 0.20, 1],
+        obj_init_x_range=[-0.35, -0.12, 5],
+        obj_init_y_range=[-0.02, 0.42, 5],
+        obj_variation_mode="episode_xy",
+        obj_episode_range=[0, 4],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+    )
+
+    # 単一プール（どれが選ばれたか識別できるよう (kind, value) で保持）
+    pool = (
+        [("scene", s) for s in scenes]
+        + [("env", e) for e in envs]
+        + [("orientation", o) for o in object_orientation]
+        + [("lighting", l) for l in lightings]
+    )
+
+    # 現在の設定（初期値）
+    current = {
+        "scene": scenes[0],
+        "env": envs[0],
+        "orientation": object_orientation[0],
+        "lighting": None,  # デフォ照明
+    }
+
+    for _ in range(num_trials):
+        kind, value = random.choice(pool)  # ← 毎回1つだけ選ぶ
+        current[kind] = value  # 選ばれた種類だけ更新
+
+        # additional_env_build_kwargs を構築
+        add_kwargs = dict(current["orientation"])
+        if current["lighting"] == "darker":
+            add_kwargs["slightly_darker_lighting"] = True
+        elif current["lighting"] == "brighter":
+            add_kwargs["slightly_brighter_lighting"] = True
+
+        cfg = ManiSkill2Config(
+            **base_kwargs,
+            env_name=current["env"],
+            scene_name=current["scene"],
+            additional_env_build_kwargs=add_kwargs,
+        )
+        results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    return results
+
+
+def pick_object_among_visual_matching(env_policy: AiroaBasePolicy, ckpt_path: str) -> List[List[bool]]:
+    print("\n--- pick_object_among_visual_matching ---")
+    results: List[List[bool]] = []
+
+    object_orientation = [
+        {"lr_switch": True},
+        {"upright": True},
+        {"laid_vertically": True},
+    ]
+
+    urdf_versions = [None, "recolor_tabletop_visual_matching_1", "recolor_tabletop_visual_matching_2", "recolor_cabinet_visual_matching_1"]
+
+    base_kwargs = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=3,
+        sim_freq=513,
+        max_episode_steps=160,
+        ckpt_path=ckpt_path,
+        robot_init_x_range=[0.35, 0.35, 1],
+        robot_init_y_range=[0.20, 0.20, 1],
+        obj_init_x_range=[-0.35, -0.12, 5],
+        obj_init_y_range=[-0.02, 0.42, 5],
+        obj_variation_mode="episode_xy",
+        obj_episode_range=[0, 5],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+    )
+
+    for urdf in urdf_versions:
+        for orientation in object_orientation:
+            cfg = ManiSkill2Config(
+                **base_kwargs,
+                env_name="GraspSingleRandomObjectDistractorInScene-v0",
+                scene_name="google_pick_coke_can_1_v4",
+                rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/google_coke_can_real_eval_1.png",
+                additional_env_build_kwargs={**orientation, "urdf_version": urdf, "distractor_config": "less"},
+            )
+            results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    return results
+
+
+def pick_object_among_variant_agg(env_policy: AiroaBasePolicy, ckpt_path: str) -> List[List[bool]]:
+    print("\n--- pick_object_among_variant_agg ---")
+    results: List[List[bool]] = []
+
+    object_orientation = [
+        {"lr_switch": True},
+        {"upright": True},
+        {"laid_vertically": True},
+    ]
+
+    base_kwargs = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=3,
+        sim_freq=513,
+        max_episode_steps=160,
+        ckpt_path=ckpt_path,
+        robot_init_x_range=[0.35, 0.35, 1],
+        robot_init_y_range=[0.20, 0.20, 1],
+        obj_init_x_range=[-0.35, -0.12, 5],
+        obj_init_y_range=[-0.02, 0.42, 5],
+        obj_variation_mode="episode_xy",
+        obj_episode_range=[0, 5],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+    )
+
+    # base
+    for orientation in object_orientation:
+        cfg = ManiSkill2Config(
+            **base_kwargs,
+            env_name="GraspSingleRandomObjectDistractorInScene-v0",
+            scene_name="google_pick_coke_can_1_v4",
+            additional_env_build_kwargs={**orientation, "distractor_config": "less"},
+        )
+        results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # table textures (baked)
+    baked_scenes = ["Baked_sc1_staging_objaverse_cabinet1_h870", "Baked_sc1_staging_objaverse_cabinet2_h870"]
+    for scene in baked_scenes:
+        for orientation in object_orientation:
+            cfg = ManiSkill2Config(
+                **base_kwargs,
+                env_name="GraspSingleRandomObjectDistractorInScene-v0",
+                scene_name=scene,
+                additional_env_build_kwargs={**orientation, "distractor_config": "less"},
+            )
+            results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # backgrounds
+    bg_scenes = ["google_pick_coke_can_1_v4_alt_background", "google_pick_coke_can_1_v4_alt_background_2"]
+    for scene in bg_scenes:
+        for orientation in object_orientation:
+            cfg = ManiSkill2Config(
+                **base_kwargs,
+                env_name="GraspSingleRandomObjectDistractorInScene-v0",
+                scene_name=scene,
+                additional_env_build_kwargs={**orientation, "distractor_config": "less"},
+            )
+            results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # lightings (darker / brighter)
+    for orientation in object_orientation:
+        cfg = ManiSkill2Config(
+            **base_kwargs,
+            env_name="GraspSingleRandomObjectDistractorInScene-v0",
+            scene_name="google_pick_coke_can_1_v4",
+            additional_env_build_kwargs={**orientation, "slightly_darker_lighting": True, "distractor_config": "less"},
+        )
+        results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+        cfg = ManiSkill2Config(
+            **base_kwargs,
+            env_name="GraspSingleRandomObjectDistractorInScene-v0",
+            scene_name="google_pick_coke_can_1_v4",
+            additional_env_build_kwargs={**orientation, "slightly_brighter_lighting": True, "distractor_config": "less"},
+        )
+        results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # camera orientations
+    alt_envs = ["GraspSingleRandomObjectDistractorAltGoogleCameraInScene-v0", "GraspSingleRandomObjectDistractorAltGoogleCamera2InScene-v0"]
+    for env in alt_envs:
+        for orientation in object_orientation:
+            cfg = ManiSkill2Config(
+                **base_kwargs,
+                env_name=env,
+                scene_name="google_pick_coke_can_1_v4",
+                additional_env_build_kwargs={**orientation, "distractor_config": "less"},
+            )
+            results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    return results
+
+
+def drawer_visual_matching(env_policy: AiroaBasePolicy, ckpt_path: str) -> List[List[bool]]:
+    print("\n--- drawer_visual_matching ---")
+    results: List[List[bool]] = []
+
+    env_names = [
+        "OpenTopDrawerCustomInScene-v0",
+        "OpenMiddleDrawerCustomInScene-v0",
+        "OpenBottomDrawerCustomInScene-v0",
+        "CloseTopDrawerCustomInScene-v0",
+        "CloseMiddleDrawerCustomInScene-v0",
+        "CloseBottomDrawerCustomInScene-v0",
+    ]
+    urdf_versions = ["recolor_cabinet_visual_matching_1", "recolor_tabletop_visual_matching_1", "recolor_tabletop_visual_matching_2", None]
+
+    base = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=3,
+        sim_freq=513,
+        max_episode_steps=113,
+        ckpt_path=ckpt_path,
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        obj_init_x_range=[0, 0, 1],
+        obj_init_y_range=[0, 0, 1],
+        scene_name="dummy_drawer",
+    )
+
+    # 9 overlay poses (A0/A1/A2/B0/B1/B2/C0/C1/C2)
+    overlay_poses = [
+        # A0/A1/A2
+        dict(
+            robot_init_x_range=[0.644, 0.644, 1],
+            robot_init_y_range=[-0.179, -0.179, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, -0.03, -0.03, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png",
+        ),
+        dict(
+            robot_init_x_range=[0.765, 0.765, 1],
+            robot_init_y_range=[-0.182, -0.182, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, -0.02, -0.02, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a1.png",
+        ),
+        dict(
+            robot_init_x_range=[0.889, 0.889, 1],
+            robot_init_y_range=[-0.203, -0.203, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, -0.06, -0.06, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a2.png",
+        ),
+        # B0/B1/B2
+        dict(
+            robot_init_x_range=[0.652, 0.652, 1],
+            robot_init_y_range=[0.009, 0.009, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png",
+        ),
+        dict(
+            robot_init_x_range=[0.752, 0.752, 1],
+            robot_init_y_range=[0.009, 0.009, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b1.png",
+        ),
+        dict(
+            robot_init_x_range=[0.851, 0.851, 1],
+            robot_init_y_range=[0.035, 0.035, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b2.png",
+        ),
+        # C0/C1/C2
+        dict(
+            robot_init_x_range=[0.665, 0.665, 1],
+            robot_init_y_range=[0.224, 0.224, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png",
+        ),
+        dict(
+            robot_init_x_range=[0.765, 0.765, 1],
+            robot_init_y_range=[0.222, 0.222, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, -0.025, -0.025, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c1.png",
+        ),
+        dict(
+            robot_init_x_range=[0.865, 0.865, 1],
+            robot_init_y_range=[0.222, 0.222, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, -0.025, -0.025, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c2.png",
+        ),
+    ]
+
+    add_base = dict(station_name="mk_station_recolor", light_mode="simple", disable_bad_material=True)
+
+    for urdf in urdf_versions:
+        add_kwargs = {**add_base, "urdf_version": urdf}
+        for env_name in env_names:
+            for pose in overlay_poses:
+                cfg = ManiSkill2Config(
+                    **base,
+                    env_name=env_name,
+                    additional_env_build_kwargs=add_kwargs,
+                    **pose,
+                )
+                results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    return results
+
+
+def drawer_variant_agg(env_policy: AiroaBasePolicy, ckpt_path: str) -> List[List[bool]]:
+    print("\n--- drawer_variant_agg ---")
+    results: List[List[bool]] = []
+
+    env_names = [
+        "OpenTopDrawerCustomInScene-v0",
+        "OpenMiddleDrawerCustomInScene-v0",
+        "OpenBottomDrawerCustomInScene-v0",
+        "CloseTopDrawerCustomInScene-v0",
+        "CloseMiddleDrawerCustomInScene-v0",
+        "CloseBottomDrawerCustomInScene-v0",
+    ]
+
+    base = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=3,
+        sim_freq=513,
+        max_episode_steps=113,
+        ckpt_path=ckpt_path,
+        robot_init_x_range=[0.65, 0.85, 3],
+        robot_init_y_range=[-0.2, 0.2, 3],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0.0, 0.0, 1],
+        obj_init_x_range=[0, 0, 1],
+        obj_init_y_range=[0, 0, 1],
+    )
+
+    # base (enable raytracing)
+    for env_name in env_names:
+        cfg = ManiSkill2Config(**base, env_name=env_name, scene_name="frl_apartment_stage_simple", enable_raytracing=True)
+        results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # backgrounds (shader_dir=rt)
+    for scene in ["modern_bedroom_no_roof", "modern_office_no_roof"]:
+        merged = {"shader_dir": "rt"}
+        for env_name in env_names:
+            cfg = ManiSkill2Config(**base, env_name=env_name, scene_name=scene, additional_env_build_kwargs=merged)
+            results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # lightings
+    for light in ["brighter", "darker"]:
+        merged = {"shader_dir": "rt", "light_mode": light}
+        for env_name in env_names:
+            cfg = ManiSkill2Config(**base, env_name=env_name, scene_name="frl_apartment_stage_simple", additional_env_build_kwargs=merged)
+            results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # new cabinets
+    for station in ["mk_station2", "mk_station3"]:
+        merged = {"shader_dir": "rt", "station_name": station}
+        for env_name in env_names:
+            cfg = ManiSkill2Config(**base, env_name=env_name, scene_name="frl_apartment_stage_simple", additional_env_build_kwargs=merged)
+            results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    return results
+
+
+def move_near_variant_agg(env_policy: AiroaBasePolicy, ckpt_path: str) -> List[List[bool]]:
+    print("\n--- move_near_variant_agg ---")
+    results: List[List[bool]] = []
+
+    base_kwargs = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=3,
+        sim_freq=513,
+        max_episode_steps=80,
+        robot_init_x_range=[0.35, 0.35, 1],
+        robot_init_y_range=[0.21, 0.21, 1],
+        obj_variation_mode="episode",
+        obj_episode_range=[0, 30],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, -0.09, -0.09, 1],
+        ckpt_path=ckpt_path,
+    )
+
+    # base
+    cfg = ManiSkill2Config(**base_kwargs, env_name="MoveNearGoogleInScene-v0", scene_name="google_pick_coke_can_1_v4")
+    results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # distractor(no_distractor=True)
+    cfg = ManiSkill2Config(
+        **base_kwargs,
+        env_name="MoveNearGoogleInScene-v0",
+        scene_name="google_pick_coke_can_1_v4",
+        additional_env_build_kwargs={"no_distractor": True},
+    )
+    results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # backgrounds
+    for scene in ["google_pick_coke_can_1_v4_alt_background", "google_pick_coke_can_1_v4_alt_background_2"]:
+        cfg = ManiSkill2Config(**base_kwargs, env_name="MoveNearGoogleInScene-v0", scene_name=scene)
+        results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # lighting
+    for k in [{"slightly_darker_lighting": True}, {"slightly_brighter_lighting": True}]:
+        cfg = ManiSkill2Config(
+            **base_kwargs,
+            env_name="MoveNearGoogleInScene-v0",
+            scene_name="google_pick_coke_can_1_v4",
+            additional_env_build_kwargs=k,
+        )
+        results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # table textures (baked)
+    for scene in ["Baked_sc1_staging_objaverse_cabinet1_h870", "Baked_sc1_staging_objaverse_cabinet2_h870"]:
+        cfg = ManiSkill2Config(**base_kwargs, env_name="MoveNearGoogleInScene-v0", scene_name=scene)
+        results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # camera orientations
+    for env in ["MoveNearAltGoogleCameraInScene-v0", "MoveNearAltGoogleCamera2InScene-v0"]:
+        cfg = ManiSkill2Config(**base_kwargs, env_name=env, scene_name="google_pick_coke_can_1_v4")
+        results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    return results
+
+
+def move_near_visual_matching(env_policy: AiroaBasePolicy, ckpt_path: str) -> List[List[bool]]:
+    print("\n--- move_near_visual_matching ---")
+    results: List[List[bool]] = []
+
+    base_kwargs = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=3,
+        sim_freq=513,
+        max_episode_steps=160,
+        robot_init_x_range=[0.35, 0.35, 1],
+        robot_init_y_range=[0.21, 0.21, 1],
+        obj_variation_mode="episode",
+        obj_episode_range=[0, 30],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, -0.09, -0.09, 1],
+        ckpt_path=ckpt_path,
+    )
+
+    urdf_versions = [None, "recolor_tabletop_visual_matching_1", "recolor_tabletop_visual_matching_2", "recolor_cabinet_visual_matching_1"]
+    for urdf in urdf_versions:
+        cfg = ManiSkill2Config(
+            **base_kwargs,
+            env_name="MoveNearGoogleBakedTexInScene-v0",
+            scene_name="google_pick_coke_can_1_v4",
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/google_move_near_real_eval_1.png",
+            additional_env_save_tags="baked_except_bpb_orange",
+            additional_env_build_kwargs={"urdf_version": urdf},
+        )
+        results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    return results
+
+
+def put_in_drawer_visual_matching(env_policy: AiroaBasePolicy, ckpt_path: str) -> List[List[bool]]:
+    print("\n--- put_in_drawer_visual_matching ---")
+    results: List[List[bool]] = []
+
+    env_names = ["PlaceIntoClosedTopDrawerCustomInScene-v0"]
+    urdf_versions = ["recolor_cabinet_visual_matching_1", "recolor_tabletop_visual_matching_1", "recolor_tabletop_visual_matching_2", None]
+
+    base = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=3,
+        sim_freq=513,
+        max_episode_steps=400,
+        ckpt_path=ckpt_path,
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        obj_init_x_range=[-0.08, -0.02, 3],
+        obj_init_y_range=[-0.02, 0.08, 3],
+        obj_variation_mode="episode_xy",
+        obj_episode_range=[0, 3],
+    )
+
+    overlay_poses = [
+        # A0
+        dict(
+            robot_init_x_range=[0.644, 0.644, 1],
+            robot_init_y_range=[-0.179, -0.179, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, -0.03, -0.03, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png",
+        ),
+        # B0
+        dict(
+            robot_init_x_range=[0.652, 0.652, 1],
+            robot_init_y_range=[0.009, 0.009, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png",
+        ),
+        # C0
+        dict(
+            robot_init_x_range=[0.665, 0.665, 1],
+            robot_init_y_range=[0.224, 0.224, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png",
+        ),
+    ]
+
+    add_base = dict(station_name="mk_station_recolor", light_mode="simple", disable_bad_material=True, model_ids="baked_apple_v2")
+
+    for urdf in urdf_versions:
+        add_kwargs = {**add_base, "urdf_version": urdf}
+        for env_name in env_names:
+            for pose in overlay_poses:
+                cfg = ManiSkill2Config(
+                    **base,
+                    env_name=env_name,
+                    scene_name="dummy_drawer",
+                    additional_env_build_kwargs=add_kwargs,
+                    **pose,
+                )
+                results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    return results
+
+
+def put_in_drawer_variant_agg(env_policy: AiroaBasePolicy, ckpt_path: str) -> List[List[bool]]:
+    print("\n--- put_in_drawer_variant_agg ---")
+    results: List[List[bool]] = []
+
+    common = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=3,
+        sim_freq=513,
+        max_episode_steps=400,
+        ckpt_path=ckpt_path,
+        additional_env_build_kwargs={"model_ids": "apple"},
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        obj_init_x_range=[-0.08, -0.02, 3],
+        obj_init_y_range=[-0.02, 0.08, 3],
+        obj_variation_mode="episode_xy",
+        obj_episode_range=[0, 3],
+        robot_init_x_range=[0.65, 0.65, 1],
+        robot_init_y_range=[-0.2, 0.2, 3],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0.0, 0.0, 1],
+    )
+
+    env_names = ["PlaceIntoClosedTopDrawerCustomInScene-v0"]
+
+    # base (enable raytracing)
+    for env_name in env_names:
+        cfg = ManiSkill2Config(**common, env_name=env_name, scene_name="frl_apartment_stage_simple", enable_raytracing=True)
+        results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # backgrounds (shader_dir=rt)
+    for scene in ["modern_bedroom_no_roof", "modern_office_no_roof"]:
+        merged = common["additional_env_build_kwargs"].copy()
+        merged["shader_dir"] = "rt"
+        for env_name in env_names:
+            cfg = ManiSkill2Config(**common, env_name=env_name, scene_name=scene, additional_env_build_kwargs=merged)
+            results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # lightings
+    for light_mode in ["brighter", "darker"]:
+        merged = common["additional_env_build_kwargs"].copy()
+        merged.update({"shader_dir": "rt", "light_mode": light_mode})
+        for env_name in env_names:
+            cfg = ManiSkill2Config(**common, env_name=env_name, scene_name="frl_apartment_stage_simple", additional_env_build_kwargs=merged)
+            results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # new cabinets
+    for station in ["mk_station2", "mk_station3"]:
+        merged = common["additional_env_build_kwargs"].copy()
+        merged.update({"shader_dir": "rt", "station_name": station})
+        for env_name in env_names:
+            cfg = ManiSkill2Config(**common, env_name=env_name, scene_name="frl_apartment_stage_simple", additional_env_build_kwargs=merged)
+            results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    return results
+
+
+# ======================================================================
+# 総合評価（重み付け・スコアリングは従来どおり）
+# ======================================================================
+SIM_WEIGHT = 0.3
+VISUAL_MATCHING_WEIGHT = 0.7
+
+
+def run_comprehensive_evaluation(env_policy: AiroaBasePolicy, ckpt_path: str) -> Dict[str, float]:
+    print("=" * 80)
+    print(f"🚀 STARTING COMPREHENSIVE EVALUATION 🚀")
+    print(f"Checkpoint: {ckpt_path}")
+    print(f"Weights: Sim={SIM_WEIGHT}, VisualMatching={VISUAL_MATCHING_WEIGHT}")
+    print("=" * 80)
+
+    vm_results: List[List[bool]] = []
+    sim_results: List[List[bool]] = []
+
+    vm_results += pick_object_visual_matching(env_policy, ckpt_path)
+    sim_results += pick_object_variant_agg(env_policy, ckpt_path)
+
+    vm_results += pick_object_among_visual_matching(env_policy, ckpt_path)
+    sim_results += pick_object_among_variant_agg(env_policy, ckpt_path)
+
+    vm_results += drawer_visual_matching(env_policy, ckpt_path)
+    sim_results += drawer_variant_agg(env_policy, ckpt_path)
+
+    vm_results += move_near_visual_matching(env_policy, ckpt_path)
+    sim_results += move_near_variant_agg(env_policy, ckpt_path)
+
+    vm_results += put_in_drawer_visual_matching(env_policy, ckpt_path)
+    sim_results += put_in_drawer_variant_agg(env_policy, ckpt_path)
+
+    # ロバストスコア
+    sim_score = calculate_robust_score(sim_results)
+    vm_score = calculate_robust_score(vm_results)
+
+    total_weight = SIM_WEIGHT + VISUAL_MATCHING_WEIGHT
+    final_score = 0.0 if total_weight == 0 else (sim_score * SIM_WEIGHT + vm_score * VISUAL_MATCHING_WEIGHT) / total_weight
+
+    print("\n" + "=" * 80)
+    print("📊 EVALUATION SUMMARY 📊")
+    print("-" * 80)
+    print(f"Simulation Score (Robust):            {sim_score:.4f}")
+    print(f"  - Total Simulation Runs: {len(sim_results)}")
+    print(f"Visual Matching Score (Robust):       {vm_score:.4f}")
+    print(f"  - Total Visual Matching Runs: {len(vm_results)}")
+    print("-" * 80)
+    print(f"🏆 Final Weighted Score:               {final_score:.4f}")
+    print("=" * 80)
+
+    return {
+        "final_score": final_score,
+        "simulation_robust_score": sim_score,
+        "visual_matching_robust_score": vm_score,
+    }
diff --git a/simpler_env/evaluation/evaluate_bridge.py b/simpler_env/evaluation/evaluate_bridge.py
new file mode 100644
index 00000000..aae17cc3
--- /dev/null
+++ b/simpler_env/evaluation/evaluate_bridge.py
@@ -0,0 +1,102 @@
+from typing import Any, Dict, List
+
+from ..policies.base import AiroaBasePolicy
+from .config import ManiSkill2Config
+from .evaluate import _run_single_evaluation, calculate_robust_score
+
+# env_name → task_name の対応表
+ENV_TO_TASK: Dict[str, str] = {
+    "PutSpoonOnTableClothInScene-v0": "widowx_spoon_on_towel",
+    "PutCarrotOnPlateInScene-v0": "widowx_carrot_on_plate",
+    "StackGreenCubeOnYellowCubeBakedTexInScene-v0": "widowx_stack_cube",
+    "PutEggplantInBasketScene-v0": "widowx_put_eggplant_in_basket",
+}
+
+def bridge(env_policy: AiroaBasePolicy, ckpt_path: str) -> List[List[bool]]:
+    """
+    scripts/bridge.sh を Python で忠実に再現。
+    4 本の実行（3 + 1）を順番に回し、各 run の成功配列を返す。
+    """
+    print("\n--- bridge (scripts/bridge.sh) ---")
+    results: List[List[bool]] = []
+
+    # ====== bridge_table_1_v1 ======
+    common_v1 = dict(
+        policy_setup="widowx_bridge",
+        robot="widowx",
+        control_freq=5,
+        sim_freq=500,
+        max_episode_steps=60,
+        rgb_overlay_path="ManiSkill2_real2sim/data/real_inpainting/bridge_real_eval_1.png",
+        robot_init_x_range=[0.147, 0.147, 1],
+        robot_init_y_range=[0.028, 0.028, 1],
+        obj_variation_mode="episode",
+        obj_episode_range=[0, 24],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+        ckpt_path=ckpt_path,
+    )
+    envs_v1 = [
+        ("PutCarrotOnPlateInScene-v0", "bridge_table_1_v1"),
+        ("StackGreenCubeOnYellowCubeBakedTexInScene-v0", "bridge_table_1_v1"),
+        ("PutSpoonOnTableClothInScene-v0", "bridge_table_1_v1"),
+    ]
+    for env_name, scene_name in envs_v1:
+        task_name = ENV_TO_TASK.get(env_name)
+        cfg = ManiSkill2Config(
+            **common_v1,
+            env_name=env_name,
+            scene_name=scene_name,
+            task_name=task_name,
+        )
+        results.append(_run_single_evaluation(env_policy, cfg, ckpt_path))
+
+    # ====== bridge_table_1_v2 ======
+    # 注意: ここだけ max_episode_steps=120、ロボット/カメラセットアップが異なる
+    cfg_v2 = ManiSkill2Config(
+        policy_setup="widowx_bridge",
+        robot="widowx_sink_camera_setup",
+        control_freq=5,
+        sim_freq=500,
+        max_episode_steps=120,
+        env_name="PutEggplantInBasketScene-v0",
+        scene_name="bridge_table_1_v2",
+        rgb_overlay_path="ManiSkill2_real2sim/data/real_inpainting/bridge_sink.png",
+        robot_init_x_range=[0.127, 0.127, 1],
+        robot_init_y_range=[0.06, 0.06, 1],
+        obj_variation_mode="episode",
+        obj_episode_range=[0, 24],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+        ckpt_path=ckpt_path,
+        task_name=ENV_TO_TASK["PutEggplantInBasketScene-v0"],  # v2 はバスケット固定
+    )
+    results.append(_run_single_evaluation(env_policy, cfg_v2, ckpt_path))
+
+    return results
+
+
+def run_comprehensive_evaluation_bridge(env_policy: AiroaBasePolicy, ckpt_path: str) -> Dict[str, Any]:
+    """
+    Bridge 用の総合実行。scripts/bridge.sh を 1:1 で再現し、
+    ロバストスコアも合わせて出力する（便宜機能）。
+    """
+    print("=" * 80)
+    print("🚧 STARTING BRIDGE EVALUATION (bridge.sh) 🚧")
+    print(f"Checkpoint: {ckpt_path}")
+    print("=" * 80)
+
+    runs = bridge(env_policy, ckpt_path)
+    vm_score = calculate_robust_score(runs)  # overlay 相当なので Visual Matching としてロバストスコアを算出
+
+    print("\n" + "=" * 80)
+    print("📊 BRIDGE EVALUATION SUMMARY 📊")
+    print("-" * 80)
+    print(f"Visual Matching Score (Robust):       {vm_score:.4f}")
+    print(f"  - Total Runs: {len(runs)}")
+    print("=" * 80)
+
+    return {
+        "visual_matching_robust_score": vm_score,
+        "num_runs": len(runs),
+    }
diff --git a/simpler_env/evaluation/fractal_tasks.py b/simpler_env/evaluation/fractal_tasks.py
new file mode 100644
index 00000000..45a7a641
--- /dev/null
+++ b/simpler_env/evaluation/fractal_tasks.py
@@ -0,0 +1,779 @@
+import random
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+from statsmodels.stats.proportion import proportion_confint
+
+from ..policies.base import AiroaBasePolicy
+from .config import ManiSkill2Config
+from .maniskill2_evaluator import maniskill2_evaluator
+
+
+def calculate_score(results: List[bool], penalty_factor: float = 0.5) -> Tuple[float, float, float]:
+    """
+    Args:
+        results: 各エピソードの成功(True)/失敗(False)の一次元リスト
+        penalty_factor: 予備パラメータ（現状未使用）
+
+    Returns:
+        (successes_rate, wilson_lower_bound_95, sample_std_dev)
+    """
+    if not results:
+        return 0.0, 0.0, 0.0
+
+    n = len(results)
+    successes = int(np.sum(results))  # True を 1 として加算
+    successes_rate = successes / n
+
+    # Wilsonの二側95%CIの下限
+    wilson_lower, _ = proportion_confint(successes, n, alpha=0.05, method="wilson")
+    wilson_lower = float(wilson_lower)
+
+    # 0/1系列の標本標準偏差（エピソード間のばらつき）
+    std_dev = float(np.std(results, ddof=1)) if n > 1 else 0.0
+
+    return float(successes_rate), wilson_lower, std_dev
+
+
+def pick_object_visual_matching(env_policy: AiroaBasePolicy, ckpt_path: str, control_freq: int = 3, num_trials: int = 30) -> List[List[bool]]:
+    max_time = 53
+    max_episode_steps = max_time * control_freq  # Original: 160
+
+    print("\n--- pick_object_visual_matching ---")
+    results: List[List[bool]] = []
+
+    direction_orientationions_arr = [
+        {"lr_switch": True},
+        {"upright": True},
+        {"laid_vertically": True},
+    ]
+    urdf_versions = [None, "recolor_tabletop_visual_matching_1", "recolor_tabletop_visual_matching_2", "recolor_cabinet_visual_matching_1"]
+
+    base_kwargs = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=control_freq,
+        sim_freq=513,
+        max_episode_steps=max_episode_steps,
+        ckpt_path=ckpt_path,
+        robot_init_x_range=[0.35, 0.35, 1],
+        robot_init_y_range=[0.20, 0.20, 1],
+        obj_init_x_range=[-0.35, -0.12, 8],
+        obj_init_y_range=[-0.02, 0.42, 8],
+        obj_variation_mode="episode_xy",
+        obj_episode_range=[0, 1],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+        task_name="fractal_pick_object_visual_matching",
+    )
+
+    for i in range(num_trials):
+        urdf = random.choice(urdf_versions)
+        orientation = random.choice(direction_orientationions_arr)
+        cfg = ManiSkill2Config(
+            **base_kwargs,
+            env_name="GraspSingleRandomObjectInScene-v0",
+            scene_name="google_pick_coke_can_1_v4",
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/google_coke_can_real_eval_1.png",
+            additional_env_build_kwargs={**orientation, "urdf_version": urdf},
+            episode_id=i,
+        )
+        results.append(maniskill2_evaluator(env_policy, cfg))
+
+    return results
+
+
+def pick_object_variant_agg(env_policy: "AiroaBasePolicy", ckpt_path: str, control_freq: int = 3, num_trials: int = 30) -> List[List[bool]]:
+    """
+    ランダムにシーン/姿勢/環境/照明を選んで評価を繰り返す。
+    戻り値は各トライアルの _run_single_evaluation の結果のリスト。
+    """
+    max_time = 53
+    max_episode_steps = max_time * control_freq  # Original: 160
+    
+    print("\n--- pick_object_variant_agg ---")
+
+    results: List[List[bool]] = []
+
+    scenes = [
+        "google_pick_coke_can_1_v4",
+        "google_pick_coke_can_1_v4_alt_background",
+        "google_pick_coke_can_1_v4_alt_background_2",
+        "Baked_sc1_staging_objaverse_cabinet1_h870",
+        "Baked_sc1_staging_objaverse_cabinet2_h870",
+    ]
+
+    object_orientation = [
+        {"lr_switch": True},
+        {"upright": True},
+        {"laid_vertically": True},
+    ]
+
+    envs = [
+        "GraspSingleRandomObjectInScene-v0",
+        "GraspSingleRandomObjectAltGoogleCameraInScene-v0",
+        "GraspSingleRandomObjectAltGoogleCamera2InScene-v0",
+    ]
+
+    lightings = [None, "darker", "brighter"]
+
+    base_kwargs: Dict[str, Any] = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=control_freq,
+        sim_freq=513,
+        max_episode_steps=max_episode_steps,
+        ckpt_path=ckpt_path,
+        robot_init_x_range=[0.35, 0.35, 1],
+        robot_init_y_range=[0.20, 0.20, 1],
+        obj_init_x_range=[-0.35, -0.12, 5],
+        obj_init_y_range=[-0.02, 0.42, 5],
+        obj_variation_mode="episode_xy",
+        obj_episode_range=[0, 1],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+        task_name="fractal_pick_object_variant_agg",
+    )
+
+    for i in range(num_trials):
+        scene_name = random.choice(scenes)
+        env_name = random.choice(envs)
+        orientation = random.choice(object_orientation)
+        lighting = random.choice(lightings)
+
+        add_kwargs = dict(orientation)
+        if lighting == "darker":
+            add_kwargs["slightly_darker_lighting"] = True
+        elif lighting == "brighter":
+            add_kwargs["slightly_brighter_lighting"] = True
+
+        cfg = ManiSkill2Config(
+            **base_kwargs,
+            env_name=env_name,
+            scene_name=scene_name,
+            additional_env_build_kwargs=add_kwargs,
+            episode_id=i,
+        )
+
+        results.append(maniskill2_evaluator(env_policy, cfg))
+
+    return results
+
+
+def pick_object_among_visual_matching(env_policy: AiroaBasePolicy, ckpt_path: str, control_freq: int = 3, num_trials: int = 30) -> List[List[bool]]:
+    max_time = 53
+    max_episode_steps = max_time * control_freq  # Original: 160
+
+    print("\n--- pick_object_among_visual_matching ---")
+    results: List[List[bool]] = []
+
+    object_orientation = [
+        {"lr_switch": True},
+        {"upright": True},
+        {"laid_vertically": True},
+    ]
+    urdf_versions = [None, "recolor_tabletop_visual_matching_1", "recolor_tabletop_visual_matching_2", "recolor_cabinet_visual_matching_1"]
+
+    base_kwargs = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=control_freq,
+        sim_freq=513,
+        max_episode_steps=max_episode_steps,
+        ckpt_path=ckpt_path,
+        robot_init_x_range=[0.35, 0.35, 1],
+        robot_init_y_range=[0.20, 0.20, 1],
+        obj_init_x_range=[-0.35, -0.12, 5],
+        obj_init_y_range=[-0.02, 0.42, 5],
+        obj_variation_mode="episode_xy",
+        obj_episode_range=[0, 1],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+        task_name="fractal_pick_object_among_visual_matching",
+    )
+
+    for i in range(num_trials):
+        urdf = random.choice(urdf_versions)
+        orientation = random.choice(object_orientation)
+        cfg = ManiSkill2Config(
+            **base_kwargs,
+            env_name="GraspSingleRandomObjectDistractorInScene-v0",
+            scene_name="google_pick_coke_can_1_v4",
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/google_coke_can_real_eval_1.png",
+            additional_env_build_kwargs={**orientation, "urdf_version": urdf, "distractor_config": "less"},
+            episode_id=i,
+        )
+        results.append(maniskill2_evaluator(env_policy, cfg))
+
+    return results
+
+
+def pick_object_among_variant_agg(env_policy: AiroaBasePolicy, ckpt_path: str, control_freq: int = 3, num_trials: int = 30) -> List[List[bool]]:
+    max_time = 53
+    max_episode_steps = max_time * control_freq  # Original: 160
+
+    print("\n--- pick_object_among_variant_agg ---")
+    results: List[List[bool]] = []
+
+    scenes = [
+        "google_pick_coke_can_1_v4",
+        "google_pick_coke_can_1_v4_alt_background",
+        "google_pick_coke_can_1_v4_alt_background_2",
+        "Baked_sc1_staging_objaverse_cabinet1_h870",
+        "Baked_sc1_staging_objaverse_cabinet2_h870",
+    ]
+    envs = [
+        "GraspSingleRandomObjectDistractorInScene-v0",
+        "GraspSingleRandomObjectDistractorAltGoogleCameraInScene-v0",
+        "GraspSingleRandomObjectDistractorAltGoogleCamera2InScene-v0",
+    ]
+    object_orientation = [
+        {"lr_switch": True},
+        {"upright": True},
+        {"laid_vertically": True},
+    ]
+    lightings = [None, "darker", "brighter"]
+
+    base_kwargs = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=control_freq,
+        sim_freq=513,
+        max_episode_steps=max_episode_steps,
+        ckpt_path=ckpt_path,
+        robot_init_x_range=[0.35, 0.35, 1],
+        robot_init_y_range=[0.20, 0.20, 1],
+        obj_init_x_range=[-0.35, -0.12, 5],
+        obj_init_y_range=[-0.02, 0.42, 5],
+        obj_variation_mode="episode_xy",
+        obj_episode_range=[0, 1],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+        task_name="fractal_pick_object_among_variant_agg",
+    )
+
+    for i in range(num_trials):
+        scene = random.choice(scenes)
+        env = random.choice(envs)
+        orientation = random.choice(object_orientation)
+        lighting = random.choice(lightings)
+
+        add_kwargs = {**orientation, "distractor_config": "less"}
+        if lighting == "darker":
+            add_kwargs["slightly_darker_lighting"] = True
+        elif lighting == "brighter":
+            add_kwargs["slightly_brighter_lighting"] = True
+
+        cfg = ManiSkill2Config(
+            **base_kwargs,
+            env_name=env,
+            scene_name=scene,
+            additional_env_build_kwargs=add_kwargs,
+            episode_id=i,
+        )
+        results.append(maniskill2_evaluator(env_policy, cfg))
+
+    return results
+
+
+def drawer_visual_matching(env_policy: AiroaBasePolicy, ckpt_path: str, control_freq: int = 3, num_trials: int = 30) -> List[List[bool]]:
+    max_time = 38
+    max_episode_steps = max_time * control_freq  # Original: 113
+
+    print("\n--- drawer_visual_matching ---")
+    results: List[List[bool]] = []
+
+    env_names = [
+        "OpenTopDrawerCustomInScene-v0",
+        "OpenMiddleDrawerCustomInScene-v0",
+        "OpenBottomDrawerCustomInScene-v0",
+        "CloseTopDrawerCustomInScene-v0",
+        "CloseMiddleDrawerCustomInScene-v0",
+        "CloseBottomDrawerCustomInScene-v0",
+    ]
+    urdf_versions = ["recolor_cabinet_visual_matching_1", "recolor_tabletop_visual_matching_1", "recolor_tabletop_visual_matching_2", None]
+
+    base = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=control_freq,
+        sim_freq=513,
+        max_episode_steps=max_episode_steps,
+        ckpt_path=ckpt_path,
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        obj_init_x_range=[0, 0, 1],
+        obj_init_y_range=[0, 0, 1],
+        scene_name="dummy_drawer",
+        task_name="fractal_drawer_visual_matching",
+    )
+
+    overlay_poses = [
+        dict(
+            robot_init_x_range=[0.644, 0.644, 1],
+            robot_init_y_range=[-0.179, -0.179, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, -0.03, -0.03, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png",
+        ),
+        dict(
+            robot_init_x_range=[0.765, 0.765, 1],
+            robot_init_y_range=[-0.182, -0.182, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, -0.02, -0.02, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a1.png",
+        ),
+        dict(
+            robot_init_x_range=[0.889, 0.889, 1],
+            robot_init_y_range=[-0.203, -0.203, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, -0.06, -0.06, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a2.png",
+        ),
+        dict(
+            robot_init_x_range=[0.652, 0.652, 1],
+            robot_init_y_range=[0.009, 0.009, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png",
+        ),
+        dict(
+            robot_init_x_range=[0.752, 0.752, 1],
+            robot_init_y_range=[0.009, 0.009, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b1.png",
+        ),
+        dict(
+            robot_init_x_range=[0.851, 0.851, 1],
+            robot_init_y_range=[0.035, 0.035, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b2.png",
+        ),
+        dict(
+            robot_init_x_range=[0.665, 0.665, 1],
+            robot_init_y_range=[0.224, 0.224, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png",
+        ),
+        dict(
+            robot_init_x_range=[0.765, 0.765, 1],
+            robot_init_y_range=[0.222, 0.222, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, -0.025, -0.025, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c1.png",
+        ),
+        dict(
+            robot_init_x_range=[0.865, 0.865, 1],
+            robot_init_y_range=[0.222, 0.222, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, -0.025, -0.025, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c2.png",
+        ),
+    ]
+
+    add_base = dict(station_name="mk_station_recolor", light_mode="simple", disable_bad_material=True)
+
+    for i in range(num_trials):
+        urdf = random.choice(urdf_versions)
+        env_name = random.choice(env_names)
+        pose = random.choice(overlay_poses)
+        cfg = ManiSkill2Config(
+            **base,
+            env_name=env_name,
+            additional_env_build_kwargs={**add_base, "urdf_version": urdf},
+            **pose,
+            episode_id=i,
+        )
+        results.append(maniskill2_evaluator(env_policy, cfg))
+
+    return results
+
+
+def drawer_variant_agg(env_policy: AiroaBasePolicy, ckpt_path: str, control_freq: int = 3, num_trials: int = 30) -> List[List[bool]]:
+    max_time = 38
+    max_episode_steps = max_time * control_freq  # Original: 113
+
+    print("\n--- drawer_variant_agg ---")
+    results: List[List[bool]] = []
+
+    env_names = [
+        "OpenTopDrawerCustomInScene-v0",
+        "OpenMiddleDrawerCustomInScene-v0",
+        "OpenBottomDrawerCustomInScene-v0",
+        "CloseTopDrawerCustomInScene-v0",
+        "CloseMiddleDrawerCustomInScene-v0",
+        "CloseBottomDrawerCustomInScene-v0",
+    ]
+
+    base = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=control_freq,
+        sim_freq=513,
+        max_episode_steps=max_episode_steps,
+        ckpt_path=ckpt_path,
+        robot_init_x_range=[0.65, 0.85, 3],
+        robot_init_y_range=[-0.2, 0.2, 3],
+        robot_variation_mode="episode_xy",
+        robot_episode_range=[0, 1],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0.0, 0.0, 1],
+        obj_init_x_range=[0, 0, 1],
+        obj_init_y_range=[0, 0, 1],
+        task_name="fractal_drawer_variant_agg",
+    )
+
+    background_scenes = ["frl_apartment_stage_simple", "modern_bedroom_no_roof", "modern_office_no_roof"]
+    stations = ["mk_station", "mk_station2", "mk_station3"]
+    lightings = [None, "brighter", "darker"]
+
+    for i in range(num_trials):
+        env_name = random.choice(env_names)
+        scene = random.choice(background_scenes)
+        station = random.choice(stations)
+        # enable_raytracing = random.choice([True, False])
+        light = random.choice(lightings)
+
+        additional_env_build_kwargs = {
+            # "shader_dir": "rt", # v100とかはray tracingに対応していない
+            "light_mode": light,
+            "station_name": station,
+        }
+
+        cfg = ManiSkill2Config(
+            **base,
+            env_name=env_name,
+            scene_name=scene,
+            additional_env_build_kwargs=additional_env_build_kwargs,
+            enable_raytracing=False,
+            episode_id=i,
+        )
+
+        results.append(maniskill2_evaluator(env_policy, cfg))
+
+    return results
+
+
+def move_near_visual_matching(env_policy: AiroaBasePolicy, ckpt_path: str, control_freq: int = 3, num_trials: int = 30) -> List[List[bool]]:
+    max_time = 53
+    max_episode_steps = max_time * control_freq  # Original: 160
+
+    print("\n--- move_near_visual_matching ---")
+    results: List[List[bool]] = []
+
+    base_kwargs = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=control_freq,
+        sim_freq=513,
+        max_episode_steps=max_episode_steps,
+        robot_init_x_range=[0.35, 0.35, 1],
+        robot_init_y_range=[0.21, 0.21, 1],
+        obj_variation_mode="episode",
+        obj_episode_range=[0, 1],
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, -0.09, -0.09, 1],
+        ckpt_path=ckpt_path,
+        task_name="fractal_move_near_visual_matching",
+    )
+
+    urdf_versions = [None, "recolor_tabletop_visual_matching_1", "recolor_tabletop_visual_matching_2", "recolor_cabinet_visual_matching_1"]
+
+    for i in range(num_trials):
+        urdf = random.choice(urdf_versions)
+        cfg = ManiSkill2Config(
+            **base_kwargs,
+            env_name="MoveNearGoogleBakedTexInScene-v0",
+            scene_name="google_pick_coke_can_1_v4",
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/google_move_near_real_eval_1.png",
+            additional_env_save_tags="baked_except_bpb_orange",
+            additional_env_build_kwargs={"urdf_version": urdf},
+            episode_id=i,
+        )
+        results.append(maniskill2_evaluator(env_policy, cfg))
+
+    return results
+
+
+def move_near_variant_agg(env_policy: AiroaBasePolicy, ckpt_path: str, control_freq: int = 3, num_trials: int = 30) -> List[List[bool]]:
+    max_time = 27
+    max_episode_steps = max_time * control_freq  # Original: 80
+
+    print("\n--- move_near_variant_agg ---")
+    results: List[List[bool]] = []
+
+    base_kwargs = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=control_freq,
+        sim_freq=513,
+        max_episode_steps=max_episode_steps,
+        robot_init_x_range=[0.35, 0.35, 1],
+        robot_init_y_range=[0.21, 0.21, 1],
+        obj_variation_mode="episode",
+        obj_episode_range=[0, 1],  # TODO: widowxとおなじようランダマイズする
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, -0.09, -0.09, 1],
+        task_name="fractal_move_near_variant_agg",
+        ckpt_path=ckpt_path,
+    )
+
+    envs = ["MoveNearGoogleInScene-v0", "MoveNearAltGoogleCameraInScene-v0", "MoveNearAltGoogleCamera2InScene-v0"]
+    scenes = [
+        "google_pick_coke_can_1_v4",
+        "google_pick_coke_can_1_v4_alt_background",
+        "google_pick_coke_can_1_v4_alt_background_2",
+        "Baked_sc1_staging_objaverse_cabinet1_h870",
+        "Baked_sc1_staging_objaverse_cabinet2_h870",
+    ]
+    extras = [None, {"no_distractor": True}, {"slightly_darker_lighting": True}, {"slightly_brighter_lighting": True}]
+
+    for i in range(num_trials):
+        env = random.choice(envs)
+        scene = random.choice(scenes)
+        extra = random.choice(extras)
+
+        cfg = ManiSkill2Config(
+            **base_kwargs,
+            env_name=env,
+            scene_name=scene,
+            additional_env_build_kwargs=extra if extra else None,
+            episode_id=i,
+        )
+        results.append(maniskill2_evaluator(env_policy, cfg))
+
+    return results
+
+
+def put_in_drawer_visual_matching(env_policy: AiroaBasePolicy, ckpt_path: str, control_freq: int = 3, num_trials: int = 30) -> List[List[bool]]:
+    max_time = 133
+    max_episode_steps = max_time * control_freq  # Original: 400
+
+    print("\n--- put_in_drawer_visual_matching ---")
+    results: List[List[bool]] = []
+
+    env_names = ["PlaceIntoClosedTopDrawerCustomInScene-v0"]
+    urdf_versions = ["recolor_cabinet_visual_matching_1", "recolor_tabletop_visual_matching_1", "recolor_tabletop_visual_matching_2", None]
+
+    base = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=control_freq,
+        sim_freq=513,
+        max_episode_steps=max_episode_steps,
+        ckpt_path=ckpt_path,
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        obj_init_x_range=[-0.08, -0.02, 3],
+        obj_init_y_range=[-0.02, 0.08, 3],
+        obj_variation_mode="episode_xy",
+        obj_episode_range=[0, 1],
+        task_name="fractal_put_in_drawer_visual_matching",
+    )
+
+    overlay_poses = [
+        dict(
+            robot_init_x_range=[0.644, 0.644, 1],
+            robot_init_y_range=[-0.179, -0.179, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, -0.03, -0.03, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png",
+        ),
+        dict(
+            robot_init_x_range=[0.652, 0.652, 1],
+            robot_init_y_range=[0.009, 0.009, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png",
+        ),
+        dict(
+            robot_init_x_range=[0.665, 0.665, 1],
+            robot_init_y_range=[0.224, 0.224, 1],
+            robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0, 0, 1],
+            rgb_overlay_path="./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png",
+        ),
+    ]
+
+    model_ids = [
+        "baked_opened_pepsi_can_v2",
+        "baked_opened_coke_can_v2",
+        "baked_opened_7up_can_v2",
+        "baked_opened_redbull_can_v2",
+        "baked_blue_plastic_bottle_v2",
+        "baked_apple_v2",
+        "baked_orange_v2",
+        "baked_sponge_v2",
+    ]
+
+    for i in range(num_trials):
+        env_name = random.choice(env_names)
+        urdf = random.choice(urdf_versions)
+        pose = random.choice(overlay_poses)
+        model_id = random.choice(model_ids)
+
+        additional_env_build_kwargs = {
+            "station_name": "mk_station_recolor",
+            "light_mode": "simple",
+            "disable_bad_material": True,
+            "model_ids": model_id,
+            "urdf_version": urdf,
+        }
+
+        cfg = ManiSkill2Config(
+            **base,
+            env_name=env_name,
+            scene_name="dummy_drawer",
+            additional_env_build_kwargs=additional_env_build_kwargs,
+            **pose,
+            episode_id=i,
+        )
+        results.append(maniskill2_evaluator(env_policy, cfg))
+
+    return results
+
+
+def put_in_drawer_variant_agg(env_policy: AiroaBasePolicy, ckpt_path: str, control_freq: int = 3, num_trials: int = 30) -> List[List[bool]]:
+    max_time = 133
+    max_episode_steps = max_time * control_freq  # Original: 400
+
+    print("\n--- put_in_drawer_variant_agg ---")
+    results: List[List[bool]] = []
+
+    common = dict(
+        robot="google_robot_static",
+        policy_setup="google_robot",
+        control_freq=control_freq,
+        sim_freq=513,
+        max_episode_steps=max_episode_steps,
+        ckpt_path=ckpt_path,
+        robot_init_rot_quat_center=[0, 0, 0, 1],
+        obj_init_x_range=[-0.08, -0.02, 3],
+        obj_init_y_range=[-0.02, 0.08, 3],
+        obj_variation_mode="episode_xy",
+        obj_episode_range=[0, 1],
+        robot_init_x_range=[0.65, 0.65, 1],
+        robot_init_y_range=[-0.2, 0.2, 3],
+        robot_variation_mode="episode_xy",
+        robot_episode_range=[0, 1],
+        robot_init_rot_rpy_range=[0, 0, 1, 0, 0, 1, 0.0, 0.0, 1],
+        task_name="fractal_put_in_drawer_variant_agg",
+    )
+
+    env_names = ["PlaceIntoClosedTopDrawerCustomInScene-v0"]
+    background_scenes = ["modern_bedroom_no_roof", "modern_office_no_roof"]
+    stations = ["mk_station", "mk_station2", "mk_station3"]
+    lightings = [None, "brighter", "darker"]
+    model_ids = [
+        "pepsi_can",
+        "baked_opened_pepsi_can_v2",
+        "coke_can",
+        "opened_coke_can",
+        "baked_opened_coke_can_v2",
+        "sprite_can",
+        "opened_sprite_can",
+        "baked_opened_7up_can_v2",
+        "fanta_can",
+        "opened_fanta_can",
+        "redbull_can",
+        "baked_opened_redbull_can_v2",
+        "blue_plastic_bottle",
+        "baked_blue_plastic_bottle_v2",
+        "apple",
+        "baked_apple_v2",
+        "orange",
+        "baked_orange_v2",
+        "sponge",
+        "baked_sponge_v2",
+    ]
+
+    for i in range(num_trials):
+        env_name = random.choice(env_names)
+        scene = random.choice(background_scenes)
+        station = random.choice(stations)
+        light = random.choice(lightings)
+        model_id = random.choice(model_ids)
+
+        additional_env_build_kwargs = {
+            "model_ids": model_id,
+            # "shader_dir": "rt",
+            "light_mode": light,
+            "station_name": station,
+        }
+
+        cfg = ManiSkill2Config(
+            **common,
+            env_name=env_name,
+            scene_name=scene,
+            additional_env_build_kwargs=additional_env_build_kwargs,
+            episode_id=i,
+        )
+        results.append(maniskill2_evaluator(env_policy, cfg))
+
+    return results
+
+
+# ======================================================================
+# 総合評価（重み付け・スコアリングは従来どおり）
+# ======================================================================
+
+
+def run_comprehensive_evaluation(env_policy: AiroaBasePolicy, ckpt_path: str, control_freq: int = 3) -> Dict[str, float]:
+    print("=" * 80)
+    print(f"🚀 STARTING COMPREHENSIVE EVALUATION 🚀")
+    print(f"Checkpoint: {ckpt_path}")
+    print("=" * 80)
+
+    # fix seed
+    random.seed(42)
+    np.random.seed(42)
+
+    num_trials = 3
+    # vm_results: List[List[bool]] = []
+    # sim_results: List[List[bool]] = []
+
+    # vm_results += pick_object_visual_matching(env_policy, ckpt_path, num_trials)
+    # sim_results += pick_object_variant_agg(env_policy, ckpt_path, num_trials)
+
+    # vm_results += pick_object_among_visual_matching(env_policy, ckpt_path, num_trials)
+    # sim_results += pick_object_among_variant_agg(env_policy, ckpt_path, num_trials)
+
+    # vm_results += drawer_visual_matching(env_policy, ckpt_path, num_trials)
+    # sim_results += drawer_variant_agg(env_policy, ckpt_path, num_trials)
+
+    # vm_results += move_near_visual_matching(env_policy, ckpt_path, num_trials)
+    # sim_results += move_near_variant_agg(env_policy, ckpt_path, num_trials)
+
+    # vm_results += put_in_drawer_visual_matching(env_policy, ckpt_path, num_trials)
+    # sim_results += put_in_drawer_variant_agg(env_policy, ckpt_path, num_trials)
+
+    results = pick_object_visual_matching(env_policy, ckpt_path, control_freq, num_trials)
+    success_rate, wilson_score, standard_deviation = calculate_score(results)
+    print(
+        f"Pick Object Visual Matching Success Rate: {success_rate:.4f}, Wilson Score: {wilson_score:.4f}, Standard Deviation: {standard_deviation:.4f}"
+    )
+    results = pick_object_variant_agg(env_policy, ckpt_path, control_freq, num_trials)
+    success_rate, wilson_score, standard_deviation = calculate_score(results)
+    print(f"Pick Object Variant Agg Success Rate: {success_rate:.4f}, Wilson Score: {wilson_score:.4f}, Standard Deviation: {standard_deviation:.4f}")
+
+    results = pick_object_among_visual_matching(env_policy, ckpt_path, control_freq, num_trials)
+    success_rate, wilson_score, standard_deviation = calculate_score(results)
+    print(
+        f"Pick Object Among Visual Matching Success Rate: {success_rate:.4f}, Wilson Score: {wilson_score:.4f}, Standard Deviation: {standard_deviation:.4f}"
+    )
+    results = pick_object_among_variant_agg(env_policy, ckpt_path, control_freq, num_trials)
+    success_rate, wilson_score, standard_deviation = calculate_score(results)
+    print(
+        f"Pick Object Among Variant Agg Success Rate: {success_rate:.4f}, Wilson Score: {wilson_score:.4f}, Standard Deviation: {standard_deviation:.4f}"
+    )
+    results = drawer_visual_matching(env_policy, ckpt_path, control_freq, num_trials)
+
+    results = drawer_variant_agg(env_policy, ckpt_path, control_freq, num_trials)
+    success_rate, wilson_score, standard_deviation = calculate_score(results)
+    print(f"Drawer Variant Agg Success Rate: {success_rate:.4f}, Wilson Score: {wilson_score:.4f}, Standard Deviation: {standard_deviation:.4f}")
+    results = move_near_visual_matching(env_policy, ckpt_path, control_freq, num_trials)
+    success_rate, wilson_score, standard_deviation = calculate_score(results)
+    print(
+        f"Move Near Visual Matching Success Rate: {success_rate:.4f}, Wilson Score: {wilson_score:.4f}, Standard Deviation: {standard_deviation:.4f}"
+    )
+    results = move_near_variant_agg(env_policy, ckpt_path, control_freq, num_trials)
+
+    results = put_in_drawer_visual_matching(env_policy, ckpt_path, control_freq, num_trials)
+    success_rate, wilson_score, standard_deviation = calculate_score(results)
+    print(
+        f"Put In Drawer Visual Matching Success Rate: {success_rate:.4f}, Wilson Score: {wilson_score:.4f}, Standard Deviation: {standard_deviation:.4f}"
+    )
+    results = put_in_drawer_variant_agg(env_policy, ckpt_path, control_freq, num_trials)
+    success_rate, wilson_score, standard_deviation = calculate_score(results)
+    print(
+        f"Put In Drawer Variant Agg Success Rate: {success_rate:.4f}, Wilson Score: {wilson_score:.4f}, Standard Deviation: {standard_deviation:.4f}"
+    )
diff --git a/simpler_env/evaluation/maniskill2_evaluator.py b/simpler_env/evaluation/maniskill2_evaluator.py
index 99a788e0..4ece8452 100644
--- a/simpler_env/evaluation/maniskill2_evaluator.py
+++ b/simpler_env/evaluation/maniskill2_evaluator.py
@@ -2,18 +2,26 @@
 Evaluate a model on ManiSkill2 environment.
 """
 
+import json
 import os
 
 import numpy as np
+from tqdm import tqdm
 from transforms3d.euler import quat2euler
 
 from simpler_env.utils.env.env_builder import build_maniskill2_env, get_robot_control_mode
 from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict
-from simpler_env.utils.visualization import write_video
+from simpler_env.utils.visualization import write_interval_video, write_video
 
+INF_COST = 999
+master_seed = 42
+rng = np.random.RandomState(master_seed)
+success_threshold_s = 5  # [s]
 
 def run_maniskill2_eval_single_episode(
     model,
+    task_name,
+    episode_id,
     ckpt_path,
     robot_name,
     env_name,
@@ -36,7 +44,6 @@ def run_maniskill2_eval_single_episode(
     additional_env_save_tags=None,
     logging_dir="./results",
 ):
-
     if additional_env_build_kwargs is None:
         additional_env_build_kwargs = {}
 
@@ -62,7 +69,7 @@ def run_maniskill2_eval_single_episode(
         **additional_env_build_kwargs,
         **kwargs,
     )
-
+    # __import__('ipdb').set_trace()
     # initialize environment
     env_reset_options = {
         "robot_init_options": {
@@ -82,9 +89,12 @@ def run_maniskill2_eval_single_episode(
         env_reset_options["obj_init_options"] = {
             "episode_id": obj_episode_id,
         }
-    obs, _ = env.reset(options=env_reset_options)
+    # episode_seed = np.random.randint(0, 100000)
+    episode_seed = rng.randint(0, 100000)
+
+    obs, _ = env.reset(options=env_reset_options, seed=episode_seed)
     # for long-horizon environments, we check if the current subtask is the final subtask
-    is_final_subtask = env.is_final_subtask() 
+    is_final_subtask = env.is_final_subtask()
 
     # Obtain language instruction
     if instruction is not None:
@@ -103,64 +113,96 @@ def run_maniskill2_eval_single_episode(
     # Initialize model
     model.reset(task_description)
 
+    # Initialize subtasks stats
+    init_episode_stats = env.episode_stats
+    subtasks = {}
+    for k, v in init_episode_stats.items():
+        subtasks[k] = {"status": v, "cost_time": INF_COST}
+
     timestep = 0
     success = "failure"
+    final_cost_time = INF_COST
+    success_timestep = 0
+    # action_ensemble = model.action_ensemble_temp  if hasattr(model, "action_ensemble") else "none"
 
     # Step the environment
-    while not (predicted_terminated or truncated):
-        # step the model; "raw_action" is raw model action output; "action" is the processed action to be sent into maniskill env
-        raw_action, action = model.step(image, task_description)
-        predicted_actions.append(raw_action)
-        predicted_terminated = bool(action["terminate_episode"][0] > 0)
-        if predicted_terminated:
-            if not is_final_subtask:
-                # advance the environment to the next subtask
-                predicted_terminated = False
-                env.advance_to_next_subtask()
-
-        # step the environment
-        obs, reward, done, truncated, info = env.step(
-            np.concatenate([action["world_vector"], action["rot_axangle"], action["gripper"]]),
-        )
-        
-        success = "success" if done else "failure"
-        new_task_description = env.get_language_instruction()
-        if new_task_description != task_description:
-            task_description = new_task_description
-            print(task_description)
-        is_final_subtask = env.is_final_subtask()
+    task_descriptions = []
+    with tqdm(total=max_episode_steps, desc=f"Episode {episode_id}", leave=False) as pbar:
+        while not (predicted_terminated or truncated):
+            # step the model; "raw_action" is raw model action output; "action" is the processed action to be sent into maniskill env
+            raw_action, action = model.step(image, task_description, eef_pos=obs["agent"]["eef_pos"])
+            predicted_actions.append(raw_action)
+            predicted_terminated = bool(action["terminate_episode"][0] > 0)
+            if predicted_terminated:
+                if not is_final_subtask:
+                    # advance the environment to the next subtask
+                    predicted_terminated = False
+                    env.advance_to_next_subtask()
+
+            # step the environment
+            obs, reward, done, truncated, info = env.step(
+                np.concatenate([action["world_vector"], action["rot_axangle"], action["gripper"]]),
+            )
 
-        print(timestep, info)
+            cur_episode_stats = info.get("episode_stats", {})
+            for k, v in cur_episode_stats.items():
+                subtasks[k]["status"] = v
+                subtasks[k]["cost_time"] = min(subtasks[k]["cost_time"], timestep) if v else INF_COST
 
-        image = get_image_from_maniskill2_obs_dict(env, obs, camera_name=obs_camera_name)
-        images.append(image)
-        timestep += 1
+            final_cost_time = min(final_cost_time, timestep) if info["success"] else INF_COST
+            success_timestep = success_timestep + 1 if info["success"] else 0
+
+            if success_timestep >= success_threshold_s * control_freq:
+                print("Succeeded in advance")
+                break
+
+            success = "success" if done else "failure"
+            new_task_description = env.get_language_instruction()
+            if new_task_description != task_description:
+                task_description = new_task_description
+                print(task_description)
+            is_final_subtask = env.is_final_subtask()
+
+            # print(timestep, info)
+
+            image = get_image_from_maniskill2_obs_dict(env, obs, camera_name=obs_camera_name)
+            images.append(image)
+            task_descriptions.append(task_description)
+            timestep += 1
+            pbar.update(1)
 
     episode_stats = info.get("episode_stats", {})
 
-    # save video
-    env_save_name = env_name
-    for k, v in additional_env_build_kwargs.items():
-        env_save_name = env_save_name + f"_{k}_{v}"
-    if additional_env_save_tags is not None:
-        env_save_name = env_save_name + f"_{additional_env_save_tags}"
     ckpt_path_basename = ckpt_path if ckpt_path[-1] != "/" else ckpt_path[:-1]
     ckpt_path_basename = ckpt_path_basename.split("/")[-1]
-    if obj_variation_mode == "xy":
-        video_name = f"{success}_obj_{obj_init_x}_{obj_init_y}"
-    elif obj_variation_mode == "episode":
-        video_name = f"{success}_obj_episode_{obj_episode_id}"
-    for k, v in episode_stats.items():
-        video_name = video_name + f"_{k}_{v}"
-    video_name = video_name + ".mp4"
-    if rgb_overlay_path is not None:
-        rgb_overlay_path_str = os.path.splitext(os.path.basename(rgb_overlay_path))[0]
+    base_logging_dir = os.path.join(logging_dir, ckpt_path_basename)
+
+    # このロジックはあったほうがいいかも
+    # if not os.path.exists(base_logging_dir):
+    #     ckpt_logging_dir = base_logging_dir
+    # else:
+    #     i = 1
+    #     while True:
+    #         new_logging_dir = f"{base_logging_dir}_{i}"
+    #         if not os.path.exists(new_logging_dir):
+    #             ckpt_logging_dir = new_logging_dir
+    #             break
+    #         i += 1
+    # logging_dir = os.path.join(ckpt_logging_dir, task_name)
+    # os.makedirs(logging_dir, exist_ok=True)
+
+    ckpt_logging_dir = base_logging_dir
+    logging_dir = os.path.join(ckpt_logging_dir, task_name)
+    os.makedirs(logging_dir, exist_ok=True)
+
+    # save video
+    if success == "success":
+        success_emoji = "✅"
     else:
-        rgb_overlay_path_str = "None"
-    r, p, y = quat2euler(robot_init_quat)
-    video_path = f"{ckpt_path_basename}/{scene_name}/{control_mode}/{env_save_name}/rob_{robot_init_x}_{robot_init_y}_rot_{r:.3f}_{p:.3f}_{y:.3f}_rgb_overlay_{rgb_overlay_path_str}/{video_name}"
-    video_path = os.path.join(logging_dir, video_path)
-    write_video(video_path, images, fps=5)
+        success_emoji = "❌"
+    video_path = os.path.join(logging_dir, f"{episode_id}_{success}.mp4")
+    write_video(video_path, images, fps=control_freq)
+    print(f"{success_emoji} Video saved to {video_path}")
 
     # save action trajectory
     action_path = video_path.replace(".mp4", ".png")
@@ -169,6 +211,37 @@ def run_maniskill2_eval_single_episode(
     action_path = action_root + os.path.basename(action_path)
     model.visualize_epoch(predicted_actions, images, save_path=action_path)
 
+    # save json summary
+    episode_record = {
+        "trial_id": episode_id,
+        **({"obj_episode": obj_episode_id} if obj_variation_mode in ("episode", "episode_xy") else {"obj_init_xy": [obj_init_x, obj_init_y]}),
+        "task_description": task_description,
+        "episode_stats": subtasks,
+        "final_status": success,
+        "final_cost_time": final_cost_time,
+    }
+
+    # Details
+    r, p, y = quat2euler(robot_init_quat)
+    episode_record["details"] = {
+        "scene_name": scene_name,
+        "control_mode": control_mode,
+        "env_save_name": env_name,
+        "rgb_overlay_path": (os.path.splitext(os.path.basename(rgb_overlay_path))[0] if rgb_overlay_path else "None"),
+        "robot_init_xy": [float(f"{robot_init_x:.3f}"), float(f"{robot_init_y:.3f}")],
+        "robot_init_rpy": [float(f"{r:.3f}"), float(f"{p:.3f}"), float(f"{y:.3f}")],
+        "add_info": (
+            (f"{success}_obj_{obj_init_x}_{obj_init_y}")
+            if obj_variation_mode in ("xy", "episode_xy")
+            else (f"{success}_idx_{episode_id}_obj_episode_{obj_episode_id}")
+        )
+        + "".join([f"_{k}_{v}" for k, v in episode_stats.items()]),
+        "additional_env_build_kwargs": additional_env_build_kwargs,
+    }
+
+    # Write to json
+    _append_episode_to_json(ckpt_logging_dir, task_name, episode_record)
+
     return success == "success"
 
 
@@ -177,43 +250,131 @@ def maniskill2_evaluator(model, args):
     success_arr = []
 
     # run inference
-    for robot_init_x in args.robot_init_xs:
-        for robot_init_y in args.robot_init_ys:
+    if args.robot_variation_mode == "xy":
+        for robot_init_x in args.robot_init_xs:
+            for robot_init_y in args.robot_init_ys:
+                for robot_init_quat in args.robot_init_quats:
+                    success = _run_single_evaluation(model, args, control_mode, robot_init_x, robot_init_y, robot_init_quat)
+                    success_arr += success
+
+    if args.robot_variation_mode == "episode_xy":
+        for robot_episode_id in range(args.robot_episode_range[0], args.robot_episode_range[1]):
+            robot_init_x = rng.uniform(args.robot_init_x_range[0], args.robot_init_x_range[1])
+            robot_init_y = rng.uniform(args.robot_init_y_range[0], args.robot_init_y_range[1])
             for robot_init_quat in args.robot_init_quats:
-                kwargs = dict(
-                    model=model,
-                    ckpt_path=args.ckpt_path,
-                    robot_name=args.robot,
-                    env_name=args.env_name,
-                    scene_name=args.scene_name,
-                    robot_init_x=robot_init_x,
-                    robot_init_y=robot_init_y,
-                    robot_init_quat=robot_init_quat,
-                    control_mode=control_mode,
-                    additional_env_build_kwargs=args.additional_env_build_kwargs,
-                    rgb_overlay_path=args.rgb_overlay_path,
-                    control_freq=args.control_freq,
-                    sim_freq=args.sim_freq,
-                    max_episode_steps=args.max_episode_steps,
-                    enable_raytracing=args.enable_raytracing,
-                    additional_env_save_tags=args.additional_env_save_tags,
-                    obs_camera_name=args.obs_camera_name,
-                    logging_dir=args.logging_dir,
+                success = _run_single_evaluation(model, args, control_mode, robot_init_x, robot_init_y, robot_init_quat)
+                success_arr += success
+
+    return success_arr
+
+
+def _run_single_evaluation(model, args, control_mode, robot_init_x, robot_init_y, robot_init_quat):
+    kwargs = dict(
+        model=model,
+        task_name=args.task_name,
+        episode_id=args.episode_id,
+        ckpt_path=args.ckpt_path,
+        robot_name=args.robot,
+        env_name=args.env_name,
+        scene_name=args.scene_name,
+        robot_init_x=robot_init_x,
+        robot_init_y=robot_init_y,
+        robot_init_quat=robot_init_quat,
+        control_mode=control_mode,
+        additional_env_build_kwargs=args.additional_env_build_kwargs,
+        rgb_overlay_path=args.rgb_overlay_path,
+        control_freq=args.control_freq,
+        sim_freq=args.sim_freq,
+        max_episode_steps=args.max_episode_steps,
+        enable_raytracing=args.enable_raytracing,
+        additional_env_save_tags=args.additional_env_save_tags,
+        obs_camera_name=args.obs_camera_name,
+        logging_dir=args.logging_dir,
+    )
+    success_arr = []
+
+    if args.obj_variation_mode == "xy":
+        for obj_init_x in args.obj_init_xs:
+            for obj_init_y in args.obj_init_ys:
+                success = run_maniskill2_eval_single_episode(
+                    obj_init_x=obj_init_x,
+                    obj_init_y=obj_init_y,
+                    **kwargs,
                 )
-                if args.obj_variation_mode == "xy":
-                    for obj_init_x in args.obj_init_xs:
-                        for obj_init_y in args.obj_init_ys:
-                            success_arr.append(
-                                run_maniskill2_eval_single_episode(
-                                    obj_init_x=obj_init_x,
-                                    obj_init_y=obj_init_y,
-                                    **kwargs,
-                                )
-                            )
-                elif args.obj_variation_mode == "episode":
-                    for obj_episode_id in range(args.obj_episode_range[0], args.obj_episode_range[1]):
-                        success_arr.append(run_maniskill2_eval_single_episode(obj_episode_id=obj_episode_id, **kwargs))
-                else:
-                    raise NotImplementedError()
+                success_arr.append(success)
+    elif args.obj_variation_mode == "episode":
+        sampled_ids = rng.choice(range(36), size=args.obj_episode_range[1], replace=True)
+        for idx, obj_episode_id in enumerate(sampled_ids):
+            if kwargs["episode_id"] is None:
+                kwargs["episode_id"] = idx
+            success = run_maniskill2_eval_single_episode(obj_episode_id=obj_episode_id, **kwargs)
+            success_arr.append(success)
+
+    elif args.obj_variation_mode == "episode_xy":
+        for obj_episode_id in range(args.obj_episode_range[0], args.obj_episode_range[1]):
+            obj_init_x = rng.uniform(args.obj_init_x_range[0], args.obj_init_x_range[1])
+            obj_init_y = rng.uniform(args.obj_init_y_range[0], args.obj_init_y_range[1])
+            success = run_maniskill2_eval_single_episode(obj_init_x=obj_init_x, obj_init_y=obj_init_y, **kwargs)
+            success_arr.append(success)
+
+    else:
+        raise NotImplementedError()
 
     return success_arr
+
+
+def _json_default(o):
+    if isinstance(o, np.bool_):
+        return bool(o)
+    if isinstance(o, np.integer):
+        return int(o)
+    if isinstance(o, np.floating):
+        return float(o)
+    if isinstance(o, np.ndarray):
+        return o.tolist()
+    return str(o)
+
+
+def _sanitize_for_json(x):
+    if isinstance(x, dict):
+        return {k: _sanitize_for_json(v) for k, v in x.items()}
+    if isinstance(x, (list, tuple)):
+        return [_sanitize_for_json(v) for v in x]
+    if isinstance(x, np.ndarray):
+        return x.tolist()
+    if isinstance(x, np.generic):
+        return x.item()
+    return x
+
+
+def _append_episode_to_json(ckpt_logging_dir, task_name, episode_record):
+    json_path = os.path.join(ckpt_logging_dir, "results.json")
+    data = []
+
+    # Read existing json file
+    if os.path.exists(json_path):
+        try:
+            with open(json_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            if not isinstance(data, list):
+                data = []
+        except Exception:
+            data = []
+
+    # Update
+    idx = None
+    for i, item in enumerate(data):
+        if isinstance(item, dict) and item.get("task_name") == task_name and isinstance(item.get("episodes"), list):
+            idx = i
+            break
+
+    episode_record = _sanitize_for_json(episode_record)
+
+    if idx is None:
+        data.append({"task_name": task_name, "episodes": [episode_record]})
+    else:
+        data[idx]["episodes"].append(episode_record)
+
+    # Write
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2, default=_json_default)
diff --git a/simpler_env/evaluation/random_envs.py b/simpler_env/evaluation/random_envs.py
new file mode 100644
index 00000000..7987fefc
--- /dev/null
+++ b/simpler_env/evaluation/random_envs.py
@@ -0,0 +1,442 @@
+from collections import OrderedDict
+from typing import List
+
+import sapien.core as sapien
+from transforms3d.euler import euler2quat
+import numpy as np
+
+from mani_skill2_real2sim.utils.registration import register_env
+from mani_skill2_real2sim.utils.common import random_choice
+from mani_skill2_real2sim.envs.custom_scenes.put_on_in_scene import PutOnBridgeInSceneEnv
+from mani_skill2_real2sim import ASSET_DIR
+
+DEFAULT_OBJECTS = [
+    "yellow_cube_3cm",
+    "green_cube_3cm",
+    "eggplant",
+    "bridge_spoon_generated_modified",
+    "bridge_carrot_generated_modified",
+]
+
+DEFAULT_TOPS = [
+    "bridge_plate_objaverse_larger",
+    "table_cloth_generated_shorter",
+]
+
+
+# Task 1
+@register_env("GraspRandomObjectInScene-v0", max_episode_steps=60)
+class GraspRandomObjectInScene(PutOnBridgeInSceneEnv):
+    def __init__(self, candidate_source_names: List[str] = DEFAULT_OBJECTS, grasp_hold_seconds: float = 3.0, **kwargs):
+        xy_center = np.array([-0.16, 0.00])
+        half_edge_length_x = 0.075
+        half_edge_length_y = 0.075
+        grid_pos = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) * 2 - 1
+        grid_pos = (
+            grid_pos * np.array([half_edge_length_x, half_edge_length_y])[None]
+            + xy_center[None]
+        )
+
+        xy_configs = []
+        for i, p1 in enumerate(grid_pos):
+            for j, p2 in enumerate(grid_pos):
+                if i != j:
+                    xy_configs.append(np.array([p1, p2]))
+
+        quat_configs = [
+            np.array([[1, 0, 0, 0], [1, 0, 0, 0]]),
+            np.array([euler2quat(0, 0, np.pi / 2), [1, 0, 0, 0]]),
+            np.array([euler2quat(0, 0, np.pi), [1, 0, 0, 0]]),
+        ]
+
+        self._placeholder_src = "__random_src_placeholder__"
+        self._user_src_pool = candidate_source_names
+
+        self.grasp_hold_seconds = grasp_hold_seconds
+        self.consecutive_grasp = 0
+
+        super().__init__(
+            source_obj_name=self._placeholder_src,
+            target_obj_name="dummy_sink_target_plane",
+            xy_configs=xy_configs,
+            quat_configs=quat_configs,
+            **kwargs,
+        )
+
+    def _initialize_actors(self):
+        super()._initialize_actors()
+        if self.episode_target_obj is not None:
+            self.episode_target_obj.hide_visual()
+    
+    def reset(self, seed=None, options=None):
+        if options is None:
+            options = dict()
+        options = options.copy()
+        self.set_episode_rng(seed)
+
+        self._grasp_hold_steps = int(self.grasp_hold_seconds * self.control_freq)
+
+        if self._user_src_pool is not None:
+            src_candidates = list(self._user_src_pool)
+        else:
+            ban = {"sink", "dummy_sink_target_plane", "table_cloth_generated_shorter"}
+            ban_kw = ["sink", "plane", "cloth", "towel", "target"]
+            src_candidates = [
+                k for k in self.model_db.keys()
+                if (k not in ban) and all(kw not in k for kw in ban_kw)
+            ]
+        assert len(src_candidates) > 0, "No valid source objects to grasp."
+
+        self._source_obj_name = random_choice(src_candidates, rng=self._episode_rng)
+
+        self.consecutive_grasp = 0
+        self._grasp_success_locked = False
+
+        obs, info = super().reset(seed=self._episode_seed, options=options)
+        info.update({"random_source_obj_name": self._source_obj_name})
+        return obs, info
+
+    def _initialize_episode_stats(self):
+        self.episode_stats = OrderedDict(
+            is_src_obj_grasped=False,
+            consecutive_grasp=False,
+            grasp_stable=False,
+        )
+
+    def evaluate(self, **kwargs):
+        is_src_obj_grasped = self.agent.check_grasp(self.episode_source_obj)
+        
+        if self._grasp_success_locked:
+            success = True
+            consecutive_grasp = True
+        else:
+            if is_src_obj_grasped:
+                self.consecutive_grasp += 1
+            else:
+                self.consecutive_grasp = 0
+
+            consecutive_grasp = self.consecutive_grasp >= self._grasp_hold_steps
+            success = consecutive_grasp
+
+            if success:
+                self._grasp_success_locked = True
+
+        self.episode_stats["is_src_obj_grasped"] = (
+            self.episode_stats["is_src_obj_grasped"] or is_src_obj_grasped
+        )
+        self.episode_stats["consecutive_grasp"] = (
+            self.episode_stats["consecutive_grasp"] or consecutive_grasp
+        )
+        self.episode_stats["grasp_stable"] = success
+
+        return dict(
+            is_src_obj_grasped=is_src_obj_grasped,
+            consecutive_grasp=consecutive_grasp,
+            episode_stats=self.episode_stats,
+            success=success,
+        )
+
+    def get_language_instruction(self, **kwargs):
+        src_name = self._get_instruction_obj_name(self.episode_source_obj.name)
+        return f"pick up {src_name}"
+
+
+# Task 2
+@register_env("StackRandomGreenYellowCubeInScene-v0", max_episode_steps=60)
+class StackRandomGreenYellowCubeInScene(PutOnBridgeInSceneEnv):
+    def __init__(self, **kwargs):
+        xy_center = np.array([-0.16, 0.00])
+        half_edge_length_xs = [0.05, 0.1]
+        half_edge_length_ys = [0.05, 0.1]
+        xy_configs = []
+        for (half_edge_length_x, half_edge_length_y) in zip(
+            half_edge_length_xs, half_edge_length_ys
+        ):
+            grid_pos = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) * 2 - 1
+            grid_pos = (
+                grid_pos * np.array([half_edge_length_x, half_edge_length_y])[None]
+                + xy_center[None]
+            )
+            for i, p1 in enumerate(grid_pos):
+                for j, p2 in enumerate(grid_pos):
+                    if i != j:
+                        xy_configs.append(np.array([p1, p2]))
+
+        quat_configs = [np.array([[1, 0, 0, 0], [1, 0, 0, 0]])]
+
+        self._placeholder_src = "__random_src_placeholder__"
+        self._placeholder_tgt = "__random_tgt_placeholder__"
+
+        super().__init__(
+            source_obj_name=self._placeholder_src,
+            target_obj_name=self._placeholder_tgt,
+            xy_configs=xy_configs,
+            quat_configs=quat_configs,
+            **kwargs,
+        )
+
+    def reset(self, seed=None, options=None):
+        if options is None:
+            options = dict()
+        options = options.copy()
+
+        self.set_episode_rng(seed)
+
+        green = "green_cube_3cm"
+        yellow = "yellow_cube_3cm"
+
+        if random_choice([0, 1], rng=self._episode_rng):
+            src, tgt = green, yellow
+        else:
+            src, tgt = yellow, green
+
+        self._source_obj_name = src
+        self._target_obj_name = tgt
+
+        obs, info = super().reset(seed=self._episode_seed, options=options)
+        info.update({
+            "top_cube": self._source_obj_name,
+            "bottom_cube": self._target_obj_name,
+        })
+        return obs, info
+
+    def get_language_instruction(self, **kwargs):
+        src = "green block" if "green" in self.episode_source_obj.name else "yellow block"
+        tgt = "yellow block" if "yellow" in self.episode_target_obj.name else "green block"
+        return f"stack the {src} on the {tgt}"
+
+
+# Task 3
+@register_env("PutRandomObjectOnRandomTopInScene-v0", max_episode_steps=60)
+class PutRandomObjectOnRandomTopInScene(PutOnBridgeInSceneEnv):
+    def __init__(
+        self,
+        candidate_source_names: List[str] = DEFAULT_OBJECTS,
+        candidate_target_names: List[str] = DEFAULT_TOPS,
+        **kwargs,
+    ):
+        xy_center = np.array([-0.16, 0.00])
+        half_edge_length_x = 0.075
+        half_edge_length_y = 0.075
+        grid_pos = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) * 2 - 1
+        grid_pos = (
+            grid_pos * np.array([half_edge_length_x, half_edge_length_y])[None]
+            + xy_center[None]
+        )
+
+        xy_configs = []
+        for i, p1 in enumerate(grid_pos):
+            for j, p2 in enumerate(grid_pos):
+                if i != j:
+                    xy_configs.append(np.array([p1, p2]))
+
+        quat_configs = [
+            np.array([[1, 0, 0, 0], [1, 0, 0, 0]]),
+            np.array([euler2quat(0, 0, np.pi / 2), [1, 0, 0, 0]]),
+            np.array([euler2quat(0, 0, np.pi), [1, 0, 0, 0]]),
+        ]
+
+        self._placeholder_src = "__random_src_placeholder__"
+        self._placeholder_tgt = "__random_tgt_placeholder__"
+
+        self._user_src_pool = candidate_source_names
+        self._user_tgt_pool = candidate_target_names
+
+        super().__init__(
+            source_obj_name=self._placeholder_src,
+            target_obj_name=self._placeholder_tgt,
+            xy_configs=xy_configs,
+            quat_configs=quat_configs,
+            **kwargs,
+        )
+
+    def reset(self, seed=None, options=None):
+        if options is None:
+            options = dict()
+        options = options.copy()
+
+        self.set_episode_rng(seed)
+
+        if self._user_src_pool is not None:
+            src_candidates = list(self._user_src_pool)
+        else:
+            ban = {"sink", "dummy_sink_target_plane"}
+            ban_kw = ["sink", "plane", "cloth", "towel", "target"]
+            src_candidates = [
+                k for k in self.model_db.keys()
+                if (k not in ban) and all(kw not in k for kw in ban_kw)
+            ]
+
+        if self._user_tgt_pool is not None:
+            tgt_candidates = list(self._user_tgt_pool)
+        else:
+            prefer_kw = ["plate", "bowl", "tray", "cloth", "towel"]
+            tgt_candidates = [
+                k for k in self.model_db.keys()
+                if any(kw in k for kw in prefer_kw)
+            ]
+
+        assert len(src_candidates) > 0, "No valid source objects for random put-on task."
+        assert len(tgt_candidates) > 0, "No valid container candidates for random put-on task."
+
+        chosen_src = random_choice(src_candidates, rng=self._episode_rng)
+        chosen_tgt = random_choice(tgt_candidates, rng=self._episode_rng)
+        if chosen_src == chosen_tgt and len(src_candidates) > 1:
+            alt = [x for x in src_candidates if x != chosen_tgt]
+            chosen_src = random_choice(alt, self._episode_rng, rng=self._episode_rng)
+
+        self._source_obj_name = chosen_src
+        self._target_obj_name = chosen_tgt
+
+        obs, info = super().reset(seed=self._episode_seed, options=options)
+        info.update({
+            "random_source_obj_name": chosen_src,
+            "random_container_obj_name": chosen_tgt,
+        })
+        return obs, info
+
+    def evaluate(self, **kwargs):
+        tgt_name = self.episode_target_obj.name if self.episode_target_obj is not None else ""
+        soft = ("cloth" in tgt_name) or ("towel" in tgt_name)
+        if soft:
+            return super().evaluate(success_require_src_completely_on_target=False, **kwargs)
+        else:
+            return super().evaluate(success_require_src_completely_on_target=True, **kwargs)
+
+    def get_language_instruction(self, **kwargs):
+        src_name = self._get_instruction_obj_name(self.episode_source_obj.name)
+        tgt_name = self._get_instruction_obj_name(self.episode_target_obj.name)
+        tgt = "plate" if "plate" in tgt_name else "towel"
+        return f"put {src_name} on the {tgt}"
+
+
+# Task 4
+@register_env("PutRandomObjectInBasketScene-v0", max_episode_steps=120)
+class PutRandomObjectInBasketScene(PutOnBridgeInSceneEnv):
+    def __init__(self, candidate_source_names: List[str] = DEFAULT_OBJECTS, **kwargs):
+        target_obj_name = "dummy_sink_target_plane"
+
+        target_xy = np.array([-0.125, 0.025])
+        xy_center = [-0.105, 0.206]
+        half_span_x = 0.01
+        half_span_y = 0.015
+        num_x = 2
+        num_y = 4
+
+        grid_pos = []
+        for x in np.linspace(-half_span_x, half_span_x, num_x):
+            for y in np.linspace(-half_span_y, half_span_y, num_y):
+                grid_pos.append(np.array([x + xy_center[0], y + xy_center[1]]))
+        xy_configs = [np.stack([pos, target_xy], axis=0) for pos in grid_pos]
+
+        quat_configs = [
+            np.array([euler2quat(0, 0, 0, "sxyz"), [1, 0, 0, 0]]),
+            np.array([euler2quat(0, 0, 1 * np.pi / 4, "sxyz"), [1, 0, 0, 0]]),
+            np.array([euler2quat(0, 0, -1 * np.pi / 4, "sxyz"), [1, 0, 0, 0]]),
+        ]
+
+        self._placeholder_src = "__random_src_placeholder__"
+        self._candidate_source_names = candidate_source_names
+
+        super().__init__(
+            source_obj_name=self._placeholder_src,
+            target_obj_name=target_obj_name,
+            xy_configs=xy_configs,
+            quat_configs=quat_configs,
+            rgb_always_overlay_objects=["sink", "dummy_sink_target_plane"],
+            **kwargs,
+        )
+
+    def _load_model(self):
+        super()._load_model()
+        self.sink_id = "sink"
+        self.sink = self._build_actor_helper(
+            self.sink_id,
+            self._scene,
+            density=self.model_db[self.sink_id].get("density", 1000),
+            physical_material=self._scene.create_physical_material(
+                static_friction=self.obj_static_friction,
+                dynamic_friction=self.obj_dynamic_friction,
+                restitution=0.0,
+            ),
+            root_dir=self.asset_root,
+        )
+        self.sink.name = self.sink_id
+
+    def _initialize_actors(self):
+        self.agent.robot.set_pose(sapien.Pose([-10, 0, 0]))
+        self.sink.set_pose(
+            sapien.Pose([-0.16, 0.13, 0.88], [1, 0, 0, 0])
+        )
+        self.sink.lock_motion()
+        super()._initialize_actors()
+
+    def evaluate(self, *args, **kwargs):
+        return super().evaluate(
+            success_require_src_completely_on_target=False,
+            z_flag_required_offset=0.06,
+            *args,
+            **kwargs,
+        )
+
+    def reset(self, seed=None, options=None):
+        if options is None:
+            options = dict()
+        options = options.copy()
+
+        self.set_episode_rng(seed)
+
+        if self._candidate_source_names is not None:
+            candidates = list(self._candidate_source_names)
+        else:
+            forbid = {
+                "sink",
+                "dummy_sink_target_plane",
+                "table_cloth_generated_shorter",
+            }
+            ban_keywords = ["sink", "plane", "cloth", "towel", "target"]
+            candidates = [
+                k
+                for k in self.model_db.keys()
+                if (k not in forbid)
+                and all(kw not in k for kw in ban_keywords)
+            ]
+
+        assert len(candidates) > 0, "No valid source objects found for random basket task."
+        self._source_obj_name = random_choice(candidates, rng=self._episode_rng)
+
+        obs, info = super().reset(seed=self._episode_seed, options=options)
+        info.update({"random_source_obj_name": self._source_obj_name})
+        return obs, info
+
+    def _setup_prepackaged_env_init_config(self):
+        ret = super()._setup_prepackaged_env_init_config()
+        ret["robot"] = "widowx_sink_camera_setup"
+        ret["scene_name"] = "bridge_table_1_v2"
+        ret["rgb_overlay_path"] = str(ASSET_DIR / "real_inpainting/bridge_sink.png")
+        return ret
+
+    def _additional_prepackaged_config_reset(self, options):
+        options["robot_init_options"] = {
+            "init_xy": [0.127, 0.06],
+            "init_rot_quat": [0, 0, 0, 1],
+        }
+        return False
+
+    def _setup_lighting(self):
+        if self.bg_name is not None:
+            return
+        shadow = self.enable_shadow
+        self._scene.set_ambient_light([0.3, 0.3, 0.3])
+        self._scene.add_directional_light(
+            [0, 0, -1],
+            [0.3, 0.3, 0.3],
+            position=[0, 0, 1],
+            shadow=shadow,
+            scale=5,
+            shadow_map_size=2048,
+        )
+
+    def get_language_instruction(self, **kwargs):
+        src_name = self._get_instruction_obj_name(self.episode_source_obj.name)
+        return f"put {src_name} into yellow basket"
\ No newline at end of file
diff --git a/simpler_env/main_inference.py b/simpler_env/main_inference.py
index 320f2685..60336140 100644
--- a/simpler_env/main_inference.py
+++ b/simpler_env/main_inference.py
@@ -5,8 +5,6 @@
 
 from simpler_env.evaluation.argparse import get_args
 from simpler_env.evaluation.maniskill2_evaluator import maniskill2_evaluator
-from simpler_env.policies.octo.octo_server_model import OctoServerInference
-from simpler_env.policies.rt1.rt1_model import RT1Inference
 
 try:
     from simpler_env.policies.octo.octo_model import OctoInference
@@ -17,7 +15,6 @@
 
 if __name__ == "__main__":
     args = get_args()
-
     os.environ["DISPLAY"] = ""
     # prevent a single jax process from taking up all the GPU memory
     os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"
@@ -28,9 +25,10 @@
             gpus[0],
             [tf.config.LogicalDeviceConfiguration(memory_limit=args.tf_memory_limit)],
         )
-
+    print(f"**** {args.policy_model} ****")
     # policy model creation; update this if you are using a new policy model
     if args.policy_model == "rt1":
+        from simpler_env.policies.rt1.rt1_model import RT1Inference
         assert args.ckpt_path is not None
         model = RT1Inference(
             saved_model_path=args.ckpt_path,
@@ -38,6 +36,7 @@
             action_scale=args.action_scale,
         )
     elif "octo" in args.policy_model:
+        from simpler_env.policies.octo.octo_server_model import OctoServerInference
         if args.ckpt_path is None or args.ckpt_path == "None":
             args.ckpt_path = args.policy_model
         if "server" in args.policy_model:
@@ -53,6 +52,56 @@
                 init_rng=args.octo_init_rng,
                 action_scale=args.action_scale,
             )
+    elif args.policy_model == "openvla":
+        assert args.ckpt_path is not None
+        from simpler_env.policies.openvla.openvla_model import OpenVLAInference
+        model = OpenVLAInference(
+            saved_model_path=args.ckpt_path,
+            policy_setup=args.policy_setup,
+            action_scale=args.action_scale,
+        )
+    elif args.policy_model == "cogact":
+        from simpler_env.policies.sim_cogact import CogACTInference
+        assert args.ckpt_path is not None
+        model = CogACTInference(
+            saved_model_path=args.ckpt_path,  # e.g., CogACT/CogACT-Base
+            policy_setup=args.policy_setup,
+            action_scale=args.action_scale,
+            action_model_type='DiT-L',
+            cfg_scale=1.5                     # cfg from 1.5 to 7 also performs well
+        )
+    elif args.policy_model == "spatialvla":
+        assert args.ckpt_path is not None
+        from simpler_env.policies.spatialvla.spatialvla_model import SpatialVLAInference
+        model = SpatialVLAInference(
+            saved_model_path=args.ckpt_path,
+            policy_setup=args.policy_setup,
+            action_scale=args.action_scale,
+        )
+    elif args.policy_model == "openpifast":
+        assert args.ckpt_path is not None
+        from simpler_env.policies.openpi.pi0_or_fast import OpenPiFastInference
+        model = OpenPiFastInference(
+            saved_model_path=args.ckpt_path,
+            policy_setup=args.policy_setup,
+            action_scale=args.action_scale,
+        )
+    elif args.policy_model == "lerobotpifast":
+        assert args.ckpt_path is not None
+        from simpler_env.policies.lerobotpi.pi0_or_fast import LerobotPiFastInference
+        model = LerobotPiFastInference(
+            saved_model_path=args.ckpt_path,
+            policy_setup=args.policy_setup,
+            action_scale=args.action_scale,
+        )
+    elif args.policy_model == "gr00t":
+        assert args.ckpt_path is not None
+        from simpler_env.policies.gr00t.gr00t_model import Gr00tInference
+        model = Gr00tInference(
+            saved_model_path=args.ckpt_path,
+            policy_setup=args.policy_setup,
+            action_scale=args.action_scale,
+        )
     else:
         raise NotImplementedError()
 
diff --git a/simpler_env/policies/__init__.py b/simpler_env/policies/__init__.py
deleted file mode 100644
index 8b137891..00000000
--- a/simpler_env/policies/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/simpler_env/policies/adapter.py b/simpler_env/policies/adapter.py
new file mode 100644
index 00000000..6b7de3e8
--- /dev/null
+++ b/simpler_env/policies/adapter.py
@@ -0,0 +1,219 @@
+from typing import Dict, Optional, Sequence
+
+import matplotlib.pyplot as plt
+import numpy as np
+import tensorflow as tf
+
+from simpler_env.utils.geometry import euler2axangle, mat2euler, quat2mat
+
+
+class BaseAdapter:
+    def __init__(self, policy):
+        self.policy = policy
+
+    def reset(self, task_description):
+        pass
+
+    def preprocess(self, image: np.ndarray, prompt: str, eef_pos: np.ndarray) -> Dict:
+        pass
+
+    def postprocess(self, outputs: Dict) -> Dict:
+        pass
+
+    def step(self, image: np.ndarray, prompt: str, eef_pos: np.ndarray) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+        inputs = self.preprocess(image, prompt, eef_pos)
+        outputs = self.policy.step(inputs)
+        state_gripper = inputs["state"][-1]
+        action_gripper = outputs["actions"][-1]
+        # print(f"state: {state_gripper} action: {action_gripper}")
+        final_outputs = self.postprocess(outputs)
+        simpler_outputs = {
+            "world_vector": outputs["actions"][:3],
+            "rot_axangle": outputs["actions"][3:6],
+            "gripper": outputs["actions"][6:],
+            "terminate_episode": outputs["terminate_episode"],
+        }
+        final_simpler_outputs = {
+            "world_vector": final_outputs["actions"][:3],
+            "rot_axangle": final_outputs["actions"][3:6],
+            "gripper": final_outputs["actions"][6:],
+            "terminate_episode": final_outputs["terminate_episode"],
+        }
+        return simpler_outputs, final_simpler_outputs
+
+    def _resize_image(self, image: np.ndarray) -> np.ndarray:
+        image = tf.image.resize(
+            image,
+            size=(256, 256),
+            method="lanczos3",
+            antialias=True,
+        )
+        image = tf.cast(tf.clip_by_value(tf.round(image), 0, 255), tf.uint8).numpy()
+        return image
+
+    def visualize_epoch(self, predicted_raw_actions: Sequence[np.ndarray], images: Sequence[np.ndarray], save_path: str) -> None:
+        images = [self._resize_image(image) for image in images]
+        ACTION_DIM_LABELS = ["x", "y", "z", "roll", "pitch", "yaw", "grasp"]
+
+        img_strip = np.concatenate(np.array(images[::3]), axis=1)
+
+        # set up plt figure
+        figure_layout = [["image"] * len(ACTION_DIM_LABELS), ACTION_DIM_LABELS]
+        plt.rcParams.update({"font.size": 12})
+        fig, axs = plt.subplot_mosaic(figure_layout)
+        fig.set_size_inches([45, 10])
+
+        # plot actions
+        pred_actions = np.array([np.concatenate([a["world_vector"], a["rot_axangle"], a["gripper"]], axis=-1) for a in predicted_raw_actions])
+        for action_dim, action_label in enumerate(ACTION_DIM_LABELS):
+            # actions have batch, horizon, dim, in this example we just take the first action for simplicity
+            axs[action_label].plot(pred_actions[:, action_dim], label="predicted action")
+            axs[action_label].set_title(action_label)
+            axs[action_label].set_xlabel("Time in one episode")
+
+        axs["image"].imshow(img_strip)
+        axs["image"].set_xlabel("Time in one episode (subsampled)")
+        plt.legend()
+        plt.savefig(save_path)
+
+
+# from https://github.com/allenzren/open-pi-zero/blob/main/src/agent/env_adapter/simpler.py
+class AiroaToSimplerBridgeAdapter(BaseAdapter):
+    def __init__(self, policy):
+        super().__init__(policy)
+        # EE pose in Bridge data was relative to a top-down pose, instead of robot base
+        self.default_rot = np.array(
+            [[0, 0, 1.0], [0, 1.0, 0], [-1.0, 0, 0]]
+        )  # https://github.com/rail-berkeley/bridge_data_robot/blob/b841131ecd512bafb303075bd8f8b677e0bf9f1f/widowx_envs/widowx_controller/src/widowx_controller/widowx_controller.py#L203
+
+    def reset(self, task_description):
+        pass
+
+    def preprocess(self, image: np.ndarray, prompt: str, eef_pos: np.ndarray) -> dict:
+
+        proprio = eef_pos
+        rm_bridge = quat2mat(proprio[3:7])
+        rpy_bridge_converted = mat2euler(rm_bridge @ self.default_rot.T)
+        gripper_openness = proprio[7]
+        proprio = np.concatenate(
+            [
+                proprio[:3],
+                rpy_bridge_converted,
+                [gripper_openness],
+            ]
+        )
+
+        inputs = {
+            "image": image,
+            "prompt": prompt,
+            "state": proprio,
+        }
+        return inputs
+
+    def postprocess(self, outputs: dict) -> dict:
+        action = outputs["actions"]
+        roll, pitch, yaw = action[3:6]
+        action_rotation_ax, action_rotation_angle = euler2axangle(roll, pitch, yaw)
+
+        """from simpler octo inference: https://github.com/allenzren/SimplerEnv/blob/7d39d8a44e6d5ec02d4cdc9101bb17f5913bcd2a/simpler_env/policies/octo/octo_model.py#L234-L235"""
+        # trained with [0, 1], 0 for close, 1 for open
+        # convert to -1 close, 1 open for simpler
+        action_gripper = action[-1]
+        action_gripper = 2.0 * (action_gripper > 0.5) - 1.0
+
+        action = np.concatenate(
+            [
+                action[:3],
+                action_rotation_ax * action_rotation_angle,
+                [action_gripper],
+            ]
+        )
+        return {
+            "actions": action,
+            "terminate_episode": outputs["terminate_episode"],
+        }
+
+
+class AiroaToSimplerFractalAdapter(BaseAdapter):
+    def __init__(self, policy):
+        super().__init__(policy)
+        # Constants
+        self.sticky_gripper_num_repeat = 15  # same used in Octo. Note this is for every individual action, not every action chunk. Control freq is 3Hz, so roughly sticky for 5 seconds.
+
+    def reset(self, task_description):
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+
+    def preprocess(self, image: np.ndarray, prompt: str, eef_pos: np.ndarray) -> dict:
+        """convert wxyz quat from simpler to xyzw used in fractal"""
+        quat_xyzw = np.roll(eef_pos[3:7], -1)
+        gripper_width = eef_pos[7]  # from simpler, 0 for close, 1 for open continuous
+        gripper_closedness = (
+            1 - gripper_width
+        )  # TODO(allenzren): change fractal data processing in training so also use gripper openness in proprio (as in bridge) instead of closedness
+        proprio = np.concatenate(
+            (
+                eef_pos[:3],
+                quat_xyzw,
+                [gripper_closedness],
+            )
+        )
+
+        # H W C [0, 255]
+        inputs = {
+            "image": image,
+            "prompt": prompt,
+            "state": proprio,
+        }
+        return inputs
+
+    def postprocess(self, outputs: dict) -> dict:
+        action = outputs["actions"]
+        roll, pitch, yaw = action[3:6]
+        action_rotation_ax, action_rotation_angle = euler2axangle(roll, pitch, yaw)
+
+        """from simpler octo inference: https://github.com/allenzren/SimplerEnv/blob/7d39d8a44e6d5ec02d4cdc9101bb17f5913bcd2a/simpler_env/policies/octo/octo_model.py#L187"""
+        # trained with [0, 1], 0 for close, 1 for open
+        # convert to -1 open, 1 close for simpler
+
+        gripper_action = action[-1]
+
+        gripper_action = (gripper_action * 2) - 1  # [0, 1] -> [-1, 1] -1 close, 1 open
+
+        # without sticky
+        relative_gripper_action = -gripper_action
+        # if self.previous_gripper_action is None:
+        #     relative_gripper_action = -1  # open
+        # else:
+        #     relative_gripper_action = -action
+        # self.previous_gripper_action = action
+
+        # switch to sticky closing
+        if np.abs(relative_gripper_action) > 0.5 and self.sticky_action_is_on is False:
+            self.sticky_action_is_on = True
+            self.sticky_gripper_action = relative_gripper_action
+
+        # sticky closing
+        if self.sticky_action_is_on:
+            self.gripper_action_repeat += 1
+            relative_gripper_action = self.sticky_gripper_action
+
+        # reaching maximum sticky
+        if self.gripper_action_repeat == self.sticky_gripper_num_repeat:
+            self.sticky_action_is_on = False
+            self.gripper_action_repeat = 0
+            self.sticky_gripper_action = 0.0
+
+        action = np.concatenate(
+            [
+                action[:3],
+                action_rotation_ax * action_rotation_angle,
+                [relative_gripper_action],
+            ]
+        )
+
+        return {
+            "actions": action,
+            "terminate_episode": outputs["terminate_episode"],
+        }
diff --git a/simpler_env/policies/base.py b/simpler_env/policies/base.py
new file mode 100644
index 00000000..73eb48a7
--- /dev/null
+++ b/simpler_env/policies/base.py
@@ -0,0 +1,12 @@
+from abc import abstractmethod
+from typing import Dict
+
+
+class AiroaBasePolicy:
+    @abstractmethod
+    def step(self, obs: Dict) -> Dict:
+        pass
+
+    @abstractmethod
+    def reset(self) -> None:
+        pass
diff --git a/simpler_env/policies/gr00t/geometry.py b/simpler_env/policies/gr00t/geometry.py
new file mode 100644
index 00000000..f1e7164a
--- /dev/null
+++ b/simpler_env/policies/gr00t/geometry.py
@@ -0,0 +1,463 @@
+"""
+Mostly copied from transforms3d library
+
+"""
+
+import math
+
+import numpy as np
+
+_FLOAT_EPS = np.finfo(np.float64).eps
+
+# axis sequences for Euler angles
+_NEXT_AXIS = [1, 2, 0, 1]
+
+# map axes strings to/from tuples of inner axis, parity, repetition, frame
+_AXES2TUPLE = {
+    "sxyz": (0, 0, 0, 0),
+    "sxyx": (0, 0, 1, 0),
+    "sxzy": (0, 1, 0, 0),
+    "sxzx": (0, 1, 1, 0),
+    "syzx": (1, 0, 0, 0),
+    "syzy": (1, 0, 1, 0),
+    "syxz": (1, 1, 0, 0),
+    "syxy": (1, 1, 1, 0),
+    "szxy": (2, 0, 0, 0),
+    "szxz": (2, 0, 1, 0),
+    "szyx": (2, 1, 0, 0),
+    "szyz": (2, 1, 1, 0),
+    "rzyx": (0, 0, 0, 1),
+    "rxyx": (0, 0, 1, 1),
+    "ryzx": (0, 1, 0, 1),
+    "rxzx": (0, 1, 1, 1),
+    "rxzy": (1, 0, 0, 1),
+    "ryzy": (1, 0, 1, 1),
+    "rzxy": (1, 1, 0, 1),
+    "ryxy": (1, 1, 1, 1),
+    "ryxz": (2, 0, 0, 1),
+    "rzxz": (2, 0, 1, 1),
+    "rxyz": (2, 1, 0, 1),
+    "rzyz": (2, 1, 1, 1),
+}
+
+_TUPLE2AXES = dict((v, k) for k, v in _AXES2TUPLE.items())
+
+# For testing whether a number is close to zero
+_EPS4 = np.finfo(float).eps * 4.0
+
+
+def mat2euler(mat, axes="sxyz"):
+    """Return Euler angles from rotation matrix for specified axis sequence.
+
+    Note that many Euler angle triplets can describe one matrix.
+
+    Parameters
+    ----------
+    mat : array-like shape (3, 3) or (4, 4)
+        Rotation matrix or affine.
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+
+    Examples
+    --------
+    >>> R0 = euler2mat(1, 2, 3, 'syxz')
+    >>> al, be, ga = mat2euler(R0, 'syxz')
+    >>> R1 = euler2mat(al, be, ga, 'syxz')
+    >>> np.allclose(R0, R1)
+    True
+    """
+    try:
+        firstaxis, parity, repetition, frame = _AXES2TUPLE[axes.lower()]
+    except (AttributeError, KeyError):
+        _TUPLE2AXES[axes]  # validation
+        firstaxis, parity, repetition, frame = axes
+
+    i = firstaxis
+    j = _NEXT_AXIS[i + parity]
+    k = _NEXT_AXIS[i - parity + 1]
+
+    M = np.array(mat, dtype=np.float64, copy=False)[:3, :3]
+    if repetition:
+        sy = math.sqrt(M[i, j] * M[i, j] + M[i, k] * M[i, k])
+        if sy > _EPS4:
+            ax = math.atan2(M[i, j], M[i, k])
+            ay = math.atan2(sy, M[i, i])
+            az = math.atan2(M[j, i], -M[k, i])
+        else:
+            ax = math.atan2(-M[j, k], M[j, j])
+            ay = math.atan2(sy, M[i, i])
+            az = 0.0
+    else:
+        cy = math.sqrt(M[i, i] * M[i, i] + M[j, i] * M[j, i])
+        if cy > _EPS4:
+            ax = math.atan2(M[k, j], M[k, k])
+            ay = math.atan2(-M[k, i], cy)
+            az = math.atan2(M[j, i], M[i, i])
+        else:
+            ax = math.atan2(-M[j, k], M[j, j])
+            ay = math.atan2(-M[k, i], cy)
+            az = 0.0
+
+    if parity:
+        ax, ay, az = -ax, -ay, -az
+    if frame:
+        ax, az = az, ax
+    return ax, ay, az
+
+
+def quat2mat(q):
+    """Calculate rotation matrix corresponding to quaternion
+
+    Parameters
+    ----------
+    q : 4 element array-like
+
+    Returns
+    -------
+    M : (3,3) array
+      Rotation matrix corresponding to input quaternion *q*
+
+    Notes
+    -----
+    Rotation matrix applies to column vectors, and is applied to the
+    left of coordinate vectors.  The algorithm here allows quaternions that
+    have not been normalized.
+
+    References
+    ----------
+    Algorithm from http://en.wikipedia.org/wiki/Rotation_matrix#Quaternion
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> M = quat2mat([1, 0, 0, 0]) # Identity quaternion
+    >>> np.allclose(M, np.eye(3))
+    True
+    >>> M = quat2mat([0, 1, 0, 0]) # 180 degree rotn around axis 0
+    >>> np.allclose(M, np.diag([1, -1, -1]))
+    True
+    """
+    w, x, y, z = q
+    Nq = w * w + x * x + y * y + z * z
+    if Nq < _FLOAT_EPS:
+        return np.eye(3)
+    s = 2.0 / Nq
+    X = x * s
+    Y = y * s
+    Z = z * s
+    wX = w * X
+    wY = w * Y
+    wZ = w * Z
+    xX = x * X
+    xY = x * Y
+    xZ = x * Z
+    yY = y * Y
+    yZ = y * Z
+    zZ = z * Z
+    return np.array(
+        [
+            [1.0 - (yY + zZ), xY - wZ, xZ + wY],
+            [xY + wZ, 1.0 - (xX + zZ), yZ - wX],
+            [xZ - wY, yZ + wX, 1.0 - (xX + yY)],
+        ]
+    )
+
+
+# Checks if a matrix is a valid rotation matrix.
+def isrotation(
+    R: np.ndarray,
+    thresh=1e-6,
+) -> bool:
+    Rt = np.transpose(R)
+    shouldBeIdentity = np.dot(Rt, R)
+    iden = np.identity(3, dtype=R.dtype)
+    n = np.linalg.norm(iden - shouldBeIdentity)
+    return n < thresh
+
+
+def euler2mat(ai, aj, ak, axes="sxyz"):
+    """Return rotation matrix from Euler angles and axis sequence.
+
+    Parameters
+    ----------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    mat : array (3, 3)
+        Rotation matrix or affine.
+
+    Examples
+    --------
+    >>> R = euler2mat(1, 2, 3, 'syxz')
+    >>> np.allclose(np.sum(R[0]), -1.34786452)
+    True
+    >>> R = euler2mat(1, 2, 3, (0, 1, 0, 1))
+    >>> np.allclose(np.sum(R[0]), -0.383436184)
+    True
+    """
+    try:
+        firstaxis, parity, repetition, frame = _AXES2TUPLE[axes]
+    except (AttributeError, KeyError):
+        _TUPLE2AXES[axes]  # validation
+        firstaxis, parity, repetition, frame = axes
+
+    i = firstaxis
+    j = _NEXT_AXIS[i + parity]
+    k = _NEXT_AXIS[i - parity + 1]
+
+    if frame:
+        ai, ak = ak, ai
+    if parity:
+        ai, aj, ak = -ai, -aj, -ak
+
+    si, sj, sk = math.sin(ai), math.sin(aj), math.sin(ak)
+    ci, cj, ck = math.cos(ai), math.cos(aj), math.cos(ak)
+    cc, cs = ci * ck, ci * sk
+    sc, ss = si * ck, si * sk
+
+    M = np.eye(3)
+    if repetition:
+        M[i, i] = cj
+        M[i, j] = sj * si
+        M[i, k] = sj * ci
+        M[j, i] = sj * sk
+        M[j, j] = -cj * ss + cc
+        M[j, k] = -cj * cs - sc
+        M[k, i] = -sj * ck
+        M[k, j] = cj * sc + cs
+        M[k, k] = cj * cc - ss
+    else:
+        M[i, i] = cj * ck
+        M[i, j] = sj * sc - cs
+        M[i, k] = sj * cc + ss
+        M[j, i] = cj * sk
+        M[j, j] = sj * ss + cc
+        M[j, k] = sj * cs - sc
+        M[k, i] = -sj
+        M[k, j] = cj * si
+        M[k, k] = cj * ci
+    return M
+
+
+def euler2axangle(ai, aj, ak, axes="sxyz"):
+    """Return angle, axis corresponding to Euler angles, axis specification
+
+    Parameters
+    ----------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    vector : array shape (3,)
+       axis around which rotation occurs
+    theta : scalar
+       angle of rotation
+
+    Examples
+    --------
+    >>> vec, theta = euler2axangle(0, 1.5, 0, 'szyx')
+    >>> np.allclose(vec, [0, 1, 0])
+    True
+    >>> theta
+    1.5
+    """
+    return quat2axangle(euler2quat(ai, aj, ak, axes))
+
+
+def euler2quat(ai, aj, ak, axes="sxyz"):
+    """Return `quaternion` from Euler angles and axis sequence `axes`
+
+    Parameters
+    ----------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    quat : array shape (4,)
+       Quaternion in w, x, y z (real, then vector) format
+
+    Examples
+    --------
+    >>> q = euler2quat(1, 2, 3, 'ryxz')
+    >>> np.allclose(q, [0.435953, 0.310622, -0.718287, 0.444435])
+    True
+    """
+    try:
+        firstaxis, parity, repetition, frame = _AXES2TUPLE[axes.lower()]
+    except (AttributeError, KeyError):
+        _TUPLE2AXES[axes]  # validation
+        firstaxis, parity, repetition, frame = axes
+
+    i = firstaxis + 1
+    j = _NEXT_AXIS[i + parity - 1] + 1
+    k = _NEXT_AXIS[i - parity] + 1
+
+    if frame:
+        ai, ak = ak, ai
+    if parity:
+        aj = -aj
+
+    ai = ai / 2.0
+    aj = aj / 2.0
+    ak = ak / 2.0
+    ci = math.cos(ai)
+    si = math.sin(ai)
+    cj = math.cos(aj)
+    sj = math.sin(aj)
+    ck = math.cos(ak)
+    sk = math.sin(ak)
+    cc = ci * ck
+    cs = ci * sk
+    sc = si * ck
+    ss = si * sk
+
+    q = np.empty((4,))
+    if repetition:
+        q[0] = cj * (cc - ss)
+        q[i] = cj * (cs + sc)
+        q[j] = sj * (cc + ss)
+        q[k] = sj * (cs - sc)
+    else:
+        q[0] = cj * cc + sj * ss
+        q[i] = cj * sc - sj * cs
+        q[j] = cj * ss + sj * cc
+        q[k] = cj * cs - sj * sc
+    if parity:
+        q[j] *= -1.0
+
+    return q
+
+
+def quat2axangle(quat, identity_thresh=None):
+    """Convert quaternion to rotation of angle around axis
+
+    Parameters
+    ----------
+    quat : 4 element sequence
+       w, x, y, z forming quaternion.
+    identity_thresh : None or scalar, optional
+       Threshold below which the norm of the vector part of the quaternion (x,
+       y, z) is deemed to be 0, leading to the identity rotation.  None (the
+       default) leads to a threshold estimated based on the precision of the
+       input.
+
+    Returns
+    -------
+    theta : scalar
+       angle of rotation.
+    vector : array shape (3,)
+       axis around which rotation occurs.
+
+    Examples
+    --------
+    >>> vec, theta = quat2axangle([0, 1, 0, 0])
+    >>> vec
+    array([1., 0., 0.])
+    >>> np.allclose(theta, np.pi)
+    True
+
+    If this is an identity rotation, we return a zero angle and an arbitrary
+    vector:
+
+    >>> quat2axangle([1, 0, 0, 0])
+    (array([1., 0., 0.]), 0.0)
+
+    If any of the quaternion values are not finite, we return a NaN in the
+    angle, and an arbitrary vector:
+
+    >>> quat2axangle([1, np.inf, 0, 0])
+    (array([1., 0., 0.]), nan)
+
+    Notes
+    -----
+    A quaternion for which x, y, z are all equal to 0, is an identity rotation.
+    In this case we return a 0 angle and an arbitrary vector, here [1, 0, 0].
+
+    The algorithm allows for quaternions that have not been normalized.
+    """
+    quat = np.asarray(quat)
+    Nq = np.sum(quat**2)
+    if not np.isfinite(Nq):
+        return np.array([1.0, 0, 0]), float("nan")
+    if identity_thresh is None:
+        try:
+            identity_thresh = np.finfo(Nq.type).eps * 3
+        except (AttributeError, ValueError):  # Not a numpy type or not float
+            identity_thresh = _FLOAT_EPS * 3
+    if Nq < _FLOAT_EPS**2:  # Results unreliable after normalization
+        return np.array([1.0, 0, 0]), 0.0
+    if Nq != 1:  # Normalize if not normalized
+        s = math.sqrt(Nq)
+        quat = quat / s
+    xyz = quat[1:]
+    len2 = np.sum(xyz**2)
+    if len2 < identity_thresh**2:
+        # if vec is nearly 0,0,0, this is an identity rotation
+        return np.array([1.0, 0, 0]), 0.0
+    # Make sure w is not slightly above 1 or below -1
+    theta = 2 * math.acos(max(min(quat[0], 1), -1))
+    return xyz / math.sqrt(len2), theta
+
+
+def quat2euler(quaternion, axes="sxyz"):
+    """Euler angles from `quaternion` for specified axis sequence `axes`
+
+    Parameters
+    ----------
+    q : 4 element sequence
+       w, x, y, z of quaternion
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+
+    Examples
+    --------
+    >>> angles = quat2euler([0.99810947, 0.06146124, 0, 0])
+    >>> np.allclose(angles, [0.123, 0, 0])
+    True
+    """
+    return mat2euler(quat2mat(quaternion), axes)
\ No newline at end of file
diff --git a/simpler_env/policies/gr00t/gr00t_model.py b/simpler_env/policies/gr00t/gr00t_model.py
new file mode 100644
index 00000000..4d4366a7
--- /dev/null
+++ b/simpler_env/policies/gr00t/gr00t_model.py
@@ -0,0 +1,377 @@
+from typing import Optional, Sequence, List
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+from transforms3d.euler import euler2axangle
+from collections import deque
+from PIL import Image
+import torch
+import cv2 as cv
+from simpler_env.utils.action.action_ensemble import ActionEnsembler
+from .geometry import quat2mat, mat2euler, euler2axangle
+import numpy as np
+import torch
+from scipy.spatial.transform import Rotation as R
+
+from gr00t.eval.robot import RobotInferenceClient
+from gr00t.experiment.data_config import DATA_CONFIG_MAP
+from gr00t.model.policy import Gr00tPolicy
+
+
+class Gr00tInference:
+    def __init__(
+        self,
+        saved_model_path: str = "pretrained/pi0",
+        unnorm_key: Optional[str] = None,
+        policy_setup: str = "widowx_bridge",
+        exec_horizon: int = 1,
+        image_size: list[int] = [224, 224],
+        action_scale: float = 1.0,
+        action_ensemble_temp: float = -0.8,
+    ) -> None:
+        gpu_idx = os.environ.get("GPU_IDX", 0)
+        self.device = f"cuda:{gpu_idx}"
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        if policy_setup == "widowx_bridge":
+            action_ensemble = False
+            data_config = "widowx"
+            image_size = [320, 256]
+            self.sticky_gripper_num_repeat = 1
+            # EE pose in Bridge data was relative to a top-down pose, instead of robot base
+            self.default_rot = np.array([[0, 0, 1.0], [0, 1.0, 0], [-1.0, 0, 0]])  # https://github.com/rail-berkeley/bridge_data_robot/blob/b841131ecd512bafb303075bd8f8b677e0bf9f1f/widowx_envs/widowx_controller/src/widowx_controller/widowx_controller.py#L203
+            embodiment_tag = "new_embodiment" # new_embodiment or widowx
+        elif policy_setup == "google_robot":
+            data_config = "google_robot"
+            action_ensemble = False
+            image_size = [320, 256]
+            self.sticky_gripper_num_repeat = 10
+            embodiment_tag = "new_embodiment" # new_embodiment or google_robot
+        else:
+            raise NotImplementedError(
+                f"Policy setup {policy_setup} not supported for octo models. The other datasets can be found in the huggingface config.json file."
+            )
+        self.policy_setup = policy_setup
+
+        # load model
+        data_config = DATA_CONFIG_MAP[data_config]
+        modality_config = data_config.modality_config()
+        transforms = data_config.transform()
+
+        self.policy_client = Gr00tPolicy(
+                model_path=saved_model_path,
+                modality_config=modality_config,
+                modality_transform=transforms,
+                embodiment_tag=embodiment_tag, #embodiment_tag or "gr1
+                device="cuda",
+                denoising_steps=16
+            )
+
+        self.image_size = image_size
+        self.action_scale = action_scale
+        self.obs_horizon = 1
+        self.obs_interval = 1
+        self.pred_action_horizon = 5
+        self.image_history = deque(maxlen=self.obs_horizon)
+        self.exec_horizon = exec_horizon
+
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+        self.action_ensemble = action_ensemble
+        self.action_ensemble_temp = action_ensemble_temp
+
+        if self.action_ensemble:
+            self.action_ensembler = ActionEnsembler(
+                self.pred_action_horizon, self.action_ensemble_temp
+            )
+        else:
+            self.action_ensembler = None
+
+        self.task = None
+        self.task_description = None
+
+    def reset(self, task_description: str) -> None:
+        self.image_history.clear()
+        if self.action_ensemble:
+            self.action_ensembler.reset()
+        self.task_description = task_description
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+        self.action_plan = deque()
+
+    def preprocess_widowx_proprio(self, eef_pos) -> np.array:
+        """convert ee rotation to the frame of top-down
+        https://github.com/allenzren/open-pi-zero/blob/c3df7fb062175c16f69d7ca4ce042958ea238fb7/src/agent/env_adapter/simpler.py#L167
+        """
+        # StateEncoding.POS_EULER: xyz + rpy + pad + gripper(openness)
+        proprio = eef_pos
+        rm_bridge = quat2mat(proprio[3:7])
+        rpy_bridge_converted = mat2euler(rm_bridge @ self.default_rot.T)
+        gripper_openness = proprio[7] # from simpler, 0 for close, 1 for open
+        raw_proprio = np.concatenate(
+            [
+                proprio[:3],
+                rpy_bridge_converted,
+                [gripper_openness],
+            ]
+        )
+        return raw_proprio
+
+    def preprocess_google_robot_proprio(self, eef_pos) -> np.array:
+        """convert wxyz quat from simpler to xyzw used in fractal
+        https://github.com/allenzren/open-pi-zero/blob/c3df7fb062175c16f69d7ca4ce042958ea238fb7/src/agent/env_adapter/simpler.py#L204
+        """
+        # StateEncoding.POS_QUAT: xyz + q_xyzw + gripper(closeness)
+        quat_xyzw = np.roll(eef_pos[3:7], -1)
+        # クォータニオン -> オイラー角変換
+        norm = np.linalg.norm(quat_xyzw)
+        if norm > 1e-12:
+            quat = quat_xyzw  / norm
+        else:
+            quat = np.array([0,0,0,1], dtype=np.float32)
+
+        rot = R.from_quat(quat)
+        rpy = rot.as_euler('yxz', degrees=False) # roll, pitch, yaw（yxz）
+        gripper_width = eef_pos[
+            7
+        ]  # from simpler, 0 for close, 1 for open
+        # need invert as the training data comes from closeness
+        gripper_closedness = (
+            1 - gripper_width
+        )  # TODO(allenzren): change fractal data processing in training so also use gripper openness in proprio (as in bridge) instead of closedness
+        raw_proprio = np.concatenate(
+            (
+                eef_pos[:3],
+                rpy,
+                [gripper_closedness],
+            )
+        )
+        return raw_proprio
+
+    def step(
+        self, image: np.ndarray, task_description: Optional[str] = None, *args, **kwargs
+    ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+        """
+        Input:
+            image: np.ndarray of shape (H, W, 3), uint8
+            task_description: Optional[str], task description; if different from previous task description, policy state is reset
+        Output:
+            raw_action: dict; raw policy action output
+            action: dict; processed action to be sent to the maniskill2 environment, with the following keys:
+                - 'world_vector': np.ndarray of shape (3,), xyz translation of robot end-effector
+                - 'rot_axangle': np.ndarray of shape (3,), axis-angle representation of end-effector rotation
+                - 'gripper': np.ndarray of shape (1,), gripper action
+                - 'terminate_episode': np.ndarray of shape (1,), 1 if episode should be terminated, 0 otherwise
+        """
+        if task_description is not None:
+            if task_description != self.task_description:
+                self.reset(task_description)
+
+        assert image.dtype == np.uint8
+        image = self._resize_image(image)
+        self._add_image_to_history(image)
+        images: List[Image.Image] = self._obtain_image_history()
+
+        eef_pos = kwargs.get("eef_pos", None)
+        if self.policy_setup == "widowx_bridge":
+            state = self.preprocess_widowx_proprio(eef_pos)
+            batch = {
+                "video.image": np.array(images[0][None]), # numpy (b h w c)
+                "state.ee_pos": state[0:3][None],
+                "state.ee_rot": state[3:6][None],
+                "state.gripper": state[6:7][None],
+                "annotation.human.action.task_description": [task_description],
+            }
+            if not self.action_plan:
+                actions = self.policy_client.get_action(batch)
+                delta_pos = np.asarray(actions["action.delta_ee_pos"])
+                delta_rot = np.asarray(actions["action.delta_ee_rot"])
+                gripper = np.asarray(actions["action.gripper"])
+
+                # すべての配列を2次元にして次元を統一
+                if delta_pos.ndim == 1:
+                    delta_pos = delta_pos.reshape(1, -1)
+                if delta_rot.ndim == 1:
+                    delta_rot = delta_rot.reshape(1, -1)
+                
+                # gripperを確実に2次元配列 (H, 1) にする
+                if gripper.ndim == 0:
+                    # スカラー値の場合
+                    gripper = gripper.reshape(1, 1)
+                elif gripper.ndim == 1:
+                    # (H,) -> (H,1)
+                    gripper = gripper[:, None]
+                elif gripper.ndim == 2 and gripper.shape[1] != 1:
+                    # 形が (1,H) の場合を想定
+                    if gripper.shape[0] == 1:
+                        gripper = gripper.T
+                    if gripper.shape[1] != 1:
+                        gripper = gripper.reshape(-1, 1)
+
+                # 二値化
+                gripper = np.clip(gripper, 0.0, 1.0)
+                gripper = (gripper > 0.5).astype(np.float32)
+
+                action_chunk_size = min(len(delta_pos), len(delta_rot), len(gripper))
+                action_chunk = np.concatenate([delta_pos[:action_chunk_size], delta_rot[:action_chunk_size], gripper[:action_chunk_size]], axis=-1)
+                self.action_plan.extend(action_chunk)
+
+        elif self.policy_setup == "google_robot":
+            state = self.preprocess_google_robot_proprio(eef_pos)
+            batch = {
+                "video.image": np.array(images[0][None]),
+                "state.ee_pos": state[0:3][None],
+                "state.ee_rot": state[3:6][None],
+                "state.gripper": state[6:7][None],
+                "annotation.human.action.task_description": [task_description],
+            }
+
+            if not self.action_plan:
+                actions = self.policy_client.get_action(batch)
+                delta_pos = np.asarray(actions["action.delta_ee_pos"])
+                delta_rot = np.asarray(actions["action.delta_ee_rot"])
+                gripper = np.asarray(actions["action.gripper"])
+
+                # すべての配列を2次元にして次元を統一
+                if delta_pos.ndim == 1:
+                    delta_pos = delta_pos.reshape(1, -1)
+                if delta_rot.ndim == 1:
+                    delta_rot = delta_rot.reshape(1, -1)
+                
+                # gripperを確実に2次元配列 (H, 1) にする
+                if gripper.ndim == 0:
+                    # スカラー値の場合
+                    gripper = gripper.reshape(1, 1)
+                elif gripper.ndim == 1:
+                    # (H,) -> (H,1)
+                    gripper = gripper[:, None]
+                elif gripper.ndim == 2 and gripper.shape[1] != 1:
+                    # 形が (1,H) の場合を想定
+                    if gripper.shape[0] == 1:
+                        gripper = gripper.T
+                    if gripper.shape[1] != 1:
+                        gripper = gripper.reshape(-1, 1)
+
+                # 二値化
+                gripper = np.where(gripper < 0.5, 0.0, 1.0)
+                action_chunk_size = min(len(delta_pos), len(delta_rot), len(gripper))
+                action_chunk = np.concatenate([delta_pos[:action_chunk_size], delta_rot[:action_chunk_size], gripper[:action_chunk_size]], axis=-1)
+
+                self.action_plan.extend(action_chunk)
+        raw_actions = self.action_plan.popleft()
+
+
+        # if self.action_ensemble:
+        #     raw_actions = self.action_ensembler.ensemble_action(raw_actions)[None]
+
+        raw_action = {
+            "world_vector": np.array(raw_actions[:3]),
+            "rotation_delta": np.array(raw_actions[3:6]),
+            "open_gripper": np.array(
+                raw_actions[6:7]
+            ),  # range [0, 1]; 1 = open; 0 = close
+        }
+
+        # process raw_action to obtain the action to be sent to the maniskill2 environment
+        action = {}
+        action["world_vector"] = raw_action["world_vector"] * self.action_scale
+        action_rotation_delta = np.asarray(
+            raw_action["rotation_delta"], dtype=np.float64
+        )
+        roll, pitch, yaw = action_rotation_delta
+        action_rotation_ax, action_rotation_angle = euler2axangle(roll, pitch, yaw)
+        action_rotation_axangle = action_rotation_ax * action_rotation_angle
+        action["rot_axangle"] = action_rotation_axangle * self.action_scale
+
+        if self.policy_setup == "google_robot":
+            action["gripper"] = 0
+            current_gripper_action = raw_action["open_gripper"]
+            if self.previous_gripper_action is None:
+                relative_gripper_action = np.array([0])
+                self.previous_gripper_action = current_gripper_action
+            else:
+                relative_gripper_action = self.previous_gripper_action - current_gripper_action
+            
+            # fix a bug in the SIMPLER code here
+            # self.previous_gripper_action = current_gripper_action
+
+            if np.abs(relative_gripper_action) > 0.5 and (not self.sticky_action_is_on):
+                self.sticky_action_is_on = True
+                self.sticky_gripper_action = relative_gripper_action
+                self.previous_gripper_action = current_gripper_action
+
+            if self.sticky_action_is_on:
+                self.gripper_action_repeat += 1
+                relative_gripper_action = self.sticky_gripper_action
+
+            if self.gripper_action_repeat == self.sticky_gripper_num_repeat:
+                self.sticky_action_is_on = False
+                self.gripper_action_repeat = 0
+                self.sticky_gripper_action = 0.0
+
+            action["gripper"] = relative_gripper_action
+
+        elif self.policy_setup == "widowx_bridge":
+            action["gripper"] = 2.0 * (raw_action["open_gripper"] > 0.5) - 1.0
+        
+        action["terminate_episode"] = np.array([0.0])
+        return raw_action, action
+
+    def _resize_image(self, image: np.ndarray) -> np.ndarray:
+        image = cv.resize(image, tuple(self.image_size), interpolation=cv.INTER_AREA)
+        return image
+
+    def _add_image_to_history(self, image: np.ndarray) -> None:
+        if len(self.image_history) == 0:
+            self.image_history.extend([image] * self.obs_horizon)
+        else:
+            self.image_history.append(image)
+
+    def _obtain_image_history(self) -> List[Image.Image]:
+        image_history = list(self.image_history)
+        images = image_history[:: self.obs_interval]
+        # images = [Image.fromarray(image).convert("RGB") for image in images]
+        return images
+
+    def visualize_epoch(
+        self,
+        predicted_raw_actions: Sequence[np.ndarray],
+        images: Sequence[np.ndarray],
+        save_path: str,
+    ) -> None:
+        images = [self._resize_image(image) for image in images]
+        ACTION_DIM_LABELS = ["x", "y", "z", "roll", "pitch", "yaw", "grasp"]
+
+        img_strip = np.concatenate(np.array(images[::3]), axis=1)
+
+        # set up plt figure
+        figure_layout = [["image"] * len(ACTION_DIM_LABELS), ACTION_DIM_LABELS]
+        plt.rcParams.update({"font.size": 12})
+        fig, axs = plt.subplot_mosaic(figure_layout)
+        fig.set_size_inches([45, 10])
+
+        # plot actions
+        pred_actions = np.array(
+            [
+                np.concatenate(
+                    [a["world_vector"], a["rotation_delta"], a["open_gripper"]], axis=-1
+                )
+                for a in predicted_raw_actions
+            ]
+        )
+        for action_dim, action_label in enumerate(ACTION_DIM_LABELS):
+            # actions have batch, horizon, dim, in this example we just take the first action for simplicity
+            axs[action_label].plot(
+                pred_actions[:, action_dim], label="predicted action"
+            )
+            axs[action_label].set_title(action_label)
+            axs[action_label].set_xlabel("Time in one episode")
+
+        axs["image"].imshow(img_strip)
+        axs["image"].set_xlabel("Time in one episode (subsampled)")
+        plt.legend()
+        plt.savefig(save_path)
diff --git a/simpler_env/policies/hsr_openpi/pi0_or_fast.py b/simpler_env/policies/hsr_openpi/pi0_or_fast.py
new file mode 100644
index 00000000..a26c6813
--- /dev/null
+++ b/simpler_env/policies/hsr_openpi/pi0_or_fast.py
@@ -0,0 +1,18 @@
+from typing import Dict
+
+import numpy as np
+
+from simpler_env.policies.base import AiroaBasePolicy
+
+
+class OpenpiToAiroaPolicy(AiroaBasePolicy):
+    def __init__(self, policy):
+        self.policy = policy
+
+    def step(self, obs: Dict) -> Dict:
+        outputs = self.policy.infer(obs)
+        outputs["terminate_episode"] = np.zeros(outputs["actions"].shape[0])
+        return outputs
+
+    def reset(self) -> None:
+        self.policy.reset()
diff --git a/simpler_env/policies/lerobotpi/geometry.py b/simpler_env/policies/lerobotpi/geometry.py
new file mode 100644
index 00000000..f1e7164a
--- /dev/null
+++ b/simpler_env/policies/lerobotpi/geometry.py
@@ -0,0 +1,463 @@
+"""
+Mostly copied from transforms3d library
+
+"""
+
+import math
+
+import numpy as np
+
+_FLOAT_EPS = np.finfo(np.float64).eps
+
+# axis sequences for Euler angles
+_NEXT_AXIS = [1, 2, 0, 1]
+
+# map axes strings to/from tuples of inner axis, parity, repetition, frame
+_AXES2TUPLE = {
+    "sxyz": (0, 0, 0, 0),
+    "sxyx": (0, 0, 1, 0),
+    "sxzy": (0, 1, 0, 0),
+    "sxzx": (0, 1, 1, 0),
+    "syzx": (1, 0, 0, 0),
+    "syzy": (1, 0, 1, 0),
+    "syxz": (1, 1, 0, 0),
+    "syxy": (1, 1, 1, 0),
+    "szxy": (2, 0, 0, 0),
+    "szxz": (2, 0, 1, 0),
+    "szyx": (2, 1, 0, 0),
+    "szyz": (2, 1, 1, 0),
+    "rzyx": (0, 0, 0, 1),
+    "rxyx": (0, 0, 1, 1),
+    "ryzx": (0, 1, 0, 1),
+    "rxzx": (0, 1, 1, 1),
+    "rxzy": (1, 0, 0, 1),
+    "ryzy": (1, 0, 1, 1),
+    "rzxy": (1, 1, 0, 1),
+    "ryxy": (1, 1, 1, 1),
+    "ryxz": (2, 0, 0, 1),
+    "rzxz": (2, 0, 1, 1),
+    "rxyz": (2, 1, 0, 1),
+    "rzyz": (2, 1, 1, 1),
+}
+
+_TUPLE2AXES = dict((v, k) for k, v in _AXES2TUPLE.items())
+
+# For testing whether a number is close to zero
+_EPS4 = np.finfo(float).eps * 4.0
+
+
+def mat2euler(mat, axes="sxyz"):
+    """Return Euler angles from rotation matrix for specified axis sequence.
+
+    Note that many Euler angle triplets can describe one matrix.
+
+    Parameters
+    ----------
+    mat : array-like shape (3, 3) or (4, 4)
+        Rotation matrix or affine.
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+
+    Examples
+    --------
+    >>> R0 = euler2mat(1, 2, 3, 'syxz')
+    >>> al, be, ga = mat2euler(R0, 'syxz')
+    >>> R1 = euler2mat(al, be, ga, 'syxz')
+    >>> np.allclose(R0, R1)
+    True
+    """
+    try:
+        firstaxis, parity, repetition, frame = _AXES2TUPLE[axes.lower()]
+    except (AttributeError, KeyError):
+        _TUPLE2AXES[axes]  # validation
+        firstaxis, parity, repetition, frame = axes
+
+    i = firstaxis
+    j = _NEXT_AXIS[i + parity]
+    k = _NEXT_AXIS[i - parity + 1]
+
+    M = np.array(mat, dtype=np.float64, copy=False)[:3, :3]
+    if repetition:
+        sy = math.sqrt(M[i, j] * M[i, j] + M[i, k] * M[i, k])
+        if sy > _EPS4:
+            ax = math.atan2(M[i, j], M[i, k])
+            ay = math.atan2(sy, M[i, i])
+            az = math.atan2(M[j, i], -M[k, i])
+        else:
+            ax = math.atan2(-M[j, k], M[j, j])
+            ay = math.atan2(sy, M[i, i])
+            az = 0.0
+    else:
+        cy = math.sqrt(M[i, i] * M[i, i] + M[j, i] * M[j, i])
+        if cy > _EPS4:
+            ax = math.atan2(M[k, j], M[k, k])
+            ay = math.atan2(-M[k, i], cy)
+            az = math.atan2(M[j, i], M[i, i])
+        else:
+            ax = math.atan2(-M[j, k], M[j, j])
+            ay = math.atan2(-M[k, i], cy)
+            az = 0.0
+
+    if parity:
+        ax, ay, az = -ax, -ay, -az
+    if frame:
+        ax, az = az, ax
+    return ax, ay, az
+
+
+def quat2mat(q):
+    """Calculate rotation matrix corresponding to quaternion
+
+    Parameters
+    ----------
+    q : 4 element array-like
+
+    Returns
+    -------
+    M : (3,3) array
+      Rotation matrix corresponding to input quaternion *q*
+
+    Notes
+    -----
+    Rotation matrix applies to column vectors, and is applied to the
+    left of coordinate vectors.  The algorithm here allows quaternions that
+    have not been normalized.
+
+    References
+    ----------
+    Algorithm from http://en.wikipedia.org/wiki/Rotation_matrix#Quaternion
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> M = quat2mat([1, 0, 0, 0]) # Identity quaternion
+    >>> np.allclose(M, np.eye(3))
+    True
+    >>> M = quat2mat([0, 1, 0, 0]) # 180 degree rotn around axis 0
+    >>> np.allclose(M, np.diag([1, -1, -1]))
+    True
+    """
+    w, x, y, z = q
+    Nq = w * w + x * x + y * y + z * z
+    if Nq < _FLOAT_EPS:
+        return np.eye(3)
+    s = 2.0 / Nq
+    X = x * s
+    Y = y * s
+    Z = z * s
+    wX = w * X
+    wY = w * Y
+    wZ = w * Z
+    xX = x * X
+    xY = x * Y
+    xZ = x * Z
+    yY = y * Y
+    yZ = y * Z
+    zZ = z * Z
+    return np.array(
+        [
+            [1.0 - (yY + zZ), xY - wZ, xZ + wY],
+            [xY + wZ, 1.0 - (xX + zZ), yZ - wX],
+            [xZ - wY, yZ + wX, 1.0 - (xX + yY)],
+        ]
+    )
+
+
+# Checks if a matrix is a valid rotation matrix.
+def isrotation(
+    R: np.ndarray,
+    thresh=1e-6,
+) -> bool:
+    Rt = np.transpose(R)
+    shouldBeIdentity = np.dot(Rt, R)
+    iden = np.identity(3, dtype=R.dtype)
+    n = np.linalg.norm(iden - shouldBeIdentity)
+    return n < thresh
+
+
+def euler2mat(ai, aj, ak, axes="sxyz"):
+    """Return rotation matrix from Euler angles and axis sequence.
+
+    Parameters
+    ----------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    mat : array (3, 3)
+        Rotation matrix or affine.
+
+    Examples
+    --------
+    >>> R = euler2mat(1, 2, 3, 'syxz')
+    >>> np.allclose(np.sum(R[0]), -1.34786452)
+    True
+    >>> R = euler2mat(1, 2, 3, (0, 1, 0, 1))
+    >>> np.allclose(np.sum(R[0]), -0.383436184)
+    True
+    """
+    try:
+        firstaxis, parity, repetition, frame = _AXES2TUPLE[axes]
+    except (AttributeError, KeyError):
+        _TUPLE2AXES[axes]  # validation
+        firstaxis, parity, repetition, frame = axes
+
+    i = firstaxis
+    j = _NEXT_AXIS[i + parity]
+    k = _NEXT_AXIS[i - parity + 1]
+
+    if frame:
+        ai, ak = ak, ai
+    if parity:
+        ai, aj, ak = -ai, -aj, -ak
+
+    si, sj, sk = math.sin(ai), math.sin(aj), math.sin(ak)
+    ci, cj, ck = math.cos(ai), math.cos(aj), math.cos(ak)
+    cc, cs = ci * ck, ci * sk
+    sc, ss = si * ck, si * sk
+
+    M = np.eye(3)
+    if repetition:
+        M[i, i] = cj
+        M[i, j] = sj * si
+        M[i, k] = sj * ci
+        M[j, i] = sj * sk
+        M[j, j] = -cj * ss + cc
+        M[j, k] = -cj * cs - sc
+        M[k, i] = -sj * ck
+        M[k, j] = cj * sc + cs
+        M[k, k] = cj * cc - ss
+    else:
+        M[i, i] = cj * ck
+        M[i, j] = sj * sc - cs
+        M[i, k] = sj * cc + ss
+        M[j, i] = cj * sk
+        M[j, j] = sj * ss + cc
+        M[j, k] = sj * cs - sc
+        M[k, i] = -sj
+        M[k, j] = cj * si
+        M[k, k] = cj * ci
+    return M
+
+
+def euler2axangle(ai, aj, ak, axes="sxyz"):
+    """Return angle, axis corresponding to Euler angles, axis specification
+
+    Parameters
+    ----------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    vector : array shape (3,)
+       axis around which rotation occurs
+    theta : scalar
+       angle of rotation
+
+    Examples
+    --------
+    >>> vec, theta = euler2axangle(0, 1.5, 0, 'szyx')
+    >>> np.allclose(vec, [0, 1, 0])
+    True
+    >>> theta
+    1.5
+    """
+    return quat2axangle(euler2quat(ai, aj, ak, axes))
+
+
+def euler2quat(ai, aj, ak, axes="sxyz"):
+    """Return `quaternion` from Euler angles and axis sequence `axes`
+
+    Parameters
+    ----------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    quat : array shape (4,)
+       Quaternion in w, x, y z (real, then vector) format
+
+    Examples
+    --------
+    >>> q = euler2quat(1, 2, 3, 'ryxz')
+    >>> np.allclose(q, [0.435953, 0.310622, -0.718287, 0.444435])
+    True
+    """
+    try:
+        firstaxis, parity, repetition, frame = _AXES2TUPLE[axes.lower()]
+    except (AttributeError, KeyError):
+        _TUPLE2AXES[axes]  # validation
+        firstaxis, parity, repetition, frame = axes
+
+    i = firstaxis + 1
+    j = _NEXT_AXIS[i + parity - 1] + 1
+    k = _NEXT_AXIS[i - parity] + 1
+
+    if frame:
+        ai, ak = ak, ai
+    if parity:
+        aj = -aj
+
+    ai = ai / 2.0
+    aj = aj / 2.0
+    ak = ak / 2.0
+    ci = math.cos(ai)
+    si = math.sin(ai)
+    cj = math.cos(aj)
+    sj = math.sin(aj)
+    ck = math.cos(ak)
+    sk = math.sin(ak)
+    cc = ci * ck
+    cs = ci * sk
+    sc = si * ck
+    ss = si * sk
+
+    q = np.empty((4,))
+    if repetition:
+        q[0] = cj * (cc - ss)
+        q[i] = cj * (cs + sc)
+        q[j] = sj * (cc + ss)
+        q[k] = sj * (cs - sc)
+    else:
+        q[0] = cj * cc + sj * ss
+        q[i] = cj * sc - sj * cs
+        q[j] = cj * ss + sj * cc
+        q[k] = cj * cs - sj * sc
+    if parity:
+        q[j] *= -1.0
+
+    return q
+
+
+def quat2axangle(quat, identity_thresh=None):
+    """Convert quaternion to rotation of angle around axis
+
+    Parameters
+    ----------
+    quat : 4 element sequence
+       w, x, y, z forming quaternion.
+    identity_thresh : None or scalar, optional
+       Threshold below which the norm of the vector part of the quaternion (x,
+       y, z) is deemed to be 0, leading to the identity rotation.  None (the
+       default) leads to a threshold estimated based on the precision of the
+       input.
+
+    Returns
+    -------
+    theta : scalar
+       angle of rotation.
+    vector : array shape (3,)
+       axis around which rotation occurs.
+
+    Examples
+    --------
+    >>> vec, theta = quat2axangle([0, 1, 0, 0])
+    >>> vec
+    array([1., 0., 0.])
+    >>> np.allclose(theta, np.pi)
+    True
+
+    If this is an identity rotation, we return a zero angle and an arbitrary
+    vector:
+
+    >>> quat2axangle([1, 0, 0, 0])
+    (array([1., 0., 0.]), 0.0)
+
+    If any of the quaternion values are not finite, we return a NaN in the
+    angle, and an arbitrary vector:
+
+    >>> quat2axangle([1, np.inf, 0, 0])
+    (array([1., 0., 0.]), nan)
+
+    Notes
+    -----
+    A quaternion for which x, y, z are all equal to 0, is an identity rotation.
+    In this case we return a 0 angle and an arbitrary vector, here [1, 0, 0].
+
+    The algorithm allows for quaternions that have not been normalized.
+    """
+    quat = np.asarray(quat)
+    Nq = np.sum(quat**2)
+    if not np.isfinite(Nq):
+        return np.array([1.0, 0, 0]), float("nan")
+    if identity_thresh is None:
+        try:
+            identity_thresh = np.finfo(Nq.type).eps * 3
+        except (AttributeError, ValueError):  # Not a numpy type or not float
+            identity_thresh = _FLOAT_EPS * 3
+    if Nq < _FLOAT_EPS**2:  # Results unreliable after normalization
+        return np.array([1.0, 0, 0]), 0.0
+    if Nq != 1:  # Normalize if not normalized
+        s = math.sqrt(Nq)
+        quat = quat / s
+    xyz = quat[1:]
+    len2 = np.sum(xyz**2)
+    if len2 < identity_thresh**2:
+        # if vec is nearly 0,0,0, this is an identity rotation
+        return np.array([1.0, 0, 0]), 0.0
+    # Make sure w is not slightly above 1 or below -1
+    theta = 2 * math.acos(max(min(quat[0], 1), -1))
+    return xyz / math.sqrt(len2), theta
+
+
+def quat2euler(quaternion, axes="sxyz"):
+    """Euler angles from `quaternion` for specified axis sequence `axes`
+
+    Parameters
+    ----------
+    q : 4 element sequence
+       w, x, y, z of quaternion
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+
+    Examples
+    --------
+    >>> angles = quat2euler([0.99810947, 0.06146124, 0, 0])
+    >>> np.allclose(angles, [0.123, 0, 0])
+    True
+    """
+    return mat2euler(quat2mat(quaternion), axes)
\ No newline at end of file
diff --git a/simpler_env/policies/lerobotpi/pi0_or_fast.py b/simpler_env/policies/lerobotpi/pi0_or_fast.py
new file mode 100644
index 00000000..3323b2cc
--- /dev/null
+++ b/simpler_env/policies/lerobotpi/pi0_or_fast.py
@@ -0,0 +1,280 @@
+from collections import deque
+import os
+from typing import List, Optional, Sequence
+
+import cv2 as cv
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+import torch
+from transforms3d.euler import euler2axangle
+
+from simpler_env.utils.action.action_ensemble import ActionEnsembler
+
+from .geometry import mat2euler, quat2mat
+
+
+def auto_model_fn(path):
+    import sys
+
+    sys.path.append(path)  # noqa
+    from lerobot.policies.pi0.modeling_pi0 import PI0Policy
+
+    return PI0Policy
+
+
+class LerobotPiFastInference:
+    def __init__(
+        self,
+        saved_model_path: str = "pretrained/pi0",
+        unnorm_key: Optional[str] = None,
+        policy_setup: str = "widowx_bridge",
+        exec_horizon: int = 4,
+        image_size: list[int] = [224, 224],
+        action_scale: float = 1.0,
+        action_ensemble_temp: float = -0.8,
+    ) -> None:
+        gpu_idx = os.environ.get("GPU_IDX", 0)
+        self.device = f"cuda:{gpu_idx}"
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        if policy_setup == "widowx_bridge":
+            unnorm_key = "bridge_orig/1.0.0" if unnorm_key is None else unnorm_key
+            action_ensemble = True
+            self.sticky_gripper_num_repeat = 1
+            # EE pose in Bridge data was relative to a top-down pose, instead of robot base
+            self.default_rot = np.array(
+                [[0, 0, 1.0], [0, 1.0, 0], [-1.0, 0, 0]]
+            )  # https://github.com/rail-berkeley/bridge_data_robot/blob/b841131ecd512bafb303075bd8f8b677e0bf9f1f/widowx_envs/widowx_controller/src/widowx_controller/widowx_controller.py#L203
+        elif policy_setup == "google_robot":
+            unnorm_key = "fractal20220817_data/0.1.0" if unnorm_key is None else unnorm_key
+            action_ensemble = True
+            self.sticky_gripper_num_repeat = 10
+        else:
+            raise NotImplementedError(
+                f"Policy setup {policy_setup} not supported for octo models. The other datasets can be found in the huggingface config.json file."
+            )
+        self.policy_setup = policy_setup
+        self.unnorm_key = unnorm_key
+
+        print(f"*** policy_setup: {policy_setup}, unnorm_key: {unnorm_key} ***")
+
+        # TODO: add pi0 loading ...
+        PI0Policy = auto_model_fn(saved_model_path)
+        self.vla = PI0Policy.from_pretrained(saved_model_path)
+        self.vla.to(self.device)
+        self.vla.reset()
+
+        self.image_size = image_size
+        self.action_scale = action_scale
+        self.obs_horizon = 1
+        self.obs_interval = 1
+        self.pred_action_horizon = 5
+        self.image_history = deque(maxlen=self.obs_horizon)
+        self.exec_horizon = exec_horizon
+
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+        self.action_ensemble = action_ensemble
+        self.action_ensemble_temp = action_ensemble_temp
+
+        if self.action_ensemble:
+            self.action_ensembler = ActionEnsembler(self.pred_action_horizon, self.action_ensemble_temp)
+        else:
+            self.action_ensembler = None
+
+        self.task = None
+        self.task_description = None
+
+    def reset(self, task_description: str) -> None:
+        self.image_history.clear()
+        if self.action_ensemble:
+            self.action_ensembler.reset()
+        self.task_description = task_description
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+        self.action_plan = deque()
+
+    def preprocess_widowx_proprio(self, eef_pos) -> np.array:
+        """convert ee rotation to the frame of top-down
+        https://github.com/allenzren/open-pi-zero/blob/c3df7fb062175c16f69d7ca4ce042958ea238fb7/src/agent/env_adapter/simpler.py#L167
+        """
+        # StateEncoding.POS_EULER: xyz + rpy + pad + gripper(openness)
+        proprio = eef_pos
+        rm_bridge = quat2mat(proprio[3:7])
+        rpy_bridge_converted = mat2euler(rm_bridge @ self.default_rot.T)
+        gripper_openness = proprio[7]  # from simpler, 0 for close, 1 for open
+        raw_proprio = np.concatenate(
+            [
+                proprio[:3],
+                rpy_bridge_converted,
+                np.zeros(1),
+                [gripper_openness],
+            ]
+        )
+        return raw_proprio
+
+    def preprocess_google_robot_proprio(self, eef_pos) -> np.array:
+        """convert wxyz quat from simpler to xyzw used in fractal
+        https://github.com/allenzren/open-pi-zero/blob/c3df7fb062175c16f69d7ca4ce042958ea238fb7/src/agent/env_adapter/simpler.py#L204
+        """
+        # StateEncoding.POS_QUAT: xyz + q_xyzw + gripper(closeness)
+        quat_xyzw = np.roll(eef_pos[3:7], -1)
+        gripper_width = eef_pos[7]  # from simpler, 0 for close, 1 for open
+        # need invert as the training data comes from closeness
+        gripper_closedness = (
+            1 - gripper_width
+        )  # TODO(allenzren): change fractal data processing in training so also use gripper openness in proprio (as in bridge) instead of closedness
+        raw_proprio = np.concatenate(
+            (
+                eef_pos[:3],
+                quat_xyzw,
+                [gripper_closedness],
+            )
+        )
+        return raw_proprio
+
+    def step(self, image: np.ndarray, task_description: Optional[str] = None, *args, **kwargs) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+        """
+        Input:
+            image: np.ndarray of shape (H, W, 3), uint8
+            task_description: Optional[str], task description; if different from previous task description, policy state is reset
+        Output:
+            raw_action: dict; raw policy action output
+            action: dict; processed action to be sent to the maniskill2 environment, with the following keys:
+                - 'world_vector': np.ndarray of shape (3,), xyz translation of robot end-effector
+                - 'rot_axangle': np.ndarray of shape (3,), axis-angle representation of end-effector rotation
+                - 'gripper': np.ndarray of shape (1,), gripper action
+                - 'terminate_episode': np.ndarray of shape (1,), 1 if episode should be terminated, 0 otherwise
+        """
+        if task_description is not None:
+            if task_description != self.task_description:
+                self.reset(task_description)
+
+        assert image.dtype == np.uint8
+        image = self._resize_image(image)
+        self._add_image_to_history(image)
+        images: List[Image.Image] = self._obtain_image_history()
+
+        eef_pos = kwargs.get("eef_pos", None)
+        if self.policy_setup == "widowx_bridge":
+            state = self.preprocess_widowx_proprio(eef_pos)
+            image_key = "observation.images.image_0"
+        elif self.policy_setup == "google_robot":
+            state = self.preprocess_google_robot_proprio(eef_pos)
+            image_key = "observation.images.image"
+
+        # if self.action_ensemble:
+        #     raw_actions = self.action_ensembler.ensemble_action(raw_actions)[None]
+
+        if not self.action_plan:
+            observation = {
+                "observation.state": torch.from_numpy(state).unsqueeze(0).to(self.device).float(),
+                image_key: torch.from_numpy(images[0] / 255).permute(2, 0, 1).unsqueeze(0).to(self.device).float(),
+                "task": [task_description],
+            }
+
+            # model output gripper action, +1 = open, 0 = close
+            action_chunk = self.vla.select_action(observation)[0][: self.pred_action_horizon].cpu().numpy()
+            self.action_plan.extend(action_chunk[: self.exec_horizon])
+
+        raw_actions = self.action_plan.popleft()
+
+        raw_action = {
+            "world_vector": np.array(raw_actions[:3]),
+            "rotation_delta": np.array(raw_actions[3:6]),
+            "open_gripper": np.array(raw_actions[6:7]),  # range [0, 1]; 1 = open; 0 = close
+        }
+
+        # process raw_action to obtain the action to be sent to the maniskill2 environment
+        action = {}
+        action["world_vector"] = raw_action["world_vector"] * self.action_scale
+        action_rotation_delta = np.asarray(raw_action["rotation_delta"], dtype=np.float64)
+        roll, pitch, yaw = action_rotation_delta
+        action_rotation_ax, action_rotation_angle = euler2axangle(roll, pitch, yaw)
+        action_rotation_axangle = action_rotation_ax * action_rotation_angle
+        action["rot_axangle"] = action_rotation_axangle * self.action_scale
+
+        if self.policy_setup == "google_robot":
+            action["gripper"] = 0
+            current_gripper_action = raw_action["open_gripper"]
+            if self.previous_gripper_action is None:
+                relative_gripper_action = np.array([0])
+                self.previous_gripper_action = current_gripper_action
+            else:
+                relative_gripper_action = self.previous_gripper_action - current_gripper_action
+
+            # fix a bug in the SIMPLER code here
+            # self.previous_gripper_action = current_gripper_action
+
+            if np.abs(relative_gripper_action) > 0.5 and (not self.sticky_action_is_on):
+                self.sticky_action_is_on = True
+                self.sticky_gripper_action = relative_gripper_action
+                self.previous_gripper_action = current_gripper_action
+
+            if self.sticky_action_is_on:
+                self.gripper_action_repeat += 1
+                relative_gripper_action = self.sticky_gripper_action
+
+            if self.gripper_action_repeat == self.sticky_gripper_num_repeat:
+                self.sticky_action_is_on = False
+                self.gripper_action_repeat = 0
+                self.sticky_gripper_action = 0.0
+
+            action["gripper"] = relative_gripper_action
+
+        elif self.policy_setup == "widowx_bridge":
+            action["gripper"] = 2.0 * (raw_action["open_gripper"] > 0.5) - 1.0
+
+        action["terminate_episode"] = np.array([0.0])
+        return raw_action, action
+
+    def _resize_image(self, image: np.ndarray) -> np.ndarray:
+        image = cv.resize(image, tuple(self.image_size), interpolation=cv.INTER_AREA)
+        return image
+
+    def _add_image_to_history(self, image: np.ndarray) -> None:
+        if len(self.image_history) == 0:
+            self.image_history.extend([image] * self.obs_horizon)
+        else:
+            self.image_history.append(image)
+
+    def _obtain_image_history(self) -> List[Image.Image]:
+        image_history = list(self.image_history)
+        images = image_history[:: self.obs_interval]
+        # images = [Image.fromarray(image).convert("RGB") for image in images]
+        return images
+
+    def visualize_epoch(
+        self,
+        predicted_raw_actions: Sequence[np.ndarray],
+        images: Sequence[np.ndarray],
+        save_path: str,
+    ) -> None:
+        images = [self._resize_image(image) for image in images]
+        ACTION_DIM_LABELS = ["x", "y", "z", "roll", "pitch", "yaw", "grasp"]
+
+        img_strip = np.concatenate(np.array(images[::3]), axis=1)
+
+        # set up plt figure
+        figure_layout = [["image"] * len(ACTION_DIM_LABELS), ACTION_DIM_LABELS]
+        plt.rcParams.update({"font.size": 12})
+        fig, axs = plt.subplot_mosaic(figure_layout)
+        fig.set_size_inches([45, 10])
+
+        # plot actions
+        pred_actions = np.array([np.concatenate([a["world_vector"], a["rotation_delta"], a["open_gripper"]], axis=-1) for a in predicted_raw_actions])
+        for action_dim, action_label in enumerate(ACTION_DIM_LABELS):
+            # actions have batch, horizon, dim, in this example we just take the first action for simplicity
+            axs[action_label].plot(pred_actions[:, action_dim], label="predicted action")
+            axs[action_label].set_title(action_label)
+            axs[action_label].set_xlabel("Time in one episode")
+
+        axs["image"].imshow(img_strip)
+        axs["image"].set_xlabel("Time in one episode (subsampled)")
+        plt.legend()
+        plt.savefig(save_path)
diff --git a/simpler_env/policies/octo/octo_model.py b/simpler_env/policies/octo/octo_model.py
index 92110985..e05a51e0 100644
--- a/simpler_env/policies/octo/octo_model.py
+++ b/simpler_env/policies/octo/octo_model.py
@@ -57,7 +57,9 @@ def __init__(
             self.action_std = self.model.dataset_statistics[dataset_id]["action"]["std"]
         else:
             raise NotImplementedError()
-
+        print(
+            f"*** policy setup: {policy_setup}, dataset_id: {dataset_id}, model_type: {model_type}, mean {self.action_mean}, std {self.action_std} ***"
+        )
         self.image_size = image_size
         self.action_scale = action_scale
         self.horizon = horizon
@@ -127,7 +129,9 @@ def reset(self, task_description: str) -> None:
         # self.gripper_is_closed = False
         self.previous_gripper_action = None
 
-    def step(self, image: np.ndarray, task_description: Optional[str] = None, *args, **kwargs) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+    def step(
+        self, image: np.ndarray, task_description: Optional[str] = None, *args, **kwargs
+    ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
         """
         Input:
             image: np.ndarray of shape (H, W, 3), uint8
@@ -241,7 +245,9 @@ def step(self, image: np.ndarray, task_description: Optional[str] = None, *args,
 
         return raw_action, action
 
-    def visualize_epoch(self, predicted_raw_actions: Sequence[np.ndarray], images: Sequence[np.ndarray], save_path: str) -> None:
+    def visualize_epoch(
+        self, predicted_raw_actions: Sequence[np.ndarray], images: Sequence[np.ndarray], save_path: str
+    ) -> None:
         images = [self._resize_image(image) for image in images]
         ACTION_DIM_LABELS = ["x", "y", "z", "roll", "pitch", "yaw", "grasp"]
 
diff --git a/simpler_env/policies/openpi/geometry.py b/simpler_env/policies/openpi/geometry.py
new file mode 100644
index 00000000..f1e7164a
--- /dev/null
+++ b/simpler_env/policies/openpi/geometry.py
@@ -0,0 +1,463 @@
+"""
+Mostly copied from transforms3d library
+
+"""
+
+import math
+
+import numpy as np
+
+_FLOAT_EPS = np.finfo(np.float64).eps
+
+# axis sequences for Euler angles
+_NEXT_AXIS = [1, 2, 0, 1]
+
+# map axes strings to/from tuples of inner axis, parity, repetition, frame
+_AXES2TUPLE = {
+    "sxyz": (0, 0, 0, 0),
+    "sxyx": (0, 0, 1, 0),
+    "sxzy": (0, 1, 0, 0),
+    "sxzx": (0, 1, 1, 0),
+    "syzx": (1, 0, 0, 0),
+    "syzy": (1, 0, 1, 0),
+    "syxz": (1, 1, 0, 0),
+    "syxy": (1, 1, 1, 0),
+    "szxy": (2, 0, 0, 0),
+    "szxz": (2, 0, 1, 0),
+    "szyx": (2, 1, 0, 0),
+    "szyz": (2, 1, 1, 0),
+    "rzyx": (0, 0, 0, 1),
+    "rxyx": (0, 0, 1, 1),
+    "ryzx": (0, 1, 0, 1),
+    "rxzx": (0, 1, 1, 1),
+    "rxzy": (1, 0, 0, 1),
+    "ryzy": (1, 0, 1, 1),
+    "rzxy": (1, 1, 0, 1),
+    "ryxy": (1, 1, 1, 1),
+    "ryxz": (2, 0, 0, 1),
+    "rzxz": (2, 0, 1, 1),
+    "rxyz": (2, 1, 0, 1),
+    "rzyz": (2, 1, 1, 1),
+}
+
+_TUPLE2AXES = dict((v, k) for k, v in _AXES2TUPLE.items())
+
+# For testing whether a number is close to zero
+_EPS4 = np.finfo(float).eps * 4.0
+
+
+def mat2euler(mat, axes="sxyz"):
+    """Return Euler angles from rotation matrix for specified axis sequence.
+
+    Note that many Euler angle triplets can describe one matrix.
+
+    Parameters
+    ----------
+    mat : array-like shape (3, 3) or (4, 4)
+        Rotation matrix or affine.
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+
+    Examples
+    --------
+    >>> R0 = euler2mat(1, 2, 3, 'syxz')
+    >>> al, be, ga = mat2euler(R0, 'syxz')
+    >>> R1 = euler2mat(al, be, ga, 'syxz')
+    >>> np.allclose(R0, R1)
+    True
+    """
+    try:
+        firstaxis, parity, repetition, frame = _AXES2TUPLE[axes.lower()]
+    except (AttributeError, KeyError):
+        _TUPLE2AXES[axes]  # validation
+        firstaxis, parity, repetition, frame = axes
+
+    i = firstaxis
+    j = _NEXT_AXIS[i + parity]
+    k = _NEXT_AXIS[i - parity + 1]
+
+    M = np.array(mat, dtype=np.float64, copy=False)[:3, :3]
+    if repetition:
+        sy = math.sqrt(M[i, j] * M[i, j] + M[i, k] * M[i, k])
+        if sy > _EPS4:
+            ax = math.atan2(M[i, j], M[i, k])
+            ay = math.atan2(sy, M[i, i])
+            az = math.atan2(M[j, i], -M[k, i])
+        else:
+            ax = math.atan2(-M[j, k], M[j, j])
+            ay = math.atan2(sy, M[i, i])
+            az = 0.0
+    else:
+        cy = math.sqrt(M[i, i] * M[i, i] + M[j, i] * M[j, i])
+        if cy > _EPS4:
+            ax = math.atan2(M[k, j], M[k, k])
+            ay = math.atan2(-M[k, i], cy)
+            az = math.atan2(M[j, i], M[i, i])
+        else:
+            ax = math.atan2(-M[j, k], M[j, j])
+            ay = math.atan2(-M[k, i], cy)
+            az = 0.0
+
+    if parity:
+        ax, ay, az = -ax, -ay, -az
+    if frame:
+        ax, az = az, ax
+    return ax, ay, az
+
+
+def quat2mat(q):
+    """Calculate rotation matrix corresponding to quaternion
+
+    Parameters
+    ----------
+    q : 4 element array-like
+
+    Returns
+    -------
+    M : (3,3) array
+      Rotation matrix corresponding to input quaternion *q*
+
+    Notes
+    -----
+    Rotation matrix applies to column vectors, and is applied to the
+    left of coordinate vectors.  The algorithm here allows quaternions that
+    have not been normalized.
+
+    References
+    ----------
+    Algorithm from http://en.wikipedia.org/wiki/Rotation_matrix#Quaternion
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> M = quat2mat([1, 0, 0, 0]) # Identity quaternion
+    >>> np.allclose(M, np.eye(3))
+    True
+    >>> M = quat2mat([0, 1, 0, 0]) # 180 degree rotn around axis 0
+    >>> np.allclose(M, np.diag([1, -1, -1]))
+    True
+    """
+    w, x, y, z = q
+    Nq = w * w + x * x + y * y + z * z
+    if Nq < _FLOAT_EPS:
+        return np.eye(3)
+    s = 2.0 / Nq
+    X = x * s
+    Y = y * s
+    Z = z * s
+    wX = w * X
+    wY = w * Y
+    wZ = w * Z
+    xX = x * X
+    xY = x * Y
+    xZ = x * Z
+    yY = y * Y
+    yZ = y * Z
+    zZ = z * Z
+    return np.array(
+        [
+            [1.0 - (yY + zZ), xY - wZ, xZ + wY],
+            [xY + wZ, 1.0 - (xX + zZ), yZ - wX],
+            [xZ - wY, yZ + wX, 1.0 - (xX + yY)],
+        ]
+    )
+
+
+# Checks if a matrix is a valid rotation matrix.
+def isrotation(
+    R: np.ndarray,
+    thresh=1e-6,
+) -> bool:
+    Rt = np.transpose(R)
+    shouldBeIdentity = np.dot(Rt, R)
+    iden = np.identity(3, dtype=R.dtype)
+    n = np.linalg.norm(iden - shouldBeIdentity)
+    return n < thresh
+
+
+def euler2mat(ai, aj, ak, axes="sxyz"):
+    """Return rotation matrix from Euler angles and axis sequence.
+
+    Parameters
+    ----------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    mat : array (3, 3)
+        Rotation matrix or affine.
+
+    Examples
+    --------
+    >>> R = euler2mat(1, 2, 3, 'syxz')
+    >>> np.allclose(np.sum(R[0]), -1.34786452)
+    True
+    >>> R = euler2mat(1, 2, 3, (0, 1, 0, 1))
+    >>> np.allclose(np.sum(R[0]), -0.383436184)
+    True
+    """
+    try:
+        firstaxis, parity, repetition, frame = _AXES2TUPLE[axes]
+    except (AttributeError, KeyError):
+        _TUPLE2AXES[axes]  # validation
+        firstaxis, parity, repetition, frame = axes
+
+    i = firstaxis
+    j = _NEXT_AXIS[i + parity]
+    k = _NEXT_AXIS[i - parity + 1]
+
+    if frame:
+        ai, ak = ak, ai
+    if parity:
+        ai, aj, ak = -ai, -aj, -ak
+
+    si, sj, sk = math.sin(ai), math.sin(aj), math.sin(ak)
+    ci, cj, ck = math.cos(ai), math.cos(aj), math.cos(ak)
+    cc, cs = ci * ck, ci * sk
+    sc, ss = si * ck, si * sk
+
+    M = np.eye(3)
+    if repetition:
+        M[i, i] = cj
+        M[i, j] = sj * si
+        M[i, k] = sj * ci
+        M[j, i] = sj * sk
+        M[j, j] = -cj * ss + cc
+        M[j, k] = -cj * cs - sc
+        M[k, i] = -sj * ck
+        M[k, j] = cj * sc + cs
+        M[k, k] = cj * cc - ss
+    else:
+        M[i, i] = cj * ck
+        M[i, j] = sj * sc - cs
+        M[i, k] = sj * cc + ss
+        M[j, i] = cj * sk
+        M[j, j] = sj * ss + cc
+        M[j, k] = sj * cs - sc
+        M[k, i] = -sj
+        M[k, j] = cj * si
+        M[k, k] = cj * ci
+    return M
+
+
+def euler2axangle(ai, aj, ak, axes="sxyz"):
+    """Return angle, axis corresponding to Euler angles, axis specification
+
+    Parameters
+    ----------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    vector : array shape (3,)
+       axis around which rotation occurs
+    theta : scalar
+       angle of rotation
+
+    Examples
+    --------
+    >>> vec, theta = euler2axangle(0, 1.5, 0, 'szyx')
+    >>> np.allclose(vec, [0, 1, 0])
+    True
+    >>> theta
+    1.5
+    """
+    return quat2axangle(euler2quat(ai, aj, ak, axes))
+
+
+def euler2quat(ai, aj, ak, axes="sxyz"):
+    """Return `quaternion` from Euler angles and axis sequence `axes`
+
+    Parameters
+    ----------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    quat : array shape (4,)
+       Quaternion in w, x, y z (real, then vector) format
+
+    Examples
+    --------
+    >>> q = euler2quat(1, 2, 3, 'ryxz')
+    >>> np.allclose(q, [0.435953, 0.310622, -0.718287, 0.444435])
+    True
+    """
+    try:
+        firstaxis, parity, repetition, frame = _AXES2TUPLE[axes.lower()]
+    except (AttributeError, KeyError):
+        _TUPLE2AXES[axes]  # validation
+        firstaxis, parity, repetition, frame = axes
+
+    i = firstaxis + 1
+    j = _NEXT_AXIS[i + parity - 1] + 1
+    k = _NEXT_AXIS[i - parity] + 1
+
+    if frame:
+        ai, ak = ak, ai
+    if parity:
+        aj = -aj
+
+    ai = ai / 2.0
+    aj = aj / 2.0
+    ak = ak / 2.0
+    ci = math.cos(ai)
+    si = math.sin(ai)
+    cj = math.cos(aj)
+    sj = math.sin(aj)
+    ck = math.cos(ak)
+    sk = math.sin(ak)
+    cc = ci * ck
+    cs = ci * sk
+    sc = si * ck
+    ss = si * sk
+
+    q = np.empty((4,))
+    if repetition:
+        q[0] = cj * (cc - ss)
+        q[i] = cj * (cs + sc)
+        q[j] = sj * (cc + ss)
+        q[k] = sj * (cs - sc)
+    else:
+        q[0] = cj * cc + sj * ss
+        q[i] = cj * sc - sj * cs
+        q[j] = cj * ss + sj * cc
+        q[k] = cj * cs - sj * sc
+    if parity:
+        q[j] *= -1.0
+
+    return q
+
+
+def quat2axangle(quat, identity_thresh=None):
+    """Convert quaternion to rotation of angle around axis
+
+    Parameters
+    ----------
+    quat : 4 element sequence
+       w, x, y, z forming quaternion.
+    identity_thresh : None or scalar, optional
+       Threshold below which the norm of the vector part of the quaternion (x,
+       y, z) is deemed to be 0, leading to the identity rotation.  None (the
+       default) leads to a threshold estimated based on the precision of the
+       input.
+
+    Returns
+    -------
+    theta : scalar
+       angle of rotation.
+    vector : array shape (3,)
+       axis around which rotation occurs.
+
+    Examples
+    --------
+    >>> vec, theta = quat2axangle([0, 1, 0, 0])
+    >>> vec
+    array([1., 0., 0.])
+    >>> np.allclose(theta, np.pi)
+    True
+
+    If this is an identity rotation, we return a zero angle and an arbitrary
+    vector:
+
+    >>> quat2axangle([1, 0, 0, 0])
+    (array([1., 0., 0.]), 0.0)
+
+    If any of the quaternion values are not finite, we return a NaN in the
+    angle, and an arbitrary vector:
+
+    >>> quat2axangle([1, np.inf, 0, 0])
+    (array([1., 0., 0.]), nan)
+
+    Notes
+    -----
+    A quaternion for which x, y, z are all equal to 0, is an identity rotation.
+    In this case we return a 0 angle and an arbitrary vector, here [1, 0, 0].
+
+    The algorithm allows for quaternions that have not been normalized.
+    """
+    quat = np.asarray(quat)
+    Nq = np.sum(quat**2)
+    if not np.isfinite(Nq):
+        return np.array([1.0, 0, 0]), float("nan")
+    if identity_thresh is None:
+        try:
+            identity_thresh = np.finfo(Nq.type).eps * 3
+        except (AttributeError, ValueError):  # Not a numpy type or not float
+            identity_thresh = _FLOAT_EPS * 3
+    if Nq < _FLOAT_EPS**2:  # Results unreliable after normalization
+        return np.array([1.0, 0, 0]), 0.0
+    if Nq != 1:  # Normalize if not normalized
+        s = math.sqrt(Nq)
+        quat = quat / s
+    xyz = quat[1:]
+    len2 = np.sum(xyz**2)
+    if len2 < identity_thresh**2:
+        # if vec is nearly 0,0,0, this is an identity rotation
+        return np.array([1.0, 0, 0]), 0.0
+    # Make sure w is not slightly above 1 or below -1
+    theta = 2 * math.acos(max(min(quat[0], 1), -1))
+    return xyz / math.sqrt(len2), theta
+
+
+def quat2euler(quaternion, axes="sxyz"):
+    """Euler angles from `quaternion` for specified axis sequence `axes`
+
+    Parameters
+    ----------
+    q : 4 element sequence
+       w, x, y, z of quaternion
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+
+    Examples
+    --------
+    >>> angles = quat2euler([0.99810947, 0.06146124, 0, 0])
+    >>> np.allclose(angles, [0.123, 0, 0])
+    True
+    """
+    return mat2euler(quat2mat(quaternion), axes)
\ No newline at end of file
diff --git a/simpler_env/policies/openpi/pi0_or_fast.py b/simpler_env/policies/openpi/pi0_or_fast.py
new file mode 100644
index 00000000..98397f66
--- /dev/null
+++ b/simpler_env/policies/openpi/pi0_or_fast.py
@@ -0,0 +1,287 @@
+from typing import Optional, Sequence, List
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+from transforms3d.euler import euler2axangle
+from collections import deque
+from PIL import Image
+import torch
+import cv2 as cv
+from simpler_env.utils.action.action_ensemble import ActionEnsembler
+from .geometry import quat2mat, mat2euler
+import numpy as np
+import torch
+from openpi_client import websocket_client_policy as _websocket_client_policy
+
+
+class OpenPiFastInference:
+    def __init__(
+        self,
+        saved_model_path: str = "pretrained/pi0",
+        unnorm_key: Optional[str] = None,
+        policy_setup: str = "widowx_bridge",
+        exec_horizon: int = 4,
+        image_size: list[int] = [224, 224],
+        action_scale: float = 1.0,
+        action_ensemble_temp: float = -0.8,
+    ) -> None:
+        gpu_idx = os.environ.get("GPU_IDX", 0)
+        self.device = f"cuda:{gpu_idx}"
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        if policy_setup == "widowx_bridge":
+            unnorm_key = "bridge_orig/1.0.0" if unnorm_key is None else unnorm_key
+            action_ensemble = True
+            self.sticky_gripper_num_repeat = 1
+            # EE pose in Bridge data was relative to a top-down pose, instead of robot base
+            self.default_rot = np.array([[0, 0, 1.0], [0, 1.0, 0], [-1.0, 0, 0]])  # https://github.com/rail-berkeley/bridge_data_robot/blob/b841131ecd512bafb303075bd8f8b677e0bf9f1f/widowx_envs/widowx_controller/src/widowx_controller/widowx_controller.py#L203
+        elif policy_setup == "google_robot":
+            unnorm_key = (
+                "fractal20220817_data/0.1.0" if unnorm_key is None else unnorm_key
+            )
+            action_ensemble = True
+            self.sticky_gripper_num_repeat = 10
+        else:
+            raise NotImplementedError(
+                f"Policy setup {policy_setup} not supported for octo models. The other datasets can be found in the huggingface config.json file."
+            )
+        self.policy_setup = policy_setup
+        self.unnorm_key = unnorm_key
+
+        print(f"*** policy_setup: {policy_setup}, unnorm_key: {unnorm_key} ***")
+
+        # TODO: add pi-fast loading ...
+        host: str = "0.0.0.0"
+        port: int = 8000
+        self.policy_client = _websocket_client_policy.WebsocketClientPolicy(host, port)
+
+        self.image_size = image_size
+        self.action_scale = action_scale
+        self.obs_horizon = 1
+        self.obs_interval = 1
+        self.pred_action_horizon = 5
+        self.image_history = deque(maxlen=self.obs_horizon)
+        self.exec_horizon = exec_horizon
+
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+        self.action_ensemble = action_ensemble
+        self.action_ensemble_temp = action_ensemble_temp
+
+        if self.action_ensemble:
+            self.action_ensembler = ActionEnsembler(
+                self.pred_action_horizon, self.action_ensemble_temp
+            )
+        else:
+            self.action_ensembler = None
+
+        self.task = None
+        self.task_description = None
+
+    def reset(self, task_description: str) -> None:
+        self.image_history.clear()
+        if self.action_ensemble:
+            self.action_ensembler.reset()
+        self.task_description = task_description
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+        self.action_plan = deque()
+
+    def preprocess_widowx_proprio(self, eef_pos) -> np.array:
+        """convert ee rotation to the frame of top-down
+        https://github.com/allenzren/open-pi-zero/blob/c3df7fb062175c16f69d7ca4ce042958ea238fb7/src/agent/env_adapter/simpler.py#L167
+        """
+        # StateEncoding.POS_EULER: xyz + rpy + pad + gripper(openness)
+        proprio = eef_pos
+        rm_bridge = quat2mat(proprio[3:7])
+        rpy_bridge_converted = mat2euler(rm_bridge @ self.default_rot.T)
+        gripper_openness = proprio[7] # from simpler, 0 for close, 1 for open
+        raw_proprio = np.concatenate(
+            [
+                proprio[:3],
+                rpy_bridge_converted,
+                np.zeros(1),
+                [gripper_openness],
+            ]
+        )
+        return raw_proprio
+
+    def preprocess_google_robot_proprio(self, eef_pos) -> np.array:
+        """convert wxyz quat from simpler to xyzw used in fractal
+        https://github.com/allenzren/open-pi-zero/blob/c3df7fb062175c16f69d7ca4ce042958ea238fb7/src/agent/env_adapter/simpler.py#L204
+        """
+        # StateEncoding.POS_QUAT: xyz + q_xyzw + gripper(closeness)
+        quat_xyzw = np.roll(eef_pos[3:7], -1)
+        gripper_width = eef_pos[
+            7
+        ]  # from simpler, 0 for close, 1 for open
+        # need invert as the training data comes from closeness
+        gripper_closedness = (
+            1 - gripper_width
+        )  # TODO(allenzren): change fractal data processing in training so also use gripper openness in proprio (as in bridge) instead of closedness
+        raw_proprio = np.concatenate(
+            (
+                eef_pos[:3],
+                quat_xyzw,
+                [gripper_closedness],
+            )
+        )
+        return raw_proprio
+
+    def step(
+        self, image: np.ndarray, task_description: Optional[str] = None, *args, **kwargs
+    ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+        """
+        Input:
+            image: np.ndarray of shape (H, W, 3), uint8
+            task_description: Optional[str], task description; if different from previous task description, policy state is reset
+        Output:
+            raw_action: dict; raw policy action output
+            action: dict; processed action to be sent to the maniskill2 environment, with the following keys:
+                - 'world_vector': np.ndarray of shape (3,), xyz translation of robot end-effector
+                - 'rot_axangle': np.ndarray of shape (3,), axis-angle representation of end-effector rotation
+                - 'gripper': np.ndarray of shape (1,), gripper action
+                - 'terminate_episode': np.ndarray of shape (1,), 1 if episode should be terminated, 0 otherwise
+        """
+        if task_description is not None:
+            if task_description != self.task_description:
+                self.reset(task_description)
+
+        assert image.dtype == np.uint8
+        image = self._resize_image(image)
+        self._add_image_to_history(image)
+        images: List[Image.Image] = self._obtain_image_history()
+
+        eef_pos = kwargs.get("eef_pos", None)
+        if self.policy_setup == "widowx_bridge":
+            state = self.preprocess_widowx_proprio(eef_pos)
+            image_key = "observation.images.image_0"
+        elif self.policy_setup == "google_robot":
+            state = self.preprocess_google_robot_proprio(eef_pos)
+            image_key = "observation.images.image"
+
+        # if self.action_ensemble:
+        #     raw_actions = self.action_ensembler.ensemble_action(raw_actions)[None]
+
+        if not self.action_plan:
+            observation = {
+                "observation/state": state,
+                "observation/primary_image": np.array(images[0]),
+                "prompt": task_description, 
+            }
+            action_chunk = self.policy_client.infer(observation)["actions"][:self.pred_action_horizon]
+            self.action_plan.extend(action_chunk[: self.exec_horizon])
+
+        raw_actions = self.action_plan.popleft()
+
+        raw_action = {
+            "world_vector": np.array(raw_actions[:3]),
+            "rotation_delta": np.array(raw_actions[3:6]),
+            "open_gripper": np.array(
+                raw_actions[6:7]
+            ),  # range [0, 1]; 1 = open; 0 = close
+        }
+
+        # process raw_action to obtain the action to be sent to the maniskill2 environment
+        action = {}
+        action["world_vector"] = raw_action["world_vector"] * self.action_scale
+        action_rotation_delta = np.asarray(
+            raw_action["rotation_delta"], dtype=np.float64
+        )
+        roll, pitch, yaw = action_rotation_delta
+        action_rotation_ax, action_rotation_angle = euler2axangle(roll, pitch, yaw)
+        action_rotation_axangle = action_rotation_ax * action_rotation_angle
+        action["rot_axangle"] = action_rotation_axangle * self.action_scale
+
+        if self.policy_setup == "google_robot":
+            action["gripper"] = 0
+            current_gripper_action = raw_action["open_gripper"]
+            if self.previous_gripper_action is None:
+                relative_gripper_action = np.array([0])
+                self.previous_gripper_action = current_gripper_action
+            else:
+                relative_gripper_action = self.previous_gripper_action - current_gripper_action
+            
+            # fix a bug in the SIMPLER code here
+            # self.previous_gripper_action = current_gripper_action
+
+            if np.abs(relative_gripper_action) > 0.5 and (not self.sticky_action_is_on):
+                self.sticky_action_is_on = True
+                self.sticky_gripper_action = relative_gripper_action
+                self.previous_gripper_action = current_gripper_action
+
+            if self.sticky_action_is_on:
+                self.gripper_action_repeat += 1
+                relative_gripper_action = self.sticky_gripper_action
+
+            if self.gripper_action_repeat == self.sticky_gripper_num_repeat:
+                self.sticky_action_is_on = False
+                self.gripper_action_repeat = 0
+                self.sticky_gripper_action = 0.0
+
+            action["gripper"] = relative_gripper_action
+
+        elif self.policy_setup == "widowx_bridge":
+            action["gripper"] = 2.0 * (raw_action["open_gripper"] > 0.5) - 1.0
+        
+        action["terminate_episode"] = np.array([0.0])
+        return raw_action, action
+
+    def _resize_image(self, image: np.ndarray) -> np.ndarray:
+        image = cv.resize(image, tuple(self.image_size), interpolation=cv.INTER_AREA)
+        return image
+
+    def _add_image_to_history(self, image: np.ndarray) -> None:
+        if len(self.image_history) == 0:
+            self.image_history.extend([image] * self.obs_horizon)
+        else:
+            self.image_history.append(image)
+
+    def _obtain_image_history(self) -> List[Image.Image]:
+        image_history = list(self.image_history)
+        images = image_history[:: self.obs_interval]
+        # images = [Image.fromarray(image).convert("RGB") for image in images]
+        return images
+
+    def visualize_epoch(
+        self,
+        predicted_raw_actions: Sequence[np.ndarray],
+        images: Sequence[np.ndarray],
+        save_path: str,
+    ) -> None:
+        images = [self._resize_image(image) for image in images]
+        ACTION_DIM_LABELS = ["x", "y", "z", "roll", "pitch", "yaw", "grasp"]
+
+        img_strip = np.concatenate(np.array(images[::3]), axis=1)
+
+        # set up plt figure
+        figure_layout = [["image"] * len(ACTION_DIM_LABELS), ACTION_DIM_LABELS]
+        plt.rcParams.update({"font.size": 12})
+        fig, axs = plt.subplot_mosaic(figure_layout)
+        fig.set_size_inches([45, 10])
+
+        # plot actions
+        pred_actions = np.array(
+            [
+                np.concatenate(
+                    [a["world_vector"], a["rotation_delta"], a["open_gripper"]], axis=-1
+                )
+                for a in predicted_raw_actions
+            ]
+        )
+        for action_dim, action_label in enumerate(ACTION_DIM_LABELS):
+            # actions have batch, horizon, dim, in this example we just take the first action for simplicity
+            axs[action_label].plot(
+                pred_actions[:, action_dim], label="predicted action"
+            )
+            axs[action_label].set_title(action_label)
+            axs[action_label].set_xlabel("Time in one episode")
+
+        axs["image"].imshow(img_strip)
+        axs["image"].set_xlabel("Time in one episode (subsampled)")
+        plt.legend()
+        plt.savefig(save_path)
diff --git a/simpler_env/policies/openvla/openvla_model.py b/simpler_env/policies/openvla/openvla_model.py
new file mode 100644
index 00000000..715c7796
--- /dev/null
+++ b/simpler_env/policies/openvla/openvla_model.py
@@ -0,0 +1,181 @@
+from typing import Optional, Sequence
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+from transforms3d.euler import euler2axangle
+from transformers import AutoModelForVision2Seq, AutoProcessor
+from PIL import Image
+import torch
+import cv2 as cv
+
+
+class OpenVLAInference:
+    def __init__(
+        self,
+        saved_model_path: str = "openvla/openvla-7b",
+        unnorm_key: Optional[str] = None,
+        policy_setup: str = "widowx_bridge",
+        horizon: int = 1,
+        pred_action_horizon: int = 1,
+        exec_horizon: int = 1,
+        image_size: list[int] = [224, 224],
+        action_scale: float = 1.0,
+    ) -> None:
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        if policy_setup == "widowx_bridge":
+            unnorm_key = "bridge_orig" if unnorm_key is None else unnorm_key
+            self.sticky_gripper_num_repeat = 1
+        elif policy_setup == "google_robot":
+            unnorm_key = "fractal20220817_data" if unnorm_key is None else unnorm_key
+            self.sticky_gripper_num_repeat = 15
+        else:
+            raise NotImplementedError(
+                f"Policy setup {policy_setup} not supported for octo models. The other datasets can be found in the huggingface config.json file."
+            )
+        self.policy_setup = policy_setup
+        self.unnorm_key = unnorm_key
+
+        print(f"*** policy_setup: {policy_setup}, unnorm_key: {unnorm_key} ***")
+        self.processor = AutoProcessor.from_pretrained(saved_model_path, trust_remote_code=True)
+        self.vla = AutoModelForVision2Seq.from_pretrained(
+            saved_model_path,
+            attn_implementation="flash_attention_2",  # [Optional] Requires `flash_attn`
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+        ).cuda()
+
+        self.image_size = image_size
+        self.action_scale = action_scale
+        self.horizon = horizon
+        self.pred_action_horizon = pred_action_horizon
+        self.exec_horizon = exec_horizon
+
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+        self.task = None
+        self.task_description = None
+        self.num_image_history = 0
+
+    def reset(self, task_description: str) -> None:
+        self.task_description = task_description
+        self.num_image_history = 0
+
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+    def step(
+        self, image: np.ndarray, task_description: Optional[str] = None, *args, **kwargs
+    ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+        """
+        Input:
+            image: np.ndarray of shape (H, W, 3), uint8
+            task_description: Optional[str], task description; if different from previous task description, policy state is reset
+        Output:
+            raw_action: dict; raw policy action output
+            action: dict; processed action to be sent to the maniskill2 environment, with the following keys:
+                - 'world_vector': np.ndarray of shape (3,), xyz translation of robot end-effector
+                - 'rot_axangle': np.ndarray of shape (3,), axis-angle representation of end-effector rotation
+                - 'gripper': np.ndarray of shape (1,), gripper action
+                - 'terminate_episode': np.ndarray of shape (1,), 1 if episode should be terminated, 0 otherwise
+        """
+        if task_description is not None:
+            if task_description != self.task_description:
+                self.reset(task_description)
+
+        assert image.dtype == np.uint8
+        image = self._resize_image(image)
+
+        image: Image.Image = Image.fromarray(image)
+        prompt = task_description
+
+        # predict action (7-dof; un-normalize for bridgev2)
+        inputs = self.processor(prompt, image).to("cuda:0", dtype=torch.bfloat16)
+        raw_actions = self.vla.predict_action(**inputs, unnorm_key=self.unnorm_key, do_sample=False)[None]
+        # print(f"*** raw actions {raw_actions} ***")
+
+        raw_action = {
+            "world_vector": np.array(raw_actions[0, :3]),
+            "rotation_delta": np.array(raw_actions[0, 3:6]),
+            "open_gripper": np.array(raw_actions[0, 6:7]),  # range [0, 1]; 1 = open; 0 = close
+        }
+
+        # process raw_action to obtain the action to be sent to the maniskill2 environment
+        action = {}
+        action["world_vector"] = raw_action["world_vector"] * self.action_scale
+        action_rotation_delta = np.asarray(raw_action["rotation_delta"], dtype=np.float64)
+        roll, pitch, yaw = action_rotation_delta
+        action_rotation_ax, action_rotation_angle = euler2axangle(roll, pitch, yaw)
+        action_rotation_axangle = action_rotation_ax * action_rotation_angle
+        action["rot_axangle"] = action_rotation_axangle * self.action_scale
+
+        if self.policy_setup == "google_robot":
+            current_gripper_action = raw_action["open_gripper"]
+            if self.previous_gripper_action is None:
+                relative_gripper_action = np.array([0])
+            else:
+                relative_gripper_action = self.previous_gripper_action - current_gripper_action
+            self.previous_gripper_action = current_gripper_action
+
+            if np.abs(relative_gripper_action) > 0.5 and (not self.sticky_action_is_on):
+                self.sticky_action_is_on = True
+                self.sticky_gripper_action = relative_gripper_action
+
+            if self.sticky_action_is_on:
+                self.gripper_action_repeat += 1
+                relative_gripper_action = self.sticky_gripper_action
+
+            if self.gripper_action_repeat == self.sticky_gripper_num_repeat:
+                self.sticky_action_is_on = False
+                self.gripper_action_repeat = 0
+                self.sticky_gripper_action = 0.0
+
+            action["gripper"] = relative_gripper_action
+
+        elif self.policy_setup == "widowx_bridge":
+            action["gripper"] = 2.0 * (raw_action["open_gripper"] > 0.5) - 1.0
+
+        action["terminate_episode"] = np.array([0.0])
+
+        return raw_action, action
+
+    def _resize_image(self, image: np.ndarray) -> np.ndarray:
+        image = cv.resize(image, tuple(self.image_size), interpolation=cv.INTER_AREA)
+        return image
+
+    def visualize_epoch(
+        self, predicted_raw_actions: Sequence[np.ndarray], images: Sequence[np.ndarray], save_path: str
+    ) -> None:
+        images = [self._resize_image(image) for image in images]
+        ACTION_DIM_LABELS = ["x", "y", "z", "roll", "pitch", "yaw", "grasp"]
+
+        img_strip = np.concatenate(np.array(images[::3]), axis=1)
+
+        # set up plt figure
+        figure_layout = [["image"] * len(ACTION_DIM_LABELS), ACTION_DIM_LABELS]
+        plt.rcParams.update({"font.size": 12})
+        fig, axs = plt.subplot_mosaic(figure_layout)
+        fig.set_size_inches([45, 10])
+
+        # plot actions
+        pred_actions = np.array(
+            [
+                np.concatenate([a["world_vector"], a["rotation_delta"], a["open_gripper"]], axis=-1)
+                for a in predicted_raw_actions
+            ]
+        )
+        for action_dim, action_label in enumerate(ACTION_DIM_LABELS):
+            # actions have batch, horizon, dim, in this example we just take the first action for simplicity
+            axs[action_label].plot(pred_actions[:, action_dim], label="predicted action")
+            axs[action_label].set_title(action_label)
+            axs[action_label].set_xlabel("Time in one episode")
+
+        axs["image"].imshow(img_strip)
+        axs["image"].set_xlabel("Time in one episode (subsampled)")
+        plt.legend()
+        plt.savefig(save_path)
diff --git a/simpler_env/policies/rt1/rt1_model.py b/simpler_env/policies/rt1/rt1_model.py
index c838217b..9542e9b8 100644
--- a/simpler_env/policies/rt1/rt1_model.py
+++ b/simpler_env/policies/rt1/rt1_model.py
@@ -116,7 +116,9 @@ def reset(self, task_description: str) -> None:
         self._initialize_task_description(task_description)
 
     @staticmethod
-    def _small_action_filter_google_robot(raw_action: dict[str, np.ndarray | tf.Tensor], arm_movement: bool = False, gripper: bool = True) -> dict[str, np.ndarray | tf.Tensor]:
+    def _small_action_filter_google_robot(
+        raw_action: dict[str, np.ndarray | tf.Tensor], arm_movement: bool = False, gripper: bool = True
+    ) -> dict[str, np.ndarray | tf.Tensor]:
         # small action filtering for google robot
         if arm_movement:
             raw_action["world_vector"] = tf.where(
@@ -147,7 +149,7 @@ def _small_action_filter_google_robot(raw_action: dict[str, np.ndarray | tf.Tens
             )
         return raw_action
 
-    def step(self, image: np.ndarray, task_description: Optional[str] = None) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+    def step(self, image: np.ndarray, task_description: Optional[str] = None, *args, **kwargs) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
         """
         Input:
             image: np.ndarray of shape (H, W, 3), uint8
@@ -165,7 +167,7 @@ def step(self, image: np.ndarray, task_description: Optional[str] = None) -> tup
                 # task description has changed; update language embedding
                 # self._initialize_task_description(task_description)
                 self.reset(task_description)
-        
+
         assert image.dtype == np.uint8
         image = self._resize_image(image)
         self.observation["image"] = image
@@ -188,11 +190,7 @@ def step(self, image: np.ndarray, task_description: Optional[str] = None) -> tup
         if self.action_rotation_mode == "axis_angle":
             action_rotation_delta = np.asarray(raw_action["rotation_delta"], dtype=np.float64)
             action_rotation_angle = np.linalg.norm(action_rotation_delta)
-            action_rotation_ax = (
-                action_rotation_delta / action_rotation_angle
-                if action_rotation_angle > 1e-6
-                else np.array([0.0, 1.0, 0.0])
-            )
+            action_rotation_ax = action_rotation_delta / action_rotation_angle if action_rotation_angle > 1e-6 else np.array([0.0, 1.0, 0.0])
             action["rot_axangle"] = action_rotation_ax * action_rotation_angle * self.action_scale
         elif self.action_rotation_mode in ["rpy", "ypr", "pry"]:
             if self.action_rotation_mode == "rpy":
@@ -256,9 +254,7 @@ def visualize_epoch(self, predicted_raw_actions: Sequence[np.ndarray], images: S
                 for action_sub_dimension in range(action[action_name].shape[0]):
                     # print(action_name, action_sub_dimension)
                     title = f"{action_name}_{action_sub_dimension}"
-                    predicted_action_name_to_values_over_time[title].append(
-                        predicted_raw_actions[i][action_name][action_sub_dimension]
-                    )
+                    predicted_action_name_to_values_over_time[title].append(predicted_raw_actions[i][action_name][action_sub_dimension])
 
         figure_layout = [["image"] * len(figure_layout), figure_layout]
 
diff --git a/simpler_env/policies/sim_cogact/__init__.py b/simpler_env/policies/sim_cogact/__init__.py
new file mode 100644
index 00000000..43a37cfc
--- /dev/null
+++ b/simpler_env/policies/sim_cogact/__init__.py
@@ -0,0 +1,2 @@
+from .cogact_policy import CogACTInference
+from .adaptive_ensemble import AdaptiveEnsembler
\ No newline at end of file
diff --git a/simpler_env/policies/sim_cogact/adaptive_ensemble.py b/simpler_env/policies/sim_cogact/adaptive_ensemble.py
new file mode 100644
index 00000000..520f85cc
--- /dev/null
+++ b/simpler_env/policies/sim_cogact/adaptive_ensemble.py
@@ -0,0 +1,44 @@
+"""
+adaptive_ensemble.py
+
+"""
+
+from collections import deque
+import numpy as np
+
+
+class AdaptiveEnsembler:
+    def __init__(self, pred_action_horizon, adaptive_ensemble_alpha=0.0):
+        self.pred_action_horizon = pred_action_horizon
+        self.action_history = deque(maxlen=self.pred_action_horizon)
+        self.adaptive_ensemble_alpha = adaptive_ensemble_alpha
+
+    def reset(self):
+        self.action_history.clear()
+
+    def ensemble_action(self, cur_action):
+        self.action_history.append(cur_action)
+        num_actions = len(self.action_history)
+        if cur_action.ndim == 1:
+            curr_act_preds = np.stack(self.action_history)
+        else:
+            curr_act_preds = np.stack(
+                [pred_actions[i] for (i, pred_actions) in zip(range(num_actions - 1, -1, -1), self.action_history)]
+            )
+
+        # calculate cosine similarity between the current prediction and all previous predictions
+        ref = curr_act_preds[num_actions-1, :]
+        previous_pred = curr_act_preds
+        dot_product = np.sum(previous_pred * ref, axis=1)  
+        norm_previous_pred = np.linalg.norm(previous_pred, axis=1)  
+        norm_ref = np.linalg.norm(ref)  
+        cos_similarity = dot_product / (norm_previous_pred * norm_ref + 1e-7)
+
+        # compute the weights for each prediction
+        weights = np.exp(self.adaptive_ensemble_alpha * cos_similarity)
+        weights = weights / weights.sum()
+  
+        # compute the weighted average across all predictions for this timestep
+        cur_action = np.sum(weights[:, None] * curr_act_preds, axis=0)
+
+        return cur_action
\ No newline at end of file
diff --git a/simpler_env/policies/sim_cogact/cogact_policy.py b/simpler_env/policies/sim_cogact/cogact_policy.py
new file mode 100644
index 00000000..f3609018
--- /dev/null
+++ b/simpler_env/policies/sim_cogact/cogact_policy.py
@@ -0,0 +1,232 @@
+"""
+cogact_policy.py
+
+"""
+from collections import deque
+from typing import Optional, Sequence
+import os
+from PIL import Image
+import torch
+import cv2 as cv
+import matplotlib.pyplot as plt
+import numpy as np
+
+from transforms3d.euler import euler2axangle
+from transformers import AutoModelForVision2Seq, AutoProcessor
+
+
+from vla import load_vla
+from sim_cogact.adaptive_ensemble import AdaptiveEnsembler
+
+class CogACTInference:
+    def __init__(
+        self,
+        saved_model_path: str = 'CogACT/CogACT-Base',
+        unnorm_key: Optional[str] = None,
+        policy_setup: str = "widowx_bridge",
+        horizon: int = 0,
+        action_ensemble_horizon: Optional[int] = None,
+        image_size: list[int] = [224, 224],
+        future_action_window_size: int = 15,
+        action_dim: int = 7,
+        action_model_type: str = "DiT-L",
+        action_scale: float = 1.0,
+        cfg_scale: float = 1.5,
+        use_ddim: bool = True,
+        num_ddim_steps: int = 10,
+        use_bf16: bool = True,
+        action_ensemble = True,
+        adaptive_ensemble_alpha = 0.1,
+    ) -> None:
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        if policy_setup == "widowx_bridge":
+            unnorm_key = "bridge_orig" if unnorm_key is None else unnorm_key
+            action_ensemble = action_ensemble
+            adaptive_ensemble_alpha = adaptive_ensemble_alpha
+            if action_ensemble_horizon is None:
+                # Set 7 for widowx_bridge to fix the window size of motion scale between each frame. see appendix in our paper for details
+                action_ensemble_horizon = 7
+            self.sticky_gripper_num_repeat = 1
+        elif policy_setup == "google_robot":
+            unnorm_key = "fractal20220817_data" if unnorm_key is None else unnorm_key
+            action_ensemble = action_ensemble
+            adaptive_ensemble_alpha = adaptive_ensemble_alpha
+            if action_ensemble_horizon is None:
+                # Set 2 for google_robot to fix the window size of motion scale between each frame. see appendix in our paper for details
+                action_ensemble_horizon = 2
+            self.sticky_gripper_num_repeat = 10
+        else:
+            raise NotImplementedError(
+                f"Policy setup {policy_setup} not supported for octo models. The other datasets can be found in the huggingface config.json file."
+            )
+        self.policy_setup = policy_setup
+        self.unnorm_key = unnorm_key
+
+        print(f"*** policy_setup: {policy_setup}, unnorm_key: {unnorm_key} ***")
+        self.use_ddim = use_ddim
+        self.num_ddim_steps = num_ddim_steps
+        self.vla = load_vla(
+          saved_model_path,                       # choose from ['CogACT/CogACT-Small', 'CogACT/CogACT-Base', 'CogACT/CogACT-Large'] or the local path
+          load_for_training=False, 
+          action_model_type=action_model_type,              # choose from ['DiT-Small', 'DiT-Base', 'DiT-Large'] to match the model weight
+          future_action_window_size=future_action_window_size,
+          action_dim=action_dim,
+        )
+
+        if use_bf16:
+            self.vla.vlm = self.vla.vlm.to(torch.bfloat16)
+        self.vla = self.vla.to("cuda").eval()
+        self.cfg_scale = cfg_scale
+
+        self.image_size = image_size
+        self.action_scale = action_scale
+        self.horizon = horizon
+        self.action_ensemble = action_ensemble
+        self.adaptive_ensemble_alpha = adaptive_ensemble_alpha
+        self.action_ensemble_horizon = action_ensemble_horizon
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+        self.task_description = None
+        self.image_history = deque(maxlen=self.horizon)
+        if self.action_ensemble:
+            self.action_ensembler = AdaptiveEnsembler(self.action_ensemble_horizon, self.adaptive_ensemble_alpha)
+        else:
+            self.action_ensembler = None
+        self.num_image_history = 0
+
+    def _add_image_to_history(self, image: np.ndarray) -> None:
+        self.image_history.append(image)
+        self.num_image_history = min(self.num_image_history + 1, self.horizon)
+
+    def reset(self, task_description: str) -> None:
+        self.task_description = task_description
+        self.image_history.clear()
+        if self.action_ensemble:
+            self.action_ensembler.reset()
+        self.num_image_history = 0
+
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+    def step(
+        self, image: np.ndarray, task_description: Optional[str] = None, *args, **kwargs
+    ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+        """
+        Input:
+            image: np.ndarray of shape (H, W, 3), uint8
+            task_description: Optional[str], task description; if different from previous task description, policy state is reset
+        Output:
+            raw_action: dict; raw policy action output
+            action: dict; processed action to be sent to the maniskill2 environment, with the following keys:
+                - 'world_vector': np.ndarray of shape (3,), xyz translation of robot end-effector
+                - 'rot_axangle': np.ndarray of shape (3,), axis-angle representation of end-effector rotation
+                - 'gripper': np.ndarray of shape (1,), gripper action
+                - 'terminate_episode': np.ndarray of shape (1,), 1 if episode should be terminated, 0 otherwise
+        """
+        if task_description is not None:
+            if task_description != self.task_description:
+                self.reset(task_description)
+
+        assert image.dtype == np.uint8
+        self._add_image_to_history(self._resize_image(image))
+        image: Image.Image = Image.fromarray(image)
+        raw_actions, normalized_actions = self.vla.predict_action(image=image, 
+                                                                instruction=self.task_description,
+                                                                unnorm_key=self.unnorm_key,
+                                                                do_sample=False, 
+                                                                cfg_scale=self.cfg_scale,
+                                                                use_ddim=self.use_ddim,
+                                                                num_ddim_steps=self.num_ddim_steps,
+                                                                )
+
+        if self.action_ensemble:
+            raw_actions = self.action_ensembler.ensemble_action(raw_actions)[None]
+        raw_action = {
+            "world_vector": np.array(raw_actions[0, :3]),
+            "rotation_delta": np.array(raw_actions[0, 3:6]),
+            "open_gripper": np.array(raw_actions[0, 6:7]),  # range [0, 1]; 1 = open; 0 = close
+        }
+
+        # process raw_action to obtain the action to be sent to the maniskill2 environment
+        action = {}
+        action["world_vector"] = raw_action["world_vector"] * self.action_scale
+        action_rotation_delta = np.asarray(raw_action["rotation_delta"], dtype=np.float64)
+
+        roll, pitch, yaw = action_rotation_delta
+        axes, angles = euler2axangle(roll, pitch, yaw)
+        action_rotation_axangle = axes * angles
+        action["rot_axangle"] = action_rotation_axangle * self.action_scale
+
+        if self.policy_setup == "google_robot":
+            action["gripper"] = 0
+            current_gripper_action = raw_action["open_gripper"]
+            if self.previous_gripper_action is None:
+                relative_gripper_action = np.array([0])
+                self.previous_gripper_action = current_gripper_action
+            else:
+                relative_gripper_action = self.previous_gripper_action - current_gripper_action
+            # fix a bug in the SIMPLER code here
+            # self.previous_gripper_action = current_gripper_action
+
+            if np.abs(relative_gripper_action) > 0.5 and (not self.sticky_action_is_on):
+                self.sticky_action_is_on = True
+                self.sticky_gripper_action = relative_gripper_action
+                self.previous_gripper_action = current_gripper_action
+
+            if self.sticky_action_is_on:
+                self.gripper_action_repeat += 1
+                relative_gripper_action = self.sticky_gripper_action
+
+            if self.gripper_action_repeat == self.sticky_gripper_num_repeat:
+                self.sticky_action_is_on = False
+                self.gripper_action_repeat = 0
+                self.sticky_gripper_action = 0.0
+
+            action["gripper"] = relative_gripper_action
+
+        elif self.policy_setup == "widowx_bridge":
+            action["gripper"] = 2.0 * (raw_action["open_gripper"] > 0.5) - 1.0
+        
+        action["terminate_episode"] = np.array([0.0])
+        return raw_action, action
+
+    def _resize_image(self, image: np.ndarray) -> np.ndarray:
+        image = cv.resize(image, tuple(self.image_size), interpolation=cv.INTER_AREA)
+        return image
+
+    def visualize_epoch(
+        self, predicted_raw_actions: Sequence[np.ndarray], images: Sequence[np.ndarray], save_path: str
+    ) -> None:
+        images = [self._resize_image(image) for image in images]
+        ACTION_DIM_LABELS = ["x", "y", "z", "roll", "pitch", "yaw", "grasp"]
+
+        img_strip = np.concatenate(np.array(images[::3]), axis=1)
+
+        # set up plt figure
+        figure_layout = [["image"] * len(ACTION_DIM_LABELS), ACTION_DIM_LABELS]
+        plt.rcParams.update({"font.size": 12})
+        fig, axs = plt.subplot_mosaic(figure_layout)
+        fig.set_size_inches([45, 10])
+
+        # plot actions
+        pred_actions = np.array(
+            [
+                np.concatenate([a["world_vector"], a["rotation_delta"], a["open_gripper"]], axis=-1)
+                for a in predicted_raw_actions
+            ]
+        )
+        for action_dim, action_label in enumerate(ACTION_DIM_LABELS):
+            # actions have batch, horizon, dim, in this example we just take the first action for simplicity
+            axs[action_label].plot(pred_actions[:, action_dim], label="predicted action")
+            axs[action_label].set_title(action_label)
+            axs[action_label].set_xlabel("Time in one episode")
+
+        axs["image"].imshow(img_strip)
+        axs["image"].set_xlabel("Time in one episode (subsampled)")
+        plt.legend()
+        plt.savefig(save_path)
diff --git a/scripts/rt1x_bridge.sh b/simpler_env/policies/sim_cogact/scripts/cogact_bridge.sh
similarity index 89%
rename from scripts/rt1x_bridge.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_bridge.sh
index c4608ce0..2a3f8827 100644
--- a/scripts/rt1x_bridge.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_bridge.sh
@@ -1,8 +1,6 @@
-
-
 gpu_id=0
-policy_model=rt1
-ckpt_path=checkpoints/rt_1_x_tf_trained_for_002272480_step/
+policy_model=cogact
+ckpt_path=CogACT/CogACT-Base # CogACT/CogACT-Large CogACT/CogACT-Small
 
 scene_name=bridge_table_1_v1
 robot=widowx
@@ -13,23 +11,23 @@ robot_init_y=0.028
 
 CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
   --robot ${robot} --policy-setup widowx_bridge \
-  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
-  --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 120 \
+  --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \
   --rgb-overlay-path ${rgb_overlay_path} \
   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
 
 CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
   --robot ${robot} --policy-setup widowx_bridge \
-  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
-  --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 120 \
+  --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \
   --rgb-overlay-path ${rgb_overlay_path} \
   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
 
 CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
   --robot ${robot} --policy-setup widowx_bridge \
-  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 120 \
   --env-name PutSpoonOnTableClothInScene-v0 --scene-name ${scene_name} \
   --rgb-overlay-path ${rgb_overlay_path} \
   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
@@ -48,6 +46,4 @@ CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-mod
   --env-name PutEggplantInBasketScene-v0 --scene-name ${scene_name} \
   --rgb-overlay-path ${rgb_overlay_path} \
   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
-
-
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
\ No newline at end of file
diff --git a/scripts/rt1_drawer_variant_agg.sh b/simpler_env/policies/sim_cogact/scripts/cogact_drawer_variant_agg.sh
similarity index 87%
rename from scripts/rt1_drawer_variant_agg.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_drawer_variant_agg.sh
index 983229b6..0bac2a90 100644
--- a/scripts/rt1_drawer_variant_agg.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_drawer_variant_agg.sh
@@ -1,14 +1,10 @@
 # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
-
-
+gpu_id=0
 
 declare -a ckpt_paths=(
-"./checkpoints/rt_1_tf_trained_for_000400120/"
-"./checkpoints/rt_1_tf_trained_for_000058240/"
-"./checkpoints/rt_1_x_tf_trained_for_002272480_step/"
-"./checkpoints/rt_1_tf_trained_for_000001120/"
+"CogACT/CogACT-Base"
 )
-
+# CogACT/CogACT-Large CogACT/CogACT-Small
 declare -a env_names=(
 OpenTopDrawerCustomInScene-v0
 OpenMiddleDrawerCustomInScene-v0
@@ -27,7 +23,7 @@ scene_name=frl_apartment_stage_simple
 EvalSim() {
   echo ${ckpt_path} ${env_name}
 
-  python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+  CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path None \
     --robot google_robot_static \
     --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
     --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/rt1_drawer_visual_matching.sh b/simpler_env/policies/sim_cogact/scripts/cogact_drawer_visual_matching.sh
similarity index 82%
rename from scripts/rt1_drawer_visual_matching.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_drawer_visual_matching.sh
index 77b78d2d..c302e827 100644
--- a/scripts/rt1_drawer_visual_matching.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_drawer_visual_matching.sh
@@ -1,12 +1,8 @@
 # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
-
-
+gpu_id=0
 
 declare -a ckpt_paths=(
-"./checkpoints/rt_1_tf_trained_for_000400120/"
-"./checkpoints/rt_1_tf_trained_for_000058240/"
-"./checkpoints/rt_1_x_tf_trained_for_002272480_step/"
-"./checkpoints/rt_1_tf_trained_for_000001120/"
+"CogACT/CogACT-Base"
 )
 
 declare -a env_names=(
@@ -27,7 +23,7 @@ EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_st
 
 EvalOverlay() {
 # A0
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -38,7 +34,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # A1
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -49,7 +45,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # A2
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -60,7 +56,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # B0
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -71,7 +67,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # B1
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -82,7 +78,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # B2
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -93,7 +89,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # C0
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -104,7 +100,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # C1
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -115,7 +111,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # C2
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
diff --git a/scripts/rt1_move_near_variant_agg.sh b/simpler_env/policies/sim_cogact/scripts/cogact_move_near_variant_agg.sh
similarity index 87%
rename from scripts/rt1_move_near_variant_agg.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_move_near_variant_agg.sh
index 7c95760c..b5b11b1c 100644
--- a/scripts/rt1_move_near_variant_agg.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_move_near_variant_agg.sh
@@ -1,11 +1,7 @@
-
 gpu_id=0
 
-declare -a arr=("./checkpoints/rt_1_x_tf_trained_for_002272480_step/" \
-                "./checkpoints/rt_1_tf_trained_for_000400120/" \
-                "./checkpoints/rt_1_tf_trained_for_000058240/" \
-                "./checkpoints/rt_1_tf_trained_for_000001120/")
-
+declare -a arr=("CogACT/CogACT-Base")
+# CogACT/CogACT-Large CogACT/CogACT-Small
 for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done
 
 
@@ -16,7 +12,7 @@ scene_name=google_pick_coke_can_1_v4
 
 for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -31,7 +27,7 @@ done
 
 for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -52,7 +48,7 @@ for scene_name in "${scene_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -74,7 +70,7 @@ scene_name=google_pick_coke_can_1_v4
 
 for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -82,7 +78,7 @@ do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
   --additional-env-build-kwargs slightly_darker_lighting=True;
 
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -106,7 +102,7 @@ for scene_name in "${scene_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -130,7 +126,7 @@ for env_name in "${env_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/rt1_move_near_visual_matching.sh b/simpler_env/policies/sim_cogact/scripts/cogact_move_near_visual_matching.sh
similarity index 77%
rename from scripts/rt1_move_near_visual_matching.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_move_near_visual_matching.sh
index 1949333d..833865c0 100644
--- a/scripts/rt1_move_near_visual_matching.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_move_near_visual_matching.sh
@@ -1,10 +1,7 @@
 
 gpu_id=0
 
-declare -a arr=("./checkpoints/rt_1_tf_trained_for_000400120/" \
-                "./checkpoints/rt_1_x_tf_trained_for_002272480_step/" \
-                "./checkpoints/rt_1_tf_trained_for_000058240/" \
-                "./checkpoints/rt_1_tf_trained_for_000001120/")
+declare -a arr=("CogACT/CogACT-Base")
 
 env_name=MoveNearGoogleBakedTexInScene-v0
 # env_name=MoveNearGoogleBakedTexInScene-v1
@@ -21,7 +18,7 @@ for urdf_version in "${urdf_version_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/rt1_pick_coke_can_variant_agg.sh b/simpler_env/policies/sim_cogact/scripts/cogact_pick_coke_can_variant_agg.sh
similarity index 89%
rename from scripts/rt1_pick_coke_can_variant_agg.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_pick_coke_can_variant_agg.sh
index 63b36e1e..79d1a387 100644
--- a/scripts/rt1_pick_coke_can_variant_agg.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_pick_coke_can_variant_agg.sh
@@ -1,10 +1,6 @@
-
 gpu_id=0
 
-declare -a arr=("./checkpoints/rt_1_x_tf_trained_for_002272480_step/" \
-                "./checkpoints/rt_1_tf_trained_for_000400120/" \
-                "./checkpoints/rt_1_tf_trained_for_000058240/" \
-                "./checkpoints/rt_1_tf_trained_for_000001120/")
+declare -a arr=("CogACT/CogACT-Base")
 
 # lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically
 declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True")
@@ -21,7 +17,7 @@ for coke_can_option in "${coke_can_options_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -49,7 +45,7 @@ do for scene_name in "${scene_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -75,7 +71,7 @@ for coke_can_option in "${coke_can_options_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -83,7 +79,7 @@ do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
   --additional-env-build-kwargs ${coke_can_option};
 
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -110,7 +106,7 @@ do for scene_name in "${scene_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -135,7 +131,7 @@ for coke_can_option in "${coke_can_options_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -143,7 +139,7 @@ do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
   --additional-env-build-kwargs ${coke_can_option} slightly_darker_lighting=True;
 
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -170,7 +166,7 @@ do for env_name in "${env_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/rt1_pick_coke_can_visual_matching.sh b/simpler_env/policies/sim_cogact/scripts/cogact_pick_coke_can_visual_matching.sh
similarity index 80%
rename from scripts/rt1_pick_coke_can_visual_matching.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_pick_coke_can_visual_matching.sh
index b0086684..3e5fbc25 100644
--- a/scripts/rt1_pick_coke_can_visual_matching.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_pick_coke_can_visual_matching.sh
@@ -1,11 +1,6 @@
-
-
 gpu_id=0
 
-declare -a arr=("./checkpoints/rt_1_tf_trained_for_000400120/" \
-                "./checkpoints/rt_1_tf_trained_for_000058240/" \
-                "./checkpoints/rt_1_x_tf_trained_for_002272480_step/" \
-                "./checkpoints/rt_1_tf_trained_for_000001120/" )
+declare -a arr=("CogACT/CogACT-Base")
 
 # lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically
 declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True")
@@ -27,7 +22,7 @@ do for coke_can_option in "${coke_can_options_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/rt1_put_in_drawer_variant_agg.sh b/simpler_env/policies/sim_cogact/scripts/cogact_put_in_drawer_variant_agg.sh
similarity index 87%
rename from scripts/rt1_put_in_drawer_variant_agg.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_put_in_drawer_variant_agg.sh
index 8aacf728..04d11634 100644
--- a/scripts/rt1_put_in_drawer_variant_agg.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_put_in_drawer_variant_agg.sh
@@ -1,12 +1,9 @@
 # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
-
+gpu_id=0
 
 
 declare -a ckpt_paths=(
-"./checkpoints/rt_1_tf_trained_for_000400120/"
-"./checkpoints/rt_1_tf_trained_for_000058240/"
-"./checkpoints/rt_1_x_tf_trained_for_002272480_step/"
-"./checkpoints/rt_1_tf_trained_for_000001120/"
+"CogACT/CogACT-Base"
 )
 
 declare -a env_names=(
@@ -22,7 +19,7 @@ scene_name=frl_apartment_stage_simple
 EvalSim() {
   echo ${ckpt_path} ${env_name}
 
-  python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+  CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
     --robot google_robot_static \
     --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
     --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/rt1_put_in_drawer_visual_matching.sh b/simpler_env/policies/sim_cogact/scripts/cogact_put_in_drawer_visual_matching.sh
similarity index 83%
rename from scripts/rt1_put_in_drawer_visual_matching.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_put_in_drawer_visual_matching.sh
index 3d250f6d..c1b3d23e 100644
--- a/scripts/rt1_put_in_drawer_visual_matching.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_put_in_drawer_visual_matching.sh
@@ -1,10 +1,7 @@
 # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
-
+gpu_id=0
 declare -a ckpt_paths=(
-"./checkpoints/rt_1_tf_trained_for_000400120/"
-"./checkpoints/rt_1_tf_trained_for_000058240/"
-"./checkpoints/rt_1_x_tf_trained_for_002272480_step/"
-"./checkpoints/rt_1_tf_trained_for_000001120/"
+"CogACT/CogACT-Base"
 )
 
 
@@ -25,7 +22,7 @@ EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_st
 
 EvalOverlay() {
 # A0
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -36,7 +33,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # B0
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -47,7 +44,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # C0
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
   --env-name ${env_name} --scene-name dummy_drawer \
diff --git a/scripts/octo_bridge.sh b/simpler_env/policies/sim_cogact/scripts/octo_bridge.sh
similarity index 92%
rename from scripts/octo_bridge.sh
rename to simpler_env/policies/sim_cogact/scripts/octo_bridge.sh
index 607498b0..e4f4d686 100644
--- a/scripts/octo_bridge.sh
+++ b/simpler_env/policies/sim_cogact/scripts/octo_bridge.sh
@@ -1,5 +1,3 @@
-
-
 gpu_id=0
 declare -a policy_models=(
 "octo-small"
@@ -15,13 +13,13 @@ rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_real_eval_1.png
 robot_init_x=0.147
 robot_init_y=0.028
 
-for init_rng in 0 2 4;
+for init_rng in 0 2 4 6 8;
 
 do for policy_model in "${policy_models[@]}";
 
 do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
   --robot ${robot} --policy-setup widowx_bridge --octo-init-rng ${init_rng} \
-  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 120 \
   --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \
   --rgb-overlay-path ${rgb_overlay_path} \
   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
@@ -30,7 +28,7 @@ do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-
 
 CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
   --robot ${robot} --policy-setup widowx_bridge --octo-init-rng ${init_rng} \
-  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 120 \
   --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \
   --rgb-overlay-path ${rgb_overlay_path} \
   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
@@ -39,7 +37,7 @@ CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-mod
 
 CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
   --robot ${robot} --policy-setup widowx_bridge --octo-init-rng ${init_rng} \
-  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 120 \
   --env-name PutSpoonOnTableClothInScene-v0 --scene-name ${scene_name} \
   --rgb-overlay-path ${rgb_overlay_path} \
   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
@@ -60,7 +58,7 @@ rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_sink.png
 robot_init_x=0.127
 robot_init_y=0.06
 
-for init_rng in 0 2 4;
+for init_rng in 0 2 4 6 8;
 
 do for policy_model in "${policy_models[@]}";
 
diff --git a/simpler_env/policies/spatialvla/spatialvla_model.py b/simpler_env/policies/spatialvla/spatialvla_model.py
new file mode 100644
index 00000000..d1c2ba8f
--- /dev/null
+++ b/simpler_env/policies/spatialvla/spatialvla_model.py
@@ -0,0 +1,240 @@
+from typing import Optional, Sequence, List
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+from transforms3d.euler import euler2axangle
+from transformers import AutoModel, AutoProcessor
+from collections import deque
+from PIL import Image
+import torch
+import cv2 as cv
+
+from simpler_env.utils.action.action_ensemble import ActionEnsembler
+
+
+class SpatialVLAInference:
+    def __init__(
+        self,
+        saved_model_path: str = "IPEC-COMMUNITY/spatialvla-4b-224-pt",
+        unnorm_key: Optional[str] = None,
+        policy_setup: str = "widowx_bridge",
+        exec_horizon: int = 1,
+        image_size: list[int] = [224, 224],
+        action_scale: float = 1.0,
+        action_ensemble_temp: float = -0.8,
+    ) -> None:
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        if policy_setup == "widowx_bridge":
+            unnorm_key = "bridge_orig/1.0.0" if unnorm_key is None else unnorm_key
+            action_ensemble = True
+            self.sticky_gripper_num_repeat = 1
+        elif policy_setup == "google_robot":
+            unnorm_key = (
+                "fractal20220817_data/0.1.0" if unnorm_key is None else unnorm_key
+            )
+            action_ensemble = True
+            self.sticky_gripper_num_repeat = 10
+        else:
+            raise NotImplementedError(
+                f"Policy setup {policy_setup} not supported for octo models. The other datasets can be found in the huggingface config.json file."
+            )
+        self.policy_setup = policy_setup
+        self.unnorm_key = unnorm_key
+
+        print(f"*** policy_setup: {policy_setup}, unnorm_key: {unnorm_key} ***")
+        self.processor = AutoProcessor.from_pretrained(
+            saved_model_path, trust_remote_code=True
+        )
+        self.vla = (
+            AutoModel.from_pretrained(
+                saved_model_path,
+                torch_dtype=torch.bfloat16,
+                trust_remote_code=True,
+            )
+            .eval()
+            .cuda()
+        )
+
+        self.image_size = image_size
+        self.action_scale = action_scale
+        self.obs_horizon = (self.processor.num_obs_steps - 1) * self.processor.obs_delta + 1
+        self.obs_interval = self.processor.obs_delta
+        self.pred_action_horizon = self.processor.action_chunk_size
+        self.image_history = deque(maxlen=self.obs_horizon)
+        self.exec_horizon = exec_horizon
+
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+        self.action_ensemble = action_ensemble
+        self.action_ensemble_temp = action_ensemble_temp
+
+        if self.action_ensemble:
+            self.action_ensembler = ActionEnsembler(
+                self.pred_action_horizon, self.action_ensemble_temp
+            )
+        else:
+            self.action_ensembler = None
+
+        self.task = None
+        self.task_description = None
+
+    def reset(self, task_description: str) -> None:
+        self.image_history.clear()
+        if self.action_ensemble:
+            self.action_ensembler.reset()
+        self.task_description = task_description
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+    def step(
+        self, image: np.ndarray, task_description: Optional[str] = None, *args, **kwargs
+    ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+        """
+        Input:
+            image: np.ndarray of shape (H, W, 3), uint8
+            task_description: Optional[str], task description; if different from previous task description, policy state is reset
+        Output:
+            raw_action: dict; raw policy action output
+            action: dict; processed action to be sent to the maniskill2 environment, with the following keys:
+                - 'world_vector': np.ndarray of shape (3,), xyz translation of robot end-effector
+                - 'rot_axangle': np.ndarray of shape (3,), axis-angle representation of end-effector rotation
+                - 'gripper': np.ndarray of shape (1,), gripper action
+                - 'terminate_episode': np.ndarray of shape (1,), 1 if episode should be terminated, 0 otherwise
+        """
+        if task_description is not None:
+            if task_description != self.task_description:
+                self.reset(task_description)
+
+        assert image.dtype == np.uint8
+        image = self._resize_image(image)
+        self._add_image_to_history(image)
+        images: List[Image.Image] = self._obtain_image_history()
+        prompt = task_description
+
+        # predict action (7-dof; un-normalize for bridgev2)
+        inputs = self.processor(images=images, text=prompt, unnorm_key=self.unnorm_key, return_tensors="pt", do_normalize=False)
+        with torch.no_grad():
+            if hasattr(self.processor, "action_tokenizer"):
+                generation_outputs = self.vla.predict_action(inputs)
+                raw_actions = self.processor.decode_actions(
+                    generation_outputs=generation_outputs,
+                    unnorm_key=self.unnorm_key,
+                )["actions"]
+            else:
+                raw_actions = self.vla.predict_action(**inputs)["actions"]
+                raw_actions = raw_actions.cpu().numpy()
+
+        if self.action_ensemble:
+            raw_actions = self.action_ensembler.ensemble_action(raw_actions)[None]
+
+        raw_action = {
+            "world_vector": np.array(raw_actions[0, :3]),
+            "rotation_delta": np.array(raw_actions[0, 3:6]),
+            "open_gripper": np.array(
+                raw_actions[0, 6:7]
+            ),  # range [0, 1]; 1 = open; 0 = close
+        }
+
+        # process raw_action to obtain the action to be sent to the maniskill2 environment
+        action = {}
+        action["world_vector"] = raw_action["world_vector"] * self.action_scale
+        action_rotation_delta = np.asarray(
+            raw_action["rotation_delta"], dtype=np.float64
+        )
+        roll, pitch, yaw = action_rotation_delta
+        action_rotation_ax, action_rotation_angle = euler2axangle(roll, pitch, yaw)
+        action_rotation_axangle = action_rotation_ax * action_rotation_angle
+        action["rot_axangle"] = action_rotation_axangle * self.action_scale
+
+        if self.policy_setup == "google_robot":
+            action["gripper"] = 0
+            current_gripper_action = raw_action["open_gripper"]
+            if self.previous_gripper_action is None:
+                relative_gripper_action = np.array([0])
+                self.previous_gripper_action = current_gripper_action
+            else:
+                relative_gripper_action = self.previous_gripper_action - current_gripper_action
+            # fix a bug in the SIMPLER code here
+            # self.previous_gripper_action = current_gripper_action
+
+            if np.abs(relative_gripper_action) > 0.5 and (not self.sticky_action_is_on):
+                self.sticky_action_is_on = True
+                self.sticky_gripper_action = relative_gripper_action
+                self.previous_gripper_action = current_gripper_action
+
+            if self.sticky_action_is_on:
+                self.gripper_action_repeat += 1
+                relative_gripper_action = self.sticky_gripper_action
+
+            if self.gripper_action_repeat == self.sticky_gripper_num_repeat:
+                self.sticky_action_is_on = False
+                self.gripper_action_repeat = 0
+                self.sticky_gripper_action = 0.0
+
+            action["gripper"] = relative_gripper_action
+
+        elif self.policy_setup == "widowx_bridge":
+            action["gripper"] = 2.0 * (raw_action["open_gripper"] > 0.5) - 1.0
+        
+        action["terminate_episode"] = np.array([0.0])
+        return raw_action, action
+
+    def _resize_image(self, image: np.ndarray) -> np.ndarray:
+        image = cv.resize(image, tuple(self.image_size), interpolation=cv.INTER_AREA)
+        return image
+
+    def _add_image_to_history(self, image: np.ndarray) -> None:
+        if len(self.image_history) == 0:
+            self.image_history.extend([image] * self.obs_horizon)
+        else:
+            self.image_history.append(image)
+
+    def _obtain_image_history(self) -> List[Image.Image]:
+        image_history = list(self.image_history)
+        images = image_history[:: self.obs_interval]
+        images = [Image.fromarray(image).convert("RGB") for image in images]
+        return images
+
+    def visualize_epoch(
+        self,
+        predicted_raw_actions: Sequence[np.ndarray],
+        images: Sequence[np.ndarray],
+        save_path: str,
+    ) -> None:
+        images = [self._resize_image(image) for image in images]
+        ACTION_DIM_LABELS = ["x", "y", "z", "roll", "pitch", "yaw", "grasp"]
+
+        img_strip = np.concatenate(np.array(images[::3]), axis=1)
+
+        # set up plt figure
+        figure_layout = [["image"] * len(ACTION_DIM_LABELS), ACTION_DIM_LABELS]
+        plt.rcParams.update({"font.size": 12})
+        fig, axs = plt.subplot_mosaic(figure_layout)
+        fig.set_size_inches([45, 10])
+
+        # plot actions
+        pred_actions = np.array(
+            [
+                np.concatenate(
+                    [a["world_vector"], a["rotation_delta"], a["open_gripper"]], axis=-1
+                )
+                for a in predicted_raw_actions
+            ]
+        )
+        for action_dim, action_label in enumerate(ACTION_DIM_LABELS):
+            # actions have batch, horizon, dim, in this example we just take the first action for simplicity
+            axs[action_label].plot(
+                pred_actions[:, action_dim], label="predicted action"
+            )
+            axs[action_label].set_title(action_label)
+            axs[action_label].set_xlabel("Time in one episode")
+
+        axs["image"].imshow(img_strip)
+        axs["image"].set_xlabel("Time in one episode (subsampled)")
+        plt.legend()
+        plt.savefig(save_path)
diff --git a/simpler_env/utils/action/action_ensemble.py b/simpler_env/utils/action/action_ensemble.py
index 367ccf91..fc88fed2 100644
--- a/simpler_env/utils/action/action_ensemble.py
+++ b/simpler_env/utils/action/action_ensemble.py
@@ -21,7 +21,7 @@ def ensemble_action(self, cur_action):
             curr_act_preds = np.stack(
                 [pred_actions[i] for (i, pred_actions) in zip(range(num_actions - 1, -1, -1), self.action_history)]
             )
-        # more recent predictions get exponentially *less* weight than older predictions
+        # if temp > 0, more recent predictions get exponentially *less* weight than older predictions
         weights = np.exp(-self.action_ensemble_temp * np.arange(num_actions))
         weights = weights / weights.sum()
         # compute the weighted average across all predictions for this timestep
diff --git a/simpler_env/utils/geometry.py b/simpler_env/utils/geometry.py
new file mode 100644
index 00000000..f1e7164a
--- /dev/null
+++ b/simpler_env/utils/geometry.py
@@ -0,0 +1,463 @@
+"""
+Mostly copied from transforms3d library
+
+"""
+
+import math
+
+import numpy as np
+
+_FLOAT_EPS = np.finfo(np.float64).eps
+
+# axis sequences for Euler angles
+_NEXT_AXIS = [1, 2, 0, 1]
+
+# map axes strings to/from tuples of inner axis, parity, repetition, frame
+_AXES2TUPLE = {
+    "sxyz": (0, 0, 0, 0),
+    "sxyx": (0, 0, 1, 0),
+    "sxzy": (0, 1, 0, 0),
+    "sxzx": (0, 1, 1, 0),
+    "syzx": (1, 0, 0, 0),
+    "syzy": (1, 0, 1, 0),
+    "syxz": (1, 1, 0, 0),
+    "syxy": (1, 1, 1, 0),
+    "szxy": (2, 0, 0, 0),
+    "szxz": (2, 0, 1, 0),
+    "szyx": (2, 1, 0, 0),
+    "szyz": (2, 1, 1, 0),
+    "rzyx": (0, 0, 0, 1),
+    "rxyx": (0, 0, 1, 1),
+    "ryzx": (0, 1, 0, 1),
+    "rxzx": (0, 1, 1, 1),
+    "rxzy": (1, 0, 0, 1),
+    "ryzy": (1, 0, 1, 1),
+    "rzxy": (1, 1, 0, 1),
+    "ryxy": (1, 1, 1, 1),
+    "ryxz": (2, 0, 0, 1),
+    "rzxz": (2, 0, 1, 1),
+    "rxyz": (2, 1, 0, 1),
+    "rzyz": (2, 1, 1, 1),
+}
+
+_TUPLE2AXES = dict((v, k) for k, v in _AXES2TUPLE.items())
+
+# For testing whether a number is close to zero
+_EPS4 = np.finfo(float).eps * 4.0
+
+
+def mat2euler(mat, axes="sxyz"):
+    """Return Euler angles from rotation matrix for specified axis sequence.
+
+    Note that many Euler angle triplets can describe one matrix.
+
+    Parameters
+    ----------
+    mat : array-like shape (3, 3) or (4, 4)
+        Rotation matrix or affine.
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+
+    Examples
+    --------
+    >>> R0 = euler2mat(1, 2, 3, 'syxz')
+    >>> al, be, ga = mat2euler(R0, 'syxz')
+    >>> R1 = euler2mat(al, be, ga, 'syxz')
+    >>> np.allclose(R0, R1)
+    True
+    """
+    try:
+        firstaxis, parity, repetition, frame = _AXES2TUPLE[axes.lower()]
+    except (AttributeError, KeyError):
+        _TUPLE2AXES[axes]  # validation
+        firstaxis, parity, repetition, frame = axes
+
+    i = firstaxis
+    j = _NEXT_AXIS[i + parity]
+    k = _NEXT_AXIS[i - parity + 1]
+
+    M = np.array(mat, dtype=np.float64, copy=False)[:3, :3]
+    if repetition:
+        sy = math.sqrt(M[i, j] * M[i, j] + M[i, k] * M[i, k])
+        if sy > _EPS4:
+            ax = math.atan2(M[i, j], M[i, k])
+            ay = math.atan2(sy, M[i, i])
+            az = math.atan2(M[j, i], -M[k, i])
+        else:
+            ax = math.atan2(-M[j, k], M[j, j])
+            ay = math.atan2(sy, M[i, i])
+            az = 0.0
+    else:
+        cy = math.sqrt(M[i, i] * M[i, i] + M[j, i] * M[j, i])
+        if cy > _EPS4:
+            ax = math.atan2(M[k, j], M[k, k])
+            ay = math.atan2(-M[k, i], cy)
+            az = math.atan2(M[j, i], M[i, i])
+        else:
+            ax = math.atan2(-M[j, k], M[j, j])
+            ay = math.atan2(-M[k, i], cy)
+            az = 0.0
+
+    if parity:
+        ax, ay, az = -ax, -ay, -az
+    if frame:
+        ax, az = az, ax
+    return ax, ay, az
+
+
+def quat2mat(q):
+    """Calculate rotation matrix corresponding to quaternion
+
+    Parameters
+    ----------
+    q : 4 element array-like
+
+    Returns
+    -------
+    M : (3,3) array
+      Rotation matrix corresponding to input quaternion *q*
+
+    Notes
+    -----
+    Rotation matrix applies to column vectors, and is applied to the
+    left of coordinate vectors.  The algorithm here allows quaternions that
+    have not been normalized.
+
+    References
+    ----------
+    Algorithm from http://en.wikipedia.org/wiki/Rotation_matrix#Quaternion
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> M = quat2mat([1, 0, 0, 0]) # Identity quaternion
+    >>> np.allclose(M, np.eye(3))
+    True
+    >>> M = quat2mat([0, 1, 0, 0]) # 180 degree rotn around axis 0
+    >>> np.allclose(M, np.diag([1, -1, -1]))
+    True
+    """
+    w, x, y, z = q
+    Nq = w * w + x * x + y * y + z * z
+    if Nq < _FLOAT_EPS:
+        return np.eye(3)
+    s = 2.0 / Nq
+    X = x * s
+    Y = y * s
+    Z = z * s
+    wX = w * X
+    wY = w * Y
+    wZ = w * Z
+    xX = x * X
+    xY = x * Y
+    xZ = x * Z
+    yY = y * Y
+    yZ = y * Z
+    zZ = z * Z
+    return np.array(
+        [
+            [1.0 - (yY + zZ), xY - wZ, xZ + wY],
+            [xY + wZ, 1.0 - (xX + zZ), yZ - wX],
+            [xZ - wY, yZ + wX, 1.0 - (xX + yY)],
+        ]
+    )
+
+
+# Checks if a matrix is a valid rotation matrix.
+def isrotation(
+    R: np.ndarray,
+    thresh=1e-6,
+) -> bool:
+    Rt = np.transpose(R)
+    shouldBeIdentity = np.dot(Rt, R)
+    iden = np.identity(3, dtype=R.dtype)
+    n = np.linalg.norm(iden - shouldBeIdentity)
+    return n < thresh
+
+
+def euler2mat(ai, aj, ak, axes="sxyz"):
+    """Return rotation matrix from Euler angles and axis sequence.
+
+    Parameters
+    ----------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    mat : array (3, 3)
+        Rotation matrix or affine.
+
+    Examples
+    --------
+    >>> R = euler2mat(1, 2, 3, 'syxz')
+    >>> np.allclose(np.sum(R[0]), -1.34786452)
+    True
+    >>> R = euler2mat(1, 2, 3, (0, 1, 0, 1))
+    >>> np.allclose(np.sum(R[0]), -0.383436184)
+    True
+    """
+    try:
+        firstaxis, parity, repetition, frame = _AXES2TUPLE[axes]
+    except (AttributeError, KeyError):
+        _TUPLE2AXES[axes]  # validation
+        firstaxis, parity, repetition, frame = axes
+
+    i = firstaxis
+    j = _NEXT_AXIS[i + parity]
+    k = _NEXT_AXIS[i - parity + 1]
+
+    if frame:
+        ai, ak = ak, ai
+    if parity:
+        ai, aj, ak = -ai, -aj, -ak
+
+    si, sj, sk = math.sin(ai), math.sin(aj), math.sin(ak)
+    ci, cj, ck = math.cos(ai), math.cos(aj), math.cos(ak)
+    cc, cs = ci * ck, ci * sk
+    sc, ss = si * ck, si * sk
+
+    M = np.eye(3)
+    if repetition:
+        M[i, i] = cj
+        M[i, j] = sj * si
+        M[i, k] = sj * ci
+        M[j, i] = sj * sk
+        M[j, j] = -cj * ss + cc
+        M[j, k] = -cj * cs - sc
+        M[k, i] = -sj * ck
+        M[k, j] = cj * sc + cs
+        M[k, k] = cj * cc - ss
+    else:
+        M[i, i] = cj * ck
+        M[i, j] = sj * sc - cs
+        M[i, k] = sj * cc + ss
+        M[j, i] = cj * sk
+        M[j, j] = sj * ss + cc
+        M[j, k] = sj * cs - sc
+        M[k, i] = -sj
+        M[k, j] = cj * si
+        M[k, k] = cj * ci
+    return M
+
+
+def euler2axangle(ai, aj, ak, axes="sxyz"):
+    """Return angle, axis corresponding to Euler angles, axis specification
+
+    Parameters
+    ----------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    vector : array shape (3,)
+       axis around which rotation occurs
+    theta : scalar
+       angle of rotation
+
+    Examples
+    --------
+    >>> vec, theta = euler2axangle(0, 1.5, 0, 'szyx')
+    >>> np.allclose(vec, [0, 1, 0])
+    True
+    >>> theta
+    1.5
+    """
+    return quat2axangle(euler2quat(ai, aj, ak, axes))
+
+
+def euler2quat(ai, aj, ak, axes="sxyz"):
+    """Return `quaternion` from Euler angles and axis sequence `axes`
+
+    Parameters
+    ----------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    quat : array shape (4,)
+       Quaternion in w, x, y z (real, then vector) format
+
+    Examples
+    --------
+    >>> q = euler2quat(1, 2, 3, 'ryxz')
+    >>> np.allclose(q, [0.435953, 0.310622, -0.718287, 0.444435])
+    True
+    """
+    try:
+        firstaxis, parity, repetition, frame = _AXES2TUPLE[axes.lower()]
+    except (AttributeError, KeyError):
+        _TUPLE2AXES[axes]  # validation
+        firstaxis, parity, repetition, frame = axes
+
+    i = firstaxis + 1
+    j = _NEXT_AXIS[i + parity - 1] + 1
+    k = _NEXT_AXIS[i - parity] + 1
+
+    if frame:
+        ai, ak = ak, ai
+    if parity:
+        aj = -aj
+
+    ai = ai / 2.0
+    aj = aj / 2.0
+    ak = ak / 2.0
+    ci = math.cos(ai)
+    si = math.sin(ai)
+    cj = math.cos(aj)
+    sj = math.sin(aj)
+    ck = math.cos(ak)
+    sk = math.sin(ak)
+    cc = ci * ck
+    cs = ci * sk
+    sc = si * ck
+    ss = si * sk
+
+    q = np.empty((4,))
+    if repetition:
+        q[0] = cj * (cc - ss)
+        q[i] = cj * (cs + sc)
+        q[j] = sj * (cc + ss)
+        q[k] = sj * (cs - sc)
+    else:
+        q[0] = cj * cc + sj * ss
+        q[i] = cj * sc - sj * cs
+        q[j] = cj * ss + sj * cc
+        q[k] = cj * cs - sj * sc
+    if parity:
+        q[j] *= -1.0
+
+    return q
+
+
+def quat2axangle(quat, identity_thresh=None):
+    """Convert quaternion to rotation of angle around axis
+
+    Parameters
+    ----------
+    quat : 4 element sequence
+       w, x, y, z forming quaternion.
+    identity_thresh : None or scalar, optional
+       Threshold below which the norm of the vector part of the quaternion (x,
+       y, z) is deemed to be 0, leading to the identity rotation.  None (the
+       default) leads to a threshold estimated based on the precision of the
+       input.
+
+    Returns
+    -------
+    theta : scalar
+       angle of rotation.
+    vector : array shape (3,)
+       axis around which rotation occurs.
+
+    Examples
+    --------
+    >>> vec, theta = quat2axangle([0, 1, 0, 0])
+    >>> vec
+    array([1., 0., 0.])
+    >>> np.allclose(theta, np.pi)
+    True
+
+    If this is an identity rotation, we return a zero angle and an arbitrary
+    vector:
+
+    >>> quat2axangle([1, 0, 0, 0])
+    (array([1., 0., 0.]), 0.0)
+
+    If any of the quaternion values are not finite, we return a NaN in the
+    angle, and an arbitrary vector:
+
+    >>> quat2axangle([1, np.inf, 0, 0])
+    (array([1., 0., 0.]), nan)
+
+    Notes
+    -----
+    A quaternion for which x, y, z are all equal to 0, is an identity rotation.
+    In this case we return a 0 angle and an arbitrary vector, here [1, 0, 0].
+
+    The algorithm allows for quaternions that have not been normalized.
+    """
+    quat = np.asarray(quat)
+    Nq = np.sum(quat**2)
+    if not np.isfinite(Nq):
+        return np.array([1.0, 0, 0]), float("nan")
+    if identity_thresh is None:
+        try:
+            identity_thresh = np.finfo(Nq.type).eps * 3
+        except (AttributeError, ValueError):  # Not a numpy type or not float
+            identity_thresh = _FLOAT_EPS * 3
+    if Nq < _FLOAT_EPS**2:  # Results unreliable after normalization
+        return np.array([1.0, 0, 0]), 0.0
+    if Nq != 1:  # Normalize if not normalized
+        s = math.sqrt(Nq)
+        quat = quat / s
+    xyz = quat[1:]
+    len2 = np.sum(xyz**2)
+    if len2 < identity_thresh**2:
+        # if vec is nearly 0,0,0, this is an identity rotation
+        return np.array([1.0, 0, 0]), 0.0
+    # Make sure w is not slightly above 1 or below -1
+    theta = 2 * math.acos(max(min(quat[0], 1), -1))
+    return xyz / math.sqrt(len2), theta
+
+
+def quat2euler(quaternion, axes="sxyz"):
+    """Euler angles from `quaternion` for specified axis sequence `axes`
+
+    Parameters
+    ----------
+    q : 4 element sequence
+       w, x, y, z of quaternion
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+
+    Returns
+    -------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+
+    Examples
+    --------
+    >>> angles = quat2euler([0.99810947, 0.06146124, 0, 0])
+    >>> np.allclose(angles, [0.123, 0, 0])
+    True
+    """
+    return mat2euler(quat2mat(quaternion), axes)
\ No newline at end of file
diff --git a/simpler_env/utils/metrics.py b/simpler_env/utils/metrics.py
index 39a251c9..6a7f207c 100644
--- a/simpler_env/utils/metrics.py
+++ b/simpler_env/utils/metrics.py
@@ -3,8 +3,9 @@
 from typing import Sequence, Optional
 
 import numpy as np
+from pandas.core.frame import frame_sub_kwargs
 
-REAL_PERF = {    # Real robot eval performance --> extract via: REAL_PERF[task][policy]
+REAL_PERF = {  # Real robot eval performance --> extract via: REAL_PERF[task][policy]
     "google_robot_pick_coke_can": {
         "rt-2-x": 0.907,
         "rt-1-converged": 0.853,
@@ -68,7 +69,7 @@
 }
 
 
-SIMPLER_PERF = {    # SIMPLER simulated eval performance --> extract via: SIMPLER_PERF[task][policy]
+SIMPLER_PERF = {  # SIMPLER simulated eval performance --> extract via: SIMPLER_PERF[task][policy]
     "google_robot_pick_coke_can": {
         "rt-2-x": 0.787,
         "rt-1-converged": 0.857,
@@ -131,6 +132,187 @@
     },
 }
 
+REF = {
+    "ckpt_name": [
+        "RT-1(Converged)",
+        "RT-1(15%)",
+        "RT-1-X",
+        "RT-2-X",
+        "Octo-Base",
+        "Octo-Small",
+        "RT-1(begin)",
+        "OpenVLA",
+        "RoboVLM",
+    ],
+    "coke_can/matching_avg": [0.857, 0.710, 0.567, 0.787, 0.170, "nan", 0.027, 0.163, 0.727],
+    "coke_can/variant_avg": [0.898, 0.813, 0.490, 0.823, 0.006, "nan", 0.022, 0.545, "nan"],
+    "coke_can/matching/horizontal": [
+        0.960,
+        0.860,
+        0.820,
+        0.740,
+        0.210,
+        "nan",
+        0.050,
+        0.270,
+        0.850,
+    ],
+    "coke_can/matching/vertical": [
+        0.900,
+        0.790,
+        0.330,
+        0.740,
+        0.210,
+        "nan",
+        0.000,
+        0.030,
+        0.430,
+    ],
+    "coke_can/matching/standing": [
+        0.710,
+        0.480,
+        0.550,
+        0.880,
+        0.090,
+        "nan",
+        0.030,
+        0.190,
+        0.900,
+    ],
+    "coke_can/variant/horizontal": [
+        0.969,
+        0.920,
+        0.569,
+        0.822,
+        0.005,
+        "nan",
+        0.022,
+        0.711,
+        "nan",
+    ],
+    "coke_can/variant/vertical": [
+        0.760,
+        0.704,
+        0.204,
+        0.754,
+        0.000,
+        "nan",
+        0.013,
+        0.271,
+        "nan",
+    ],
+    "coke_can/variant/standing": [
+        0.964,
+        0.813,
+        0.698,
+        0.893,
+        0.013,
+        "nan",
+        0.031,
+        0.653,
+        "nan",
+    ],
+    "move_near/variant": [0.500, 0.446, 0.323, 0.792, 0.031, "nan", 0.040, 0.477, "nan"],
+    "move_near/matching": [0.442, 0.354, 0.317, 0.779, 0.042, "nan", 0.050, 0.462, 0.663],
+    "drawer/matching_avg": [0.730, 0.565, 0.597, 0.250, 0.227, "nan", 0.139, 0.356, 0.268],
+    "drawer/variant_avg": [0.323, 0.267, 0.294, 0.353, 0.011, "nan", 0.069, 0.177, "nan"],
+    "drawer/matching/open": [0.601, 0.463, 0.296, 0.157, 0.009, "nan", 0.000, 0.194, 0.287],
+    "drawer/matching/close": [0.861, 0.667, 0.891, 0.343, 0.444, "nan", 0.278, 0.518, 0.25],
+    "drawer/variant/open": [0.270, 0.212, 0.069, 0.333, 0.000, "nan", 0.005, 0.158, "nan"],
+    "drawer/variant/close": [0.376, 0.323, 0.519, 0.372, 0.021, "nan", 0.132, 0.195, "nan"],
+    "apple_in_drawer/matching_avg": [0.065, 0.130, 0.213, 0.037, 0.000, 0.000, 0.000, "nan", 0.361],
+    "apple_in_drawer/variant_avg": [0.026, 0.021, 0.101, 0.206, 0.000, 0.000, 0.000, "nan", "nan"],
+    "put_spoon_on_tablecloth/matching_partial": [
+        "nan",
+        "nan",
+        0.167,
+        "nan",
+        0.347,
+        0.778,
+        "nan",
+        0.041,
+        0.375
+    ],
+    "put_spoon_on_tablecloth/matching_entire": [
+        "nan",
+        "nan",
+        0.000,
+        "nan",
+        0.125,
+        0.472,
+        "nan",
+        0.000,
+        0.208
+    ],
+    "put_carrot_on_plate/matching_partial": [
+        "nan",
+        "nan",
+        0.208,
+        "nan",
+        0.528,
+        0.278,
+        "nan",
+        0.333,
+        0.333
+    ],
+    "put_carrot_on_plate/matching_entire": [
+        "nan",
+        "nan",
+        0.042,
+        "nan",
+        0.083,
+        0.097,
+        "nan",
+        0.000,
+        0.25
+    ],
+    "stack_green_block_on_yellow_block/matching_partial": [
+        "nan",
+        "nan",
+        0.083,
+        "nan",
+        0.319,
+        0.403,
+        "nan",
+        0.125,
+        0.083
+    ],
+    "stack_green_block_on_yellow_block/matching_entire": [
+        "nan",
+        "nan",
+        0.000,
+        "nan",
+        0.000,
+        0.042,
+        "nan",
+        0.000,
+        0.083
+    ],
+    "put_eggplant_in_basket/matching_partial": [
+        "nan",
+        "nan",
+        0.000,
+        "nan",
+        0.667,
+        0.875,
+        "nan",
+        0.083,
+        0.000
+    ],
+    "put_eggplant_in_basket/matching_entire": [
+        "nan",
+        "nan",
+        0.000,
+        "nan",
+        0.431,
+        0.569,
+        "nan",
+        0.041,
+        0.000
+    ],
+}
+
+
 def pearson_correlation(perf_sim: Sequence[float], perf_real: Sequence[float]) -> float:
     perf_sim, perf_real = np.array(perf_sim), np.array(perf_real)
     assert perf_sim.shape == perf_real.shape
@@ -173,6 +355,7 @@ def print_all_kruskal_results(
     Prints out the Kruskal-Wallis test for each checkpoint
     """
     from scipy.stats import kruskal
+
     sim, real = np.array(sim), np.array(real)
     assert sim.shape == real.shape
     print(title)
@@ -225,5 +408,4 @@ def get_dir_stats(
             results.append(1)
         elif succ_fail_pattern[1] in fname:
             results.append(0)
-
     return results
diff --git a/simpler_env/utils/visualization.py b/simpler_env/utils/visualization.py
index 824f4c5b..15531a92 100644
--- a/simpler_env/utils/visualization.py
+++ b/simpler_env/utils/visualization.py
@@ -27,6 +27,20 @@ def write_video(path, images, fps=5):
         images_npy = images
     media.write_video(path, images_npy, fps=fps)
 
+def write_interval_video(path, images, fps=5, interval=20):
+    # images: list of numpy arrays
+    root_dir = Path(path).parent
+    if not os.path.exists(root_dir):
+        os.makedirs(root_dir)
+    print(f"skip writting video D:")
+    os.system(f"touch {path}")
+    return
+
+    if not isinstance(images[0], np.ndarray):
+        images_npy = [image.numpy() for image in images]
+    else:
+        images_npy = images
+    media.write_video(path, images_npy[::interval], fps=fps)
 
 def plot_pred_and_gt_action_trajectory(predicted_actions, gt_actions, stacked_images):
     """
diff --git a/simpler_env_miniconda.md b/simpler_env_miniconda.md
new file mode 100644
index 00000000..9902993f
--- /dev/null
+++ b/simpler_env_miniconda.md
@@ -0,0 +1,102 @@
+# SimplerEnv: Setup & Evaluation (group6/benchmark-v2)
+
+## 0. 前提
+
+* NVIDIA Driver ≥ 535 / CUDA 12.1 相当
+* Linux + GPU ノード
+* Git LFS が使えること（初回だけ）:
+
+  ```bash
+  conda install -c conda-forge -y git-lfs && git lfs install
+  ```
+
+## 1. 取得 & ブランチ切替（サブモジュール込み）
+
+```bash
+git clone https://github.com/simpler-env/SimplerEnv --recurse-submodules
+cd SimplerEnv
+
+# 提出ブランチへ（compare: group6/benchmark-v2 → base: benchmark-v2）
+git checkout group6/benchmark-v2
+
+# 念のためサブモジュールを初期化
+git submodule update --init --recursive
+```
+
+## 2. Conda 環境
+
+```bash
+source ~/miniconda3/etc/profile.d/conda.sh
+conda create -y -n simpler_env_group6 -c conda-forge python=3.11
+conda activate simpler_env_group6
+```
+
+## 3. 依存インストール（SimperEnv / ManiSkill / System）
+
+```bash
+# (A) SimplerEnv / ManiSkill2_real2sim（編集可能インストール）
+cd ManiSkill2_real2sim && pip install -e . && cd ..
+pip install -e .
+pip install -r requirements_full_install.txt
+
+# (B) System系（Vulkan, ffmpeg, gsutil）
+conda install -y -c conda-forge vulkan-tools vulkan-headers ffmpeg=4.2.2 gsutil
+```
+
+## 4. PyTorch / Flash-Attn（CUDA 12.1）
+
+```bash
+pip install --index-url https://download.pytorch.org/whl/cu121 \
+  torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --no-cache-dir
+
+pip install --no-deps \
+  https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.1.post4/\
+flash_attn-2.7.1.post4+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
+```
+
+## 5. Isaac-GR00T（サブモジュール、推論に必要な最小セット）
+
+```bash
+cd Isaac-GR00T
+
+# ベースの依存（重い依存は上で入れているため --no-deps 併用）
+pip install -e .[base] --no-deps
+
+# 追加ライブラリ（必要だったものをピン留め）
+pip install \
+  pandas==2.2.3 \
+  "pydantic>=2,<3" typing_extensions==4.12.2 --force-reinstall \
+  albumentations==1.4.18 albucore==0.0.17 scikit-image==0.25.2 lazy_loader==0.4 --no-deps \
+  decord==0.6.0 av==12.3.0 --no-deps \
+  nptyping==2.5.0 numpydantic==1.6.10 --no-deps \
+  diffusers==0.30.2 timm==1.0.14 peft==0.14.0 \
+  transformers==4.51.3 --force-reinstall --no-deps \
+  pyzmq --no-deps \
+  "tokenizers>=0.21,<0.22" --no-deps \
+  "git+https://github.com/facebookresearch/pytorch3d.git"
+
+# サブモジュールのGR00T側の設定を上書き
+cd ..
+cp ./scripts/gr00t/data_config.py ./Isaac-GR00T/gr00t/experiment/data_config.py
+cp ./scripts/gr00t/embodiment_tags.py ./Isaac-GR00T/gr00t/data/embodiment_tags.py
+```
+
+> 余計な依存衝突を避けるため、意図的に一部 `--no-deps` を使っています。
+> 追加で必要な依存が出た場合は、そのライブラリのみ都度入れてください。
+
+## 6. 推論テスト
+
+WidowX（Bridge）:
+
+```bash
+python scripts/gr00t/evaluate_bridge.py \
+  --ckpt-path /hogehoge-wasabi/path/to/checkpoint-group6/
+```
+
+Google Robot（Fractal）:
+
+```bash
+python scripts/gr00t/evaluate_fractal.py \
+  --ckpt-path /hogehoge-wasabi/path/to/checkpoint-group6/
+```
+---
\ No newline at end of file
diff --git a/tools/calc_metrics_evaluation_videos.py b/tools/calc_metrics_evaluation_videos.py
index 1981ad25..4af72010 100644
--- a/tools/calc_metrics_evaluation_videos.py
+++ b/tools/calc_metrics_evaluation_videos.py
@@ -1,6 +1,7 @@
 import argparse
 import glob
 from pathlib import Path
+import pandas as pd
 
 import numpy as np
 from scipy.stats import kruskal
@@ -13,6 +14,7 @@
     print_all_kruskal_results,
 )
 
+from simpler_env.utils.metrics import REF
 # Calculate metrics for each task
 
 
@@ -21,28 +23,40 @@ def calc_pick_coke_can_stats(root_result_dir):
     # If you use a new checkpoint, please update the real evaluation results here
     coke_can_real_success = {
         "horizontal": {
-            "rt-2-x": 0.92,
-            "rt-1-converged": 0.96,
-            "rt-1-15pct": 1.0,
-            "rt-1-x": 0.88,
-            "rt-1-begin": 0.20,
-            "octo-base": 0.44,
+            # "rt-2-x": 0.92,
+            # "rt-1-converged": 0.96,
+            # "rt-1-15pct": 1.0,
+            # "rt-1-x": 0.88,
+            # "rt-1-begin": 0.20,
+            # "octo-base": 0.44,
+            # "openvla-7b": 0.44,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
         },
         "vertical": {
-            "rt-2-x": 0.80,
-            "rt-1-converged": 0.88,
-            "rt-1-15pct": 0.96,
-            "rt-1-x": 0.56,
-            "rt-1-begin": 0.00,
-            "octo-base": 0.20,
+            # "rt-2-x": 0.80,
+            # "rt-1-converged": 0.88,
+            # "rt-1-15pct": 0.96,
+            # "rt-1-x": 0.56,
+            # "rt-1-begin": 0.00,
+            # "octo-base": 0.20,
+            # "openvla-7b": 0.20,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
         },
         "standing": {
-            "rt-2-x": 1.00,
-            "rt-1-converged": 0.72,
-            "rt-1-15pct": 0.80,
-            "rt-1-x": 0.84,
-            "rt-1-begin": 0.20,
-            "octo-base": 0.24,
+            # "rt-2-x": 1.00,
+            # "rt-1-converged": 0.72,
+            # "rt-1-15pct": 0.80,
+            # "rt-1-x": 0.84,
+            # "rt-1-begin": 0.20,
+            # "octo-base": 0.24,
+            # "openvla-7b": 0.24,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
         },
     }
 
@@ -52,17 +66,18 @@ def calc_pick_coke_can_stats(root_result_dir):
         "vertical": "laid_vertically",
         "standing": "upright",
     }
-    n_orientations = len(coke_can_orientation_map_dict)  # number of coke can orientations
-    n_trials_per_ckpt_per_orientation = (
-        25  # number of trials per checkpoint per coke can orientation; update if it is different
-    )
+    n_orientations = len(
+        coke_can_orientation_map_dict
+    )  # number of coke can orientations
+    n_trials_per_ckpt_per_orientation = 25  # number of trials per checkpoint per coke can orientation; update if it is different
     # extra patterns required in file name; if you are using different visual matching overlay image, please update here
     extra_pattern_require_sim_variants = ["rgb_overlay_None"]
     extra_pattern_require_visual_matching = ["rgb_overlay_google_coke_can_real_eval_1"]
 
     # get simulation variant success
     coke_can_sim_variant_success = {
-        k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in coke_can_orientation_map_dict.keys()
+        k1: {k2: [] for k2 in ckpt_alias_keys}
+        for k1 in coke_can_orientation_map_dict.keys()
     }
 
     # hardcoded variant aggregation result dirs; if you have new variants, please update here
@@ -105,9 +120,15 @@ def calc_pick_coke_can_stats(root_result_dir):
     for coke_can_orientation in coke_can_orientation_map_dict.keys():
         for ckpt_alias in ckpt_alias_keys:
             for variant in (
-                base_variants + background_variants + lighting_variants + distractor_variants + table_texture_variants
+                base_variants
+                + background_variants
+                + lighting_variants
+                + distractor_variants
+                + table_texture_variants
             ):
-                variant = variant.format(coke_can_orientation_map_dict[coke_can_orientation])
+                variant = variant.format(
+                    coke_can_orientation_map_dict[coke_can_orientation]
+                )
                 variant = f"{root_result_dir}/{CKPT_MAPPING[ckpt_alias]}/{variant}"
                 avg_sim_success = np.mean(
                     get_dir_stats(
@@ -117,8 +138,10 @@ def calc_pick_coke_can_stats(root_result_dir):
                 )
                 if np.isnan(avg_sim_success):
                     print(f"WARNING: avg_sim_success is nan for {variant}")
-                coke_can_sim_variant_success[coke_can_orientation][ckpt_alias].append(avg_sim_success)
-            
+                coke_can_sim_variant_success[coke_can_orientation][ckpt_alias].append(
+                    avg_sim_success
+                )
+
             coke_can_sim_variant_success[coke_can_orientation][ckpt_alias] = np.mean(
                 coke_can_sim_variant_success[coke_can_orientation][ckpt_alias]
             )
@@ -157,29 +180,40 @@ def calc_pick_coke_can_stats(root_result_dir):
             avg_orientation_sim_variant_results[-1].append(
                 coke_can_sim_variant_success[coke_can_orientation][ckpt_alias]
             )
-            avg_orientation_real_results[-1].append(coke_can_real_success[coke_can_orientation][ckpt_alias])
-        avg_orientation_sim_variant_results[-1] = np.mean(avg_orientation_sim_variant_results[-1])
+            avg_orientation_real_results[-1].append(
+                coke_can_real_success[coke_can_orientation][ckpt_alias]
+            )
+        avg_orientation_sim_variant_results[-1] = np.mean(
+            avg_orientation_sim_variant_results[-1]
+        )
         avg_orientation_real_results[-1] = np.mean(avg_orientation_real_results[-1])
     print("avg_orientation_sim_variant_results", avg_orientation_sim_variant_results)
     print("avg_orientation_real_results", avg_orientation_real_results)
     print(
         "mean_maximum_rank_violation(avg_orientation_sim_variant_results, avg_orientation_real_results)",
-        mean_maximum_rank_violation(avg_orientation_sim_variant_results, avg_orientation_real_results),
+        mean_maximum_rank_violation(
+            avg_orientation_sim_variant_results, avg_orientation_real_results
+        ),
     )
     print(
         "pearson_correlation(avg_orientation_sim_variant_results, avg_orientation_real_results)",
-        pearson_correlation(avg_orientation_sim_variant_results, avg_orientation_real_results),
+        pearson_correlation(
+            avg_orientation_sim_variant_results, avg_orientation_real_results
+        ),
     )
     print("-" * 20)
 
     # get visual matching success
     coke_can_sim_visual_matching_success = {
-        k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in coke_can_orientation_map_dict.keys()
+        k1: {k2: [] for k2 in ckpt_alias_keys}
+        for k1 in coke_can_orientation_map_dict.keys()
     }
     for coke_can_orientation in coke_can_orientation_map_dict.keys():
         for ckpt_alias in ckpt_alias_keys:
             for variant in base_visual_matching_variants:
-                variant = variant.format(coke_can_orientation_map_dict[coke_can_orientation])
+                variant = variant.format(
+                    coke_can_orientation_map_dict[coke_can_orientation]
+                )
                 variant = f"{root_result_dir}/{CKPT_MAPPING[ckpt_alias]}/{variant}"
                 avg_sim_success = np.mean(
                     get_dir_stats(
@@ -189,10 +223,18 @@ def calc_pick_coke_can_stats(root_result_dir):
                 )
                 if np.isnan(avg_sim_success):
                     print(f"WARNING: avg_sim_success is nan for {variant}")
-                coke_can_sim_visual_matching_success[coke_can_orientation][ckpt_alias].append(avg_sim_success)
-            print(f"Orientation {coke_can_orientation}, ckpt {ckpt_alias} all robot arm visual matching success: {coke_can_sim_visual_matching_success[coke_can_orientation][ckpt_alias]}")
-            coke_can_sim_visual_matching_success[coke_can_orientation][ckpt_alias] = np.mean(
-                coke_can_sim_visual_matching_success[coke_can_orientation][ckpt_alias]
+                coke_can_sim_visual_matching_success[coke_can_orientation][
+                    ckpt_alias
+                ].append(avg_sim_success)
+            print(
+                f"Orientation {coke_can_orientation}, ckpt {ckpt_alias} all robot arm visual matching success: {coke_can_sim_visual_matching_success[coke_can_orientation][ckpt_alias]}"
+            )
+            coke_can_sim_visual_matching_success[coke_can_orientation][ckpt_alias] = (
+                np.mean(
+                    coke_can_sim_visual_matching_success[coke_can_orientation][
+                        ckpt_alias
+                    ]
+                )
             )
 
     for coke_can_orientation in coke_can_orientation_map_dict.keys():
@@ -207,21 +249,27 @@ def calc_pick_coke_can_stats(root_result_dir):
         print(
             f"{coke_can_orientation} MMRV",
             mean_maximum_rank_violation(
-                list(coke_can_sim_visual_matching_success[coke_can_orientation].values()),
+                list(
+                    coke_can_sim_visual_matching_success[coke_can_orientation].values()
+                ),
                 list(coke_can_real_success[coke_can_orientation].values()),
             ),
         )
         print(
             f"{coke_can_orientation} pearson correlation",
             pearson_correlation(
-                list(coke_can_sim_visual_matching_success[coke_can_orientation].values()),
+                list(
+                    coke_can_sim_visual_matching_success[coke_can_orientation].values()
+                ),
                 list(coke_can_real_success[coke_can_orientation].values()),
             ),
         )
         print_all_kruskal_results(
             construct_unordered_trial_results(
                 n_trials_per_ckpt_per_orientation,
-                list(coke_can_sim_visual_matching_success[coke_can_orientation].values()),
+                list(
+                    coke_can_sim_visual_matching_success[coke_can_orientation].values()
+                ),
             ),
             construct_unordered_trial_results(
                 n_trials_per_ckpt_per_orientation,
@@ -239,8 +287,12 @@ def calc_pick_coke_can_stats(root_result_dir):
             avg_orientation_sim_visual_matching_results[-1].append(
                 coke_can_sim_visual_matching_success[coke_can_orientation][ckpt_alias]
             )
-            avg_orientation_real_results[-1].append(coke_can_real_success[coke_can_orientation][ckpt_alias])
-        avg_orientation_sim_visual_matching_results[-1] = np.mean(avg_orientation_sim_visual_matching_results[-1])
+            avg_orientation_real_results[-1].append(
+                coke_can_real_success[coke_can_orientation][ckpt_alias]
+            )
+        avg_orientation_sim_visual_matching_results[-1] = np.mean(
+            avg_orientation_sim_visual_matching_results[-1]
+        )
         avg_orientation_real_results[-1] = np.mean(avg_orientation_real_results[-1])
     print(
         "avg_orientation_sim_visual_matching_results",
@@ -249,11 +301,15 @@ def calc_pick_coke_can_stats(root_result_dir):
     print("avg_orientation_real_results", avg_orientation_real_results)
     print(
         "mean_maximum_rank_violation(avg_orientation_sim_visual_matching_results, avg_orientation_real_results)",
-        mean_maximum_rank_violation(avg_orientation_sim_visual_matching_results, avg_orientation_real_results),
+        mean_maximum_rank_violation(
+            avg_orientation_sim_visual_matching_results, avg_orientation_real_results
+        ),
     )
     print(
         "pearson_correlation(avg_orientation_sim_visual_matching_results, avg_orientation_real_results)",
-        pearson_correlation(avg_orientation_sim_visual_matching_results, avg_orientation_real_results),
+        pearson_correlation(
+            avg_orientation_sim_visual_matching_results, avg_orientation_real_results
+        ),
     )
     print_all_kruskal_results(
         construct_unordered_trial_results(
@@ -270,18 +326,31 @@ def calc_pick_coke_can_stats(root_result_dir):
     print("*" * 20)
     for _ in range(3):
         print()
+    # process results and return
+    ret_dict = {}
+    ret_dict["coke_can/matching_avg"] = avg_orientation_sim_visual_matching_results
+    ret_dict["coke_can/variant_avg"] = avg_orientation_sim_variant_results
+    for key, val in coke_can_sim_visual_matching_success.items():
+        ret_dict[f"coke_can/matching/{key}"] = [succ for _, succ in val.items()]
+    for key, val in coke_can_sim_variant_success.items():
+        ret_dict[f"coke_can/variant/{key}"] = [succ for _, succ in val.items()]
+    return ret_dict
 
 
 def calc_move_near_stats(root_result_dir):
     print("***Move Near results***")
     # If you use a new checkpoint, please update the real evaluation results here
     move_near_real_success = {
-        "rt-2-x": 0.733,
-        "rt-1-converged": 0.633,
-        "rt-1-15pct": 0.583,
-        "rt-1-x": 0.45,
-        "rt-1-begin": 0.017,
-        "octo-base": 0.35,
+        # "rt-2-x": 0.733,
+        # "rt-1-converged": 0.633,
+        # "rt-1-15pct": 0.583,
+        # "rt-1-x": 0.45,
+        # "rt-1-begin": 0.017,
+        # "octo-base": 0.35,
+        # "openvla-7b": 0.35,
+        # "internvla-1_8b": 0.44,
+        # "internvla_diff-1_8b": 0.44,
+        "ours": 0.0,
     }
 
     ckpt_alias_keys = list(move_near_real_success.keys())
@@ -331,14 +400,24 @@ def calc_move_near_stats(root_result_dir):
 
     for ckpt_alias in ckpt_alias_keys:
         for variant in (
-            base_variants + background_variants + lighting_variants + distractor_variants + table_texture_variants
+            base_variants
+            + background_variants
+            + lighting_variants
+            + distractor_variants
+            + table_texture_variants
         ):
             variant = f"{root_result_dir}/{CKPT_MAPPING[ckpt_alias]}/{variant}"
-            avg_sim_success = np.mean(get_dir_stats(variant, extra_pattern_require=extra_pattern_require_sim_variants))
+            avg_sim_success = np.mean(
+                get_dir_stats(
+                    variant, extra_pattern_require=extra_pattern_require_sim_variants
+                )
+            )
             if np.isnan(avg_sim_success):
                 print(f"WARNING: avg_sim_success is nan for {variant}")
             move_near_sim_variant_success[ckpt_alias].append(avg_sim_success)
-        move_near_sim_variant_success[ckpt_alias] = np.mean(move_near_sim_variant_success[ckpt_alias])
+        move_near_sim_variant_success[ckpt_alias] = np.mean(
+            move_near_sim_variant_success[ckpt_alias]
+        )
 
     print("-" * 20)
     print("sim variant avg success", move_near_sim_variant_success)
@@ -365,15 +444,20 @@ def calc_move_near_stats(root_result_dir):
         for variant in base_visual_matching_variants:
             variant = f"{root_result_dir}/{CKPT_MAPPING[ckpt_alias]}/{variant}"
             avg_sim_success = np.mean(
-                get_dir_stats(variant, extra_pattern_require=extra_pattern_require_visual_matching)
+                get_dir_stats(
+                    variant, extra_pattern_require=extra_pattern_require_visual_matching
+                )
             )
             if np.isnan(avg_sim_success):
                 print(f"WARNING: avg_sim_success is nan for {variant}")
             move_near_sim_visual_matching_success[ckpt_alias].append(avg_sim_success)
-            
-        print(f"Ckpt {ckpt_alias} all robot arm visual matching success: {move_near_sim_visual_matching_success[ckpt_alias]}")
-        move_near_sim_visual_matching_success[ckpt_alias] = np.mean(move_near_sim_visual_matching_success[ckpt_alias])
 
+        print(
+            f"Ckpt {ckpt_alias} all robot arm visual matching success: {move_near_sim_visual_matching_success[ckpt_alias]}"
+        )
+        move_near_sim_visual_matching_success[ckpt_alias] = np.mean(
+            move_near_sim_visual_matching_success[ckpt_alias]
+        )
 
     print("sim visual matching success", move_near_sim_visual_matching_success)
     print("real success", move_near_real_success)
@@ -392,8 +476,12 @@ def calc_move_near_stats(root_result_dir):
         ),
     )
     print_all_kruskal_results(
-        construct_unordered_trial_results(n_trials_per_ckpt, list(move_near_sim_visual_matching_success.values())),
-        construct_unordered_trial_results(n_trials_per_ckpt, list(move_near_real_success.values())),
+        construct_unordered_trial_results(
+            n_trials_per_ckpt, list(move_near_sim_visual_matching_success.values())
+        ),
+        construct_unordered_trial_results(
+            n_trials_per_ckpt, list(move_near_real_success.values())
+        ),
         "avg kruskal:",
     )
 
@@ -401,26 +489,46 @@ def calc_move_near_stats(root_result_dir):
     for _ in range(3):
         print()
 
+    # TODO: process ret results
+    ret_dict = {}
+    for key, val in move_near_sim_variant_success.items():
+        ret_dict["move_near/variant"] = [
+            succ for _, succ in move_near_sim_variant_success.items()
+        ]
+    for key, val in move_near_sim_visual_matching_success.items():
+        ret_dict["move_near/matching"] = [
+            succ for _, succ in move_near_sim_visual_matching_success.items()
+        ]
+    return ret_dict
+
 
 def calc_drawer_stats(root_result_dir):
     print("***Drawer results***")
     # If you use a new checkpoint, please update the real evaluation results here
     drawer_real_success = {
         "open": {
-            "rt-2-x": 0.333,
-            "rt-1-converged": 0.815,
-            "rt-1-15pct": 0.704,
-            "rt-1-x": 0.519,
-            "rt-1-begin": 0.000,
-            "octo-base": 0.148,
+            # "rt-2-x": 0.333,
+            # "rt-1-converged": 0.815,
+            # "rt-1-15pct": 0.704,
+            # "rt-1-x": 0.519,
+            # "rt-1-begin": 0.000,
+            # "octo-base": 0.148,
+            # "openvla-7b": 0.148,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
         },
         "close": {
-            "rt-2-x": 0.630,
-            "rt-1-converged": 0.926,
-            "rt-1-15pct": 0.889,
-            "rt-1-x": 0.741,
-            "rt-1-begin": 0.000,
-            "octo-base": 0.519,
+            # "rt-2-x": 0.630,
+            # "rt-1-converged": 0.926,
+            # "rt-1-15pct": 0.889,
+            # "rt-1-x": 0.741,
+            # "rt-1-begin": 0.000,
+            # "octo-base": 0.519,
+            # "openvla-7b": 0.519,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
         },
     }
 
@@ -438,15 +546,15 @@ def calc_drawer_stats(root_result_dir):
         ],
     }
     n_tasks = len(drawer_task_map_dict)
-    n_trials_per_ckpt_per_task = (
-        27  # number of trials per checkpoint for all open / all close tasks; update if it is different
-    )
+    n_trials_per_ckpt_per_task = 27  # number of trials per checkpoint for all open / all close tasks; update if it is different
     # extra patterns required in file name; if you are using different visual matching overlay image, please update here
     extra_pattern_require_sim_variants = ["rgb_overlay_None"]
     extra_pattern_require_visual_matching = ["rgb_overlay_open_drawer"]
 
     # get simulation variant success
-    drawer_sim_variant_success = {k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in drawer_task_map_dict.keys()}
+    drawer_sim_variant_success = {
+        k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in drawer_task_map_dict.keys()
+    }
 
     # hardcoded variant aggregation result dirs; if you have new variants, please update here
     base_variants = [
@@ -481,7 +589,12 @@ def calc_drawer_stats(root_result_dir):
     for drawer_task in drawer_task_map_dict.keys():
         for ckpt_alias in ckpt_alias_keys:
             for specific_task in drawer_task_map_dict[drawer_task]:
-                for variant in base_variants + background_variants + lighting_variants + table_texture_variants:
+                for variant in (
+                    base_variants
+                    + background_variants
+                    + lighting_variants
+                    + table_texture_variants
+                ):
                     variant = variant.format(specific_task)
                     variant = f"{root_result_dir}/{CKPT_MAPPING[ckpt_alias]}/{variant}"
                     avg_sim_success = np.mean(
@@ -492,7 +605,9 @@ def calc_drawer_stats(root_result_dir):
                     )
                     if np.isnan(avg_sim_success):
                         print(f"WARNING: avg_sim_success is nan for {variant}")
-                    drawer_sim_variant_success[drawer_task][ckpt_alias].append(avg_sim_success)
+                    drawer_sim_variant_success[drawer_task][ckpt_alias].append(
+                        avg_sim_success
+                    )
             drawer_sim_variant_success[drawer_task][ckpt_alias] = np.mean(
                 drawer_sim_variant_success[drawer_task][ckpt_alias]
             )
@@ -525,7 +640,9 @@ def calc_drawer_stats(root_result_dir):
         avg_sim_variant_results.append([])
         avg_real_results.append([])
         for drawer_task in drawer_task_map_dict.keys():
-            avg_sim_variant_results[-1].append(drawer_sim_variant_success[drawer_task][ckpt_alias])
+            avg_sim_variant_results[-1].append(
+                drawer_sim_variant_success[drawer_task][ckpt_alias]
+            )
             avg_real_results[-1].append(drawer_real_success[drawer_task][ckpt_alias])
         avg_sim_variant_results[-1] = np.mean(avg_sim_variant_results[-1])
         avg_real_results[-1] = np.mean(avg_real_results[-1])
@@ -542,7 +659,9 @@ def calc_drawer_stats(root_result_dir):
     print("-" * 20)
 
     # get visual matching success
-    drawer_sim_visual_matching_success = {k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in drawer_task_map_dict.keys()}
+    drawer_sim_visual_matching_success = {
+        k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in drawer_task_map_dict.keys()
+    }
     for drawer_task in drawer_task_map_dict.keys():
         for ckpt_alias in ckpt_alias_keys:
             for specific_task in drawer_task_map_dict[drawer_task]:
@@ -557,13 +676,21 @@ def calc_drawer_stats(root_result_dir):
                     )
                     if np.isnan(avg_sim_success):
                         print(f"WARNING: avg_sim_success is nan for {variant}")
-                    drawer_sim_visual_matching_success[drawer_task][ckpt_alias].append(avg_sim_success)
+                    drawer_sim_visual_matching_success[drawer_task][ckpt_alias].append(
+                        avg_sim_success
+                    )
             tmp_variant_avg_each_robot_arm = []
             for i in range(len(base_visual_matching_variants)):
                 tmp_variant_avg_each_robot_arm.append(
-                    np.mean(drawer_sim_visual_matching_success[drawer_task][ckpt_alias][i::len(drawer_task_map_dict[drawer_task])])
+                    np.mean(
+                        drawer_sim_visual_matching_success[drawer_task][ckpt_alias][
+                            i :: len(drawer_task_map_dict[drawer_task])
+                        ]
+                    )
                 )
-            print(f"Drawer task {drawer_task}, ckpt {ckpt_alias} all robot arm visual matching success: {tmp_variant_avg_each_robot_arm}")
+            print(
+                f"Drawer task {drawer_task}, ckpt {ckpt_alias} all robot arm visual matching success: {tmp_variant_avg_each_robot_arm}"
+            )
             drawer_sim_visual_matching_success[drawer_task][ckpt_alias] = np.mean(
                 drawer_sim_visual_matching_success[drawer_task][ckpt_alias]
             )
@@ -606,9 +733,13 @@ def calc_drawer_stats(root_result_dir):
         avg_sim_visual_matching_results.append([])
         avg_real_results.append([])
         for drawer_task in drawer_task_map_dict.keys():
-            avg_sim_visual_matching_results[-1].append(drawer_sim_visual_matching_success[drawer_task][ckpt_alias])
+            avg_sim_visual_matching_results[-1].append(
+                drawer_sim_visual_matching_success[drawer_task][ckpt_alias]
+            )
             avg_real_results[-1].append(drawer_real_success[drawer_task][ckpt_alias])
-        avg_sim_visual_matching_results[-1] = np.mean(avg_sim_visual_matching_results[-1])
+        avg_sim_visual_matching_results[-1] = np.mean(
+            avg_sim_visual_matching_results[-1]
+        )
         avg_real_results[-1] = np.mean(avg_real_results[-1])
     print("avg_sim_visual_matching_results", avg_sim_visual_matching_results)
     print("avg_real_results", avg_real_results)
@@ -621,27 +752,46 @@ def calc_drawer_stats(root_result_dir):
         pearson_correlation(avg_sim_visual_matching_results, avg_real_results),
     )
     print_all_kruskal_results(
-        construct_unordered_trial_results(n_trials_per_ckpt_per_task * n_tasks, avg_sim_visual_matching_results),
-        construct_unordered_trial_results(n_trials_per_ckpt_per_task * n_tasks, avg_real_results),
+        construct_unordered_trial_results(
+            n_trials_per_ckpt_per_task * n_tasks, avg_sim_visual_matching_results
+        ),
+        construct_unordered_trial_results(
+            n_trials_per_ckpt_per_task * n_tasks, avg_real_results
+        ),
         "avg kruskal:",
     )
 
     print("*" * 20)
     for _ in range(3):
         print()
-        
-        
+
+    # TODO: process ret dict
+    ret_dict = {
+        "drawer/matching_avg": avg_sim_visual_matching_results,
+        "drawer/variant_avg": avg_sim_variant_results,
+    }
+    for key, val in drawer_sim_visual_matching_success.items():
+        ret_dict[f"drawer/matching/{key}"] = [succ for _, succ in val.items()]
+    for key, val in drawer_sim_variant_success.items():
+        ret_dict[f"drawer/variant/{key}"] = [succ for _, succ in val.items()]
+    return ret_dict
+
+
 def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
     print("***Drawer results***")
     # If you use a new checkpoint, please update the real evaluation results here
     drawer_real_success = {
         "put_apple_into_top_drawer": {
-            "rt-2-x": 0.074,
-            "rt-1-converged": 0.185,
-            "rt-1-15pct": 0.185,
-            "rt-1-x": 0.407,
-            "rt-1-begin": 0.000,
-            "octo-base": 0.000,
+            # "rt-2-x": 0.074,
+            # "rt-1-converged": 0.185,
+            # "rt-1-15pct": 0.185,
+            # "rt-1-x": 0.407,
+            # "rt-1-begin": 0.000,
+            # "octo-base": 0.000,
+            # "openvla-7b": 0.000,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
         },
     }
 
@@ -652,9 +802,7 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
         ],
     }
     n_tasks = len(drawer_task_map_dict)
-    n_trials_per_ckpt_per_task = (
-        27  # number of trials per checkpoint for each key in drawer_task_map_dict; update if it is different
-    )
+    n_trials_per_ckpt_per_task = 27  # number of trials per checkpoint for each key in drawer_task_map_dict; update if it is different
     # extra patterns required in file name; if you are using different visual matching overlay image, please update here
     extra_pattern_require_sim_variants = ["rgb_overlay_None", "apple"]
     extra_pattern_require_visual_matching = ["rgb_overlay_open_drawer", "apple"]
@@ -662,7 +810,9 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
     extra_log_str_visual_matching = "model_ids_baked_apple_v2"
 
     # get simulation variant success
-    drawer_sim_variant_success = {k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in drawer_task_map_dict.keys()}
+    drawer_sim_variant_success = {
+        k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in drawer_task_map_dict.keys()
+    }
 
     # hardcoded variant aggregation result dirs; if you have new variants, please update here
     base_variants = [
@@ -697,8 +847,15 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
     for drawer_task in drawer_task_map_dict.keys():
         for ckpt_alias in ckpt_alias_keys:
             for specific_task in drawer_task_map_dict[drawer_task]:
-                for variant in base_variants + background_variants + lighting_variants + table_texture_variants:
-                    variant = variant.format(specific_task) + f"_{extra_log_str_variant_agg}"
+                for variant in (
+                    base_variants
+                    + background_variants
+                    + lighting_variants
+                    + table_texture_variants
+                ):
+                    variant = (
+                        variant.format(specific_task) + f"_{extra_log_str_variant_agg}"
+                    )
                     variant = f"{root_result_dir}/{CKPT_MAPPING[ckpt_alias]}/{variant}"
                     avg_sim_success = np.mean(
                         get_dir_stats(
@@ -708,7 +865,9 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
                     )
                     if np.isnan(avg_sim_success):
                         print(f"WARNING: avg_sim_success is nan for {variant}")
-                    drawer_sim_variant_success[drawer_task][ckpt_alias].append(avg_sim_success)
+                    drawer_sim_variant_success[drawer_task][ckpt_alias].append(
+                        avg_sim_success
+                    )
             drawer_sim_variant_success[drawer_task][ckpt_alias] = np.mean(
                 drawer_sim_variant_success[drawer_task][ckpt_alias]
             )
@@ -741,7 +900,9 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
         avg_sim_variant_results.append([])
         avg_real_results.append([])
         for drawer_task in drawer_task_map_dict.keys():
-            avg_sim_variant_results[-1].append(drawer_sim_variant_success[drawer_task][ckpt_alias])
+            avg_sim_variant_results[-1].append(
+                drawer_sim_variant_success[drawer_task][ckpt_alias]
+            )
             avg_real_results[-1].append(drawer_real_success[drawer_task][ckpt_alias])
         avg_sim_variant_results[-1] = np.mean(avg_sim_variant_results[-1])
         avg_real_results[-1] = np.mean(avg_real_results[-1])
@@ -758,12 +919,17 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
     print("-" * 20)
 
     # get visual matching success
-    drawer_sim_visual_matching_success = {k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in drawer_task_map_dict.keys()}
+    drawer_sim_visual_matching_success = {
+        k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in drawer_task_map_dict.keys()
+    }
     for drawer_task in drawer_task_map_dict.keys():
         for ckpt_alias in ckpt_alias_keys:
             for specific_task in drawer_task_map_dict[drawer_task]:
                 for variant in base_visual_matching_variants:
-                    variant = variant.format(specific_task) + f"_{extra_log_str_visual_matching}"
+                    variant = (
+                        variant.format(specific_task)
+                        + f"_{extra_log_str_visual_matching}"
+                    )
                     variant = f"{root_result_dir}/{CKPT_MAPPING[ckpt_alias]}/{variant}"
                     avg_sim_success = np.mean(
                         get_dir_stats(
@@ -773,13 +939,21 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
                     )
                     if np.isnan(avg_sim_success):
                         print(f"WARNING: avg_sim_success is nan for {variant}")
-                    drawer_sim_visual_matching_success[drawer_task][ckpt_alias].append(avg_sim_success)
+                    drawer_sim_visual_matching_success[drawer_task][ckpt_alias].append(
+                        avg_sim_success
+                    )
             tmp_variant_avg_each_robot_arm = []
             for i in range(len(base_visual_matching_variants)):
                 tmp_variant_avg_each_robot_arm.append(
-                    np.mean(drawer_sim_visual_matching_success[drawer_task][ckpt_alias][i::len(drawer_task_map_dict[drawer_task])])
+                    np.mean(
+                        drawer_sim_visual_matching_success[drawer_task][ckpt_alias][
+                            i :: len(drawer_task_map_dict[drawer_task])
+                        ]
+                    )
                 )
-            print(f"Drawer task {drawer_task}, ckpt {ckpt_alias} all robot arm visual matching success: {tmp_variant_avg_each_robot_arm}")
+            print(
+                f"Drawer task {drawer_task}, ckpt {ckpt_alias} all robot arm visual matching success: {tmp_variant_avg_each_robot_arm}"
+            )
             drawer_sim_visual_matching_success[drawer_task][ckpt_alias] = np.mean(
                 drawer_sim_visual_matching_success[drawer_task][ckpt_alias]
             )
@@ -822,9 +996,13 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
         avg_sim_visual_matching_results.append([])
         avg_real_results.append([])
         for drawer_task in drawer_task_map_dict.keys():
-            avg_sim_visual_matching_results[-1].append(drawer_sim_visual_matching_success[drawer_task][ckpt_alias])
+            avg_sim_visual_matching_results[-1].append(
+                drawer_sim_visual_matching_success[drawer_task][ckpt_alias]
+            )
             avg_real_results[-1].append(drawer_real_success[drawer_task][ckpt_alias])
-        avg_sim_visual_matching_results[-1] = np.mean(avg_sim_visual_matching_results[-1])
+        avg_sim_visual_matching_results[-1] = np.mean(
+            avg_sim_visual_matching_results[-1]
+        )
         avg_real_results[-1] = np.mean(avg_real_results[-1])
     print("avg_sim_visual_matching_results", avg_sim_visual_matching_results)
     print("avg_real_results", avg_real_results)
@@ -837,14 +1015,29 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
         pearson_correlation(avg_sim_visual_matching_results, avg_real_results),
     )
     print_all_kruskal_results(
-        construct_unordered_trial_results(n_trials_per_ckpt_per_task * n_tasks, avg_sim_visual_matching_results),
-        construct_unordered_trial_results(n_trials_per_ckpt_per_task * n_tasks, avg_real_results),
+        construct_unordered_trial_results(
+            n_trials_per_ckpt_per_task * n_tasks, avg_sim_visual_matching_results
+        ),
+        construct_unordered_trial_results(
+            n_trials_per_ckpt_per_task * n_tasks, avg_real_results
+        ),
         "avg kruskal:",
     )
 
     print("*" * 20)
     for _ in range(3):
         print()
+ 
+    # TODO: process ret dict
+    ret_dict = {
+        "apple_in_drawer/matching_avg": avg_sim_visual_matching_results,
+        "apple_in_drawer/variant_avg": avg_sim_variant_results,
+    }
+    # for key, val in drawer_sim_visual_matching_success.items():
+    #     ret_dict[f"drawer/matching/{key}"] = [succ for _, succ in val.items()]
+    # for key, val in drawer_sim_variant_success.items():
+    #     ret_dict[f"drawer/variant/{key}"] = [succ for _, succ in val.items()]
+    return ret_dict
 
 
 def calc_bridge_put_on_env_stats(root_result_dir):
@@ -852,42 +1045,78 @@ def calc_bridge_put_on_env_stats(root_result_dir):
     # If you use a new checkpoint, please update the real evaluation results here
     real_partial_success_dict = {
         "put_spoon_on_tablecloth": {
-            "rt-1-x": 0.042,
-            "octo-base": 0.500,
-            "octo-small": 0.542,
+            # "rt-1-x": 0.042,
+            # "octo-base": 0.500,
+            # "openvla-7b": 0.500,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
+            # "octo-small": 0.542,
         },
         "put_carrot_on_plate": {
-            "rt-1-x": 0.167,
-            "octo-base": 0.500,
-            "octo-small": 0.208,
+            # "rt-1-x": 0.167,
+            # "octo-base": 0.500,
+            # "openvla-7b": 0.500,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
+            # "octo-small": 0.208,
         },
         "stack_green_block_on_yellow_block": {
-            "rt-1-x": 0.000,
-            "octo-base": 0.292,
-            "octo-small": 0.583,
+            # "rt-1-x": 0.000,
+            # "octo-base": 0.292,
+            # "openvla-7b": 0.292,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
+            # "octo-small": 0.583,
         },
         "put_eggplant_in_basket": {
-            "rt-1-x": 0.000,
-            "octo-base": 0.400,
-            "octo-small": 0.600,
+            # "rt-1-x": 0.000,
+            # "octo-base": 0.400,
+            # "openvla-7b": 0.400,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
+            # "octo-small": 0.600,
         },
     }
     real_success_dict = {
         "put_spoon_on_tablecloth": {
-            "rt-1-x": 0.000,
-            "octo-base": 0.333,
-            "octo-small": 0.417,
+            # "rt-1-x": 0.000,
+            # "octo-base": 0.333,
+            # "openvla-7b": 0.333,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
+            # "octo-small": 0.417,
+        },
+        "put_carrot_on_plate": {
+            # "rt-1-x": 0.00,
+            # "octo-base": 0.25,
+            # "octo-small": 0.083,
+            # "openvla-7b": 0,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
         },
-        "put_carrot_on_plate": {"rt-1-x": 0.00, "octo-base": 0.25, "octo-small": 0.083},
         "stack_green_block_on_yellow_block": {
-            "rt-1-x": 0.000,
-            "octo-base": 0.000,
-            "octo-small": 0.125,
+            # "rt-1-x": 0.000,
+            # "octo-base": 0.000,
+            # "openvla-7b": 0.000,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
+            # "octo-small": 0.125,
         },
         "put_eggplant_in_basket": {
-            "rt-1-x": 0.000,
-            "octo-base": 0.250,
-            "octo-small": 0.400,
+            # "rt-1-x": 0.000,
+            # "octo-base": 0.250,
+            # "openvla-7b": 0.250,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
+            # "octo-small": 0.400,
         },
     }
 
@@ -931,6 +1160,7 @@ def calc_bridge_put_on_env_stats(root_result_dir):
     # partial_succ_fail_pattern = ['consecutive_grasp_True', 'consecutive_grasp_False']
 
     # get visual matching success
+    ret_dict = {}
     for task in tasks:
         real_success = real_success_dict[task]
         real_partial_success = real_partial_success_dict[task]
@@ -944,32 +1174,49 @@ def calc_bridge_put_on_env_stats(root_result_dir):
                 # we average octo performance over different random seeds
                 tmp = []
                 for seed in octo_seed_range:
-                    tmp.extend([f"{variant}_octo_init_rng_{seed}" for variant in base_visual_matching_variants])
+                    tmp.extend(
+                        [
+                            f"{variant}_octo_init_rng_{seed}"
+                            for variant in base_visual_matching_variants
+                        ]
+                    )
                 base_visual_matching_variants = tmp
             for variant in base_visual_matching_variants:
                 variant = f"{root_result_dir}/{CKPT_MAPPING[ckpt_alias]}/{variant}"
                 avg_sim_success = np.mean(
                     get_dir_stats(
                         variant,
-                        extra_pattern_require=extra_pattern_require_visual_matching[task],
+                        extra_pattern_require=extra_pattern_require_visual_matching[
+                            task
+                        ],
                         succ_fail_pattern=succ_fail_pattern,
                     )
                 )
                 avg_sim_partial_success = np.mean(
                     get_dir_stats(
                         variant,
-                        extra_pattern_require=extra_pattern_require_visual_matching[task],
+                        extra_pattern_require=extra_pattern_require_visual_matching[
+                            task
+                        ],
                         succ_fail_pattern=partial_succ_fail_pattern,
                     )
                 )
                 if np.isnan(avg_sim_success) or np.isnan(avg_sim_partial_success):
                     print(f"WARNING: avg_sim_success is nan for {variant}")
                 sim_visual_matching_success[ckpt_alias].append(avg_sim_success)
-                sim_visual_matching_partial_success[ckpt_alias].append(avg_sim_partial_success)
-            sim_visual_matching_success[ckpt_alias] = np.mean(sim_visual_matching_success[ckpt_alias])
-            sim_visual_matching_partial_success[ckpt_alias] = np.mean(sim_visual_matching_partial_success[ckpt_alias])
+                sim_visual_matching_partial_success[ckpt_alias].append(
+                    avg_sim_partial_success
+                )
+            sim_visual_matching_success[ckpt_alias] = np.mean(
+                sim_visual_matching_success[ckpt_alias]
+            )
+            sim_visual_matching_partial_success[ckpt_alias] = np.mean(
+                sim_visual_matching_partial_success[ckpt_alias]
+            )
 
-        print("sim visual matching partial success", sim_visual_matching_partial_success)
+        print(
+            "sim visual matching partial success", sim_visual_matching_partial_success
+        )
         print("real partial success", real_partial_success)
         print(
             "visual matching MMRV (partial success)",
@@ -986,8 +1233,12 @@ def calc_bridge_put_on_env_stats(root_result_dir):
             ),
         )
         print_all_kruskal_results(
-            construct_unordered_trial_results(n_trials_per_ckpt, list(sim_visual_matching_partial_success.values())),
-            construct_unordered_trial_results(n_trials_per_ckpt, list(real_partial_success.values())),
+            construct_unordered_trial_results(
+                n_trials_per_ckpt, list(sim_visual_matching_partial_success.values())
+            ),
+            construct_unordered_trial_results(
+                n_trials_per_ckpt, list(real_partial_success.values())
+            ),
             "avg kruskal (partial success):",
         )
 
@@ -995,15 +1246,23 @@ def calc_bridge_put_on_env_stats(root_result_dir):
         print("real success", real_success)
         print(
             "visual matching MMRV",
-            mean_maximum_rank_violation(list(sim_visual_matching_success.values()), list(real_success.values())),
+            mean_maximum_rank_violation(
+                list(sim_visual_matching_success.values()), list(real_success.values())
+            ),
         )
         print(
             "visual matching pearson correlation",
-            pearson_correlation(list(sim_visual_matching_success.values()), list(real_success.values())),
+            pearson_correlation(
+                list(sim_visual_matching_success.values()), list(real_success.values())
+            ),
         )
         print_all_kruskal_results(
-            construct_unordered_trial_results(n_trials_per_ckpt, list(sim_visual_matching_success.values())),
-            construct_unordered_trial_results(n_trials_per_ckpt, list(real_success.values())),
+            construct_unordered_trial_results(
+                n_trials_per_ckpt, list(sim_visual_matching_success.values())
+            ),
+            construct_unordered_trial_results(
+                n_trials_per_ckpt, list(real_success.values())
+            ),
             "avg kruskal:",
         )
 
@@ -1011,39 +1270,69 @@ def calc_bridge_put_on_env_stats(root_result_dir):
         for _ in range(3):
             print()
 
+        ret_dict[f"{task}/matching_partial"] = [
+            succ for _, succ in sim_visual_matching_partial_success.items()
+        ]
+        ret_dict[f"{task}/matching_entire"] = [
+            succ for _, succ in sim_visual_matching_success.items()
+        ]
+
+    return ret_dict
+
 
 # Define checkpoint alias-to-directory mapping; If you use a new checkpoint, please update the dict
 
 CKPT_MAPPING = {
-    "rt-2-x": "rt_2_x",
-    "rt-1-converged": "rt_1_tf_trained_for_000400120",
-    "rt-1-15pct": "rt_1_tf_trained_for_000058240",
-    "rt-1-x": "rt_1_x_tf_trained_for_002272480_step",
-    "rt-1-begin": "rt_1_tf_trained_for_000001120",
-    "octo-base": "octo-base",
-    "octo-small": "octo-small",
-    "octo-server": "octo-server",
+    # "rt-2-x": "rt_2_x",
+    # "rt-1-converged": "rt_1_tf_trained_for_000400120",
+    # "rt-1-15pct": "rt_1_tf_trained_for_000058240",
+    # "rt-1-x": "rt_1_x_tf_trained_for_002272480_step",
+    # "rt-1-begin": "rt_1_tf_trained_for_000001120",
+    # "octo-base": "octo-base",
+    # "openvla-7b": "openvla-7b",
+    # "internvla-1_8b": "internvla-1_8b",
+    # "internvla_diff-1_8b": "internvla_diff-1_8b",
+    "ours": "TODO",
+    # "octo-small": "octo-small",
+    # "octo-server": "octo-server",
 }
 
 parser = argparse.ArgumentParser()
-parser.add_argument("--task", type=str, default="pick_coke_can", help="task name")
-parser.add_argument("--log-dir-root", type=str, default="./results/", help="log directory")
-
+parser.add_argument(
+    "--log-dir-root", type=str, default="./results/", help="log directory"
+)
 args = parser.parse_args()
 
-if args.task == "pick_coke_can":
-    calc_pick_coke_can_stats(args.log_dir_root)
-elif args.task == "move_near":
-    calc_move_near_stats(args.log_dir_root)
-elif args.task == "drawer":
-    calc_drawer_stats(args.log_dir_root)
-elif args.task == "long_horizon_apple_in_drawer":
-    calc_long_horizon_apple_in_drawer_stats(args.log_dir_root)
-elif args.task == "bridge_put_on":
-    calc_bridge_put_on_env_stats(args.log_dir_root)
-else:
-    raise ValueError(f"Unknown task: {args.task}")
+# NOTE: replace the CKPT_MAPPING with the actual checkpoint directories
+CKPT_MAPPING["ours"] = Path(args.log_dir_root).name
+
+pick_coke_can_results = calc_pick_coke_can_stats(str(Path(args.log_dir_root).parent))
+move_near_real_results = calc_move_near_stats(str(Path(args.log_dir_root).parent))
+drawer_results = calc_drawer_stats(str(Path(args.log_dir_root).parent))
+long_horizon_apple_in_drawer_results = calc_long_horizon_apple_in_drawer_stats(str(Path(args.log_dir_root).parent))
+bridge_put_on_results = calc_bridge_put_on_env_stats(
+    str(Path(args.log_dir_root).parent)
+)
+
+results = {
+    **pick_coke_can_results,
+    **move_near_real_results,
+    **drawer_results,
+    **bridge_put_on_results,
+    **long_horizon_apple_in_drawer_results,
+    "ckpt_name": ["ours"],
+}
+
+df = pd.DataFrame(results)
+df_ref = pd.DataFrame(REF)
+results_df = pd.concat([df, df_ref], axis=0, ignore_index=True)
+results_df = results_df.transpose()
+markdown = results_df.to_markdown(f"{args.log_dir_root}/results.md", index=True)
+
 
+df.loc[0, "ckpt_name"] = CKPT_MAPPING["ours"]
+df.to_csv(f"{args.log_dir_root}/results.csv", index=False, float_format="%.3f")
+print(df)
 exit(0)
 
 """