refactor metrics

xuanlinli17 · xuanlinli17 · commit 6f5a5f747377 · 2024-05-05T13:52:01.000-07:00
diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ We hope that our work guides and inspires future real-to-sim evaluation efforts.
   - [Installation](#installation)
   - [Examples](#examples)
   - [Current Environments](#current-environments)
-  - [Compare Your Approach to SIMPLER](#compare-your-approach-to-simpler)
+  - [Compare Your Policy Evaluation Approach to SIMPLER](#compare-your-policy-evaluation-approach-to-simpler)
   - [Code Structure](#code-structure)
   - [Adding New Policies](#adding-new-policies)
   - [Adding New Real-to-Sim Evaluation Environments and Robots](#adding-new-real-to-sim-evaluation-environments-and-robots)
@@ -128,17 +128,15 @@ We also support creating sub-tasks variations such as `google_robot_pick_{horizo
 By default, Google Robot environments use a control frequency of 3hz, and Bridge environments use a control frequency of 5hz. Simulation frequency is ~500hz.
 
 
-## Compare Your Approach to SIMPLER
+## Compare Your Policy Evaluation Approach to SIMPLER
 
-We make it easy to compare your approach for offline robot policy evaluation to SIMPLER. In [our paper](https://simpler-env.github.io/) we use two metrics to measure the quality of simulated evaluation pipelines: Mean Maximum Rank Violation (MMRV) and the Pearson Correlation Coefficient. 
-Both capture, how well the offline evaluations reflect the policy's real-world performance during robot rollouts.
+We make it easy to compare your offline robot policy evaluation approach to SIMPLER. In [our paper](https://simpler-env.github.io/), we use two metrics to measure the quality of simulated evaluation pipelines: Mean Maximum Rank Violation (MMRV) and the Pearson Correlation Coefficient. Both capture how well the offline evaluations reflect the policy's real-world performance and behaviors during robot rollouts.
 
-To make comparisons easy, we provide all our raw policy evaluation data: performance values for all policies on all real-world tasks. We also provide the corresponding estimates for policy performance of SIMPLER and functions for computing the aforementioned metrics we report in the paper.
+To make comparisons easy, we provide our real and SIMPLER evaluation performance for all policies on all tasks. We also provide the corresponding functions for computing the aforementioned metrics we report in the paper.
 
 To compute the corresponding metrics for *your* offline policy evaluation approach `your_sim_eval(task, policy)`, you can use the following snippet:
 ```
-from simpler_env.utils.metrics import mean_maximum_rank_violation, pearson_correlation
-from tools.calc_metrics import REAL_PERF
+from simpler_env.utils.metrics import mean_maximum_rank_violation, pearson_correlation, REAL_PERF
 
 sim_eval_perf = [
     your_sim_eval(task="google_robot_move_near", policy=p) 
diff --git a/simpler_env/utils/metrics.py b/simpler_env/utils/metrics.py
@@ -4,6 +4,132 @@
 
 import numpy as np
 
+REAL_PERF = {    # Real robot eval performance --> extract via: REAL_PERF[task][policy]
+    "google_robot_pick_coke_can": {
+        "rt-2-x": 0.907,
+        "rt-1-converged": 0.853,
+        "rt-1-15pct": 0.920,
+        "rt-1-x": 0.760,
+        "rt-1-begin": 0.133,
+        "octo-base": 0.293,
+    },
+    "google_robot_move_near": {
+        "rt-2-x": 0.733,
+        "rt-1-converged": 0.633,
+        "rt-1-15pct": 0.583,
+        "rt-1-x": 0.450,
+        "rt-1-begin": 0.017,
+        "octo-base": 0.350,
+    },
+    "google_robot_open_drawer": {
+        "rt-2-x": 0.333,
+        "rt-1-converged": 0.815,
+        "rt-1-15pct": 0.704,
+        "rt-1-x": 0.519,
+        "rt-1-begin": 0.000,
+        "octo-base": 0.148,
+    },
+    "google_robot_close_drawer": {
+        "rt-2-x": 0.630,
+        "rt-1-converged": 0.926,
+        "rt-1-15pct": 0.889,
+        "rt-1-x": 0.741,
+        "rt-1-begin": 0.000,
+        "octo-base": 0.519,
+    },
+    "google_robot_place_apple_in_closed_top_drawer": {
+        "rt-2-x": 0.074,
+        "rt-1-converged": 0.185,
+        "rt-1-15pct": 0.185,
+        "rt-1-x": 0.407,
+        "rt-1-begin": 0.000,
+        "octo-base": 0.000,
+    },
+    "widowx_spoon_on_towel": {
+        "rt-1-x": 0.000,
+        "octo-base": 0.333,
+        "octo-small": 0.417,
+    },
+    "widowx_carrot_on_plate": {
+        "rt-1-x": 0.000,
+        "octo-base": 0.250,
+        "octo-small": 0.083,
+    },
+    "widowx_stack_cube": {
+        "rt-1-x": 0.000,
+        "octo-base": 0.000,
+        "octo-small": 0.125,
+    },
+    "widowx_put_eggplant_in_basket": {
+        "rt-1-x": 0.000,
+        "octo-base": 0.250,
+        "octo-small": 0.400,
+    },
+}
+
+
+SIMPLER_PERF = {    # SIMPLER simulated eval performance --> extract via: SIMPLER_PERF[task][policy]
+    "google_robot_pick_coke_can": {
+        "rt-2-x": 0.787,
+        "rt-1-converged": 0.857,
+        "rt-1-15pct": 0.710,
+        "rt-1-x": 0.567,
+        "rt-1-begin": 0.027,
+        "octo-base": 0.170,
+    },
+    "google_robot_move_near": {
+        "rt-2-x": 0.779,
+        "rt-1-converged": 0.442,
+        "rt-1-15pct": 0.354,
+        "rt-1-x": 0.317,
+        "rt-1-begin": 0.050,
+        "octo-base": 0.042,
+    },
+    "google_robot_open_drawer": {
+        "rt-2-x": 0.157,
+        "rt-1-converged": 0.601,
+        "rt-1-15pct": 0.463,
+        "rt-1-x": 0.296,
+        "rt-1-begin": 0.000,
+        "octo-base": 0.009,
+    },
+    "google_robot_close_drawer": {
+        "rt-2-x": 0.343,
+        "rt-1-converged": 0.861,
+        "rt-1-15pct": 0.667,
+        "rt-1-x": 0.891,
+        "rt-1-begin": 0.278,
+        "octo-base": 0.444,
+    },
+    "google_robot_place_apple_in_closed_top_drawer": {
+        "rt-2-x": 0.037,
+        "rt-1-converged": 0.065,
+        "rt-1-15pct": 0.130,
+        "rt-1-x": 0.213,
+        "rt-1-begin": 0.000,
+        "octo-base": 0.000,
+    },
+    "widowx_spoon_on_towel": {
+        "rt-1-x": 0.000,
+        "octo-base": 0.125,
+        "octo-small": 0.472,
+    },
+    "widowx_carrot_on_plate": {
+        "rt-1-x": 0.042,
+        "octo-base": 0.083,
+        "octo-small": 0.097,
+    },
+    "widowx_stack_cube": {
+        "rt-1-x": 0.000,
+        "octo-base": 0.000,
+        "octo-small": 0.042,
+    },
+    "widowx_put_eggplant_in_basket": {
+        "rt-1-x": 0.000,
+        "octo-base": 0.431,
+        "octo-small": 0.569,
+    },
+}
 
 def pearson_correlation(perf_sim: Sequence[float], perf_real: Sequence[float]) -> float:
     perf_sim, perf_real = np.array(perf_sim), np.array(perf_real)
diff --git a/tools/calc_metrics.py b/tools/calc_metrics.py
@@ -2,7 +2,7 @@
 Computes metrics for evaluating simulated evaluation pipelines.
 
 Usage:
-    from simpler_env.utils.metrics import mean_maximum_rank_violation, pearson_correlation
+    from simpler_env.utils.metrics import mean_maximum_rank_violation, pearson_correlation, REAL_PERF
 
     sim_eval_perf = [
         your_sim_eval(task="google_robot_move_near", policy=p) 
@@ -17,136 +17,7 @@
 """
 
 
-from simpler_env.utils.metrics import mean_maximum_rank_violation, pearson_correlation
-
-
-
-REAL_PERF = {    # Real robot eval performance --> extract via: REAL_PERF[task][policy]
-    "google_robot_pick_coke_can": {
-        "rt-2-x": 0.907,
-        "rt-1-converged": 0.853,
-        "rt-1-15pct": 0.920,
-        "rt-1-x": 0.760,
-        "rt-1-begin": 0.133,
-        "octo-base": 0.293,
-    },
-    "google_robot_move_near": {
-        "rt-2-x": 0.733,
-        "rt-1-converged": 0.633,
-        "rt-1-15pct": 0.583,
-        "rt-1-x": 0.450,
-        "rt-1-begin": 0.017,
-        "octo-base": 0.350,
-    },
-    "google_robot_open_drawer": {
-        "rt-2-x": 0.333,
-        "rt-1-converged": 0.815,
-        "rt-1-15pct": 0.704,
-        "rt-1-x": 0.519,
-        "rt-1-begin": 0.000,
-        "octo-base": 0.148,
-    },
-    "google_robot_close_drawer": {
-        "rt-2-x": 0.630,
-        "rt-1-converged": 0.926,
-        "rt-1-15pct": 0.889,
-        "rt-1-x": 0.741,
-        "rt-1-begin": 0.000,
-        "octo-base": 0.519,
-    },
-    "google_robot_place_apple_in_closed_top_drawer": {
-        "rt-2-x": 0.074,
-        "rt-1-converged": 0.185,
-        "rt-1-15pct": 0.185,
-        "rt-1-x": 0.407,
-        "rt-1-begin": 0.000,
-        "octo-base": 0.000,
-    },
-    "widowx_spoon_on_towel": {
-        "rt-1-x": 0.000,
-        "octo-base": 0.333,
-        "octo-small": 0.417,
-    },
-    "widowx_carrot_on_plate": {
-        "rt-1-x": 0.000,
-        "octo-base": 0.250,
-        "octo-small": 0.083,
-    },
-    "widowx_stack_cube": {
-        "rt-1-x": 0.000,
-        "octo-base": 0.000,
-        "octo-small": 0.125,
-    },
-    "widowx_put_eggplant_in_basket": {
-        "rt-1-x": 0.000,
-        "octo-base": 0.250,
-        "octo-small": 0.400,
-    },
-}
-
-
-SIMPLER_PERF = {    # SIMPLER simulated eval performance --> extract via: SIMPLER_PERF[task][policy]
-    "google_robot_pick_coke_can": {
-        "rt-2-x": 0.787,
-        "rt-1-converged": 0.857,
-        "rt-1-15pct": 0.710,
-        "rt-1-x": 0.567,
-        "rt-1-begin": 0.027,
-        "octo-base": 0.170,
-    },
-    "google_robot_move_near": {
-        "rt-2-x": 0.779,
-        "rt-1-converged": 0.442,
-        "rt-1-15pct": 0.354,
-        "rt-1-x": 0.317,
-        "rt-1-begin": 0.050,
-        "octo-base": 0.042,
-    },
-    "google_robot_open_drawer": {
-        "rt-2-x": 0.157,
-        "rt-1-converged": 0.601,
-        "rt-1-15pct": 0.463,
-        "rt-1-x": 0.296,
-        "rt-1-begin": 0.000,
-        "octo-base": 0.009,
-    },
-    "google_robot_close_drawer": {
-        "rt-2-x": 0.343,
-        "rt-1-converged": 0.861,
-        "rt-1-15pct": 0.667,
-        "rt-1-x": 0.891,
-        "rt-1-begin": 0.278,
-        "octo-base": 0.444,
-    },
-    "google_robot_place_apple_in_closed_top_drawer": {
-        "rt-2-x": 0.037,
-        "rt-1-converged": 0.065,
-        "rt-1-15pct": 0.130,
-        "rt-1-x": 0.213,
-        "rt-1-begin": 0.000,
-        "octo-base": 0.000,
-    },
-    "widowx_spoon_on_towel": {
-        "rt-1-x": 0.000,
-        "octo-base": 0.125,
-        "octo-small": 0.472,
-    },
-    "widowx_carrot_on_plate": {
-        "rt-1-x": 0.042,
-        "octo-base": 0.083,
-        "octo-small": 0.097,
-    },
-    "widowx_stack_cube": {
-        "rt-1-x": 0.000,
-        "octo-base": 0.000,
-        "octo-small": 0.042,
-    },
-    "widowx_put_eggplant_in_basket": {
-        "rt-1-x": 0.000,
-        "octo-base": 0.431,
-        "octo-small": 0.569,
-    },
-}
+from simpler_env.utils.metrics import mean_maximum_rank_violation, pearson_correlation, REAL_PERF, SIMPLER_PERF
 
 
 if __name__ == "__main__":