Skip to content

Commit 6f5a5f7

Browse files
committed
refactor metrics
1 parent 69bca18 commit 6f5a5f7

File tree

3 files changed

+133
-138
lines changed

3 files changed

+133
-138
lines changed

README.md

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ We hope that our work guides and inspires future real-to-sim evaluation efforts.
1919
- [Installation](#installation)
2020
- [Examples](#examples)
2121
- [Current Environments](#current-environments)
22-
- [Compare Your Approach to SIMPLER](#compare-your-approach-to-simpler)
22+
- [Compare Your Policy Evaluation Approach to SIMPLER](#compare-your-policy-evaluation-approach-to-simpler)
2323
- [Code Structure](#code-structure)
2424
- [Adding New Policies](#adding-new-policies)
2525
- [Adding New Real-to-Sim Evaluation Environments and Robots](#adding-new-real-to-sim-evaluation-environments-and-robots)
@@ -128,17 +128,15 @@ We also support creating sub-tasks variations such as `google_robot_pick_{horizo
128128
By default, Google Robot environments use a control frequency of 3hz, and Bridge environments use a control frequency of 5hz. Simulation frequency is ~500hz.
129129

130130

131-
## Compare Your Approach to SIMPLER
131+
## Compare Your Policy Evaluation Approach to SIMPLER
132132

133-
We make it easy to compare your approach for offline robot policy evaluation to SIMPLER. In [our paper](https://simpler-env.github.io/) we use two metrics to measure the quality of simulated evaluation pipelines: Mean Maximum Rank Violation (MMRV) and the Pearson Correlation Coefficient.
134-
Both capture, how well the offline evaluations reflect the policy's real-world performance during robot rollouts.
133+
We make it easy to compare your offline robot policy evaluation approach to SIMPLER. In [our paper](https://simpler-env.github.io/), we use two metrics to measure the quality of simulated evaluation pipelines: Mean Maximum Rank Violation (MMRV) and the Pearson Correlation Coefficient. Both capture how well the offline evaluations reflect the policy's real-world performance and behaviors during robot rollouts.
135134

136-
To make comparisons easy, we provide all our raw policy evaluation data: performance values for all policies on all real-world tasks. We also provide the corresponding estimates for policy performance of SIMPLER and functions for computing the aforementioned metrics we report in the paper.
135+
To make comparisons easy, we provide our real and SIMPLER evaluation performance for all policies on all tasks. We also provide the corresponding functions for computing the aforementioned metrics we report in the paper.
137136

138137
To compute the corresponding metrics for *your* offline policy evaluation approach `your_sim_eval(task, policy)`, you can use the following snippet:
139138
```
140-
from simpler_env.utils.metrics import mean_maximum_rank_violation, pearson_correlation
141-
from tools.calc_metrics import REAL_PERF
139+
from simpler_env.utils.metrics import mean_maximum_rank_violation, pearson_correlation, REAL_PERF
142140
143141
sim_eval_perf = [
144142
your_sim_eval(task="google_robot_move_near", policy=p)

simpler_env/utils/metrics.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,132 @@
44

55
import numpy as np
66

7+
REAL_PERF = { # Real robot eval performance --> extract via: REAL_PERF[task][policy]
8+
"google_robot_pick_coke_can": {
9+
"rt-2-x": 0.907,
10+
"rt-1-converged": 0.853,
11+
"rt-1-15pct": 0.920,
12+
"rt-1-x": 0.760,
13+
"rt-1-begin": 0.133,
14+
"octo-base": 0.293,
15+
},
16+
"google_robot_move_near": {
17+
"rt-2-x": 0.733,
18+
"rt-1-converged": 0.633,
19+
"rt-1-15pct": 0.583,
20+
"rt-1-x": 0.450,
21+
"rt-1-begin": 0.017,
22+
"octo-base": 0.350,
23+
},
24+
"google_robot_open_drawer": {
25+
"rt-2-x": 0.333,
26+
"rt-1-converged": 0.815,
27+
"rt-1-15pct": 0.704,
28+
"rt-1-x": 0.519,
29+
"rt-1-begin": 0.000,
30+
"octo-base": 0.148,
31+
},
32+
"google_robot_close_drawer": {
33+
"rt-2-x": 0.630,
34+
"rt-1-converged": 0.926,
35+
"rt-1-15pct": 0.889,
36+
"rt-1-x": 0.741,
37+
"rt-1-begin": 0.000,
38+
"octo-base": 0.519,
39+
},
40+
"google_robot_place_apple_in_closed_top_drawer": {
41+
"rt-2-x": 0.074,
42+
"rt-1-converged": 0.185,
43+
"rt-1-15pct": 0.185,
44+
"rt-1-x": 0.407,
45+
"rt-1-begin": 0.000,
46+
"octo-base": 0.000,
47+
},
48+
"widowx_spoon_on_towel": {
49+
"rt-1-x": 0.000,
50+
"octo-base": 0.333,
51+
"octo-small": 0.417,
52+
},
53+
"widowx_carrot_on_plate": {
54+
"rt-1-x": 0.000,
55+
"octo-base": 0.250,
56+
"octo-small": 0.083,
57+
},
58+
"widowx_stack_cube": {
59+
"rt-1-x": 0.000,
60+
"octo-base": 0.000,
61+
"octo-small": 0.125,
62+
},
63+
"widowx_put_eggplant_in_basket": {
64+
"rt-1-x": 0.000,
65+
"octo-base": 0.250,
66+
"octo-small": 0.400,
67+
},
68+
}
69+
70+
71+
SIMPLER_PERF = { # SIMPLER simulated eval performance --> extract via: SIMPLER_PERF[task][policy]
72+
"google_robot_pick_coke_can": {
73+
"rt-2-x": 0.787,
74+
"rt-1-converged": 0.857,
75+
"rt-1-15pct": 0.710,
76+
"rt-1-x": 0.567,
77+
"rt-1-begin": 0.027,
78+
"octo-base": 0.170,
79+
},
80+
"google_robot_move_near": {
81+
"rt-2-x": 0.779,
82+
"rt-1-converged": 0.442,
83+
"rt-1-15pct": 0.354,
84+
"rt-1-x": 0.317,
85+
"rt-1-begin": 0.050,
86+
"octo-base": 0.042,
87+
},
88+
"google_robot_open_drawer": {
89+
"rt-2-x": 0.157,
90+
"rt-1-converged": 0.601,
91+
"rt-1-15pct": 0.463,
92+
"rt-1-x": 0.296,
93+
"rt-1-begin": 0.000,
94+
"octo-base": 0.009,
95+
},
96+
"google_robot_close_drawer": {
97+
"rt-2-x": 0.343,
98+
"rt-1-converged": 0.861,
99+
"rt-1-15pct": 0.667,
100+
"rt-1-x": 0.891,
101+
"rt-1-begin": 0.278,
102+
"octo-base": 0.444,
103+
},
104+
"google_robot_place_apple_in_closed_top_drawer": {
105+
"rt-2-x": 0.037,
106+
"rt-1-converged": 0.065,
107+
"rt-1-15pct": 0.130,
108+
"rt-1-x": 0.213,
109+
"rt-1-begin": 0.000,
110+
"octo-base": 0.000,
111+
},
112+
"widowx_spoon_on_towel": {
113+
"rt-1-x": 0.000,
114+
"octo-base": 0.125,
115+
"octo-small": 0.472,
116+
},
117+
"widowx_carrot_on_plate": {
118+
"rt-1-x": 0.042,
119+
"octo-base": 0.083,
120+
"octo-small": 0.097,
121+
},
122+
"widowx_stack_cube": {
123+
"rt-1-x": 0.000,
124+
"octo-base": 0.000,
125+
"octo-small": 0.042,
126+
},
127+
"widowx_put_eggplant_in_basket": {
128+
"rt-1-x": 0.000,
129+
"octo-base": 0.431,
130+
"octo-small": 0.569,
131+
},
132+
}
7133

8134
def pearson_correlation(perf_sim: Sequence[float], perf_real: Sequence[float]) -> float:
9135
perf_sim, perf_real = np.array(perf_sim), np.array(perf_real)

tools/calc_metrics.py

Lines changed: 2 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Computes metrics for evaluating simulated evaluation pipelines.
33
44
Usage:
5-
from simpler_env.utils.metrics import mean_maximum_rank_violation, pearson_correlation
5+
from simpler_env.utils.metrics import mean_maximum_rank_violation, pearson_correlation, REAL_PERF
66
77
sim_eval_perf = [
88
your_sim_eval(task="google_robot_move_near", policy=p)
@@ -17,136 +17,7 @@
1717
"""
1818

1919

20-
from simpler_env.utils.metrics import mean_maximum_rank_violation, pearson_correlation
21-
22-
23-
24-
REAL_PERF = { # Real robot eval performance --> extract via: REAL_PERF[task][policy]
25-
"google_robot_pick_coke_can": {
26-
"rt-2-x": 0.907,
27-
"rt-1-converged": 0.853,
28-
"rt-1-15pct": 0.920,
29-
"rt-1-x": 0.760,
30-
"rt-1-begin": 0.133,
31-
"octo-base": 0.293,
32-
},
33-
"google_robot_move_near": {
34-
"rt-2-x": 0.733,
35-
"rt-1-converged": 0.633,
36-
"rt-1-15pct": 0.583,
37-
"rt-1-x": 0.450,
38-
"rt-1-begin": 0.017,
39-
"octo-base": 0.350,
40-
},
41-
"google_robot_open_drawer": {
42-
"rt-2-x": 0.333,
43-
"rt-1-converged": 0.815,
44-
"rt-1-15pct": 0.704,
45-
"rt-1-x": 0.519,
46-
"rt-1-begin": 0.000,
47-
"octo-base": 0.148,
48-
},
49-
"google_robot_close_drawer": {
50-
"rt-2-x": 0.630,
51-
"rt-1-converged": 0.926,
52-
"rt-1-15pct": 0.889,
53-
"rt-1-x": 0.741,
54-
"rt-1-begin": 0.000,
55-
"octo-base": 0.519,
56-
},
57-
"google_robot_place_apple_in_closed_top_drawer": {
58-
"rt-2-x": 0.074,
59-
"rt-1-converged": 0.185,
60-
"rt-1-15pct": 0.185,
61-
"rt-1-x": 0.407,
62-
"rt-1-begin": 0.000,
63-
"octo-base": 0.000,
64-
},
65-
"widowx_spoon_on_towel": {
66-
"rt-1-x": 0.000,
67-
"octo-base": 0.333,
68-
"octo-small": 0.417,
69-
},
70-
"widowx_carrot_on_plate": {
71-
"rt-1-x": 0.000,
72-
"octo-base": 0.250,
73-
"octo-small": 0.083,
74-
},
75-
"widowx_stack_cube": {
76-
"rt-1-x": 0.000,
77-
"octo-base": 0.000,
78-
"octo-small": 0.125,
79-
},
80-
"widowx_put_eggplant_in_basket": {
81-
"rt-1-x": 0.000,
82-
"octo-base": 0.250,
83-
"octo-small": 0.400,
84-
},
85-
}
86-
87-
88-
SIMPLER_PERF = { # SIMPLER simulated eval performance --> extract via: SIMPLER_PERF[task][policy]
89-
"google_robot_pick_coke_can": {
90-
"rt-2-x": 0.787,
91-
"rt-1-converged": 0.857,
92-
"rt-1-15pct": 0.710,
93-
"rt-1-x": 0.567,
94-
"rt-1-begin": 0.027,
95-
"octo-base": 0.170,
96-
},
97-
"google_robot_move_near": {
98-
"rt-2-x": 0.779,
99-
"rt-1-converged": 0.442,
100-
"rt-1-15pct": 0.354,
101-
"rt-1-x": 0.317,
102-
"rt-1-begin": 0.050,
103-
"octo-base": 0.042,
104-
},
105-
"google_robot_open_drawer": {
106-
"rt-2-x": 0.157,
107-
"rt-1-converged": 0.601,
108-
"rt-1-15pct": 0.463,
109-
"rt-1-x": 0.296,
110-
"rt-1-begin": 0.000,
111-
"octo-base": 0.009,
112-
},
113-
"google_robot_close_drawer": {
114-
"rt-2-x": 0.343,
115-
"rt-1-converged": 0.861,
116-
"rt-1-15pct": 0.667,
117-
"rt-1-x": 0.891,
118-
"rt-1-begin": 0.278,
119-
"octo-base": 0.444,
120-
},
121-
"google_robot_place_apple_in_closed_top_drawer": {
122-
"rt-2-x": 0.037,
123-
"rt-1-converged": 0.065,
124-
"rt-1-15pct": 0.130,
125-
"rt-1-x": 0.213,
126-
"rt-1-begin": 0.000,
127-
"octo-base": 0.000,
128-
},
129-
"widowx_spoon_on_towel": {
130-
"rt-1-x": 0.000,
131-
"octo-base": 0.125,
132-
"octo-small": 0.472,
133-
},
134-
"widowx_carrot_on_plate": {
135-
"rt-1-x": 0.042,
136-
"octo-base": 0.083,
137-
"octo-small": 0.097,
138-
},
139-
"widowx_stack_cube": {
140-
"rt-1-x": 0.000,
141-
"octo-base": 0.000,
142-
"octo-small": 0.042,
143-
},
144-
"widowx_put_eggplant_in_basket": {
145-
"rt-1-x": 0.000,
146-
"octo-base": 0.431,
147-
"octo-small": 0.569,
148-
},
149-
}
20+
from simpler_env.utils.metrics import mean_maximum_rank_violation, pearson_correlation, REAL_PERF, SIMPLER_PERF
15021

15122

15223
if __name__ == "__main__":

0 commit comments

Comments
 (0)