Skip to content

Commit dd2b3a5

Browse files
authored
polish(zjow): rename 'eval reward' -> 'episode return' (#536)
* fix typo 'final eval reward' -> 'eval episode return'; 'episode reward' -> 'episode return' * formatting * fix typo * Polish code.
1 parent 01b1a8b commit dd2b3a5

File tree

94 files changed

+492
-501
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

94 files changed

+492
-501
lines changed

ding/design/serial_evaluator-activity.puml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,6 @@ repeat
2626
endif
2727
repeat while (evaluate episodes are not enough?)
2828
|#FFCCCC|evaluator|
29-
:return eval_episode_reward;
29+
:return eval_episode_return;
3030
stop
3131
@enduml

ding/design/serial_main.puml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ loop
3131
evaluator -> evaluator: eval_performance
3232
alt reach eval stop_value
3333
learner -> learner: save checkpoint and exit
34-
else eval_reward is new highest
34+
else episode_return is new highest
3535
learner -> learner: save checkpoint
3636
end
3737
end

ding/entry/application_entry.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,10 @@ def eval(
7272

7373
# Evaluate
7474
_, episode_info = evaluator.eval()
75-
reward = [e['final_eval_reward'] for e in episode_info]
76-
eval_reward = np.mean(to_ndarray(reward))
77-
print('Eval is over! The performance of your RL policy is {}'.format(eval_reward))
78-
return eval_reward
75+
reward = [e['eval_episode_return'] for e in episode_info]
76+
episode_return = np.mean(to_ndarray(reward))
77+
print('Eval is over! The performance of your RL policy is {}'.format(episode_return))
78+
return episode_return
7979

8080

8181
def collect_demo_data(
@@ -271,8 +271,8 @@ def episode_to_transitions_filter(data_path: str, expert_data_path: str, nstep:
271271
_dict = pickle.load(f) # class is list; length is cfg.reward_model.collect_count
272272
post_process_data = []
273273
for i in range(len(_dict)):
274-
episode_rewards = torch.stack([_dict[i][j]['reward'] for j in range(_dict[i].__len__())], axis=0)
275-
if episode_rewards.sum() < min_episode_return:
274+
episode_returns = torch.stack([_dict[i][j]['reward'] for j in range(_dict[i].__len__())], axis=0)
275+
if episode_returns.sum() < min_episode_return:
276276
continue
277277
data = get_nstep_return_data(_dict[i], nstep)
278278
post_process_data.extend(data)

ding/entry/serial_entry.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def serial_pipeline(
121121
import pickle
122122
import numpy as np
123123
with open(os.path.join(cfg.exp_name, 'result.pkl'), 'wb') as f:
124-
eval_value_raw = [d['final_eval_reward'] for d in eval_info]
124+
eval_value_raw = [d['eval_episode_return'] for d in eval_info]
125125
final_data = {
126126
'stop': stop,
127127
'env_step': collector.envstep,

ding/entry/serial_entry_gail.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def serial_pipeline_gail(
127127
# Evaluate policy performance
128128
if evaluator.should_eval(learner.train_iter):
129129
stop, reward = evaluator.eval(learner.save_checkpoint, learner.train_iter, collector.envstep)
130-
reward_mean = np.array([r['final_eval_reward'] for r in reward]).mean()
130+
reward_mean = np.array([r['eval_episode_return'] for r in reward]).mean()
131131
if reward_mean >= best_reward:
132132
save_reward_model(cfg.exp_name, reward_model, 'best')
133133
best_reward = reward_mean

ding/entry/serial_entry_onpolicy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def serial_pipeline_onpolicy(
102102
import pickle
103103
import numpy as np
104104
with open(os.path.join(cfg.exp_name, 'result.pkl'), 'wb') as f:
105-
eval_value_raw = [d['final_eval_reward'] for d in eval_info]
105+
eval_value_raw = [d['eval_episode_return'] for d in eval_info]
106106
final_data = {
107107
'stop': stop,
108108
'env_step': collector.envstep,

ding/entry/tests/test_application_entry.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,16 +34,16 @@ def test_eval(self, setup_state_dict):
3434
cfg_for_stop_value = compile_config(cartpole_offppo_config, auto=True, create_cfg=cartpole_offppo_create_config)
3535
stop_value = cfg_for_stop_value.env.stop_value
3636
config = deepcopy(cartpole_offppo_config), deepcopy(cartpole_offppo_create_config)
37-
eval_reward = eval(config, seed=0, state_dict=setup_state_dict['eval'])
38-
assert eval_reward >= stop_value
37+
episode_return = eval(config, seed=0, state_dict=setup_state_dict['eval'])
38+
assert episode_return >= stop_value
3939
config = deepcopy(cartpole_offppo_config), deepcopy(cartpole_offppo_create_config)
40-
eval_reward = eval(
40+
episode_return = eval(
4141
config,
4242
seed=0,
4343
env_setting=[CartPoleEnv, None, [{} for _ in range(5)]],
4444
state_dict=setup_state_dict['eval']
4545
)
46-
assert eval_reward >= stop_value
46+
assert episode_return >= stop_value
4747

4848
def test_collect_demo_data(self, setup_state_dict):
4949
config = deepcopy(cartpole_offppo_config), deepcopy(cartpole_offppo_create_config)

ding/envs/env/default_wrapper.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22
from typing import Optional, List
33
import copy
44

5-
final_eval_reward_wrapper = EasyDict(type='final_eval_reward')
5+
eval_episode_return_wrapper = EasyDict(type='eval_episode_return')
66

77

88
def get_default_wrappers(env_wrapper_name: str, env_id: Optional[str] = None) -> List[dict]:
99
if env_wrapper_name == 'mujoco_default':
1010
return [
1111
EasyDict(type='delay_reward', kwargs=dict(delay_reward_step=3)),
12-
copy.deepcopy(final_eval_reward_wrapper),
12+
copy.deepcopy(eval_episode_return_wrapper),
1313
]
1414
elif env_wrapper_name == 'atari_default':
1515
wrapper_list = []
@@ -23,14 +23,14 @@ def get_default_wrappers(env_wrapper_name: str, env_id: Optional[str] = None) ->
2323
wrapper_list.append(EasyDict(type='scaled_float_frame'))
2424
wrapper_list.append(EasyDict(type='clip_reward'))
2525
wrapper_list.append(EasyDict(type='frame_stack', kwargs=dict(n_frames=4)))
26-
wrapper_list.append(copy.deepcopy(final_eval_reward_wrapper))
26+
wrapper_list.append(copy.deepcopy(eval_episode_return_wrapper))
2727
return wrapper_list
2828
elif env_wrapper_name == 'gym_hybrid_default':
2929
return [
3030
EasyDict(type='gym_hybrid_dict_action'),
31-
copy.deepcopy(final_eval_reward_wrapper),
31+
copy.deepcopy(eval_episode_return_wrapper),
3232
]
3333
elif env_wrapper_name == 'default':
34-
return [copy.deepcopy(final_eval_reward_wrapper)]
34+
return [copy.deepcopy(eval_episode_return_wrapper)]
3535
else:
3636
raise NotImplementedError()

ding/envs/env/env_implementation_check.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def check_step(env: BaseEnv) -> None:
7474
for ndarray, space, name in zip([obs, rew], [env.observation_space, env.reward_space], ['obs', 'rew']):
7575
check_array_space(ndarray, space, name)
7676
if done:
77-
assert 'final_eval_reward' in info, "info dict should have 'final_eval_reward' key."
77+
assert 'eval_episode_return' in info, "info dict should have 'eval_episode_return' key."
7878
done_times += 1
7979
_ = env.reset()
8080
if done_times == 3:
@@ -163,7 +163,7 @@ def demonstrate_correct_procedure(env_fn: Callable) -> None:
163163
action = env.random_action()
164164
obs, rew, done, info = env.step(action)
165165
if done:
166-
assert 'final_eval_reward' in info
166+
assert 'eval_episode_return' in info
167167
done_times += 1
168168
obs = env.reset()
169169
# Seed will not change unless `seed` method is called again.

ding/envs/env/tests/demo_env.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def step(self, action: Any) -> 'BaseEnv.timestep':
5959
done = False
6060
info = {}
6161
if done:
62-
info['final_eval_reward'] = self.reward_space.sample() * 30
62+
info['eval_episode_return'] = self.reward_space.sample() * 30
6363
return BaseEnvTimestep(obs, rew, done, info)
6464

6565
def seed(self, seed: int) -> None:

0 commit comments

Comments
 (0)