Skip to content

Commit 23f25ac

Browse files
committed
new g2048 clean up
1 parent e90546b commit 23f25ac

File tree

7 files changed

+824
-185
lines changed

7 files changed

+824
-185
lines changed

pufferlib/config/ocean/g2048.ini

Lines changed: 146 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,177 @@
11
[base]
22
package = ocean
33
env_name = puffer_g2048
4-
policy_name = Policy
4+
policy_name = G2048
55
rnn_name = Recurrent
66

77
[policy]
8-
hidden_size = 256
8+
; hidden_size = 256
9+
hidden_size = 512
910

1011
[rnn]
11-
input_size = 256
12-
hidden_size = 256
12+
; input_size = 256
13+
; hidden_size = 256
14+
input_size = 512
15+
hidden_size = 512
1316

1417
[vec]
1518
num_envs = 4
1619

1720
[env]
1821
num_envs = 4096
22+
reward_scaler = 0.67
23+
endgame_env_prob = 0.05
24+
scaffolding_ratio = 0.67
25+
use_heuristic_rewards = True
26+
snake_reward_weight = 0.0005
1927

2028
[train]
21-
# https://wandb.ai/kywch/pufferlib/runs/n8xml0u9?nw=nwuserkywch
22-
total_timesteps = 3_000_000_000
29+
# 256 hidden: https://wandb.ai/kywch/pufferlib/runs/nvd0pfuj?nw=nwuserkywch
30+
# 512 hidden: https://wandb.ai/kywch/pufferlib/runs/2ch3my60?nw=nwuserkywch
31+
total_timesteps = 6_767_676_767
32+
; total_timesteps = 1_000_000_000
2333
anneal_lr = True
34+
min_learning_rate = 0.00005
2435
batch_size = auto
2536
bptt_horizon = 64
26-
minibatch_size = 65536
37+
minibatch_size = 32768
2738

28-
adam_beta1 = 0.99
29-
adam_beta2 = 0.96
30-
adam_eps = 1.0e-10
31-
clip_coef = 0.1
32-
ent_coef = 0.02
33-
gae_lambda = 0.6
34-
gamma = 0.985
35-
learning_rate = 0.001
36-
max_grad_norm = 1.0
37-
prio_alpha = 0.99
38-
prio_beta0 = 0.40
39-
vf_clip_coef = 0.1
39+
clip_coef = 0.067
40+
ent_coef = 0.0267
41+
gae_lambda = 0.67
42+
gamma = 0.99567
43+
vf_clip_coef = 0.167
4044
vf_coef = 2.0
41-
vtrace_c_clip = 4.3
42-
vtrace_rho_clip = 1.6
45+
46+
# for 256 hidden
47+
; learning_rate = 0.0005
48+
; max_grad_norm = 0.5
49+
50+
# for 512 hidden
51+
learning_rate = 0.000467
52+
max_grad_norm = 0.5
53+
54+
55+
# These are newer puffer PPO params. Need more sweeping.
56+
adam_beta1 = 0.99
57+
adam_beta2 = 0.9999
58+
adam_eps = 0.0001
59+
prio_alpha = 0.8
60+
prio_beta0 = 0.1
61+
vtrace_c_clip = 2.0
62+
vtrace_rho_clip = 1.1
4363

4464

65+
### Targeted sweep
66+
4567
[sweep]
4668
metric = score
4769
goal = maximize
70+
max_suggestion_cost = 7200
71+
sweep_only = endgame_env_prob, scaffolding_ratio, snake_reward_weight, learning_rate, max_grad_norm
72+
downsample = 1
73+
74+
[sweep.env.endgame_env_prob]
75+
distribution = uniform
76+
min = 0.0
77+
mean = 0.03
78+
max = 0.2
79+
scale = auto
80+
81+
[sweep.env.scaffolding_ratio]
82+
distribution = uniform
83+
min = 0.1
84+
mean = 0.5
85+
max = 0.8
86+
scale = auto
4887

49-
[sweep.train.total_timesteps]
50-
distribution = log_normal
51-
min = 3e8
52-
max = 1e10
53-
mean = 1e9
54-
scale = time
88+
[sweep.env.snake_reward_weight]
89+
distribution = uniform
90+
min = 0.0001
91+
mean = 0.0007
92+
max = 0.0050
93+
scale = auto
5594

5695
[sweep.train.learning_rate]
57-
distribution = log_normal
58-
min = 0.00001
59-
mean = 0.001
60-
max = 0.1
96+
distribution = uniform
97+
min = 0.0001
98+
mean = 0.0005
99+
max = 0.0030
100+
scale = 0.5
101+
102+
[sweep.train.max_grad_norm]
103+
distribution = uniform
104+
min = 0.1
105+
mean = 0.5
106+
max = 2.0
61107
scale = 0.5
62108

63-
[sweep.train.gae_lambda]
64-
distribution = logit_normal
65-
min = 0.01
66-
mean = 0.6
67-
max = 0.995
68-
scale = auto
109+
[sweep.train.vf_clip_coef]
110+
distribution = uniform
111+
min = 0.05
112+
max = 0.5
113+
mean = 0.2
114+
scale = auto
115+
116+
117+
### Broad sweep
118+
119+
; [sweep]
120+
; metric = score
121+
; goal = maximize
122+
123+
; [sweep.env.reward_scaler]
124+
; distribution = uniform
125+
; min = 0.1
126+
; mean = 0.5
127+
; max = 1.0
128+
; scale = auto
129+
130+
; [sweep.env.scaffolding_ratio]
131+
; distribution = uniform
132+
; min = 0.0
133+
; mean = 0.5
134+
; max = 0.8
135+
; scale = auto
136+
137+
; [sweep.env.snake_reward_weight]
138+
; distribution = uniform
139+
; min = 0.00001
140+
; mean = 0.00005
141+
; max = 0.0002
142+
; scale = auto
143+
144+
; [sweep.train.total_timesteps]
145+
; distribution = log_normal
146+
; min = 3e8
147+
; max = 1e10
148+
; mean = 1e9
149+
; scale = time
150+
151+
; [sweep.train.learning_rate]
152+
; distribution = log_normal
153+
; min = 0.00001
154+
; mean = 0.001
155+
; max = 0.1
156+
; scale = 0.5
157+
158+
; [sweep.train.gamma]
159+
; distribution = logit_normal
160+
; min = 0.8
161+
; mean = 0.995
162+
; max = 0.9999
163+
; scale = auto
164+
165+
; [sweep.train.gae_lambda]
166+
; distribution = logit_normal
167+
; min = 0.01
168+
; mean = 0.7
169+
; max = 0.995
170+
; scale = auto
171+
172+
; [sweep.train.clip_coef]
173+
; distribution = log_normal
174+
; min = 0.001
175+
; max = 0.5
176+
; mean = 0.05
177+
; scale = auto

pufferlib/ocean/g2048/binding.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,15 @@
33
#define Env Game
44
#include "../env_binding.h"
55

6-
// g2048.h does not have a 'size' field, so my_init can just return 0
76
static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
8-
// No custom initialization needed for 2048
7+
env->can_go_over_65536 = unpack(kwargs, "can_go_over_65536");
8+
env->reward_scaler = unpack(kwargs, "reward_scaler");
9+
env->endgame_env_prob = unpack(kwargs, "endgame_env_prob");
10+
env->scaffolding_ratio = unpack(kwargs, "scaffolding_ratio");
11+
env->use_heuristic_rewards = unpack(kwargs, "use_heuristic_rewards");
12+
env->snake_reward_weight = unpack(kwargs, "snake_reward_weight");
13+
env->use_sparse_reward = unpack(kwargs, "use_sparse_reward");
14+
init(env);
915
return 0;
1016
}
1117

@@ -15,5 +21,11 @@ static int my_log(PyObject* dict, Log* log) {
1521
assign_to_dict(dict, "merge_score", log->merge_score);
1622
assign_to_dict(dict, "episode_return", log->episode_return);
1723
assign_to_dict(dict, "episode_length", log->episode_length);
24+
assign_to_dict(dict, "lifetime_max_tile", log->lifetime_max_tile);
25+
assign_to_dict(dict, "reached_32768", log->reached_32768);
26+
assign_to_dict(dict, "reached_65536", log->reached_65536);
27+
assign_to_dict(dict, "monotonicity_reward", log->monotonicity_reward);
28+
assign_to_dict(dict, "snake_state", log->snake_state);
29+
assign_to_dict(dict, "snake_reward", log->snake_reward);
1830
return 0;
1931
}

pufferlib/ocean/g2048/eval.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
from pufferlib import pufferl
2+
3+
def evaluate(env_name, load_model_path):
4+
args = pufferl.load_config(env_name)
5+
args['vec']['num_envs'] = 1
6+
args['env']['num_envs'] = 4096
7+
args['load_model_path'] = load_model_path
8+
# Turn off endgame_envs and scaffolding episodes, which do not report results
9+
args['env']['endgame_env_prob'] = 0
10+
args['env']['scaffolding_ratio'] = 0
11+
args['env']['can_go_over_65536'] = True
12+
13+
vecenv = pufferl.load_env(env_name, args)
14+
policy = pufferl.load_policy(args, vecenv, env_name)
15+
trainer = pufferl.PuffeRL(args['train'], vecenv, policy)
16+
17+
# Each evaluate runs for 64 ticks. NOTE: bppt horizon might be short for g2048?
18+
# Avg episode length from the current model is ~18000, so it takes ~300 epochs for an avg episode.
19+
# It's hard to get the single best score because stats are already averaged across done envs.
20+
for i in range(10000):
21+
stats = trainer.evaluate()
22+
23+
trainer.epoch += 1
24+
if i % 20 == 0:
25+
trainer.print_dashboard()
26+
27+
trainer.close()
28+
29+
# Get the estimates
30+
num_episodes = sum(stats['n'])
31+
episode_lengths = sum(n * l for n, l in zip(stats['n'], stats['episode_length'])) / num_episodes
32+
max_tiles = sum(n * m for n, m in zip(stats['n'], stats['score'])) / num_episodes
33+
merge_scores = sum(n * s for n, s in zip(stats['n'], stats['merge_score'])) / num_episodes
34+
reached_32768 = sum(n * s for n, s in zip(stats['n'], stats['reached_32768'])) / num_episodes
35+
reached_65536 = sum(n * s for n, s in zip(stats['n'], stats['reached_65536'])) / num_episodes
36+
37+
print(f"Num episodes: {int(num_episodes)}")
38+
print(f"Max tile avg: {max_tiles:.1f}")
39+
# The stats from vecenv are averaged across envs that were done in the same tick. Cannot get the single max.
40+
print(f"Episode length -- Avg: {episode_lengths:.1f}, Max: {max(stats['episode_length']):.1f}")
41+
print(f"Merge score -- Avg: {merge_scores:.1f}, Max: {max(stats['merge_score']):.1f}")
42+
print(f"Reached 32768 prob: {reached_32768*100:.2f} %")
43+
print(f"Reached 65536 prob: {reached_65536*100:.2f} %")
44+
45+
"""
46+
# hidden 256: https://wandb.ai/kywch/pufferlib/runs/nvd0pfuj?nw=nwuserkywch
47+
Num episodes: 154406
48+
Max tile avg: 22532.9
49+
Episode length -- Avg: 16667.2, Max: 26659.1
50+
Merge score -- Avg: 462797.9, Max: 744224.9
51+
Reached 32768 prob: 46.08 %
52+
Reached 65536 prob: 3.53 %
53+
54+
# hidden 512: https://wandb.ai/kywch/pufferlib/runs/2ch3my60?nw=nwuserkywch
55+
Num episodes: 119243
56+
Max tile avg: 30662.2
57+
Episode length -- Avg: 21539.7, Max: 29680.3
58+
Merge score -- Avg: 618011.8, Max: 918755.8
59+
Reached 32768 prob: 68.25 %
60+
Reached 65536 prob: 13.09 %
61+
"""
62+
63+
def finetune(env_name, load_model_path):
64+
args = pufferl.load_config(env_name)
65+
args['load_model_path'] = load_model_path
66+
# args['env']['use_sparse_reward'] = True
67+
args['env']['scaffolding_ratio'] = 0.85
68+
69+
# args['policy']['hidden_size'] = 512
70+
# args['rnn']['input_size'] = 512
71+
# args['rnn']['hidden_size'] = 512
72+
73+
args['train']['total_timesteps'] = 1_000_000_000
74+
args['train']['learning_rate'] = 0.00005
75+
args['train']['anneal_lr'] = False
76+
77+
args['wandb'] = True
78+
args['tag'] = 'pg2048'
79+
80+
pufferl.train(env_name, args)
81+
82+
if __name__ == '__main__':
83+
evaluate('puffer_g2048', load_model_path='puffer_g2048_2ch3my60.pt')
84+
# finetune('puffer_g2048', load_model_path='puffer_g2048_256_base.pt')

0 commit comments

Comments
 (0)