Skip to content

Commit 23babab

Browse files
authored
Merge pull request #417 from kywch/g2048-65k
New baseline for G2048
2 parents 17a0a1b + 473911d commit 23babab

File tree

9 files changed

+897
-185
lines changed

9 files changed

+897
-185
lines changed

pufferlib/config/ocean/g2048.ini

Lines changed: 136 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,167 @@
11
[base]
22
package = ocean
33
env_name = puffer_g2048
4-
policy_name = Policy
4+
policy_name = G2048
55
rnn_name = Recurrent
66

77
[policy]
8-
hidden_size = 256
8+
hidden_size = 512
99

1010
[rnn]
11-
input_size = 256
12-
hidden_size = 256
11+
input_size = 512
12+
hidden_size = 512
1313

1414
[vec]
1515
num_envs = 4
1616

1717
[env]
1818
num_envs = 4096
19+
reward_scaler = 0.67
20+
endgame_env_prob = 0.05
21+
scaffolding_ratio = 0.67
22+
use_heuristic_rewards = True
23+
snake_reward_weight = 0.0005
1924

2025
[train]
21-
# https://wandb.ai/kywch/pufferlib/runs/n8xml0u9?nw=nwuserkywch
22-
total_timesteps = 3_000_000_000
26+
# 512 hidden: https://wandb.ai/kywch/pufferlib/runs/5thsjr61?nw=nwuserkywch
27+
total_timesteps = 6_767_676_767
2328
anneal_lr = True
29+
min_learning_rate = 0.00005
2430
batch_size = auto
2531
bptt_horizon = 64
26-
minibatch_size = 65536
32+
minibatch_size = 32768
2733

28-
adam_beta1 = 0.99
29-
adam_beta2 = 0.96
30-
adam_eps = 1.0e-10
31-
clip_coef = 0.1
32-
ent_coef = 0.02
33-
gae_lambda = 0.6
34-
gamma = 0.985
35-
learning_rate = 0.001
36-
max_grad_norm = 1.0
37-
prio_alpha = 0.99
38-
prio_beta0 = 0.40
39-
vf_clip_coef = 0.1
34+
clip_coef = 0.067
35+
ent_coef = 0.0267
36+
gae_lambda = 0.67
37+
gamma = 0.99567
38+
vf_clip_coef = 0.167
4039
vf_coef = 2.0
41-
vtrace_c_clip = 4.3
42-
vtrace_rho_clip = 1.6
4340

41+
learning_rate = 0.000467
42+
max_grad_norm = 0.5
43+
44+
45+
# These are newer puffer PPO params. Need more sweeping.
46+
adam_beta1 = 0.99
47+
adam_beta2 = 0.9999
48+
adam_eps = 0.0001
49+
prio_alpha = 0.8
50+
prio_beta0 = 0.1
51+
vtrace_c_clip = 2.0
52+
vtrace_rho_clip = 1.1
53+
54+
55+
### Targeted sweep
4456

4557
[sweep]
4658
metric = score
4759
goal = maximize
60+
max_suggestion_cost = 7200
61+
sweep_only = endgame_env_prob, scaffolding_ratio, snake_reward_weight, learning_rate, max_grad_norm
62+
downsample = 1
4863

49-
[sweep.train.total_timesteps]
50-
distribution = log_normal
51-
min = 3e8
52-
max = 1e10
53-
mean = 1e9
54-
scale = time
64+
[sweep.env.endgame_env_prob]
65+
distribution = uniform
66+
min = 0.0
67+
mean = 0.03
68+
max = 0.2
69+
scale = auto
70+
71+
[sweep.env.scaffolding_ratio]
72+
distribution = uniform
73+
min = 0.1
74+
mean = 0.5
75+
max = 0.8
76+
scale = auto
77+
78+
[sweep.env.snake_reward_weight]
79+
distribution = uniform
80+
min = 0.0001
81+
mean = 0.0007
82+
max = 0.0050
83+
scale = auto
5584

5685
[sweep.train.learning_rate]
57-
distribution = log_normal
58-
min = 0.00001
59-
mean = 0.001
60-
max = 0.1
86+
distribution = uniform
87+
min = 0.0001
88+
mean = 0.0005
89+
max = 0.0030
6190
scale = 0.5
6291

63-
[sweep.train.gae_lambda]
64-
distribution = logit_normal
65-
min = 0.01
66-
mean = 0.6
67-
max = 0.995
68-
scale = auto
92+
[sweep.train.max_grad_norm]
93+
distribution = uniform
94+
min = 0.1
95+
mean = 0.5
96+
max = 2.0
97+
scale = 0.5
98+
99+
[sweep.train.vf_clip_coef]
100+
distribution = uniform
101+
min = 0.05
102+
max = 0.5
103+
mean = 0.2
104+
scale = auto
105+
106+
107+
### Broad sweep
108+
109+
; [sweep]
110+
; metric = score
111+
; goal = maximize
112+
113+
; [sweep.env.reward_scaler]
114+
; distribution = uniform
115+
; min = 0.1
116+
; mean = 0.5
117+
; max = 1.0
118+
; scale = auto
119+
120+
; [sweep.env.scaffolding_ratio]
121+
; distribution = uniform
122+
; min = 0.0
123+
; mean = 0.5
124+
; max = 0.8
125+
; scale = auto
126+
127+
; [sweep.env.snake_reward_weight]
128+
; distribution = uniform
129+
; min = 0.00001
130+
; mean = 0.00005
131+
; max = 0.0002
132+
; scale = auto
133+
134+
; [sweep.train.total_timesteps]
135+
; distribution = log_normal
136+
; min = 3e8
137+
; max = 1e10
138+
; mean = 1e9
139+
; scale = time
140+
141+
; [sweep.train.learning_rate]
142+
; distribution = log_normal
143+
; min = 0.00001
144+
; mean = 0.001
145+
; max = 0.1
146+
; scale = 0.5
147+
148+
; [sweep.train.gamma]
149+
; distribution = logit_normal
150+
; min = 0.8
151+
; mean = 0.995
152+
; max = 0.9999
153+
; scale = auto
154+
155+
; [sweep.train.gae_lambda]
156+
; distribution = logit_normal
157+
; min = 0.01
158+
; mean = 0.7
159+
; max = 0.995
160+
; scale = auto
161+
162+
; [sweep.train.clip_coef]
163+
; distribution = log_normal
164+
; min = 0.001
165+
; max = 0.5
166+
; mean = 0.05
167+
; scale = auto

pufferlib/ocean/g2048/binding.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,15 @@
33
#define Env Game
44
#include "../env_binding.h"
55

6-
// g2048.h does not have a 'size' field, so my_init can just return 0
76
static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
8-
// No custom initialization needed for 2048
7+
env->can_go_over_65536 = unpack(kwargs, "can_go_over_65536");
8+
env->reward_scaler = unpack(kwargs, "reward_scaler");
9+
env->endgame_env_prob = unpack(kwargs, "endgame_env_prob");
10+
env->scaffolding_ratio = unpack(kwargs, "scaffolding_ratio");
11+
env->use_heuristic_rewards = unpack(kwargs, "use_heuristic_rewards");
12+
env->snake_reward_weight = unpack(kwargs, "snake_reward_weight");
13+
env->use_sparse_reward = unpack(kwargs, "use_sparse_reward");
14+
init(env);
915
return 0;
1016
}
1117

@@ -15,5 +21,11 @@ static int my_log(PyObject* dict, Log* log) {
1521
assign_to_dict(dict, "merge_score", log->merge_score);
1622
assign_to_dict(dict, "episode_return", log->episode_return);
1723
assign_to_dict(dict, "episode_length", log->episode_length);
24+
assign_to_dict(dict, "lifetime_max_tile", log->lifetime_max_tile);
25+
assign_to_dict(dict, "reached_32768", log->reached_32768);
26+
assign_to_dict(dict, "reached_65536", log->reached_65536);
27+
assign_to_dict(dict, "monotonicity_reward", log->monotonicity_reward);
28+
assign_to_dict(dict, "snake_state", log->snake_state);
29+
assign_to_dict(dict, "snake_reward", log->snake_reward);
1830
return 0;
1931
}

pufferlib/ocean/g2048/eval.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
from pufferlib import pufferl
2+
3+
def evaluate(env_name, load_model_path):
4+
args = pufferl.load_config(env_name)
5+
args['vec']['num_envs'] = 1
6+
args['env']['num_envs'] = 4096
7+
args['load_model_path'] = load_model_path
8+
# Turn off endgame_envs and scaffolding episodes, which do not report results
9+
args['env']['endgame_env_prob'] = 0
10+
args['env']['scaffolding_ratio'] = 0
11+
args['env']['can_go_over_65536'] = True
12+
13+
vecenv = pufferl.load_env(env_name, args)
14+
policy = pufferl.load_policy(args, vecenv, env_name)
15+
trainer = pufferl.PuffeRL(args['train'], vecenv, policy)
16+
17+
# Each evaluate runs for 64 ticks. NOTE: bppt horizon might be short for g2048?
18+
# Avg episode length from the current model is ~18000, so it takes ~300 epochs for an avg episode.
19+
# It's hard to get the single best score because stats are already averaged across done envs.
20+
for i in range(10000):
21+
stats = trainer.evaluate()
22+
23+
trainer.epoch += 1
24+
if i % 20 == 0:
25+
trainer.print_dashboard()
26+
27+
trainer.close()
28+
29+
# Get the estimates
30+
num_episodes = sum(stats['n'])
31+
episode_lengths = sum(n * l for n, l in zip(stats['n'], stats['episode_length'])) / num_episodes
32+
max_tiles = sum(n * m for n, m in zip(stats['n'], stats['score'])) / num_episodes
33+
merge_scores = sum(n * s for n, s in zip(stats['n'], stats['merge_score'])) / num_episodes
34+
reached_32768 = sum(n * s for n, s in zip(stats['n'], stats['reached_32768'])) / num_episodes
35+
reached_65536 = sum(n * s for n, s in zip(stats['n'], stats['reached_65536'])) / num_episodes
36+
37+
print(f"Num episodes: {int(num_episodes)}")
38+
print(f"Max tile avg: {max_tiles:.1f}")
39+
# The stats from vecenv are averaged across envs that were done in the same tick. Cannot get the single max.
40+
print(f"Episode length -- Avg: {episode_lengths:.1f}, Max: {max(stats['episode_length']):.1f}")
41+
print(f"Merge score -- Avg: {merge_scores:.1f}, Max: {max(stats['merge_score']):.1f}")
42+
print(f"Reached 32768 prob: {reached_32768*100:.2f} %")
43+
print(f"Reached 65536 prob: {reached_65536*100:.2f} %")
44+
45+
"""
46+
# hidden 256: https://wandb.ai/kywch/pufferlib/runs/nvd0pfuj?nw=nwuserkywch
47+
Num episodes: 154406
48+
Max tile avg: 22532.9
49+
Episode length -- Avg: 16667.2, Max: 26659.1
50+
Merge score -- Avg: 462797.9, Max: 744224.9
51+
Reached 32768 prob: 46.08 %
52+
Reached 65536 prob: 3.53 %
53+
54+
# hidden 512: https://wandb.ai/kywch/pufferlib/runs/2ch3my60?nw=nwuserkywch
55+
Num episodes: 119243
56+
Max tile avg: 30662.2
57+
Episode length -- Avg: 21539.7, Max: 29680.3
58+
Merge score -- Avg: 618011.8, Max: 918755.8
59+
Reached 32768 prob: 68.25 %
60+
Reached 65536 prob: 13.09 %
61+
62+
# hidden 512 (replication): https://wandb.ai/kywch/pufferlib/runs/5thsjr61?nw=nwuserkywch
63+
Num episodes: 115652
64+
Max tile avg: 31773.2
65+
Episode length -- Avg: 22196.4, Max: 30316.5
66+
Merge score -- Avg: 639395.6, Max: 909969.8
67+
Reached 32768 prob: 71.22 %
68+
Reached 65536 prob: 14.75 %
69+
"""
70+
71+
def finetune(env_name, load_model_path):
72+
args = pufferl.load_config(env_name)
73+
args['load_model_path'] = load_model_path
74+
# args['env']['use_sparse_reward'] = True
75+
args['env']['scaffolding_ratio'] = 0.85
76+
77+
# args['policy']['hidden_size'] = 512
78+
# args['rnn']['input_size'] = 512
79+
# args['rnn']['hidden_size'] = 512
80+
81+
args['train']['total_timesteps'] = 1_000_000_000
82+
args['train']['learning_rate'] = 0.00005
83+
args['train']['anneal_lr'] = False
84+
85+
args['wandb'] = True
86+
args['tag'] = 'pg2048'
87+
88+
pufferl.train(env_name, args)
89+
90+
if __name__ == '__main__':
91+
import os
92+
import wandb
93+
94+
# https://wandb.ai/kywch/pufferlib/runs/5thsjr61?nw=nwuserkywch
95+
wandb_run_id = '5thsjr61'
96+
wandb.init(id=wandb_run_id, project='pufferlib', entity='kywch')
97+
98+
artifact = wandb.use_artifact(f'{wandb_run_id}:latest')
99+
data_dir = artifact.download()
100+
model_file = max(os.listdir(data_dir))
101+
model_path = f'{data_dir}/{model_file}'
102+
wandb.finish()
103+
104+
evaluate('puffer_g2048', load_model_path=model_path)
105+
# finetune('puffer_g2048', load_model_path='puffer_g2048_256_base.pt')

0 commit comments

Comments
 (0)