|
| 1 | +from pufferlib import pufferl |
| 2 | + |
| 3 | +def evaluate(env_name, load_model_path): |
| 4 | + args = pufferl.load_config(env_name) |
| 5 | + args['vec']['num_envs'] = 1 |
| 6 | + args['env']['num_envs'] = 4096 |
| 7 | + args['load_model_path'] = load_model_path |
| 8 | + # Turn off endgame_envs and scaffolding episodes, which do not report results |
| 9 | + args['env']['endgame_env_prob'] = 0 |
| 10 | + args['env']['scaffolding_ratio'] = 0 |
| 11 | + args['env']['can_go_over_65536'] = True |
| 12 | + |
| 13 | + vecenv = pufferl.load_env(env_name, args) |
| 14 | + policy = pufferl.load_policy(args, vecenv, env_name) |
| 15 | + trainer = pufferl.PuffeRL(args['train'], vecenv, policy) |
| 16 | + |
| 17 | + # Each evaluate runs for 64 ticks. NOTE: bppt horizon might be short for g2048? |
| 18 | + # Avg episode length from the current model is ~18000, so it takes ~300 epochs for an avg episode. |
| 19 | + # It's hard to get the single best score because stats are already averaged across done envs. |
| 20 | + for i in range(10000): |
| 21 | + stats = trainer.evaluate() |
| 22 | + |
| 23 | + trainer.epoch += 1 |
| 24 | + if i % 20 == 0: |
| 25 | + trainer.print_dashboard() |
| 26 | + |
| 27 | + trainer.close() |
| 28 | + |
| 29 | + # Get the estimates |
| 30 | + num_episodes = sum(stats['n']) |
| 31 | + episode_lengths = sum(n * l for n, l in zip(stats['n'], stats['episode_length'])) / num_episodes |
| 32 | + max_tiles = sum(n * m for n, m in zip(stats['n'], stats['score'])) / num_episodes |
| 33 | + merge_scores = sum(n * s for n, s in zip(stats['n'], stats['merge_score'])) / num_episodes |
| 34 | + reached_32768 = sum(n * s for n, s in zip(stats['n'], stats['reached_32768'])) / num_episodes |
| 35 | + reached_65536 = sum(n * s for n, s in zip(stats['n'], stats['reached_65536'])) / num_episodes |
| 36 | + |
| 37 | + print(f"Num episodes: {int(num_episodes)}") |
| 38 | + print(f"Max tile avg: {max_tiles:.1f}") |
| 39 | + # The stats from vecenv are averaged across envs that were done in the same tick. Cannot get the single max. |
| 40 | + print(f"Episode length -- Avg: {episode_lengths:.1f}, Max: {max(stats['episode_length']):.1f}") |
| 41 | + print(f"Merge score -- Avg: {merge_scores:.1f}, Max: {max(stats['merge_score']):.1f}") |
| 42 | + print(f"Reached 32768 prob: {reached_32768*100:.2f} %") |
| 43 | + print(f"Reached 65536 prob: {reached_65536*100:.2f} %") |
| 44 | + |
| 45 | + """ |
| 46 | + # hidden 256: https://wandb.ai/kywch/pufferlib/runs/nvd0pfuj?nw=nwuserkywch |
| 47 | + Num episodes: 154406 |
| 48 | + Max tile avg: 22532.9 |
| 49 | + Episode length -- Avg: 16667.2, Max: 26659.1 |
| 50 | + Merge score -- Avg: 462797.9, Max: 744224.9 |
| 51 | + Reached 32768 prob: 46.08 % |
| 52 | + Reached 65536 prob: 3.53 % |
| 53 | +
|
| 54 | + # hidden 512: https://wandb.ai/kywch/pufferlib/runs/2ch3my60?nw=nwuserkywch |
| 55 | + Num episodes: 119243 |
| 56 | + Max tile avg: 30662.2 |
| 57 | + Episode length -- Avg: 21539.7, Max: 29680.3 |
| 58 | + Merge score -- Avg: 618011.8, Max: 918755.8 |
| 59 | + Reached 32768 prob: 68.25 % |
| 60 | + Reached 65536 prob: 13.09 % |
| 61 | + """ |
| 62 | + |
| 63 | +def finetune(env_name, load_model_path): |
| 64 | + args = pufferl.load_config(env_name) |
| 65 | + args['load_model_path'] = load_model_path |
| 66 | + # args['env']['use_sparse_reward'] = True |
| 67 | + args['env']['scaffolding_ratio'] = 0.85 |
| 68 | + |
| 69 | + # args['policy']['hidden_size'] = 512 |
| 70 | + # args['rnn']['input_size'] = 512 |
| 71 | + # args['rnn']['hidden_size'] = 512 |
| 72 | + |
| 73 | + args['train']['total_timesteps'] = 1_000_000_000 |
| 74 | + args['train']['learning_rate'] = 0.00005 |
| 75 | + args['train']['anneal_lr'] = False |
| 76 | + |
| 77 | + args['wandb'] = True |
| 78 | + args['tag'] = 'pg2048' |
| 79 | + |
| 80 | + pufferl.train(env_name, args) |
| 81 | + |
| 82 | +if __name__ == '__main__': |
| 83 | + evaluate('puffer_g2048', load_model_path='puffer_g2048_2ch3my60.pt') |
| 84 | + # finetune('puffer_g2048', load_model_path='puffer_g2048_256_base.pt') |
0 commit comments