Skip to content

Commit c8eca6c

Browse files
committed
revert score to max tile based, lengthen tick limit
1 parent 6c5a8db commit c8eca6c

File tree

4 files changed

+25
-21
lines changed

4 files changed

+25
-21
lines changed

pufferlib/config/ocean/g2048.ini

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@ policy_name = Policy
55
rnn_name = Recurrent
66

77
[policy]
8-
hidden_size = 128
8+
hidden_size = 256
99

1010
[rnn]
11-
input_size = 128
12-
hidden_size = 128
11+
input_size = 256
12+
hidden_size = 256
1313

1414
[vec]
1515
num_envs = 4
@@ -18,6 +18,7 @@ num_envs = 4
1818
num_envs = 4096
1919

2020
[train]
21+
# https://wandb.ai/kywch/pufferlib/runs/n8xml0u9?nw=nwuserkywch
2122
total_timesteps = 3_000_000_000
2223
anneal_lr = True
2324
batch_size = auto
@@ -31,7 +32,7 @@ clip_coef = 0.1
3132
ent_coef = 0.02
3233
gae_lambda = 0.6
3334
gamma = 0.985
34-
learning_rate = 0.003
35+
learning_rate = 0.001
3536
max_grad_norm = 1.0
3637
prio_alpha = 0.99
3738
prio_beta0 = 0.40

pufferlib/ocean/g2048/binding.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
1212
static int my_log(PyObject* dict, Log* log) {
1313
assign_to_dict(dict, "perf", log->perf);
1414
assign_to_dict(dict, "score", log->score);
15-
assign_to_dict(dict, "max_tile", log->max_tile);
15+
assign_to_dict(dict, "merge_score", log->merge_score);
1616
assign_to_dict(dict, "episode_return", log->episode_return);
1717
assign_to_dict(dict, "episode_length", log->episode_length);
1818
return 0;

pufferlib/ocean/g2048/g2048.c

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@ int main() {
2929

3030
if (IsKeyDown(KEY_LEFT_SHIFT)) {
3131
action = -1;
32-
if (IsKeyDown(KEY_W) || IsKeyDown(KEY_UP)) action = UP;
33-
else if (IsKeyDown(KEY_S) || IsKeyDown(KEY_DOWN)) action = DOWN;
34-
else if (IsKeyDown(KEY_A) || IsKeyDown(KEY_LEFT)) action = LEFT;
35-
else if (IsKeyDown(KEY_D) || IsKeyDown(KEY_RIGHT)) action = RIGHT;
32+
if (IsKeyPressed(KEY_W) || IsKeyPressed(KEY_UP)) action = UP;
33+
else if (IsKeyPressed(KEY_S) || IsKeyPressed(KEY_DOWN)) action = DOWN;
34+
else if (IsKeyPressed(KEY_A) || IsKeyPressed(KEY_LEFT)) action = LEFT;
35+
else if (IsKeyPressed(KEY_D) || IsKeyPressed(KEY_RIGHT)) action = RIGHT;
3636
env.actions[0] = action - 1;
3737
} else if (frame % 1 != 0) {
3838
continue;
@@ -48,11 +48,6 @@ int main() {
4848
c_step(&env);
4949
}
5050

51-
if (IsKeyDown(KEY_LEFT_SHIFT) && action > 0) {
52-
WaitTime(0.05);
53-
}
54-
55-
5651
}
5752

5853
free_linearlstm(net);

pufferlib/ocean/g2048/g2048.h

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,13 @@
2020
#define INVALID_MOVE_PENALTY -0.05f
2121
#define GAME_OVER_PENALTY -1.0f
2222

23-
// To normalize perf from 0 to 1. Update when beaten.
24-
#define OBSERVED_MAX_SCORE 100000.0f
23+
// To normalize perf from 0 to 1. Reachable with hidden size 256.
24+
#define OBSERVED_MAX_TILE 4096.0f
2525

2626
typedef struct {
2727
float perf;
2828
float score;
29-
float max_tile;
29+
float merge_score;
3030
float episode_return;
3131
float episode_length;
3232
float n;
@@ -42,6 +42,7 @@ typedef struct {
4242
int tick;
4343
unsigned char grid[SIZE][SIZE];
4444
float episode_reward; // Accumulate episode reward
45+
int moves_made;
4546
int max_episode_ticks; // Dynamic max_ticks based on score
4647

4748
// Cached values to avoid recomputation
@@ -115,9 +116,9 @@ static inline unsigned char get_max_tile(Game* game) {
115116

116117
void add_log(Game* game) {
117118
unsigned char s = get_max_tile(game);
118-
game->log.max_tile += (float)(1 << s);
119-
game->log.score += (float)game->score;
120-
game->log.perf += (float)game->score / OBSERVED_MAX_SCORE;
119+
game->log.score += (float)(1 << s);
120+
game->log.perf += (float)(1 << s) / OBSERVED_MAX_TILE;
121+
game->log.merge_score += (float)game->score;
121122
game->log.episode_length += game->tick;
122123
game->log.episode_return += game->episode_reward;
123124
game->log.n += 1;
@@ -136,6 +137,7 @@ void c_reset(Game* game) {
136137
game->empty_count = SIZE * SIZE;
137138
game->game_over_cached = false;
138139
game->grid_changed = true;
140+
game->moves_made = 0;
139141
game->max_episode_ticks = BASE_MAX_TICKS;
140142

141143
if (game->terminals) game->terminals[0] = 0;
@@ -176,6 +178,7 @@ void add_random_tile(Game* game) {
176178
if (chosen_pos >= 0) {
177179
int i = chosen_pos / SIZE;
178180
int j = chosen_pos % SIZE;
181+
// Implement the 90% 2, 10% 4 rule
179182
game->grid[i][j] = (rand() % 10 == 0) ? 2 : 1;
180183
game->empty_count--;
181184
game->grid_changed = true;
@@ -309,11 +312,13 @@ void c_step(Game* game) {
309312
game->tick++;
310313

311314
if (did_move) {
315+
game->moves_made++;
312316
add_random_tile(game);
313317
game->score += score_add;
314318
update_empty_count(game); // Update after adding tile
315319
// This is to limit infinite invalid moves during eval
316-
game->max_episode_ticks = max(BASE_MAX_TICKS, game->score / 20);
320+
// Don't need to be tight. Don't need to show to user?
321+
game->max_episode_ticks = max(BASE_MAX_TICKS, game->score / 10);
317322
} else {
318323
reward = INVALID_MOVE_PENALTY;
319324
}
@@ -385,6 +390,9 @@ void c_render(Game* game) {
385390
// Draw score (format once per frame)
386391
snprintf(score_text, sizeof(score_text), "Score: %d", game->score);
387392
DrawText(score_text, 10, px * SIZE + 10, 24, PUFF_WHITE);
393+
394+
snprintf(score_text, sizeof(score_text), "Moves: %d", game->moves_made);
395+
DrawText(score_text, 210, px * SIZE + 10, 24, PUFF_WHITE);
388396

389397
EndDrawing();
390398
}

0 commit comments

Comments
 (0)