Merge pull request #404 from kywch/h2048

jsuarez5341 · web-flow · commit a8a5e33d1c46 · 2025-10-27T15:54:58.000-04:00
Modified G2048 for better sweeps
diff --git a/pufferlib/config/ocean/g2048.ini b/pufferlib/config/ocean/g2048.ini
@@ -5,11 +5,11 @@ policy_name = Policy
 rnn_name = Recurrent
 
 [policy]
-hidden_size = 128
+hidden_size = 256
 
 [rnn]
-input_size = 128
-hidden_size = 128
+input_size = 256
+hidden_size = 256
 
 [vec]
 num_envs = 4
@@ -18,23 +18,51 @@ num_envs = 4
 num_envs = 4096
 
 [train]
-total_timesteps = 5_000_000_000
-adam_beta1 = 0.9529488439604378
-adam_beta2 = 0.9993901829477296
-adam_eps = 2.745365927413118e-7
+# https://wandb.ai/kywch/pufferlib/runs/n8xml0u9?nw=nwuserkywch
+total_timesteps = 3_000_000_000
+anneal_lr = True
+batch_size = auto
 bptt_horizon = 64
-clip_coef = 0.596573170393339
-ent_coef = 0.02107417730003862
-gae_lambda = 0.9940613415815854
-gamma = 0.9889857974154952
-#learning_rate = 0.0032402460796988127
+minibatch_size = 65536
+
+adam_beta1 = 0.99
+adam_beta2 = 0.96
+adam_eps = 1.0e-10
+clip_coef = 0.1
+ent_coef = 0.02
+gae_lambda = 0.6
+gamma = 0.985
 learning_rate = 0.001
-max_grad_norm = 1.0752406726589745
-minibatch_size = 16384
-prio_alpha = 0.25297099593586336
-prio_beta0 = 0.940606268942572
+max_grad_norm = 1.0
+prio_alpha = 0.99
+prio_beta0 = 0.40
 vf_clip_coef = 0.1
-vf_coef = 1.6362878279900643
-vtrace_c_clip = 0
-vtrace_rho_clip = 1.2917509971869054
-anneal_lr = False
+vf_coef = 2.0
+vtrace_c_clip = 4.3
+vtrace_rho_clip = 1.6
+
+
+[sweep]
+metric = score
+goal = maximize
+
+[sweep.train.total_timesteps]
+distribution = log_normal
+min = 3e8
+max = 1e10
+mean = 1e9
+scale = time
+
+[sweep.train.learning_rate]
+distribution = log_normal
+min = 0.00001
+mean = 0.001
+max = 0.1
+scale = 0.5
+
+[sweep.train.gae_lambda]
+distribution = logit_normal
+min = 0.01
+mean = 0.6
+max = 0.995
+scale = auto
diff --git a/pufferlib/ocean/g2048/binding.c b/pufferlib/ocean/g2048/binding.c
@@ -1,9 +1,9 @@
-#include "2048.h"
+#include "g2048.h"
 
 #define Env Game
 #include "../env_binding.h"
 
-// 2048.h does not have a 'size' field, so my_init can just return 0
+// g2048.h does not have a 'size' field, so my_init can just return 0
 static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
     // No custom initialization needed for 2048
     return 0;
@@ -12,6 +12,7 @@ static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
 static int my_log(PyObject* dict, Log* log) {
     assign_to_dict(dict, "perf", log->perf);
     assign_to_dict(dict, "score", log->score);
+    assign_to_dict(dict, "merge_score", log->merge_score);
     assign_to_dict(dict, "episode_return", log->episode_return);
     assign_to_dict(dict, "episode_length", log->episode_length);
     return 0;
diff --git a/pufferlib/ocean/g2048/g2048.c b/pufferlib/ocean/g2048/g2048.c
@@ -1,6 +1,25 @@
-#include "2048.h"
+#include "g2048.h"
 #include "puffernet.h"
 
+// Network with hidden size 256. Should go to puffernet
+LinearLSTM* make_linearlstm_256(Weights* weights, int num_agents, int input_dim, int logit_sizes[], int num_actions) {
+    LinearLSTM* net = calloc(1, sizeof(LinearLSTM));
+    net->num_agents = num_agents;
+    net->obs = calloc(num_agents*input_dim, sizeof(float));
+    int hidden_dim = 256;
+    net->encoder = make_linear(weights, num_agents, input_dim, hidden_dim);
+    net->gelu1 = make_gelu(num_agents, hidden_dim);
+    int atn_sum = 0;
+    for (int i = 0; i < num_actions; i++) {
+        atn_sum += logit_sizes[i];
+    }
+    net->actor = make_linear(weights, num_agents, hidden_dim, atn_sum);
+    net->value_fn = make_linear(weights, num_agents, hidden_dim, 1);
+    net->lstm = make_lstm(weights, num_agents, hidden_dim, hidden_dim);
+    net->multidiscrete = make_multidiscrete(num_agents, logit_sizes, num_actions);
+    return net;
+}
+
 int main() {
     srand(time(NULL));
     Game env;
@@ -14,26 +33,27 @@ int main() {
     env.actions = actions;
     env.rewards = rewards;
 
-    Weights* weights = load_weights("resources/g2048/g2048_weights.bin", 134917);
+    Weights* weights = load_weights("resources/g2048/g2048_weights.bin", 531973);
     int logit_sizes[1] = {4};
-    LinearLSTM* net = make_linearlstm(weights, 1, 16, logit_sizes, 1);
+    LinearLSTM* net = make_linearlstm_256(weights, 1, 16, logit_sizes, 1);
     c_reset(&env);
     c_render(&env);
 
     // Main game loop
     int frame = 0;
+    int action = -1;
     while (!WindowShouldClose()) {
         c_render(&env);
         frame++;
-
-        int action = 0;
+        
         if (IsKeyDown(KEY_LEFT_SHIFT)) {
-            if (IsKeyPressed(KEY_W) || IsKeyPressed(KEY_UP)) action = UP;
-            else if (IsKeyPressed(KEY_S) || IsKeyPressed(KEY_DOWN)) action = DOWN;
-            else if (IsKeyPressed(KEY_A) || IsKeyPressed(KEY_LEFT)) action = LEFT;
-            else if (IsKeyPressed(KEY_D) || IsKeyPressed(KEY_RIGHT)) action = RIGHT;
+            action = -1;
+            if (IsKeyDown(KEY_W) || IsKeyDown(KEY_UP)) action = UP;
+            else if (IsKeyDown(KEY_S) || IsKeyDown(KEY_DOWN)) action = DOWN;
+            else if (IsKeyDown(KEY_A) || IsKeyDown(KEY_LEFT)) action = LEFT;
+            else if (IsKeyDown(KEY_D) || IsKeyDown(KEY_RIGHT)) action = RIGHT;
             env.actions[0] = action - 1;
-        } else if (frame % 10 != 0) {
+        } else if (frame % 1 != 0) {
             continue;
         } else {
             action = 1;
@@ -43,9 +63,14 @@ int main() {
             forward_linearlstm(net, net->obs, env.actions);
         }
 
-        if (action != 0) {
+        if (action > 0) {
             c_step(&env);
         }
+
+        if (IsKeyDown(KEY_LEFT_SHIFT) && action > 0) {
+            // Don't need to be super reactive
+            WaitTime(0.1);
+        }        
     }
 
     free_linearlstm(net);
diff --git a/pufferlib/ocean/g2048/g2048.h b/pufferlib/ocean/g2048/g2048.h
@@ -5,22 +5,28 @@
 #include <math.h>
 #include <string.h>
 #include "raylib.h"
+#define max(a, b) (((a) > (b)) ? (a) : (b))
 
 #define SIZE 4
 #define EMPTY 0
 #define UP 1
 #define DOWN 2
 #define LEFT 3
 #define RIGHT 4
+#define BASE_MAX_TICKS 2000
 
 // Precomputed constants
-#define REWARD_MULTIPLIER 0.09090909f
+#define REWARD_MULTIPLIER 0.0625f
 #define INVALID_MOVE_PENALTY -0.05f
 #define GAME_OVER_PENALTY -1.0f
 
+// To normalize perf from 0 to 1. Reachable with hidden size 256.
+#define OBSERVED_MAX_TILE 4096.0f
+
 typedef struct {
     float perf;
     float score;
+    float merge_score;
     float episode_return;
     float episode_length;
     float n;
@@ -36,6 +42,8 @@ typedef struct {
     int tick;
     unsigned char grid[SIZE][SIZE];
     float episode_reward;           // Accumulate episode reward
+    int moves_made;
+    int max_episode_ticks;          // Dynamic max_ticks based on score
     
     // Cached values to avoid recomputation
     int empty_count;
@@ -93,9 +101,24 @@ static inline void update_empty_count(Game* game) {
     game->empty_count = count;
 }
 
+static inline unsigned char get_max_tile(Game* game) {
+    unsigned char max_tile = 0;
+    // Unroll loop for better performance
+    for (int i = 0; i < SIZE; i++) {
+        for (int j = 0; j < SIZE; j++) {
+            if (game->grid[i][j] > max_tile) {
+                max_tile = game->grid[i][j];
+            }
+        }
+    }
+    return max_tile;
+}
+
 void add_log(Game* game) {
-    game->log.score = (float)(1 << game->score);
-    game->log.perf += ((float)game->score) * REWARD_MULTIPLIER;
+    unsigned char s = get_max_tile(game);
+    game->log.score += (float)(1 << s);
+    game->log.perf += (float)(1 << s) / OBSERVED_MAX_TILE;
+    game->log.merge_score += (float)game->score;
     game->log.episode_length += game->tick;
     game->log.episode_return += game->episode_reward;
     game->log.n += 1;
@@ -114,6 +137,8 @@ void c_reset(Game* game) {
     game->empty_count = SIZE * SIZE;
     game->game_over_cached = false;
     game->grid_changed = true;
+    game->moves_made = 0;
+    game->max_episode_ticks = BASE_MAX_TICKS;
     
     if (game->terminals) game->terminals[0] = 0;
     
@@ -153,6 +178,7 @@ void add_random_tile(Game* game) {
     if (chosen_pos >= 0) {
         int i = chosen_pos / SIZE;
         int j = chosen_pos % SIZE;
+        // Implement the 90% 2, 10% 4 rule
         game->grid[i][j] = (rand() % 10 == 0) ? 2 : 1;
         game->empty_count--;
         game->grid_changed = true;
@@ -162,7 +188,7 @@ void add_random_tile(Game* game) {
 }
 
 // Optimized slide and merge with fewer memory operations
-static inline bool slide_and_merge(unsigned char* row, float* reward) {
+static inline bool slide_and_merge(unsigned char* row, float* reward, float* score_increase) {
     bool moved = false;
     int write_pos = 0;
     
@@ -183,6 +209,7 @@ static inline bool slide_and_merge(unsigned char* row, float* reward) {
         if (row[i] != EMPTY && row[i] == row[i + 1]) {
             row[i]++;
             *reward += ((float)row[i]) * REWARD_MULTIPLIER;
+            *score_increase += (float)(1 << (int)row[i]);
             // Shift remaining elements left
             for (int j = i + 1; j < SIZE - 1; j++) {
                 row[j] = row[j + 1];
@@ -195,7 +222,7 @@ static inline bool slide_and_merge(unsigned char* row, float* reward) {
     return moved;
 }
 
-bool move(Game* game, int direction, float* reward) {
+bool move(Game* game, int direction, float* reward, float* score_increase) {
     bool moved = false;
     unsigned char temp[SIZE];
     
@@ -207,7 +234,7 @@ bool move(Game* game, int direction, float* reward) {
                 temp[i] = game->grid[idx][col];
             }
             
-            if (slide_and_merge(temp, reward)) {
+            if (slide_and_merge(temp, reward, score_increase)) {
                 moved = true;
                 // Write back column
                 for (int i = 0; i < SIZE; i++) {
@@ -224,7 +251,7 @@ bool move(Game* game, int direction, float* reward) {
                 temp[i] = game->grid[row][idx];
             }
             
-            if (slide_and_merge(temp, reward)) {
+            if (slide_and_merge(temp, reward, score_increase)) {
                 moved = true;
                 // Write back row
                 for (int i = 0; i < SIZE; i++) {
@@ -235,9 +262,7 @@ bool move(Game* game, int direction, float* reward) {
         }
     }
 
-    if (!moved) {
-        *reward = INVALID_MOVE_PENALTY;
-    } else {
+    if (moved) {
         game->grid_changed = true;
         game->game_over_cached = false; // Invalidate cache
     }
@@ -280,34 +305,28 @@ bool is_game_over(Game* game) {
     return true;
 }
 
-// Optimized score calculation
-static inline unsigned char calc_score(Game* game) {
-    unsigned char max_tile = 0;
-    // Unroll loop for better performance
-    for (int i = 0; i < SIZE; i++) {
-        for (int j = 0; j < SIZE; j++) {
-            if (game->grid[i][j] > max_tile) {
-                max_tile = game->grid[i][j];
-            }
-        }
-    }
-    return max_tile;
-}
-
 void c_step(Game* game) {
     float reward = 0.0f;
-    bool did_move = move(game, game->actions[0] + 1, &reward);
+    float score_add = 0.0f;
+    bool did_move = move(game, game->actions[0] + 1, &reward, &score_add);
     game->tick++;
     
     if (did_move) {
+        game->moves_made++;
         add_random_tile(game);
-        game->score = calc_score(game);
+        game->score += score_add;
         update_empty_count(game); // Update after adding tile
+        // This is to limit infinite invalid moves during eval
+        // Don't need to be tight. Don't need to show to user?
+        game->max_episode_ticks = max(BASE_MAX_TICKS, game->score / 10);
+    } else {
+        reward = INVALID_MOVE_PENALTY;
     }
-    
+
     bool game_over = is_game_over(game);
-    game->terminals[0] = game_over ? 1 : 0;
-    
+    bool max_ticks_reached = game->tick >= game->max_episode_ticks;
+    game->terminals[0] = (game_over || max_ticks_reached) ? 1 : 0;
+
     if (game_over) {
         reward = GAME_OVER_PENALTY;
     }
@@ -369,8 +388,11 @@ void c_render(Game* game) {
     }
     
     // Draw score (format once per frame)
-    snprintf(score_text, sizeof(score_text), "Score: %d", 1 << game->score);
+    snprintf(score_text, sizeof(score_text), "Score: %d", game->score);
     DrawText(score_text, 10, px * SIZE + 10, 24, PUFF_WHITE);
+
+    snprintf(score_text, sizeof(score_text), "Moves: %d", game->moves_made);
+    DrawText(score_text, 210, px * SIZE + 10, 24, PUFF_WHITE);
     
     EndDrawing();
 }
diff --git a/pufferlib/ocean/torch.py b/pufferlib/ocean/torch.py
diff --git a/pufferlib/resources/g2048/g2048_weights.bin b/pufferlib/resources/g2048/g2048_weights.bin