add max ticks, modify log

kywch · kywch · commit 6c5a8db86a54 · 2025-10-22T23:46:42.000-07:00
diff --git a/pufferlib/config/ocean/g2048.ini b/pufferlib/config/ocean/g2048.ini
@@ -5,11 +5,11 @@ policy_name = Policy
 rnn_name = Recurrent
 
 [policy]
-hidden_size = 1024
+hidden_size = 128
 
 [rnn]
-input_size = 1024
-hidden_size = 1024
+input_size = 128
+hidden_size = 128
 
 [vec]
 num_envs = 4
@@ -18,22 +18,36 @@ num_envs = 4
 num_envs = 4096
 
 [train]
-total_timesteps = 5_000_000_000
-adam_beta1 = 0.982603624444803
-adam_beta2 = 0.982603624444803
-adam_eps = 3.2888696338626164e-11
+total_timesteps = 3_000_000_000
+anneal_lr = True
+batch_size = auto
 bptt_horizon = 64
-clip_coef = 0.2709219986085283
-ent_coef = 0.09221187601118314
-gae_lambda = 0.5999999999999999
-gamma = 0.9913033082924563
-#learning_rate = 0.0032402460796988127
-learning_rate = 0.001370087925623787
-max_grad_norm = 3.382578348055827
-minibatch_size = 32768
-prio_alpha = 0.09999999999999998
-prio_beta0 = 0.941336023531629
-vf_clip_coef = 0.3229933703598912
-vf_coef = 3.591594736259073
-vtrace_c_clip = 1.405090934486193
-vtrace_rho_clip = 0.836535302835556
+minibatch_size = 65536
+
+adam_beta1 = 0.99
+adam_beta2 = 0.96
+adam_eps = 1.0e-10
+clip_coef = 0.1
+ent_coef = 0.02
+gae_lambda = 0.6
+gamma = 0.985
+learning_rate = 0.003
+max_grad_norm = 1.0
+prio_alpha = 0.99
+prio_beta0 = 0.40
+vf_clip_coef = 0.1
+vf_coef = 2.0
+vtrace_c_clip = 4.3
+vtrace_rho_clip = 1.6
+
+
+[sweep]
+metric = score
+goal = maximize
+
+[sweep.train.gae_lambda]
+distribution = logit_normal
+min = 0.01
+mean = 0.6
+max = 0.995
+scale = auto
diff --git a/pufferlib/ocean/g2048/binding.c b/pufferlib/ocean/g2048/binding.c
@@ -1,9 +1,9 @@
-#include "2048.h"
+#include "g2048.h"
 
 #define Env Game
 #include "../env_binding.h"
 
-// 2048.h does not have a 'size' field, so my_init can just return 0
+// g2048.h does not have a 'size' field, so my_init can just return 0
 static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
     // No custom initialization needed for 2048
     return 0;
@@ -12,6 +12,7 @@ static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
 static int my_log(PyObject* dict, Log* log) {
     assign_to_dict(dict, "perf", log->perf);
     assign_to_dict(dict, "score", log->score);
+    assign_to_dict(dict, "max_tile", log->max_tile);
     assign_to_dict(dict, "episode_return", log->episode_return);
     assign_to_dict(dict, "episode_length", log->episode_length);
     return 0;
diff --git a/pufferlib/ocean/g2048/g2048.c b/pufferlib/ocean/g2048/g2048.c
@@ -1,4 +1,4 @@
-#include "2048.h"
+#include "g2048.h"
 #include "puffernet.h"
 
 int main() {
diff --git a/pufferlib/ocean/g2048/g2048.h b/pufferlib/ocean/g2048/g2048.h
@@ -5,22 +5,28 @@
 #include <math.h>
 #include <string.h>
 #include "raylib.h"
+#define max(a, b) (((a) > (b)) ? (a) : (b))
 
 #define SIZE 4
 #define EMPTY 0
 #define UP 1
 #define DOWN 2
 #define LEFT 3
 #define RIGHT 4
+#define BASE_MAX_TICKS 2000
 
 // Precomputed constants
 #define REWARD_MULTIPLIER 0.0625f
 #define INVALID_MOVE_PENALTY -0.05f
 #define GAME_OVER_PENALTY -1.0f
 
+// To normalize perf from 0 to 1. Update when beaten.
+#define OBSERVED_MAX_SCORE 100000.0f
+
 typedef struct {
     float perf;
     float score;
+    float max_tile;
     float episode_return;
     float episode_length;
     float n;
@@ -36,6 +42,7 @@ typedef struct {
     int tick;
     unsigned char grid[SIZE][SIZE];
     float episode_reward;           // Accumulate episode reward
+    int max_episode_ticks;          // Dynamic max_ticks based on score
     
     // Cached values to avoid recomputation
     int empty_count;
@@ -93,8 +100,7 @@ static inline void update_empty_count(Game* game) {
     game->empty_count = count;
 }
 
-// Optimized score calculation
-static inline unsigned char calc_score(Game* game) {
+static inline unsigned char get_max_tile(Game* game) {
     unsigned char max_tile = 0;
     // Unroll loop for better performance
     for (int i = 0; i < SIZE; i++) {
@@ -108,9 +114,10 @@ static inline unsigned char calc_score(Game* game) {
 }
 
 void add_log(Game* game) {
-    unsigned char s = calc_score(game);
-    game->log.score = (float)(1 << s);
-    game->log.perf += ((float)s) * 0.0909f;
+    unsigned char s = get_max_tile(game);
+    game->log.max_tile += (float)(1 << s);
+    game->log.score += (float)game->score;
+    game->log.perf += (float)game->score / OBSERVED_MAX_SCORE;
     game->log.episode_length += game->tick;
     game->log.episode_return += game->episode_reward;
     game->log.n += 1;
@@ -129,6 +136,7 @@ void c_reset(Game* game) {
     game->empty_count = SIZE * SIZE;
     game->game_over_cached = false;
     game->grid_changed = true;
+    game->max_episode_ticks = BASE_MAX_TICKS;
     
     if (game->terminals) game->terminals[0] = 0;
     
@@ -251,9 +259,7 @@ bool move(Game* game, int direction, float* reward, float* score_increase) {
         }
     }
 
-    if (!moved) {
-        *reward = INVALID_MOVE_PENALTY;
-    } else {
+    if (moved) {
         game->grid_changed = true;
         game->game_over_cached = false; // Invalidate cache
     }
@@ -306,11 +312,16 @@ void c_step(Game* game) {
         add_random_tile(game);
         game->score += score_add;
         update_empty_count(game); // Update after adding tile
+        // This is to limit infinite invalid moves during eval
+        game->max_episode_ticks = max(BASE_MAX_TICKS, game->score / 20);
+    } else {
+        reward = INVALID_MOVE_PENALTY;
     }
-    
+
     bool game_over = is_game_over(game);
-    game->terminals[0] = game_over ? 1 : 0;
-    
+    bool max_ticks_reached = game->tick >= game->max_episode_ticks;
+    game->terminals[0] = (game_over || max_ticks_reached) ? 1 : 0;
+
     if (game_over) {
         reward = GAME_OVER_PENALTY;
     }

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-#include "2048.h"`
	`1`	`+#include "g2048.h"`
`2`	`2`	`#include "puffernet.h"`
`3`	`3`
`4`	`4`	`int main() {`