add obs_dist to control how much of the state the agent can see on each frame

layterz · layterz · commit 729bd12528fd · 2025-11-24T16:09:17.000-03:00
diff --git a/pufferlib/config/ocean/lock_key.ini b/pufferlib/config/ocean/lock_key.ini
@@ -9,6 +9,7 @@ num_envs = 4096
 num_keys = 1
 size = 8
 log_interval = 128
+obs_dist = 2
 
 [train]
 total_timesteps = 20_000_000
diff --git a/pufferlib/ocean/lock_key/binding.c b/pufferlib/ocean/lock_key/binding.c
@@ -6,6 +6,20 @@
 static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
     env->size = unpack(kwargs, "size");
     env->num_keys = unpack(kwargs, "num_keys");
+    env->obs_dist = unpack(kwargs, "obs_dist");
+
+    int tiles = env->size * env->size;
+    env->state = (unsigned char*)calloc(tiles, sizeof(unsigned char));
+    if (!env->state) return -1;
+
+    return 0;
+}
+
+static int my_close(Env* env) {
+    if (env->state) {
+        free(env->state);
+        env->state = NULL;
+    }
     return 0;
 }
 
diff --git a/pufferlib/ocean/lock_key/lock_key.c b/pufferlib/ocean/lock_key/lock_key.c
@@ -1,14 +1,23 @@
+#include <time.h>
 #include "lock_key.h"
 
 int main() {
-    LockKey env = {.size = 8, .num_keys = 3};
-    env.observations = (unsigned char*)calloc(env.size*env.size, sizeof(unsigned char));
-    env.actions = (int*)calloc(1, sizeof(int));
-    env.rewards = (float*)calloc(1, sizeof(float));
-    env.terminals = (unsigned char*)calloc(1, sizeof(unsigned char));
+    srand((unsigned int)time(NULL));
+
+    LockKey env = {.size = 8, .num_keys = 3, .obs_dist = 2};
+
+    int tiles = env.size * env.size;
+
+    env.state        = (unsigned char*)calloc(tiles, sizeof(unsigned char));
+    env.observations = (unsigned char*)calloc(tiles, sizeof(unsigned char));
+    env.actions      = (int*)calloc(1, sizeof(int));
+    env.rewards      = (float*)calloc(1, sizeof(float));
+    env.terminals    = (unsigned char*)calloc(1, sizeof(unsigned char));
+    env.truncations  = (unsigned char*)calloc(1, sizeof(unsigned char)); // optional
 
     c_reset(&env);
     c_render(&env);
+
     while (!WindowShouldClose()) {
         if (IsKeyDown(KEY_LEFT_SHIFT)) {
             if (IsKeyDown(KEY_A) || IsKeyDown(KEY_LEFT)) {
@@ -20,18 +29,23 @@ int main() {
             } else if (IsKeyDown(KEY_S) || IsKeyDown(KEY_DOWN)) {
                 env.actions[0] = 3;
             } else {
-                env.actions[0] = -1;
+                env.actions[0] = -1; // no-op
             }
         } else {
-            env.actions[0] = rand() % 5;
+            env.actions[0] = rand() % 5; // 4 == no-op, still fine
         }
+
         c_step(&env);
         c_render(&env);
     }
+
+    free(env.state);
     free(env.observations);
     free(env.actions);
     free(env.rewards);
     free(env.terminals);
+    if (env.truncations) free(env.truncations);
+
     c_close(&env);
+    return 0;
 }
-
diff --git a/pufferlib/ocean/lock_key/lock_key.h b/pufferlib/ocean/lock_key/lock_key.h
@@ -9,7 +9,8 @@
 static const Color PUFF_RED        = (Color){187, 0, 0, 255};
 static const Color PUFF_CYAN       = (Color){0, 187, 187, 255};
 static const Color PUFF_GREEN      = (Color){0, 187, 0, 255};
-static const Color PUFF_BACKGROUND = (Color){45, 30, 20, 255};
+static const Color PUFF_BACKGROUND = (Color){65, 30, 40, 255};
+static const Color PUFF_BLACK      = (Color){0, 0, 0, 255};
 
 typedef struct {
     float perf;
@@ -22,7 +23,11 @@ typedef struct {
 typedef struct {
     Log log;
 
+    // observations: partial view for agent
     unsigned char* observations;
+    // state: full system state (same tile encoding as before)
+    unsigned char* state;
+
     int* actions;
     float* rewards;
     unsigned char* terminals;
@@ -34,12 +39,32 @@ typedef struct {
     int x;
     int y;
     int num_keys_collected;
+    int obs_dist;
 } LockKey;
 
 static inline int lk_pos(LockKey* env, int x, int y) {
     return y * env->size + x;
 }
 
+static inline int lk_visible(LockKey* env, int x, int y) {
+    int dx = x - env->x; if (dx < 0) dx = -dx;
+    int dy = y - env->y; if (dy < 0) dy = -dy;
+    return (dx > dy ? dx : dy) <= env->obs_dist;
+}
+
+static inline void lk_update_observations(LockKey* env) {
+    int tiles = env->size * env->size;
+    memset(env->observations, 0, tiles * sizeof(unsigned char));
+
+    for (int y = 0; y < env->size; y++) {
+        for (int x = 0; x < env->size; x++) {
+            if (!lk_visible(env, x, y)) continue;
+            int pos = lk_pos(env, x, y);
+            env->observations[pos] = env->state[pos];
+        }
+    }
+}
+
 void add_log(LockKey* env) {
     env->log.perf += (env->rewards[0] > 0) ? 1 : 0;
     env->log.score += env->rewards[0];
@@ -50,27 +75,28 @@ void add_log(LockKey* env) {
 
 static inline void c_reset(LockKey* env) {
     int tiles = env->size * env->size;
-    memset(env->observations, 0, tiles * sizeof(unsigned char));
+    memset(env->state, 0, tiles * sizeof(unsigned char));
 
     env->x = env->size / 2;
     env->y = env->size / 2;
     int player_pos = lk_pos(env, env->x, env->y);
-    env->observations[player_pos] = 1;
+    env->state[player_pos] = 1;
     env->tick = 0;
 
     int lock_idx;
     do lock_idx = rand() % tiles;
     while (lock_idx == player_pos);
-    env->observations[lock_idx] = 2;
+    env->state[lock_idx] = 2;
 
     for (int i = 0; i < env->num_keys; i++) {
         int key_idx;
         do key_idx = rand() % tiles;
-        while (env->observations[key_idx] != 0);
-        env->observations[key_idx] = 3;
+        while (env->state[key_idx] != 0);
+        env->state[key_idx] = 3;
     }
 
     env->num_keys_collected = 0;
+    lk_update_observations(env);
 }
 
 static inline void c_step(LockKey* env) {
@@ -79,17 +105,16 @@ static inline void c_step(LockKey* env) {
     env->terminals[0] = 0;
     if (env->truncations) env->truncations[0] = 0;
 
-    // clear agent from previous position if not on lock
-    if (env->observations[lk_pos(env, env->x, env->y)] != 2)
-        env->observations[lk_pos(env, env->x, env->y)] = 0;
+    int prev_pos = lk_pos(env, env->x, env->y);
+    if (env->state[prev_pos] != 2)
+        env->state[prev_pos] = 0;
 
     int a = env->actions[0];
     if (a == 0) env->x--;
     else if (a == 1) env->x++;
     else if (a == 2) env->y--;
     else if (a == 3) env->y++;
 
-    // terminal if out of bounds or max steps reached
     int max_steps = 3*env->size + env->num_keys*env->num_keys;
     if (env->tick > max_steps || env->x < 0 || env->x >= env->size || env->y < 0 || env->y >= env->size) {
         env->rewards[0] = -3.0f;
@@ -101,24 +126,23 @@ static inline void c_step(LockKey* env) {
 
     int pos = lk_pos(env, env->x, env->y);
 
-    // collect key
-    if (env->observations[pos] == 3) {
+    if (env->state[pos] == 3) {
         env->rewards[0] += 1.0f;
         env->num_keys_collected++;
     }
 
-    // open lock if all keys collected
-    if (env->observations[pos] == 2 && env->num_keys_collected == env->num_keys) {
+    if (env->state[pos] == 2 && env->num_keys_collected == env->num_keys) {
         env->rewards[0] = 3.0f;
         env->terminals[0] = 1;
         add_log(env);
         c_reset(env);
         return;
     }
 
-    // move agent, but don't override observations if on lock
-    if (env->observations[pos] != 2)
-        env->observations[pos] = 1;
+    if (env->state[pos] != 2)
+        env->state[pos] = 1;
+
+    lk_update_observations(env);
 }
 
 static inline void c_render(LockKey* env) {
@@ -130,11 +154,13 @@ static inline void c_render(LockKey* env) {
     if (IsKeyDown(KEY_ESCAPE)) exit(0);
 
     BeginDrawing();
-    ClearBackground(PUFF_BACKGROUND);
 
     for (int y = 0; y < env->size; y++) {
         for (int x = 0; x < env->size; x++) {
-            int pos = y * env->size + x;
+            Color bg = lk_visible(env, x, y) ? PUFF_BACKGROUND : PUFF_BLACK;
+            DrawRectangle(x * 64, y * 64, 64, 64, bg);
+
+            int pos = lk_pos(env, x, y);
             unsigned char v = env->observations[pos];
             if (!v) continue;
 
diff --git a/pufferlib/ocean/lock_key/lock_key.py b/pufferlib/ocean/lock_key/lock_key.py
@@ -4,16 +4,11 @@
 from pufferlib.ocean.lock_key import binding
 
 class LockKey(pufferlib.PufferEnv):
-    def __init__(self, num_envs=1, render_mode=None, log_interval=128, size=8, num_keys=3, buf=None, seed=0):
-        self.size = size
-        self.num_keys = num_keys
-
-        # C writes a flattened size*size uint8 grid with values {0,1,2}
+    def __init__(self, num_envs=1, render_mode=None, log_interval=128, size=8, num_keys=3, buf=None, seed=0, obs_dist=2):
         self.single_observation_space = gymnasium.spaces.Box(
-            low=0, high=2, shape=(size * size,), dtype=np.uint8
+            low=0, high=3, shape=(size * size,), dtype=np.uint8
         )
 
-        # 0=L, 1=R, 2=U, 3=D, 4=NOOP
         self.single_action_space = gymnasium.spaces.Discrete(5)
 
         self.render_mode = render_mode
@@ -26,6 +21,7 @@ def __init__(self, num_envs=1, render_mode=None, log_interval=128, size=8, num_k
             self.terminals, self.truncations, num_envs, seed,
             size=size,
             num_keys=num_keys,
+            obs_dist=obs_dist,
         )
 
     def reset(self, seed=0):