Add lock_key environment

layterz · layterz · commit 636cbb3a867e · 2025-11-24T12:37:03.000-03:00
diff --git a/pufferlib/config/ocean/lock_key.ini b/pufferlib/config/ocean/lock_key.ini
@@ -0,0 +1,17 @@
+[base]
+package = ocean
+env_name = puffer_lock_key
+policy_name = Policy
+rnn_name = Recurrent
+
+[env]
+num_envs = 4096
+num_keys = 1
+size = 8
+log_interval = 128
+
+[train]
+total_timesteps = 20_000_000
+gamma = 0.95
+learning_rate = 0.05
+minibatch_size = 32768
diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py
@@ -163,6 +163,7 @@ def make_multiagent(buf=None, **kwargs):
     'spaces': make_spaces,
     'multiagent': make_multiagent,
     'slimevolley': 'SlimeVolley',
+    'lock_key': 'LockKey',
 }
 
 def env_creator(name='squared', *args, **kwargs):
diff --git a/pufferlib/ocean/lock_key/binding.c b/pufferlib/ocean/lock_key/binding.c
@@ -0,0 +1,18 @@
+#include "lock_key.h"
+
+#define Env LockKey 
+#include "../env_binding.h"
+
+static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
+    env->size = unpack(kwargs, "size");
+    env->num_keys = unpack(kwargs, "num_keys");
+    return 0;
+}
+
+static int my_log(PyObject* dict, Log* log) {
+    assign_to_dict(dict, "score", log->score);
+    assign_to_dict(dict, "perf", log->perf);
+    assign_to_dict(dict, "episode_return", log->episode_return);
+    assign_to_dict(dict, "episode_length", log->episode_length);
+    return 0;
+}
diff --git a/pufferlib/ocean/lock_key/lock_key b/pufferlib/ocean/lock_key/lock_key
diff --git a/pufferlib/ocean/lock_key/lock_key.c b/pufferlib/ocean/lock_key/lock_key.c
@@ -0,0 +1,37 @@
+#include "lock_key.h"
+
+int main() {
+    LockKey env = {.size = 8, .num_keys = 3};
+    env.observations = (unsigned char*)calloc(env.size*env.size, sizeof(unsigned char));
+    env.actions = (int*)calloc(1, sizeof(int));
+    env.rewards = (float*)calloc(1, sizeof(float));
+    env.terminals = (unsigned char*)calloc(1, sizeof(unsigned char));
+
+    c_reset(&env);
+    c_render(&env);
+    while (!WindowShouldClose()) {
+        if (IsKeyDown(KEY_LEFT_SHIFT)) {
+            if (IsKeyDown(KEY_A) || IsKeyDown(KEY_LEFT)) {
+                env.actions[0] = 0;
+            } else if (IsKeyDown(KEY_D) || IsKeyDown(KEY_RIGHT)) {
+                env.actions[0] = 1;
+            } else if (IsKeyDown(KEY_W) || IsKeyDown(KEY_UP)) {
+                env.actions[0] = 2;
+            } else if (IsKeyDown(KEY_S) || IsKeyDown(KEY_DOWN)) {
+                env.actions[0] = 3;
+            } else {
+                env.actions[0] = -1;
+            }
+        } else {
+            env.actions[0] = rand() % 5;
+        }
+        c_step(&env);
+        c_render(&env);
+    }
+    free(env.observations);
+    free(env.actions);
+    free(env.rewards);
+    free(env.terminals);
+    c_close(&env);
+}
+
diff --git a/pufferlib/ocean/lock_key/lock_key.h b/pufferlib/ocean/lock_key/lock_key.h
@@ -0,0 +1,159 @@
+#ifndef LOCK_KEY_H
+#define LOCK_KEY_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "raylib.h"
+
+static const Color PUFF_RED        = (Color){187, 0, 0, 255};
+static const Color PUFF_CYAN       = (Color){0, 187, 187, 255};
+static const Color PUFF_GREEN      = (Color){0, 187, 0, 255};
+static const Color PUFF_BACKGROUND = (Color){45, 30, 20, 255};
+
+typedef struct {
+    float perf;
+    float score;
+    float episode_return;
+    float episode_length;
+    float n;
+} Log;
+
+typedef struct {
+    Log log;
+
+    unsigned char* observations;
+    int* actions;
+    float* rewards;
+    unsigned char* terminals;
+    unsigned char* truncations;
+
+    int size;
+    int num_keys;
+    int tick;
+    int x;
+    int y;
+    int num_keys_collected;
+} LockKey;
+
+static inline int lk_pos(LockKey* env, int x, int y) {
+    return y * env->size + x;
+}
+
+void add_log(LockKey* env) {
+    env->log.perf += (env->rewards[0] > 0) ? 1 : 0;
+    env->log.score += env->rewards[0];
+    env->log.episode_return += env->rewards[0];
+    env->log.episode_length += env->tick;
+    env->log.n++;
+}
+
+static inline void c_reset(LockKey* env) {
+    int tiles = env->size * env->size;
+    memset(env->observations, 0, tiles * sizeof(unsigned char));
+
+    env->x = env->size / 2;
+    env->y = env->size / 2;
+    int player_pos = lk_pos(env, env->x, env->y);
+    env->observations[player_pos] = 1;
+    env->tick = 0;
+
+    int lock_idx;
+    do lock_idx = rand() % tiles;
+    while (lock_idx == player_pos);
+    env->observations[lock_idx] = 2;
+
+    for (int i = 0; i < env->num_keys; i++) {
+        int key_idx;
+        do key_idx = rand() % tiles;
+        while (env->observations[key_idx] != 0);
+        env->observations[key_idx] = 3;
+    }
+
+    env->num_keys_collected = 0;
+}
+
+static inline void c_step(LockKey* env) {
+    env->tick++;
+    env->rewards[0] = -0.1f;
+    env->terminals[0] = 0;
+    if (env->truncations) env->truncations[0] = 0;
+
+    // clear agent from previous position if not on lock
+    if (env->observations[lk_pos(env, env->x, env->y)] != 2)
+        env->observations[lk_pos(env, env->x, env->y)] = 0;
+
+    int a = env->actions[0];
+    if (a == 0) env->x--;
+    else if (a == 1) env->x++;
+    else if (a == 2) env->y--;
+    else if (a == 3) env->y++;
+
+    // terminal if out of bounds or max steps reached
+    int max_steps = 3*env->size + env->num_keys*env->num_keys;
+    if (env->tick > max_steps || env->x < 0 || env->x >= env->size || env->y < 0 || env->y >= env->size) {
+        env->rewards[0] = -3.0f;
+        env->terminals[0] = 1;
+        add_log(env);
+        c_reset(env);
+        return;
+    }
+
+    int pos = lk_pos(env, env->x, env->y);
+
+    // collect key
+    if (env->observations[pos] == 3) {
+        env->rewards[0] += 1.0f;
+        env->num_keys_collected++;
+    }
+
+    // open lock if all keys collected
+    if (env->observations[pos] == 2 && env->num_keys_collected == env->num_keys) {
+        env->rewards[0] = 3.0f;
+        env->terminals[0] = 1;
+        add_log(env);
+        c_reset(env);
+        return;
+    }
+
+    // move agent, but don't override observations if on lock
+    if (env->observations[pos] != 2)
+        env->observations[pos] = 1;
+}
+
+static inline void c_render(LockKey* env) {
+    if (!IsWindowReady()) {
+        InitWindow(64*env->size, 64*env->size, "LockKey");
+        SetTargetFPS(5);
+    }
+
+    if (IsKeyDown(KEY_ESCAPE)) exit(0);
+
+    BeginDrawing();
+    ClearBackground(PUFF_BACKGROUND);
+
+    for (int y = 0; y < env->size; y++) {
+        for (int x = 0; x < env->size; x++) {
+            int pos = y * env->size + x;
+            unsigned char v = env->observations[pos];
+            if (!v) continue;
+
+            Color color =
+                (v == 1) ? PUFF_CYAN :
+                (v == 2) ? PUFF_RED :
+                (v == 3) ? PUFF_GREEN :
+                PUFF_BACKGROUND;
+
+            DrawRectangle(x * 64, y * 64, 64, 64, color);
+        }
+    }
+
+    EndDrawing();
+}
+
+static inline void c_close(LockKey* env) {
+    (void)env;
+    if (IsWindowReady()) CloseWindow();
+}
+
+#endif
diff --git a/pufferlib/ocean/lock_key/lock_key.py b/pufferlib/ocean/lock_key/lock_key.py
@@ -0,0 +1,50 @@
+import gymnasium
+import numpy as np
+import pufferlib
+from pufferlib.ocean.lock_key import binding
+
+class LockKey(pufferlib.PufferEnv):
+    def __init__(self, num_envs=1, render_mode=None, log_interval=128, size=8, num_keys=3, buf=None, seed=0):
+        self.size = size
+        self.num_keys = num_keys
+
+        # C writes a flattened size*size uint8 grid with values {0,1,2}
+        self.single_observation_space = gymnasium.spaces.Box(
+            low=0, high=2, shape=(size * size,), dtype=np.uint8
+        )
+
+        # 0=L, 1=R, 2=U, 3=D, 4=NOOP
+        self.single_action_space = gymnasium.spaces.Discrete(5)
+
+        self.render_mode = render_mode
+        self.num_agents = num_envs
+        self.log_interval = log_interval
+        super().__init__(buf)
+
+        self.c_envs = binding.vec_init(
+            self.observations, self.actions, self.rewards,
+            self.terminals, self.truncations, num_envs, seed,
+            size=size,
+            num_keys=num_keys,
+        )
+
+    def reset(self, seed=0):
+        binding.vec_reset(self.c_envs, seed)
+        self.tick = 0
+        return self.observations, []
+
+    def step(self, actions):
+        self.tick += 1
+        self.actions[:] = actions
+        binding.vec_step(self.c_envs)
+        info = []
+        if self.tick % self.log_interval == 0:
+            info.append(binding.vec_log(self.c_envs))
+        return (self.observations, self.rewards,
+                self.terminals, self.truncations, info)
+
+    def render(self):
+        binding.vec_render(self.c_envs, 0)
+
+    def close(self):
+        binding.vec_close(self.c_envs)
diff --git a/pufferlib/resources/lock_key/puffer_lock_key_weights.bin b/pufferlib/resources/lock_key/puffer_lock_key_weights.bin

Original file line number	Diff line number	Diff line change
`@@ -163,6 +163,7 @@ def make_multiagent(buf=None, **kwargs):`
`163`	`163`	`'spaces': make_spaces,`
`164`	`164`	`'multiagent': make_multiagent,`
`165`	`165`	`'slimevolley': 'SlimeVolley',`
	`166`	`+ 'lock_key': 'LockKey',`
`166`	`167`	`}`
`167`	`168`
`168`	`169`	`def env_creator(name='squared', args, *kwargs):`