diff --git a/pufferlib/config/ocean/lock_key.ini b/pufferlib/config/ocean/lock_key.ini new file mode 100644 index 000000000..abe52fa05 --- /dev/null +++ b/pufferlib/config/ocean/lock_key.ini @@ -0,0 +1,18 @@ +[base] +package = ocean +env_name = puffer_lock_key +policy_name = Policy +rnn_name = Recurrent + +[env] +num_envs = 4096 +num_keys = 1 +size = 8 +log_interval = 128 +obs_dist = 2 + +[train] +total_timesteps = 20_000_000 +gamma = 0.95 +learning_rate = 0.05 +minibatch_size = 32768 diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py index 6c56a4ea2..0c2ab1486 100644 --- a/pufferlib/ocean/environment.py +++ b/pufferlib/ocean/environment.py @@ -162,6 +162,7 @@ def make_multiagent(buf=None, **kwargs): 'spaces': make_spaces, 'multiagent': make_multiagent, 'slimevolley': 'SlimeVolley', + 'lock_key': 'LockKey', } def env_creator(name='squared', *args, **kwargs): diff --git a/pufferlib/ocean/lock_key/binding.c b/pufferlib/ocean/lock_key/binding.c new file mode 100644 index 000000000..5bec3944f --- /dev/null +++ b/pufferlib/ocean/lock_key/binding.c @@ -0,0 +1,32 @@ +#include "lock_key.h" + +#define Env LockKey +#include "../env_binding.h" + +static int my_init(Env* env, PyObject* args, PyObject* kwargs) { + env->size = unpack(kwargs, "size"); + env->num_keys = unpack(kwargs, "num_keys"); + env->obs_dist = unpack(kwargs, "obs_dist"); + + int tiles = env->size * env->size; + env->state = (unsigned char*)calloc(tiles, sizeof(unsigned char)); + if (!env->state) return -1; + + return 0; +} + +static int my_close(Env* env) { + if (env->state) { + free(env->state); + env->state = NULL; + } + return 0; +} + +static int my_log(PyObject* dict, Log* log) { + assign_to_dict(dict, "score", log->score); + assign_to_dict(dict, "perf", log->perf); + assign_to_dict(dict, "episode_return", log->episode_return); + assign_to_dict(dict, "episode_length", log->episode_length); + return 0; +} diff --git a/pufferlib/ocean/lock_key/lock_key.c b/pufferlib/ocean/lock_key/lock_key.c new file mode 100644 index 000000000..513c5c419 --- /dev/null +++ b/pufferlib/ocean/lock_key/lock_key.c @@ -0,0 +1,51 @@ +#include +#include "lock_key.h" + +int main() { + srand((unsigned int)time(NULL)); + + LockKey env = {.size = 8, .num_keys = 3, .obs_dist = 2}; + + int tiles = env.size * env.size; + + env.state = (unsigned char*)calloc(tiles, sizeof(unsigned char)); + env.observations = (unsigned char*)calloc(tiles, sizeof(unsigned char)); + env.actions = (int*)calloc(1, sizeof(int)); + env.rewards = (float*)calloc(1, sizeof(float)); + env.terminals = (unsigned char*)calloc(1, sizeof(unsigned char)); + env.truncations = (unsigned char*)calloc(1, sizeof(unsigned char)); // optional + + c_reset(&env); + c_render(&env); + + while (!WindowShouldClose()) { + if (IsKeyDown(KEY_LEFT_SHIFT)) { + if (IsKeyDown(KEY_A) || IsKeyDown(KEY_LEFT)) { + env.actions[0] = 0; + } else if (IsKeyDown(KEY_D) || IsKeyDown(KEY_RIGHT)) { + env.actions[0] = 1; + } else if (IsKeyDown(KEY_W) || IsKeyDown(KEY_UP)) { + env.actions[0] = 2; + } else if (IsKeyDown(KEY_S) || IsKeyDown(KEY_DOWN)) { + env.actions[0] = 3; + } else { + env.actions[0] = -1; // no-op + } + } else { + env.actions[0] = rand() % 5; // 4 == no-op, still fine + } + + c_step(&env); + c_render(&env); + } + + free(env.state); + free(env.observations); + free(env.actions); + free(env.rewards); + free(env.terminals); + if (env.truncations) free(env.truncations); + + c_close(&env); + return 0; +} diff --git a/pufferlib/ocean/lock_key/lock_key.h b/pufferlib/ocean/lock_key/lock_key.h new file mode 100644 index 000000000..8228ff9d2 --- /dev/null +++ b/pufferlib/ocean/lock_key/lock_key.h @@ -0,0 +1,185 @@ +#ifndef LOCK_KEY_H +#define LOCK_KEY_H + +#include +#include +#include +#include "raylib.h" + +static const Color PUFF_RED = (Color){187, 0, 0, 255}; +static const Color PUFF_CYAN = (Color){0, 187, 187, 255}; +static const Color PUFF_GREEN = (Color){0, 187, 0, 255}; +static const Color PUFF_BACKGROUND = (Color){65, 30, 40, 255}; +static const Color PUFF_BLACK = (Color){0, 0, 0, 255}; + +typedef struct { + float perf; + float score; + float episode_return; + float episode_length; + float n; +} Log; + +typedef struct { + Log log; + + // observations: partial view for agent + unsigned char* observations; + // state: full system state + unsigned char* state; + + int* actions; + float* rewards; + unsigned char* terminals; + unsigned char* truncations; + + int size; + int num_keys; + int tick; + int x; + int y; + int num_keys_collected; + int obs_dist; +} LockKey; + +static inline int lk_pos(LockKey* env, int x, int y) { + return y * env->size + x; +} + +static inline int lk_visible(LockKey* env, int x, int y) { + int dx = x - env->x; if (dx < 0) dx = -dx; + int dy = y - env->y; if (dy < 0) dy = -dy; + return (dx > dy ? dx : dy) <= env->obs_dist; +} + +static inline void lk_update_observations(LockKey* env) { + int tiles = env->size * env->size; + memset(env->observations, 0, tiles * sizeof(unsigned char)); + + for (int y = 0; y < env->size; y++) { + for (int x = 0; x < env->size; x++) { + if (!lk_visible(env, x, y)) continue; + int pos = lk_pos(env, x, y); + env->observations[pos] = env->state[pos]; + } + } +} + +void add_log(LockKey* env) { + env->log.perf += (env->rewards[0] > 0) ? 1 : 0; + env->log.score += env->rewards[0]; + env->log.episode_return += env->rewards[0]; + env->log.episode_length += env->tick; + env->log.n++; +} + +static inline void c_reset(LockKey* env) { + int tiles = env->size * env->size; + memset(env->state, 0, tiles * sizeof(unsigned char)); + + env->x = env->size / 2; + env->y = env->size / 2; + int player_pos = lk_pos(env, env->x, env->y); + env->state[player_pos] = 1; + env->tick = 0; + + int lock_idx; + do lock_idx = rand() % tiles; + while (lock_idx == player_pos); + env->state[lock_idx] = 2; + + for (int i = 0; i < env->num_keys; i++) { + int key_idx; + do key_idx = rand() % tiles; + while (env->state[key_idx] != 0); + env->state[key_idx] = 3; + } + + env->num_keys_collected = 0; + lk_update_observations(env); +} + +static inline void c_step(LockKey* env) { + env->tick++; + env->rewards[0] = -0.1f; + env->terminals[0] = 0; + if (env->truncations) env->truncations[0] = 0; + + int prev_pos = lk_pos(env, env->x, env->y); + if (env->state[prev_pos] != 2) + env->state[prev_pos] = 0; + + int a = env->actions[0]; + if (a == 0) env->x--; + else if (a == 1) env->x++; + else if (a == 2) env->y--; + else if (a == 3) env->y++; + + int max_steps = 3*env->size + env->num_keys*env->num_keys; + if (env->tick > max_steps || env->x < 0 || env->x >= env->size || env->y < 0 || env->y >= env->size) { + env->rewards[0] = -3.0f; + env->terminals[0] = 1; + add_log(env); + c_reset(env); + return; + } + + int pos = lk_pos(env, env->x, env->y); + + if (env->state[pos] == 3) { + env->rewards[0] += 1.0f; + env->num_keys_collected++; + } + + if (env->state[pos] == 2 && env->num_keys_collected == env->num_keys) { + env->rewards[0] = 3.0f; + env->terminals[0] = 1; + add_log(env); + c_reset(env); + return; + } + + if (env->state[pos] != 2) + env->state[pos] = 1; + + lk_update_observations(env); +} + +static inline void c_render(LockKey* env) { + if (!IsWindowReady()) { + InitWindow(64*env->size, 64*env->size, "LockKey"); + SetTargetFPS(5); + } + + if (IsKeyDown(KEY_ESCAPE)) exit(0); + + BeginDrawing(); + + for (int y = 0; y < env->size; y++) { + for (int x = 0; x < env->size; x++) { + Color bg = lk_visible(env, x, y) ? PUFF_BACKGROUND : PUFF_BLACK; + DrawRectangle(x * 64, y * 64, 64, 64, bg); + + int pos = lk_pos(env, x, y); + unsigned char v = env->observations[pos]; + if (!v) continue; + + Color color = + (v == 1) ? PUFF_CYAN : + (v == 2) ? PUFF_RED : + (v == 3) ? PUFF_GREEN : + PUFF_BACKGROUND; + + DrawRectangle(x * 64, y * 64, 64, 64, color); + } + } + + EndDrawing(); +} + +static inline void c_close(LockKey* env) { + (void)env; + if (IsWindowReady()) CloseWindow(); +} + +#endif diff --git a/pufferlib/ocean/lock_key/lock_key.py b/pufferlib/ocean/lock_key/lock_key.py new file mode 100644 index 000000000..2da889ca8 --- /dev/null +++ b/pufferlib/ocean/lock_key/lock_key.py @@ -0,0 +1,46 @@ +import gymnasium +import numpy as np +import pufferlib +from pufferlib.ocean.lock_key import binding + +class LockKey(pufferlib.PufferEnv): + def __init__(self, num_envs=1, render_mode=None, log_interval=128, size=8, num_keys=3, buf=None, seed=0, obs_dist=2): + self.single_observation_space = gymnasium.spaces.Box( + low=0, high=3, shape=(size * size,), dtype=np.uint8 + ) + + self.single_action_space = gymnasium.spaces.Discrete(5) + + self.render_mode = render_mode + self.num_agents = num_envs + self.log_interval = log_interval + super().__init__(buf) + + self.c_envs = binding.vec_init( + self.observations, self.actions, self.rewards, + self.terminals, self.truncations, num_envs, seed, + size=size, + num_keys=num_keys, + obs_dist=obs_dist, + ) + + def reset(self, seed=0): + binding.vec_reset(self.c_envs, seed) + self.tick = 0 + return self.observations, [] + + def step(self, actions): + self.tick += 1 + self.actions[:] = actions + binding.vec_step(self.c_envs) + info = [] + if self.tick % self.log_interval == 0: + info.append(binding.vec_log(self.c_envs)) + return (self.observations, self.rewards, + self.terminals, self.truncations, info) + + def render(self): + binding.vec_render(self.c_envs, 0) + + def close(self): + binding.vec_close(self.c_envs) diff --git a/pufferlib/resources/lock_key/puffer_lock_key_weights.bin b/pufferlib/resources/lock_key/puffer_lock_key_weights.bin new file mode 100644 index 000000000..69e5e0f27 Binary files /dev/null and b/pufferlib/resources/lock_key/puffer_lock_key_weights.bin differ