diff --git a/pufferlib/config/ocean/tictactoe.ini b/pufferlib/config/ocean/tictactoe.ini new file mode 100644 index 000000000..77f84adb9 --- /dev/null +++ b/pufferlib/config/ocean/tictactoe.ini @@ -0,0 +1,17 @@ +[base] +package = ocean +env_name = puffer_tictactoe +policy_name = Policy +rnn_name = Recurrent + +[env] +num_envs = 1024 + +[vec] +num_envs = 8 + +[train] +total_timesteps = 100_000_000 +gamma = 0.99 +learning_rate = 0.1 +minibatch_size = 32768 diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py index ed1408ba6..6ceeb52ba 100644 --- a/pufferlib/ocean/environment.py +++ b/pufferlib/ocean/environment.py @@ -137,6 +137,7 @@ def make_multiagent(buf=None, **kwargs): 'snake': 'Snake', 'squared': 'Squared', 'pysquared': 'PySquared', + 'tictactoe': 'TicTacToe', 'connect4': 'Connect4', 'g2048': 'G2048', 'terraform': 'Terraform', diff --git a/pufferlib/ocean/tictactoe/binding.c b/pufferlib/ocean/tictactoe/binding.c new file mode 100644 index 000000000..1eab90289 --- /dev/null +++ b/pufferlib/ocean/tictactoe/binding.c @@ -0,0 +1,17 @@ +#include "tictactoe.h" + +#define Env TicTacToe +#include "../env_binding.h" + +static int my_init(Env* env, PyObject* args, PyObject* kwargs) { + // No custom parameters for tictactoe + return 0; +} + +static int my_log(PyObject* dict, Log* log) { + assign_to_dict(dict, "perf", log->perf); + assign_to_dict(dict, "score", log->score); + assign_to_dict(dict, "episode_return", log->episode_return); + assign_to_dict(dict, "episode_length", log->episode_length); + return 0; +} diff --git a/pufferlib/ocean/tictactoe/tictactoe.h b/pufferlib/ocean/tictactoe/tictactoe.h new file mode 100644 index 000000000..5e9f0b623 --- /dev/null +++ b/pufferlib/ocean/tictactoe/tictactoe.h @@ -0,0 +1,199 @@ +/* TicTacToe: Single-agent tic-tac-toe against a random opponent */ + +#include +#include +#include "raylib.h" + +const unsigned char EMPTY = 0; +const unsigned char AGENT = 1; +const unsigned char ENEMY = 2; + +// Required struct. Only use floats! +typedef struct { + float perf; // Win rate (1.0 for win, 0.0 for loss/draw) + float score; // Final reward of episode + float episode_return; // Sum of rewards over episode (same as score for single-reward games) + float episode_length; // Number of agent actions in episode + float n; // Required as the last field - number of completed episodes +} Log; + +// Environment struct +typedef struct { + Log log; // Required field + unsigned char* observations; // Required. 10 values per agent: 9 cells + 1 turn flag + int* actions; // Required. Position 0-8 for each agent + float* rewards; // Required + unsigned char* terminals; // Required + int tick; // Number of agent actions this episode + int num_moves; // Total moves on board + int current_player; // Which player's turn (0 or 1) +} TicTacToe; + +void add_log(TicTacToe* env) { + // Log from agent 0's perspective + env->log.perf += (env->rewards[0] > 0) ? 1.0 : 0.0; + env->log.score += env->rewards[0]; + env->log.episode_length += env->tick; + env->log.episode_return += env->rewards[0]; + env->log.n++; +} + +// Compute observations for both agents +void compute_observations(TicTacToe* env) { + // Agent 0: board as-is (obs[0..8]) + turn flag (obs[9]) + env->observations[9] = (env->current_player == 0) ? 1 : 0; + + // Agent 1: flipped board (obs[10..18]) + turn flag (obs[19]) + for (int i = 0; i < 9; i++) { + unsigned char cell = env->observations[i]; + env->observations[10 + i] = (cell == 1) ? 2 : (cell == 2) ? 1 : 0; + } + env->observations[19] = (env->current_player == 1) ? 1 : 0; +} + +// Check if a player has won +int check_winner(TicTacToe* env, unsigned char player) { + unsigned char* board = env->observations; + + // Check rows and columns + for (int i = 0; i < 3; i++) { + if ((board[i*3] == player && board[i*3+1] == player && board[i*3+2] == player) || + (board[i] == player && board[i+3] == player && board[i+6] == player)) { + return 1; + } + } + + // Check diagonals + if ((board[0] == player && board[4] == player && board[8] == player) || + (board[2] == player && board[4] == player && board[6] == player)) { + return 1; + } + + return 0; +} + +// Required function +void c_reset(TicTacToe* env) { + // Clear board (first 9 cells of observations) + memset(env->observations, EMPTY, 9 * sizeof(unsigned char)); + env->tick = 0; + env->num_moves = 0; + env->current_player = rand() % 2; // Randomly choose who starts + + // Compute observations for both agents + compute_observations(env); +} + +// Required function +void c_step(TicTacToe* env) { + env->tick++; + + // Get action from current player + int action = env->actions[env->current_player]; + + // Zero out rewards and terminals at the start + env->terminals[0] = 0; + env->terminals[1] = 0; + env->rewards[0] = 0; + env->rewards[1] = 0; + + // Check if move is valid + if (env->observations[action] != EMPTY) { + // Invalid move - current player loses + env->terminals[0] = 1; + env->terminals[1] = 1; + env->rewards[env->current_player] = -1.0; + env->rewards[1 - env->current_player] = 1.0; + add_log(env); + c_reset(env); + return; + } + + // Make current player's move + unsigned char player_piece = (env->current_player == 0) ? AGENT : ENEMY; + env->observations[action] = player_piece; + env->num_moves++; + + // Check if current player won + if (check_winner(env, player_piece)) { + env->terminals[0] = 1; + env->terminals[1] = 1; + env->rewards[env->current_player] = 1.0; + env->rewards[1 - env->current_player] = -1.0; + add_log(env); + c_reset(env); + return; + } + + // Check for draw (board full) + if (env->num_moves == 9) { + env->terminals[0] = 1; + env->terminals[1] = 1; + env->rewards[0] = 0.0; + env->rewards[1] = 0.0; + add_log(env); + c_reset(env); + return; + } + + // Switch to other player + env->current_player = 1 - env->current_player; + + // Update observations for both agents + compute_observations(env); +} + +// Required function +void c_render(TicTacToe* env) { + if (!IsWindowReady()) { + InitWindow(600, 600, "PufferLib TicTacToe"); + SetTargetFPS(5); + } + + if (IsKeyDown(KEY_ESCAPE)) { + exit(0); + } + + BeginDrawing(); + ClearBackground((Color){6, 24, 24, 255}); + + int cell_size = 200; + + // Draw grid lines + for (int i = 1; i < 3; i++) { + DrawLine(i * cell_size, 0, i * cell_size, 600, WHITE); + DrawLine(0, i * cell_size, 600, i * cell_size, WHITE); + } + + // Draw X's and O's + for (int i = 0; i < 9; i++) { + int row = i / 3; + int col = i % 3; + int x = col * cell_size; + int y = row * cell_size; + + if (env->observations[i] == AGENT) { + // Draw X (blue) + int margin = 40; + DrawLineEx((Vector2){x + margin, y + margin}, + (Vector2){x + cell_size - margin, y + cell_size - margin}, + 8.0f, (Color){0, 187, 187, 255}); + DrawLineEx((Vector2){x + cell_size - margin, y + margin}, + (Vector2){x + margin, y + cell_size - margin}, + 8.0f, (Color){0, 187, 187, 255}); + } else if (env->observations[i] == ENEMY) { + // Draw O (red) + DrawCircle(x + cell_size/2, y + cell_size/2, 60, (Color){187, 0, 0, 255}); + DrawCircle(x + cell_size/2, y + cell_size/2, 40, (Color){6, 24, 24, 255}); + } + } + + EndDrawing(); +} + +// Required function +void c_close(TicTacToe* env) { + if (IsWindowReady()) { + CloseWindow(); + } +} diff --git a/pufferlib/ocean/tictactoe/tictactoe.py b/pufferlib/ocean/tictactoe/tictactoe.py new file mode 100644 index 000000000..9407c4077 --- /dev/null +++ b/pufferlib/ocean/tictactoe/tictactoe.py @@ -0,0 +1,64 @@ +'''TicTacToe: Two-agent self-play tic-tac-toe (C version)''' + +import gymnasium +import numpy as np + +import pufferlib +from pufferlib.ocean.tictactoe import binding + +class TicTacToe(pufferlib.PufferEnv): + def __init__(self, num_envs=1, num_agents=2, render_mode=None, log_interval=128, buf=None, seed=0): + self.single_observation_space = gymnasium.spaces.Box(low=0, high=2, + shape=(10,), dtype=np.uint8) + self.single_action_space = gymnasium.spaces.Discrete(9) + self.render_mode = render_mode + self.num_agents = num_envs * num_agents + self.log_interval = log_interval + + super().__init__(buf) + self.c_envs = binding.vec_init(self.observations, self.actions, self.rewards, + self.terminals, self.truncations, num_envs, seed) + + def reset(self, seed=0): + binding.vec_reset(self.c_envs, seed) + self.tick = 0 + return self.observations, [] + + def step(self, actions): + self.tick += 1 + + self.actions[:] = actions + binding.vec_step(self.c_envs) + + info = [] + if self.tick % self.log_interval == 0: + info.append(binding.vec_log(self.c_envs)) + + return (self.observations, self.rewards, + self.terminals, self.truncations, info) + + def render(self): + binding.vec_render(self.c_envs, 0) + + def close(self): + binding.vec_close(self.c_envs) + +if __name__ == '__main__': + N = 4096 + + env = TicTacToe(num_envs=N) + env.reset() + steps = 0 + + CACHE = 1024 + actions = np.random.randint(0, 9, (CACHE, N)) + + i = 0 + import time + start = time.time() + while time.time() - start < 10: + env.step(actions[i % CACHE]) + steps += N + i += 1 + + print('TicTacToe SPS:', int(steps / (time.time() - start)))