PufferAI · eitanporat · Oct 3, 2025 · Oct 3, 2025 · Oct 7, 2025
diff --git a/pufferlib/config/ocean/tictactoe.ini b/pufferlib/config/ocean/tictactoe.ini
@@ -0,0 +1,17 @@
+[base]
+package = ocean
+env_name = puffer_tictactoe
+policy_name = Policy
+rnn_name = Recurrent
+
+[env]
+num_envs = 1024
+
+[vec]
+num_envs = 8
+
+[train]
+total_timesteps = 100_000_000
+gamma = 0.99
+learning_rate = 0.1
+minibatch_size = 32768
diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py
@@ -137,6 +137,7 @@ def make_multiagent(buf=None, **kwargs):
     'snake': 'Snake',
     'squared': 'Squared',
     'pysquared': 'PySquared',
+    'tictactoe': 'TicTacToe',
     'connect4': 'Connect4',
     'g2048': 'G2048',
     'terraform': 'Terraform',

diff --git a/pufferlib/ocean/tictactoe/binding.c b/pufferlib/ocean/tictactoe/binding.c
@@ -0,0 +1,17 @@
+#include "tictactoe.h"
+
+#define Env TicTacToe
+#include "../env_binding.h"
+
+static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
+    // No custom parameters for tictactoe
+    return 0;
+}
+
+static int my_log(PyObject* dict, Log* log) {
+    assign_to_dict(dict, "perf", log->perf);
+    assign_to_dict(dict, "score", log->score);
+    assign_to_dict(dict, "episode_return", log->episode_return);
+    assign_to_dict(dict, "episode_length", log->episode_length);
+    return 0;
+}
diff --git a/pufferlib/ocean/tictactoe/tictactoe.h b/pufferlib/ocean/tictactoe/tictactoe.h
@@ -0,0 +1,199 @@
+/* TicTacToe: Single-agent tic-tac-toe against a random opponent */
+
+#include <stdlib.h>
+#include <string.h>
+#include "raylib.h"
+
+const unsigned char EMPTY = 0;
+const unsigned char AGENT = 1;
+const unsigned char ENEMY = 2;
+
+// Required struct. Only use floats!
+typedef struct {
+    float perf;           // Win rate (1.0 for win, 0.0 for loss/draw)
+    float score;          // Final reward of episode
+    float episode_return; // Sum of rewards over episode (same as score for single-reward games)
+    float episode_length; // Number of agent actions in episode
+    float n;              // Required as the last field - number of completed episodes
+} Log;
+
+// Environment struct
+typedef struct {
+    Log log;                      // Required field
+    unsigned char* observations;  // Required. 10 values per agent: 9 cells + 1 turn flag
+    int* actions;                 // Required. Position 0-8 for each agent
+    float* rewards;               // Required
+    unsigned char* terminals;     // Required
+    int tick;                     // Number of agent actions this episode
+    int num_moves;                // Total moves on board
+    int current_player;           // Which player's turn (0 or 1)
+} TicTacToe;
+
+void add_log(TicTacToe* env) {
+    // Log from agent 0's perspective
+    env->log.perf += (env->rewards[0] > 0) ? 1.0 : 0.0;
+    env->log.score += env->rewards[0];
+    env->log.episode_length += env->tick;
+    env->log.episode_return += env->rewards[0];
+    env->log.n++;
+}
+
+// Compute observations for both agents
+void compute_observations(TicTacToe* env) {
+    // Agent 0: board as-is (obs[0..8]) + turn flag (obs[9])
+    env->observations[9] = (env->current_player == 0) ? 1 : 0;
+
+    // Agent 1: flipped board (obs[10..18]) + turn flag (obs[19])
+    for (int i = 0; i < 9; i++) {
+        unsigned char cell = env->observations[i];
+        env->observations[10 + i] = (cell == 1) ? 2 : (cell == 2) ? 1 : 0;
+    }
+    env->observations[19] = (env->current_player == 1) ? 1 : 0;
+}
+
+// Check if a player has won
+int check_winner(TicTacToe* env, unsigned char player) {
+    unsigned char* board = env->observations;
+
+    // Check rows and columns
+    for (int i = 0; i < 3; i++) {
+        if ((board[i*3] == player && board[i*3+1] == player && board[i*3+2] == player) ||
+            (board[i] == player && board[i+3] == player && board[i+6] == player)) {
+            return 1;
+        }
+    }
+
+    // Check diagonals
+    if ((board[0] == player && board[4] == player && board[8] == player) ||
+        (board[2] == player && board[4] == player && board[6] == player)) {
+        return 1;
+    }
+
+    return 0;
+}
+
+// Required function
+void c_reset(TicTacToe* env) {
+    // Clear board (first 9 cells of observations)
+    memset(env->observations, EMPTY, 9 * sizeof(unsigned char));
+    env->tick = 0;
+    env->num_moves = 0;
+    env->current_player = rand() % 2; // Randomly choose who starts
+
+    // Compute observations for both agents
+    compute_observations(env);
+}
+
+// Required function
+void c_step(TicTacToe* env) {
+    env->tick++;
+
+    // Get action from current player
+    int action = env->actions[env->current_player];
+
+    // Zero out rewards and terminals at the start
+    env->terminals[0] = 0;
+    env->terminals[1] = 0;
+    env->rewards[0] = 0;
+    env->rewards[1] = 0;
+
+    // Check if move is valid
+    if (env->observations[action] != EMPTY) {
+        // Invalid move - current player loses
+        env->terminals[0] = 1;
+        env->terminals[1] = 1;
+        env->rewards[env->current_player] = -1.0;
+        env->rewards[1 - env->current_player] = 1.0;
+        add_log(env);
+        c_reset(env);
+        return;
+    }
+
+    // Make current player's move
+    unsigned char player_piece = (env->current_player == 0) ? AGENT : ENEMY;
+    env->observations[action] = player_piece;
+    env->num_moves++;
+
+    // Check if current player won
+    if (check_winner(env, player_piece)) {
+        env->terminals[0] = 1;
+        env->terminals[1] = 1;
+        env->rewards[env->current_player] = 1.0;
+        env->rewards[1 - env->current_player] = -1.0;
+        add_log(env);
+        c_reset(env);
+        return;
+    }
+
+    // Check for draw (board full)
+    if (env->num_moves == 9) {
+        env->terminals[0] = 1;
+        env->terminals[1] = 1;
+        env->rewards[0] = 0.0;
+        env->rewards[1] = 0.0;
+        add_log(env);
+        c_reset(env);
+        return;
+    }
+
+    // Switch to other player
+    env->current_player = 1 - env->current_player;
+
+    // Update observations for both agents
+    compute_observations(env);
+}
+
+// Required function
+void c_render(TicTacToe* env) {
+    if (!IsWindowReady()) {
+        InitWindow(600, 600, "PufferLib TicTacToe");
+        SetTargetFPS(5);
+    }
+
+    if (IsKeyDown(KEY_ESCAPE)) {
+        exit(0);
+    }
+
+    BeginDrawing();
+    ClearBackground((Color){6, 24, 24, 255});
+
+    int cell_size = 200;
+
+    // Draw grid lines
+    for (int i = 1; i < 3; i++) {
+        DrawLine(i * cell_size, 0, i * cell_size, 600, WHITE);
+        DrawLine(0, i * cell_size, 600, i * cell_size, WHITE);
+    }
+
+    // Draw X's and O's
+    for (int i = 0; i < 9; i++) {
+        int row = i / 3;
+        int col = i % 3;
+        int x = col * cell_size;
+        int y = row * cell_size;
+
+        if (env->observations[i] == AGENT) {
+            // Draw X (blue)
+            int margin = 40;
+            DrawLineEx((Vector2){x + margin, y + margin},
+                      (Vector2){x + cell_size - margin, y + cell_size - margin},
+                      8.0f, (Color){0, 187, 187, 255});
+            DrawLineEx((Vector2){x + cell_size - margin, y + margin},
+                      (Vector2){x + margin, y + cell_size - margin},
+                      8.0f, (Color){0, 187, 187, 255});
+        } else if (env->observations[i] == ENEMY) {
+            // Draw O (red)
+            DrawCircle(x + cell_size/2, y + cell_size/2, 60, (Color){187, 0, 0, 255});
+            DrawCircle(x + cell_size/2, y + cell_size/2, 40, (Color){6, 24, 24, 255});
+        }
+    }
+
+    EndDrawing();
+}
+
+// Required function
+void c_close(TicTacToe* env) {
+    if (IsWindowReady()) {
+        CloseWindow();
+    }
+}
diff --git a/pufferlib/ocean/tictactoe/tictactoe.py b/pufferlib/ocean/tictactoe/tictactoe.py
@@ -0,0 +1,64 @@
+'''TicTacToe: Two-agent self-play tic-tac-toe (C version)'''
+
+import gymnasium
+import numpy as np
+
+import pufferlib
+from pufferlib.ocean.tictactoe import binding
+
+class TicTacToe(pufferlib.PufferEnv):
+    def __init__(self, num_envs=1, num_agents=2, render_mode=None, log_interval=128, buf=None, seed=0):
+        self.single_observation_space = gymnasium.spaces.Box(low=0, high=2,
+            shape=(10,), dtype=np.uint8)
+        self.single_action_space = gymnasium.spaces.Discrete(9)
+        self.render_mode = render_mode
+        self.num_agents = num_envs * num_agents
+        self.log_interval = log_interval
+
+        super().__init__(buf)
+        self.c_envs = binding.vec_init(self.observations, self.actions, self.rewards,
+            self.terminals, self.truncations, num_envs, seed)
+
+    def reset(self, seed=0):
+        binding.vec_reset(self.c_envs, seed)
+        self.tick = 0
+        return self.observations, []
+
+    def step(self, actions):
+        self.tick += 1
+
+        self.actions[:] = actions
+        binding.vec_step(self.c_envs)
+
+        info = []
+        if self.tick % self.log_interval == 0:
+            info.append(binding.vec_log(self.c_envs))
+
+        return (self.observations, self.rewards,
+            self.terminals, self.truncations, info)
+
+    def render(self):
+        binding.vec_render(self.c_envs, 0)
+
+    def close(self):
+        binding.vec_close(self.c_envs)
+
+if __name__ == '__main__':
+    N = 4096
+
+    env = TicTacToe(num_envs=N)
+    env.reset()
+    steps = 0
+
+    CACHE = 1024
+    actions = np.random.randint(0, 9, (CACHE, N))
+
+    i = 0
+    import time
+    start = time.time()
+    while time.time() - start < 10:
+        env.step(actions[i % CACHE])
+        steps += N
+        i += 1
+
+    print('TicTacToe SPS:', int(steps / (time.time() - start)))