UoA-CARES · SamBoasman · Dec 3, 2025 · Dec 15, 2025 · Copilot · Dec 15, 2025
diff --git a/requirements.txt b/requirements.txt
@@ -3,6 +3,7 @@ gymnasium==1.1.1
 gymnasium[classic-control]==1.1.1
 gymnasium[mujoco]==1.1.1
 mujoco==3.2.6
+envpool==0.8.4
 
 mediapy==1.1.9
 natsort==8.4.0

diff --git a/scripts/base_runner.py b/scripts/base_runner.py
@@ -136,8 +136,9 @@ def __init__(
         self.logger.info(
             f"[SEED {self.train_seed} | {self.eval_seed}] Loading Environment: {self.env_config.gym}"
         )
+
         self.env, self.env_eval = self.env_factory.create_environment(
-            self.env_config, self.alg_config.image_observation
+            self.env_config, self.alg_config.image_observation, train_seed, eval_seed
         )
 
         # Set the seed for everything

diff --git a/scripts/environments/atari/__init__.py b/scripts/environments/atari/__init__.py
diff --git a/scripts/environments/atari/atari_environment.py b/scripts/environments/atari/atari_environment.py
@@ -0,0 +1,90 @@
+from functools import cached_property
+
+import cv2
+import numpy as np
+from environments.gym_environment import GymEnvironment
+from gymnasium import spaces
+from util.configurations import AtariConfig
+import envpool
+
+
+class AtariEnvironment(GymEnvironment):
+    def __init__(self, config: AtariConfig, seed: int, evaluation: bool) -> None:
+        super().__init__(config)
+        self.env = envpool.make_gymnasium(
+            config.task,
+            num_envs=1,
+            seed=seed,
+            img_width=config.frame_width,
+            img_height=config.frame_height,
+            episodic_life=evaluation,
+            reward_clip=evaluation,
-            episodic_life=evaluation,
-            reward_clip=evaluation,
+            episodic_life=not evaluation,
+            reward_clip=not evaluation,
-            episodic_life=evaluation,
-            reward_clip=evaluation,
+            episodic_life=not evaluation,
+            reward_clip=not evaluation,
+            stack_num=config.frames_to_stack,
+        )
+        if config.display == 1:
+            self.name = f"{config.task}-{seed}"
+            cv2.namedWindow(self.name, cv2.WINDOW_GUI_NORMAL)
+
+        self.reset()
+
+    @cached_property
+    def max_action_value(self) -> float:
+        return self.env.action_space.high[0]
+
+    @cached_property
+    def min_action_value(self) -> float:
+        return self.env.action_space.low[0]
-        return self.env.action_space.high[0]
-
-    @cached_property
-    def min_action_value(self) -> float:
-        return self.env.action_space.low[0]
+        if isinstance(self.env.action_space, spaces.Box):
+            return self.env.action_space.high[0]
+        elif isinstance(self.env.action_space, spaces.Discrete):
+            return self.env.action_space.n - 1
+        else:
+            raise ValueError(f"Unhandled action space type: {type(self.env.action_space)}")
+
+    @cached_property
+    def min_action_value(self) -> float:
+        if isinstance(self.env.action_space, spaces.Box):
+            return self.env.action_space.low[0]
+        elif isinstance(self.env.action_space, spaces.Discrete):
+            return 0
+        else:
+            raise ValueError(f"Unhandled action space type: {type(self.env.action_space)}")
-        return self.env.action_space.high[0]
-
-    @cached_property
-    def min_action_value(self) -> float:
-        return self.env.action_space.low[0]
+        if isinstance(self.env.action_space, spaces.Box):
+            return self.env.action_space.high[0]
+        elif isinstance(self.env.action_space, spaces.Discrete):
+            return self.env.action_space.n - 1
+        else:
+            raise ValueError(f"Unhandled action space type: {type(self.env.action_space)}")
+
+    @cached_property
+    def min_action_value(self) -> float:
+        if isinstance(self.env.action_space, spaces.Box):
+            return self.env.action_space.low[0]
+        elif isinstance(self.env.action_space, spaces.Discrete):
+            return 0
+        else:
+            raise ValueError(f"Unhandled action space type: {type(self.env.action_space)}")
+
+    @cached_property
+    def observation_space(self) -> tuple:
+        obs_shape = self.env.observation_space.shape
+        return obs_shape
+
+    @cached_property
+    def action_num(self) -> int:
+        if isinstance(self.env.action_space, spaces.Box):
+            action_num = self.env.action_space.shape[0]
+        elif isinstance(self.env.action_space, spaces.Discrete):
+            action_num = self.env.action_space.n
+        else:
+            raise ValueError(
+                f"Unhandled action space type: {type(self.env.action_space)}"
+            )
+        return action_num
+
+    def sample_action(self) -> int:
+        return np.array([self.env.action_space.sample()], dtype=int)
+
+    def set_seed(self, seed: int) -> None:
+        self.env.reset()
-        self.env.reset()
+        self.env.reset(seed=seed)
-        self.env.reset()
+        self.env.reset(seed=seed)
+        self.env.action_space.seed(seed)
+        self.env.observation_space.seed(seed)
+
+    def reset(self, training: bool = True) -> np.ndarray:
+        state, _ = self.env.reset()
+        self.state = state[0]
+        return self.state
+
+    def _step(self, action: int) -> tuple:
+        state, reward, terminated, truncated, info = self.env.step(action)
+        self.state = state[0]
+        return state[0], reward[0], terminated[0], truncated[0], {}
+
+    def grab_frame(self, height: int = 232, width: int = 232) -> np.ndarray:
+        if len(self.state.shape) == 4:
+            # RGB
+            frame = cv2.cvtColor(np.moveaxis(self.state[-3:], 0, -1), cv2.COLOR_RGB2BGR)
+        else:
+            # Grayscale
+            frame = self.state[-1]
+            frame = np.stack([frame] * 3, axis=-1)
+        return cv2.resize(frame, (width, height), interpolation=cv2.INTER_CUBIC)
+
+    def render(self):
+        frame = self.grab_frame()
+        cv2.imshow(self.name, frame)
+        cv2.waitKey(1)
+
+    def get_overlay_info(self) -> dict:
+        # TODO: Add overlay information for gyms as needed
+        return {}
diff --git a/scripts/environments/environment_factory.py b/scripts/environments/environment_factory.py
@@ -14,7 +14,11 @@ def __init__(self) -> None:
         pass
 
     def create_environment(
-        self, config: GymEnvironmentConfig, image_observation
+        self,
+        config: GymEnvironmentConfig,
+        image_observation,
+        train_seed: int,
+        eval_seed: int,
     ) -> tuple[
         BaseEnvironment | MultiModalWrapper,
         BaseEnvironment | MultiModalWrapper,
@@ -69,8 +73,14 @@ def create_environment(
 
             env = SMAC2Environment(config, evaluation=False)
             eval_env = SMAC2Environment(config, evaluation=True)
+        elif isinstance(config, cfg.AtariConfig):
+            from environments.atari.atari_environment import AtariEnvironment
+
+            env = AtariEnvironment(config, train_seed, evaluation=False)
+            eval_env = AtariEnvironment(config, eval_seed, evaluation=True)
+            image_observation = False
         else:
-            raise ValueError(f"Unkown environment: {type(config)}")
+            raise ValueError(f"Unknown environment: {type(config)}")
 
         if isinstance(env, GymEnvironment) and isinstance(eval_env, GymEnvironment):
             env = MultiModalWrapper(config, env) if bool(image_observation) else env

diff --git a/scripts/training_runner.py b/scripts/training_runner.py
@@ -303,6 +303,8 @@ def run_training(self) -> None:
         state = self.env.reset()
         episode_start = time.time()
 
+        train_info: dict = {}
+
         # Main training loop
         train_step_counter = self.start_training_step
         for train_step_counter in range(
@@ -351,7 +353,10 @@ def run_training(self) -> None:
             #     total_reward += intrinsic_reward
             #     info["intrinsic_reward"] = intrinsic_reward
 
+            # entropy = self.agent.get_action_entropy(state)
+
             # Store experience in memory
+            # self.memory.add(state, normalised_action, total_reward, next_state, done, entropy)
             self.memory.add(state, normalised_action, total_reward, next_state, done)
 
             state = next_state
@@ -370,7 +375,6 @@ def run_training(self) -> None:
                     episode_stats.get_episode_reward(),
                     episode_end,
                 )
-                info |= train_info
 
             # Evaluate agent periodically
             if (train_step_counter + 1) % self.number_steps_per_evaluation == 0:
@@ -382,6 +386,7 @@ def run_training(self) -> None:
             # Handle episode completion
             if episode_end:
                 episode_time = time.time() - episode_start
+                info |= train_info
 
                 info.update(episode_stats.summary())
 

diff --git a/scripts/util/configurations.py b/scripts/util/configurations.py
@@ -48,6 +48,11 @@ def model_dump(self, *args, **kwargs):
         return data
 
 
+class AtariConfig(GymEnvironmentConfig):
+    gym: ClassVar[str] = "atari"
+    frames_to_stack: int = 4
+
+
 class OpenAIConfig(GymEnvironmentConfig):
     gym: ClassVar[str] = "openai"