UoA-CARES · SamBoasman · Jul 20, 2024 · Jul 20, 2024 · Jul 21, 2024 · Jul 21, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,50 @@
+# For more information, please refer to https://aka.ms/vscode-docker-python
+FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
+SHELL [ "/bin/bash", "-c" ]
+
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y git
+
+# Keeps Python from generating .pyc files in the container
+ENV PYTHONDONTWRITEBYTECODE=1
+
+# Turns off buffering for easier container logging
+ENV PYTHONUNBUFFERED=1
+
+WORKDIR /workspace
+
+# Install Cares Reinforcement Learning
+RUN git clone https://github.com/UoA-CARES/cares_reinforcement_learning.git
+WORKDIR /workspace/cares_reinforcement_learning
+RUN git checkout -t origin/action-info-logging
+RUN pip3 install -r requirements.txt
+RUN pip3 install --editable .
+
+WORKDIR /workspace
+
+# Install Pyboy Environments
+RUN git clone https://github.com/UoA-CARES/pyboy_environment.git
+WORKDIR /workspace/pyboy_environment
+RUN git checkout -t origin/lvl-up-task
+RUN pip3 install -r requirements.txt
+RUN pip3 install --editable .
+
+WORKDIR /workspace
+
+RUN git clone https://github.com/UoA-CARES/gymnasium_envrionments.git
+WORKDIR /workspace/gymnasium_envrionments
+RUN git checkout -t origin/p4p-pokemon-docker
+RUN pip3 install -r requirements.txt
+
+WORKDIR /root
+RUN git clone https://github.com/PKWadsy/cares_pokemon_configs.git cares_rl_configs
+
+# We don't have GUI capabilities
+RUN pip3 uninstall opencv-python
+RUN pip3 install opencv-python-headless
+
+# Incase someone doesn't mount volume at runtime
+VOLUME /root/cares_rl_logs
+
+WORKDIR /workspace/gymnasium_envrionments
diff --git a/requirements.txt b/requirements.txt
@@ -6,6 +6,6 @@ numpy==1.26.4
 opencv-contrib-python==4.6.0.66
 pydantic==1.10.13
 torch==2.3.1
-pyboy==2.2.1
+pyboy==2.2.2
 mediapy==1.1.9
 natsort==8.4.0
diff --git a/scripts/environments/gym_environment.py b/scripts/environments/gym_environment.py
@@ -21,6 +21,10 @@ def render(self):
     def min_action_value(self):
         raise NotImplementedError("Override this method")
 
+    @abc.abstractmethod
+    def action_as_string(self, action):
+        raise NotImplemented("Override this method")
+
     @cached_property
     @abc.abstractmethod
     def max_action_value(self):

diff --git a/scripts/environments/image_wrapper.py b/scripts/environments/image_wrapper.py
@@ -42,6 +42,9 @@ def min_action_value(self):
     def max_action_value(self):
         return self.gym.max_action_value
 
+    def action_as_string(self, action):
+        return self.gym.action_as_string(action)
+
     def render(self):
         self.gym.render()
 

diff --git a/scripts/environments/pyboy/pyboy_environment.py b/scripts/environments/pyboy/pyboy_environment.py
@@ -1,6 +1,7 @@
 from functools import cached_property
 
 import numpy as np
+import logging
 from environments.gym_environment import GymEnvironment
 from util.configurations import GymEnvironmentConfig
 
@@ -17,6 +18,7 @@ def __init__(self, config: GymEnvironmentConfig) -> None:
             config.act_freq,
             config.emulation_speed,
             config.headless,
+            config.discrete,
         )
 
     @cached_property
@@ -36,9 +38,10 @@ def action_num(self) -> int:
         return self.env.action_num
 
     def sample_action(self):
-        return np.random.uniform(
-            self.min_action_value, self.max_action_value, size=self.action_num
-        )
+        return self.env.sample_action()
+
+    def action_as_string(self, action):
+        return self.env.action_as_string(action)
 
     def set_seed(self, seed: int) -> None:
         self.env.set_seed(seed)
@@ -47,6 +50,7 @@ def reset(self) -> np.ndarray:
         return self.env.reset()
 
     def step(self, action: int) -> tuple:
+        # debug-log logging.info("Logging109")
         return self.env.step(action)
 
     def grab_frame(self, height=240, width=300) -> np.ndarray:

diff --git a/scripts/run.py b/scripts/run.py
@@ -5,6 +5,7 @@
 """
 
 import logging
+import os
 import sys
 from pathlib import Path
 

diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py
@@ -1,11 +1,48 @@
 import logging
 import time
+import os
+import inspect
+import pandas as pd
 
 from cares_reinforcement_learning.util import helpers as hlp
 from cares_reinforcement_learning.util.configurations import (
     AlgorithmConfig,
     TrainingConfig,
 )
+import cv2
+import numpy as np
+
+
+def overlay_info(image, **kwargs):
+    # Create a copy of the image to overlay text
+    output_image = image.copy()
+
+    # Define the position for the text (top-left corner)
+    text_x, text_y = 10, 30
+
+    # Set the font, scale, color, and thickness for the text
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    font_scale = 0.4  # Smaller font scale
+    color = (0, 0, 255)  # Red color in BGR
+    thickness = 1  # Thicker text
+
+    # Create overlay text from the kwargs dictionary
+    overlay_text = "\n".join([f"{key}: {value}" for key, value in kwargs.items()])
+
+    # Split the overlay text into lines and calculate position for each line
+    for i, line in enumerate(overlay_text.split("\n")):
+        cv2.putText(
+            output_image,
+            line,
+            (text_x, text_y + i * 20),
+            font,
+            font_scale,
+            color,
+            thickness,
+            cv2.LINE_AA,
+        )
+
+    return output_image
 
 
 def evaluate_policy_network(
@@ -29,6 +66,7 @@ def evaluate_policy_network(
         while not done and not truncated:
             episode_timesteps += 1
             normalised_action = agent.select_action_from_policy(state, evaluation=True)
+
             denormalised_action = (
                 hlp.denormalize(
                     normalised_action, env.max_action_value, env.min_action_value
@@ -73,6 +111,9 @@ def policy_based_train(
     display=False,
     normalisation=True,
 ):
+
+    start_new_run = True
+
     start_time = time.time()
 
     max_steps_training = alg_config.max_steps_training
@@ -98,14 +139,26 @@ def policy_based_train(
 
     episode_timesteps = 0
     episode_reward = 0
+    highest_reward = float("-inf")
     episode_num = 0
 
     state = env.reset()
 
+    # Initialize the DataFrame with specified columns
+    run_data_rows = []
+
     episode_start = time.time()
     for total_step_counter in range(int(max_steps_training)):
         episode_timesteps += 1
 
+        step_data = {}
+
+        if start_new_run == True:
+            start_new_run = False
+            frame = env.grab_frame()
+            record.start_video("temp_train_video", frame)
+            run_data_rows = []
+
         if total_step_counter < max_steps_exploration:
             logging.info(
                 f"Running Exploration Steps {total_step_counter + 1}/{max_steps_exploration}"
@@ -125,9 +178,19 @@ def policy_based_train(
             noise_scale = max(min_noise, noise_scale)
 
             # algorithm range [-1, 1]
-            normalised_action = agent.select_action_from_policy(
-                state, noise_scale=noise_scale
-            )
+
+            # Horrible hack so I don't have to change all the algorithms
+            select_action_from_policy = agent.select_action_from_policy
+
+            if "info" in inspect.signature(select_action_from_policy).parameters:
+                normalised_action = select_action_from_policy(
+                    state, noise_scale=noise_scale, info=step_data
+                )
+            else:
+                normalised_action = select_action_from_policy(
+                    state, noise_scale=noise_scale
+                )
+
             # mapping to env range [e.g. -2 , 2 for pendulum] - note for DMCS this is redudenant but required for openai
             if normalisation:
                 denormalised_action = hlp.denormalize(
@@ -140,6 +203,11 @@ def policy_based_train(
         if display:
             env.render()
 
+        if record is not None:
+            frame = env.grab_frame()
+            frame_with_stats = overlay_info(frame, Reward=f"{episode_reward:.1f}")
+            record.log_video(frame_with_stats)
+
         intrinsic_reward = 0
         if intrinsic_on and total_step_counter > max_steps_exploration:
             intrinsic_reward = agent.get_intrinsic_reward(
@@ -159,6 +227,12 @@ def policy_based_train(
         state = next_state
         episode_reward += reward_extrinsic  # Note we only track the extrinsic reward for the episode for proper comparison
 
+        step_data["action"] = denormalised_action
+        step_data["reward"] = total_reward
+        step_data["episode_reward"] = episode_reward
+
+        run_data_rows.append(step_data)
+
         info = {}
         if (
             total_step_counter >= max_steps_exploration
@@ -170,18 +244,6 @@ def policy_based_train(
         if intrinsic_on:
             info["intrinsic_reward"] = intrinsic_reward
 
-        if (total_step_counter + 1) % number_steps_per_evaluation == 0:
-            logging.info("*************--Evaluation Loop--*************")
-            evaluate_policy_network(
-                env_eval,
-                agent,
-                train_config,
-                record=record,
-                total_steps=total_step_counter,
-                normalisation=normalisation,
-            )
-            logging.info("--------------------------------------------")
-
         if done or truncated:
             episode_time = time.time() - episode_start
             record.log_train(
@@ -190,11 +252,37 @@ def policy_based_train(
                 episode_steps=episode_timesteps,
                 episode_reward=episode_reward,
                 episode_time=episode_time,
-                **info,
+                info=info,
                 display=True,
             )
 
+            record.stop_video()
+            video_dir = os.path.join(record.current_sub_directory, "videos")
+            data_dir = os.path.join(record.current_sub_directory, "data")
+
+            run_csv = os.path.join(data_dir, f"episode_{episode_num}.csv")
+            pd.DataFrame(run_data_rows).to_csv(run_csv, index=False)
+
+            if episode_reward > highest_reward:
+
+                highest_reward = episode_reward
+
+                new_record_video = os.path.join(
+                    video_dir, f"new_record_episode_{episode_num+1}.mp4"
+                )
+                training_video = os.path.join(video_dir, "temp_train_video.mp4")
+
+                logging.info(
+                    f"New highest reward of {episode_reward}. Saving video and run data..."
+                )
+
+                try:
+                    os.rename(training_video, new_record_video)
+                except:
+                    logging.error("An error renaming the video occured :/")
+
             # Reset environment
+            start_new_run = True
             state = env.reset()
             episode_timesteps = 0
             episode_reward = 0

diff --git a/scripts/util/configurations.py b/scripts/util/configurations.py
@@ -3,6 +3,7 @@
 """
 
 from pathlib import Path
+from typing import Optional
 
 from cares_reinforcement_learning.util.configurations import EnvironmentConfig
 from pydantic import Field
@@ -23,21 +24,23 @@ class GymEnvironmentConfig(EnvironmentConfig):
         act_freq (int): Action frequency (default: 24)
         emulation_speed (int): Emulation speed (default: 0)
         headless (bool): Whether to run in headless mode (default: False)
+        discrete (bool): Whether action space is discrete (default: False)
     """
 
     gym: str = Field(description="Gym Environment <openai, dmcs, pyboy>")
     task: str
-    domain: str = ""
-    display: int = 0
+    domain: Optional[str] = ""
+    display: Optional[int] = 0
 
     # image observation configurations
-    frames_to_stack: int = 3
-    frame_width: int = 84
-    frame_height: int = 84
-    grey_scale: int = 0
+    frames_to_stack: Optional[int] = 3
+    frame_width: Optional[int] = 84
+    frame_height: Optional[int] = 84
+    grey_scale: Optional[int] = 0
 
     # pyboy configurations TODO move...
-    rom_path: str = f"{Path.home()}/cares_rl_configs"
-    act_freq: int = 24
-    emulation_speed: int = 0
-    headless: int = 1
+    rom_path: Optional[str] = f"{Path.home()}/cares_rl_configs"
+    act_freq: Optional[int] = 24
+    emulation_speed: Optional[int] = 0
+    headless: Optional[int] = 1
+    discrete: Optional[int] = 0
diff --git a/shell-scripts/catch.sh b/shell-scripts/catch.sh
@@ -0,0 +1 @@
+python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 1 --domain pokemon --task catch SACDAE
diff --git a/shell-scripts/fight-sacae.sh b/shell-scripts/fight-sacae.sh
@@ -0,0 +1 @@
+python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 0 --domain pokemon --task fight SACAE
diff --git a/shell-scripts/fight-sacd.sh b/shell-scripts/fight-sacd.sh
@@ -0,0 +1 @@
+python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 1 --domain pokemon --task fight SACD
diff --git a/shell-scripts/fight-sacdae.sh b/shell-scripts/fight-sacdae.sh
@@ -0,0 +1 @@
+python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 1 --domain pokemon --task fight NaSATD3
diff --git a/shell-scripts/flexi.sh b/shell-scripts/flexi.sh
@@ -0,0 +1 @@
+python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 0 --domain pokemon --task flexi SACAE
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 1 --domain pokemon --task catch SACDAE