diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..f7ed58a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,50 @@ +# For more information, please refer to https://aka.ms/vscode-docker-python +FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime +SHELL [ "/bin/bash", "-c" ] + +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y git + +# Keeps Python from generating .pyc files in the container +ENV PYTHONDONTWRITEBYTECODE=1 + +# Turns off buffering for easier container logging +ENV PYTHONUNBUFFERED=1 + +WORKDIR /workspace + +# Install Cares Reinforcement Learning +RUN git clone https://github.com/UoA-CARES/cares_reinforcement_learning.git +WORKDIR /workspace/cares_reinforcement_learning +RUN git checkout -t origin/action-info-logging +RUN pip3 install -r requirements.txt +RUN pip3 install --editable . + +WORKDIR /workspace + +# Install Pyboy Environments +RUN git clone https://github.com/UoA-CARES/pyboy_environment.git +WORKDIR /workspace/pyboy_environment +RUN git checkout -t origin/lvl-up-task +RUN pip3 install -r requirements.txt +RUN pip3 install --editable . + +WORKDIR /workspace + +RUN git clone https://github.com/UoA-CARES/gymnasium_envrionments.git +WORKDIR /workspace/gymnasium_envrionments +RUN git checkout -t origin/p4p-pokemon-docker +RUN pip3 install -r requirements.txt + +WORKDIR /root +RUN git clone https://github.com/PKWadsy/cares_pokemon_configs.git cares_rl_configs + +# We don't have GUI capabilities +RUN pip3 uninstall opencv-python +RUN pip3 install opencv-python-headless + +# Incase someone doesn't mount volume at runtime +VOLUME /root/cares_rl_logs + +WORKDIR /workspace/gymnasium_envrionments \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index ec489ef..5c97769 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,6 @@ numpy==1.26.4 opencv-contrib-python==4.6.0.66 pydantic==1.10.13 torch==2.3.1 -pyboy==2.2.1 +pyboy==2.2.2 mediapy==1.1.9 natsort==8.4.0 \ No newline at end of file diff --git a/scripts/environments/gym_environment.py b/scripts/environments/gym_environment.py index a779a60..ed7204c 100644 --- a/scripts/environments/gym_environment.py +++ b/scripts/environments/gym_environment.py @@ -21,6 +21,10 @@ def render(self): def min_action_value(self): raise NotImplementedError("Override this method") + @abc.abstractmethod + def action_as_string(self, action): + raise NotImplemented("Override this method") + @cached_property @abc.abstractmethod def max_action_value(self): diff --git a/scripts/environments/image_wrapper.py b/scripts/environments/image_wrapper.py index 8900b4b..b1f4b58 100644 --- a/scripts/environments/image_wrapper.py +++ b/scripts/environments/image_wrapper.py @@ -42,6 +42,9 @@ def min_action_value(self): def max_action_value(self): return self.gym.max_action_value + def action_as_string(self, action): + return self.gym.action_as_string(action) + def render(self): self.gym.render() diff --git a/scripts/environments/pyboy/pyboy_environment.py b/scripts/environments/pyboy/pyboy_environment.py index 5159744..6f916ed 100644 --- a/scripts/environments/pyboy/pyboy_environment.py +++ b/scripts/environments/pyboy/pyboy_environment.py @@ -1,6 +1,7 @@ from functools import cached_property import numpy as np +import logging from environments.gym_environment import GymEnvironment from util.configurations import GymEnvironmentConfig @@ -17,6 +18,7 @@ def __init__(self, config: GymEnvironmentConfig) -> None: config.act_freq, config.emulation_speed, config.headless, + config.discrete, ) @cached_property @@ -36,9 +38,10 @@ def action_num(self) -> int: return self.env.action_num def sample_action(self): - return np.random.uniform( - self.min_action_value, self.max_action_value, size=self.action_num - ) + return self.env.sample_action() + + def action_as_string(self, action): + return self.env.action_as_string(action) def set_seed(self, seed: int) -> None: self.env.set_seed(seed) @@ -47,6 +50,7 @@ def reset(self) -> np.ndarray: return self.env.reset() def step(self, action: int) -> tuple: + # debug-log logging.info("Logging109") return self.env.step(action) def grab_frame(self, height=240, width=300) -> np.ndarray: diff --git a/scripts/run.py b/scripts/run.py index f2ab106..0d13d27 100644 --- a/scripts/run.py +++ b/scripts/run.py @@ -5,6 +5,7 @@ """ import logging +import os import sys from pathlib import Path diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py index 5dce4fd..d290360 100644 --- a/scripts/train_loops/policy_loop.py +++ b/scripts/train_loops/policy_loop.py @@ -1,11 +1,48 @@ import logging import time +import os +import inspect +import pandas as pd from cares_reinforcement_learning.util import helpers as hlp from cares_reinforcement_learning.util.configurations import ( AlgorithmConfig, TrainingConfig, ) +import cv2 +import numpy as np + + +def overlay_info(image, **kwargs): + # Create a copy of the image to overlay text + output_image = image.copy() + + # Define the position for the text (top-left corner) + text_x, text_y = 10, 30 + + # Set the font, scale, color, and thickness for the text + font = cv2.FONT_HERSHEY_SIMPLEX + font_scale = 0.4 # Smaller font scale + color = (0, 0, 255) # Red color in BGR + thickness = 1 # Thicker text + + # Create overlay text from the kwargs dictionary + overlay_text = "\n".join([f"{key}: {value}" for key, value in kwargs.items()]) + + # Split the overlay text into lines and calculate position for each line + for i, line in enumerate(overlay_text.split("\n")): + cv2.putText( + output_image, + line, + (text_x, text_y + i * 20), + font, + font_scale, + color, + thickness, + cv2.LINE_AA, + ) + + return output_image def evaluate_policy_network( @@ -29,6 +66,7 @@ def evaluate_policy_network( while not done and not truncated: episode_timesteps += 1 normalised_action = agent.select_action_from_policy(state, evaluation=True) + denormalised_action = ( hlp.denormalize( normalised_action, env.max_action_value, env.min_action_value @@ -73,6 +111,9 @@ def policy_based_train( display=False, normalisation=True, ): + + start_new_run = True + start_time = time.time() max_steps_training = alg_config.max_steps_training @@ -98,14 +139,26 @@ def policy_based_train( episode_timesteps = 0 episode_reward = 0 + highest_reward = float("-inf") episode_num = 0 state = env.reset() + # Initialize the DataFrame with specified columns + run_data_rows = [] + episode_start = time.time() for total_step_counter in range(int(max_steps_training)): episode_timesteps += 1 + step_data = {} + + if start_new_run == True: + start_new_run = False + frame = env.grab_frame() + record.start_video("temp_train_video", frame) + run_data_rows = [] + if total_step_counter < max_steps_exploration: logging.info( f"Running Exploration Steps {total_step_counter + 1}/{max_steps_exploration}" @@ -125,9 +178,19 @@ def policy_based_train( noise_scale = max(min_noise, noise_scale) # algorithm range [-1, 1] - normalised_action = agent.select_action_from_policy( - state, noise_scale=noise_scale - ) + + # Horrible hack so I don't have to change all the algorithms + select_action_from_policy = agent.select_action_from_policy + + if "info" in inspect.signature(select_action_from_policy).parameters: + normalised_action = select_action_from_policy( + state, noise_scale=noise_scale, info=step_data + ) + else: + normalised_action = select_action_from_policy( + state, noise_scale=noise_scale + ) + # mapping to env range [e.g. -2 , 2 for pendulum] - note for DMCS this is redudenant but required for openai if normalisation: denormalised_action = hlp.denormalize( @@ -140,6 +203,11 @@ def policy_based_train( if display: env.render() + if record is not None: + frame = env.grab_frame() + frame_with_stats = overlay_info(frame, Reward=f"{episode_reward:.1f}") + record.log_video(frame_with_stats) + intrinsic_reward = 0 if intrinsic_on and total_step_counter > max_steps_exploration: intrinsic_reward = agent.get_intrinsic_reward( @@ -159,6 +227,12 @@ def policy_based_train( state = next_state episode_reward += reward_extrinsic # Note we only track the extrinsic reward for the episode for proper comparison + step_data["action"] = denormalised_action + step_data["reward"] = total_reward + step_data["episode_reward"] = episode_reward + + run_data_rows.append(step_data) + info = {} if ( total_step_counter >= max_steps_exploration @@ -170,18 +244,6 @@ def policy_based_train( if intrinsic_on: info["intrinsic_reward"] = intrinsic_reward - if (total_step_counter + 1) % number_steps_per_evaluation == 0: - logging.info("*************--Evaluation Loop--*************") - evaluate_policy_network( - env_eval, - agent, - train_config, - record=record, - total_steps=total_step_counter, - normalisation=normalisation, - ) - logging.info("--------------------------------------------") - if done or truncated: episode_time = time.time() - episode_start record.log_train( @@ -190,11 +252,37 @@ def policy_based_train( episode_steps=episode_timesteps, episode_reward=episode_reward, episode_time=episode_time, - **info, + info=info, display=True, ) + record.stop_video() + video_dir = os.path.join(record.current_sub_directory, "videos") + data_dir = os.path.join(record.current_sub_directory, "data") + + run_csv = os.path.join(data_dir, f"episode_{episode_num}.csv") + pd.DataFrame(run_data_rows).to_csv(run_csv, index=False) + + if episode_reward > highest_reward: + + highest_reward = episode_reward + + new_record_video = os.path.join( + video_dir, f"new_record_episode_{episode_num+1}.mp4" + ) + training_video = os.path.join(video_dir, "temp_train_video.mp4") + + logging.info( + f"New highest reward of {episode_reward}. Saving video and run data..." + ) + + try: + os.rename(training_video, new_record_video) + except: + logging.error("An error renaming the video occured :/") + # Reset environment + start_new_run = True state = env.reset() episode_timesteps = 0 episode_reward = 0 diff --git a/scripts/util/configurations.py b/scripts/util/configurations.py index a891c90..3e8994e 100755 --- a/scripts/util/configurations.py +++ b/scripts/util/configurations.py @@ -3,6 +3,7 @@ """ from pathlib import Path +from typing import Optional from cares_reinforcement_learning.util.configurations import EnvironmentConfig from pydantic import Field @@ -23,21 +24,23 @@ class GymEnvironmentConfig(EnvironmentConfig): act_freq (int): Action frequency (default: 24) emulation_speed (int): Emulation speed (default: 0) headless (bool): Whether to run in headless mode (default: False) + discrete (bool): Whether action space is discrete (default: False) """ gym: str = Field(description="Gym Environment ") task: str - domain: str = "" - display: int = 0 + domain: Optional[str] = "" + display: Optional[int] = 0 # image observation configurations - frames_to_stack: int = 3 - frame_width: int = 84 - frame_height: int = 84 - grey_scale: int = 0 + frames_to_stack: Optional[int] = 3 + frame_width: Optional[int] = 84 + frame_height: Optional[int] = 84 + grey_scale: Optional[int] = 0 # pyboy configurations TODO move... - rom_path: str = f"{Path.home()}/cares_rl_configs" - act_freq: int = 24 - emulation_speed: int = 0 - headless: int = 1 + rom_path: Optional[str] = f"{Path.home()}/cares_rl_configs" + act_freq: Optional[int] = 24 + emulation_speed: Optional[int] = 0 + headless: Optional[int] = 1 + discrete: Optional[int] = 0 diff --git a/shell-scripts/catch.sh b/shell-scripts/catch.sh new file mode 100755 index 0000000..6a20a92 --- /dev/null +++ b/shell-scripts/catch.sh @@ -0,0 +1 @@ +python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 1 --domain pokemon --task catch SACDAE diff --git a/shell-scripts/fight-sacae.sh b/shell-scripts/fight-sacae.sh new file mode 100755 index 0000000..8164fed --- /dev/null +++ b/shell-scripts/fight-sacae.sh @@ -0,0 +1 @@ +python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 0 --domain pokemon --task fight SACAE diff --git a/shell-scripts/fight-sacd.sh b/shell-scripts/fight-sacd.sh new file mode 100755 index 0000000..41ce632 --- /dev/null +++ b/shell-scripts/fight-sacd.sh @@ -0,0 +1 @@ +python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 1 --domain pokemon --task fight SACD diff --git a/shell-scripts/fight-sacdae.sh b/shell-scripts/fight-sacdae.sh new file mode 100755 index 0000000..8427404 --- /dev/null +++ b/shell-scripts/fight-sacdae.sh @@ -0,0 +1 @@ +python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 1 --domain pokemon --task fight NaSATD3 diff --git a/shell-scripts/flexi.sh b/shell-scripts/flexi.sh new file mode 100755 index 0000000..998b0e5 --- /dev/null +++ b/shell-scripts/flexi.sh @@ -0,0 +1 @@ +python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 0 --domain pokemon --task flexi SACAE