From 7c3346bb8c744f49480d34507baf49866ddb5d92 Mon Sep 17 00:00:00 2001 From: PK-and-Sam Date: Sat, 20 Jul 2024 15:32:06 +1200 Subject: [PATCH 01/26] Fixed deep copy bug temp --- scripts/train_loops/policy_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py index e0455be..b472edf 100644 --- a/scripts/train_loops/policy_loop.py +++ b/scripts/train_loops/policy_loop.py @@ -154,7 +154,7 @@ def policy_based_train( if (total_step_counter + 1) % number_steps_per_evaluation == 0: logging.info("*************--Evaluation Loop--*************") evaluate_policy_network( - copy.deepcopy(env), + env, agent, train_config, record=record, From 311d388c744071ccb8ead893517697f40607b26d Mon Sep 17 00:00:00 2001 From: PK-and-Sam Date: Sat, 20 Jul 2024 16:02:16 +1200 Subject: [PATCH 02/26] ignored added shell script --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1bd8f1f..e1667b3 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,6 @@ dist/ .DS_Store rl_logs/ -configs \ No newline at end of file +configs + +shell-scripts \ No newline at end of file From 9b3009ef0e8635fdb524a9e601dbb59b95fcd16d Mon Sep 17 00:00:00 2001 From: PK-and-Sam Date: Sun, 21 Jul 2024 17:53:14 +1200 Subject: [PATCH 03/26] Added fight script --- .gitignore | 4 +--- shell-scripts/fight.sh | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) create mode 100755 shell-scripts/fight.sh diff --git a/.gitignore b/.gitignore index e1667b3..1bd8f1f 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,4 @@ dist/ .DS_Store rl_logs/ -configs - -shell-scripts \ No newline at end of file +configs \ No newline at end of file diff --git a/shell-scripts/fight.sh b/shell-scripts/fight.sh new file mode 100755 index 0000000..cd44790 --- /dev/null +++ b/shell-scripts/fight.sh @@ -0,0 +1 @@ +python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --domain pokemon --task fight SACAE From d4634957c7608cd31cbf7e3fc8aff5c6d09a1baa Mon Sep 17 00:00:00 2001 From: PKWadsworth Date: Sun, 21 Jul 2024 18:57:35 +1200 Subject: [PATCH 04/26] Made pyboy env use sample action from env --- scripts/environments/pyboy/pyboy_environment.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/environments/pyboy/pyboy_environment.py b/scripts/environments/pyboy/pyboy_environment.py index 5159744..00803e7 100644 --- a/scripts/environments/pyboy/pyboy_environment.py +++ b/scripts/environments/pyboy/pyboy_environment.py @@ -36,9 +36,7 @@ def action_num(self) -> int: return self.env.action_num def sample_action(self): - return np.random.uniform( - self.min_action_value, self.max_action_value, size=self.action_num - ) + return self.env.sample_action() def set_seed(self, seed: int) -> None: self.env.set_seed(seed) From c5e2ad8382fb157eca990e10f645705d5b12d130 Mon Sep 17 00:00:00 2001 From: PK-and-Sam Date: Tue, 23 Jul 2024 17:06:39 +1200 Subject: [PATCH 05/26] Added the discrete policy loop --- scripts/train.py | 11 ++ scripts/train_loops/discrete_policy_loop.py | 114 ++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 scripts/train_loops/discrete_policy_loop.py diff --git a/scripts/train.py b/scripts/train.py index 1af81b4..79be83d 100755 --- a/scripts/train.py +++ b/scripts/train.py @@ -8,6 +8,7 @@ import sys import torch +import train_loops.discrete_policy_loop as dpbe import train_loops.policy_loop as pbe import train_loops.ppo_loop as ppe import train_loops.value_loop as vbe @@ -133,6 +134,16 @@ def main(): alg_config, display=env_config.display, ) + elif agent.type == "discrete_policy": + dpbe.discrete_policy_based_train( + env, + agent, + memory, + record, + training_config, + alg_config, + display=env_config.display, + ) elif agent.type == "policy": pbe.policy_based_train( env, diff --git a/scripts/train_loops/discrete_policy_loop.py b/scripts/train_loops/discrete_policy_loop.py new file mode 100644 index 0000000..773d5e4 --- /dev/null +++ b/scripts/train_loops/discrete_policy_loop.py @@ -0,0 +1,114 @@ +import logging +import time + +from cares_reinforcement_learning.util import helpers as hlp +from cares_reinforcement_learning.util.configurations import ( + AlgorithmConfig, + TrainingConfig, +) + +def discrete_policy_based_train( + env, + agent, + memory, + record, + train_config: TrainingConfig, + alg_config: AlgorithmConfig, + display=False, +): + start_time = time.time() + + max_steps_training = alg_config.max_steps_training + max_steps_exploration = alg_config.max_steps_exploration + number_steps_per_evaluation = train_config.number_steps_per_evaluation + number_steps_per_train_policy = alg_config.number_steps_per_train_policy + + # Algorthm specific attributes - e.g. NaSA-TD3 dd + intrinsic_on = ( + bool(alg_config.intrinsic_on) if hasattr(alg_config, "intrinsic_on") else False + ) + + min_noise = alg_config.min_noise if hasattr(alg_config, "min_noise") else 0 + noise_decay = alg_config.noise_decay if hasattr(alg_config, "noise_decay") else 1.0 + noise_scale = alg_config.noise_scale if hasattr(alg_config, "noise_scale") else 0.1 + + logging.info( + f"Training {max_steps_training} Exploration {max_steps_exploration} Evaluation {number_steps_per_evaluation}" + ) + + batch_size = alg_config.batch_size + G = alg_config.G + + episode_timesteps = 0 + episode_reward = 0 + episode_num = 0 + + state = env.reset() + + episode_start = time.time() + for total_step_counter in range(int(max_steps_training)): + episode_timesteps += 1 + + if total_step_counter < max_steps_exploration: + logging.info( + f"Running Exploration Steps {total_step_counter + 1}/{max_steps_exploration}" + ) + + action = env.sample_action() + + else: + noise_scale *= noise_decay + noise_scale = max(min_noise, noise_scale) + + # algorithm range [-1, 1] + action = agent.select_action_from_policy(state, noise_scale=noise_scale) + + next_state, reward_extrinsic, done, truncated = env.step(action) + if display: + env.render() + + intrinsic_reward = 0 + if intrinsic_on and total_step_counter > max_steps_exploration: + intrinsic_reward = agent.get_intrinsic_reward(state, action, next_state) + + total_reward = reward_extrinsic + intrinsic_reward + + memory.add( + state, + action, + total_reward, + next_state, + done, + ) + + state = next_state + episode_reward += reward_extrinsic # Note we only track the extrinsic reward for the episode for proper comparison + + if ( + total_step_counter >= max_steps_exploration + and total_step_counter % number_steps_per_train_policy == 0 + ): + for _ in range(G): + agent.train_policy(memory, batch_size) + + if done or truncated: + episode_time = time.time() - episode_start + record.log_train( + total_steps=total_step_counter + 1, + episode=episode_num + 1, + episode_steps=episode_timesteps, + episode_reward=episode_reward, + episode_time=episode_time, + display=True, + ) + + # Reset environment + state = env.reset() + episode_timesteps = 0 + episode_reward = 0 + episode_num += 1 + episode_start = time.time() + + end_time = time.time() + elapsed_time = end_time - start_time + print("Training time:", time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) From a7077962d6c494fc1dda2a4a508fc05949683e8f Mon Sep 17 00:00:00 2001 From: PK-and-Sam Date: Wed, 7 Aug 2024 16:47:07 +1200 Subject: [PATCH 06/26] Added discrete policy loop for pokemon and fight script --- requirements.txt | 1 + .../environments/openai/openai_environment.py | 2 + scripts/plot.py | 45 ++++++++++++++ scripts/train.py | 1 - scripts/train_loops/discrete_policy_loop.py | 61 ++++++++++++++++++- shell-scripts/fight.sh | 2 +- 6 files changed, 108 insertions(+), 4 deletions(-) create mode 100644 scripts/plot.py diff --git a/requirements.txt b/requirements.txt index 5f90734..4268b16 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,5 @@ opencv-contrib-python==4.6.0.66 pydantic==1.10.13 torch==2.3.1 pyboy==2.2.1 +plotly==5.22.0 mediapy==1.1.9 diff --git a/scripts/environments/openai/openai_environment.py b/scripts/environments/openai/openai_environment.py index 1dabafb..80b9e5a 100644 --- a/scripts/environments/openai/openai_environment.py +++ b/scripts/environments/openai/openai_environment.py @@ -15,10 +15,12 @@ def __init__(self, config: GymEnvironmentConfig) -> None: @cached_property def max_action_value(self) -> float: + return self.env.action_space.n - 1 return self.env.action_space.high[0] @cached_property def min_action_value(self) -> float: + return 0 return self.env.action_space.low[0] @cached_property diff --git a/scripts/plot.py b/scripts/plot.py new file mode 100644 index 0000000..b807d91 --- /dev/null +++ b/scripts/plot.py @@ -0,0 +1,45 @@ +import os +import sys +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go + +def plot_csv_files(directory_path): + # Define file paths + train_file_path = os.path.join(directory_path, 'train.csv') + eval_file_path = os.path.join(directory_path, 'eval.csv') + + # Check if files exist + if not os.path.exists(train_file_path): + print(f"Train file not found at {train_file_path}") + return + if not os.path.exists(eval_file_path): + print(f"Eval file not found at {eval_file_path}") + return + + # Read CSV files + train_df = pd.read_csv(train_file_path) + eval_df = pd.read_csv(eval_file_path) + + # Create subplots for train and eval data + fig = px.make_subplots(rows=2, cols=1, shared_xaxes=True, + subplot_titles=('Train Data', 'Eval Data')) + + # Plot train data + for col in train_df.columns: + fig.add_trace(go.Scatter(x=train_df.index, y=train_df[col], mode='lines', name=f'Train {col}'), row=1, col=1) + + # Plot eval data + for col in eval_df.columns: + fig.add_trace(go.Scatter(x=eval_df.index, y=eval_df[col], mode='lines', name=f'Eval {col}'), row=2, col=1) + + # Update layout + fig.update_layout(height=600, width=800, title_text="Train and Eval Data Plots") + fig.show() + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python plot_csv.py ") + else: + directory_path = sys.argv[1] + plot_csv_files(directory_path) diff --git a/scripts/train.py b/scripts/train.py index 79be83d..7363bb6 100755 --- a/scripts/train.py +++ b/scripts/train.py @@ -111,7 +111,6 @@ def main(): ) # create the record class - standardised results tracking record = Record( - glob_log_dir="", log_dir=log_dir, algorithm=alg_config.algorithm, task=env_config.task, diff --git a/scripts/train_loops/discrete_policy_loop.py b/scripts/train_loops/discrete_policy_loop.py index 773d5e4..438ef1b 100644 --- a/scripts/train_loops/discrete_policy_loop.py +++ b/scripts/train_loops/discrete_policy_loop.py @@ -7,6 +7,52 @@ TrainingConfig, ) +def evaluate_policy_network( + env, agent, config: TrainingConfig, record=None, total_steps=0 +): + state = env.reset() + + if record is not None: + frame = env.grab_frame() + record.start_video(total_steps + 1, frame) + + number_eval_episodes = int(config.number_eval_episodes) + + for eval_episode_counter in range(number_eval_episodes): + episode_timesteps = 0 + episode_reward = 0 + episode_num = 0 + done = False + truncated = False + + while not done and not truncated: + episode_timesteps += 1 + action = agent.select_action_from_policy(state, evaluation=True).item() + + state, reward, done, truncated = env.step(action) + episode_reward += reward + + if eval_episode_counter == 0 and record is not None: + frame = env.grab_frame() + record.log_video(frame) + + if done or truncated: + if record is not None: + record.log_eval( + total_steps=total_steps + 1, + episode=eval_episode_counter + 1, + episode_reward=episode_reward, + display=True, + ) + + # Reset environment + state = env.reset() + episode_reward = 0 + episode_timesteps = 0 + episode_num += 1 + + record.stop_video() + def discrete_policy_based_train( env, agent, @@ -53,7 +99,6 @@ def discrete_policy_based_train( logging.info( f"Running Exploration Steps {total_step_counter + 1}/{max_steps_exploration}" ) - action = env.sample_action() else: @@ -61,7 +106,8 @@ def discrete_policy_based_train( noise_scale = max(min_noise, noise_scale) # algorithm range [-1, 1] - action = agent.select_action_from_policy(state, noise_scale=noise_scale) + action_tensor = agent.select_action_from_policy(state, noise_scale=noise_scale) + action = action_tensor.item() next_state, reward_extrinsic, done, truncated = env.step(action) if display: @@ -91,6 +137,17 @@ def discrete_policy_based_train( for _ in range(G): agent.train_policy(memory, batch_size) + if (total_step_counter + 1) % number_steps_per_evaluation == 0: + logging.info("*************--Evaluation Loop--*************") + evaluate_policy_network( + env, + agent, + train_config, + record=record, + total_steps=total_step_counter, + ) + logging.info("--------------------------------------------") + if done or truncated: episode_time = time.time() - episode_start record.log_train( diff --git a/shell-scripts/fight.sh b/shell-scripts/fight.sh index cd44790..3263ceb 100755 --- a/shell-scripts/fight.sh +++ b/shell-scripts/fight.sh @@ -1 +1 @@ -python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --domain pokemon --task fight SACAE +python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --domain pokemon --task fight SACD --image_observation 1 From eea9c31c0dbd213c564666318770a470d2cbdf55 Mon Sep 17 00:00:00 2001 From: PKWadsworth Date: Tue, 13 Aug 2024 15:30:46 +1200 Subject: [PATCH 07/26] Removed discrete policy loop --- scripts/train_loops/discrete_policy_loop.py | 171 -------------------- 1 file changed, 171 deletions(-) delete mode 100644 scripts/train_loops/discrete_policy_loop.py diff --git a/scripts/train_loops/discrete_policy_loop.py b/scripts/train_loops/discrete_policy_loop.py deleted file mode 100644 index 438ef1b..0000000 --- a/scripts/train_loops/discrete_policy_loop.py +++ /dev/null @@ -1,171 +0,0 @@ -import logging -import time - -from cares_reinforcement_learning.util import helpers as hlp -from cares_reinforcement_learning.util.configurations import ( - AlgorithmConfig, - TrainingConfig, -) - -def evaluate_policy_network( - env, agent, config: TrainingConfig, record=None, total_steps=0 -): - state = env.reset() - - if record is not None: - frame = env.grab_frame() - record.start_video(total_steps + 1, frame) - - number_eval_episodes = int(config.number_eval_episodes) - - for eval_episode_counter in range(number_eval_episodes): - episode_timesteps = 0 - episode_reward = 0 - episode_num = 0 - done = False - truncated = False - - while not done and not truncated: - episode_timesteps += 1 - action = agent.select_action_from_policy(state, evaluation=True).item() - - state, reward, done, truncated = env.step(action) - episode_reward += reward - - if eval_episode_counter == 0 and record is not None: - frame = env.grab_frame() - record.log_video(frame) - - if done or truncated: - if record is not None: - record.log_eval( - total_steps=total_steps + 1, - episode=eval_episode_counter + 1, - episode_reward=episode_reward, - display=True, - ) - - # Reset environment - state = env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 - - record.stop_video() - -def discrete_policy_based_train( - env, - agent, - memory, - record, - train_config: TrainingConfig, - alg_config: AlgorithmConfig, - display=False, -): - start_time = time.time() - - max_steps_training = alg_config.max_steps_training - max_steps_exploration = alg_config.max_steps_exploration - number_steps_per_evaluation = train_config.number_steps_per_evaluation - number_steps_per_train_policy = alg_config.number_steps_per_train_policy - - # Algorthm specific attributes - e.g. NaSA-TD3 dd - intrinsic_on = ( - bool(alg_config.intrinsic_on) if hasattr(alg_config, "intrinsic_on") else False - ) - - min_noise = alg_config.min_noise if hasattr(alg_config, "min_noise") else 0 - noise_decay = alg_config.noise_decay if hasattr(alg_config, "noise_decay") else 1.0 - noise_scale = alg_config.noise_scale if hasattr(alg_config, "noise_scale") else 0.1 - - logging.info( - f"Training {max_steps_training} Exploration {max_steps_exploration} Evaluation {number_steps_per_evaluation}" - ) - - batch_size = alg_config.batch_size - G = alg_config.G - - episode_timesteps = 0 - episode_reward = 0 - episode_num = 0 - - state = env.reset() - - episode_start = time.time() - for total_step_counter in range(int(max_steps_training)): - episode_timesteps += 1 - - if total_step_counter < max_steps_exploration: - logging.info( - f"Running Exploration Steps {total_step_counter + 1}/{max_steps_exploration}" - ) - action = env.sample_action() - - else: - noise_scale *= noise_decay - noise_scale = max(min_noise, noise_scale) - - # algorithm range [-1, 1] - action_tensor = agent.select_action_from_policy(state, noise_scale=noise_scale) - action = action_tensor.item() - - next_state, reward_extrinsic, done, truncated = env.step(action) - if display: - env.render() - - intrinsic_reward = 0 - if intrinsic_on and total_step_counter > max_steps_exploration: - intrinsic_reward = agent.get_intrinsic_reward(state, action, next_state) - - total_reward = reward_extrinsic + intrinsic_reward - - memory.add( - state, - action, - total_reward, - next_state, - done, - ) - - state = next_state - episode_reward += reward_extrinsic # Note we only track the extrinsic reward for the episode for proper comparison - - if ( - total_step_counter >= max_steps_exploration - and total_step_counter % number_steps_per_train_policy == 0 - ): - for _ in range(G): - agent.train_policy(memory, batch_size) - - if (total_step_counter + 1) % number_steps_per_evaluation == 0: - logging.info("*************--Evaluation Loop--*************") - evaluate_policy_network( - env, - agent, - train_config, - record=record, - total_steps=total_step_counter, - ) - logging.info("--------------------------------------------") - - if done or truncated: - episode_time = time.time() - episode_start - record.log_train( - total_steps=total_step_counter + 1, - episode=episode_num + 1, - episode_steps=episode_timesteps, - episode_reward=episode_reward, - episode_time=episode_time, - display=True, - ) - - # Reset environment - state = env.reset() - episode_timesteps = 0 - episode_reward = 0 - episode_num += 1 - episode_start = time.time() - - end_time = time.time() - elapsed_time = end_time - start_time - print("Training time:", time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) From efb3ab9e8d5e9c65185a49cad6bfd20e104beb6d Mon Sep 17 00:00:00 2001 From: PK-and-Sam Date: Wed, 14 Aug 2024 15:53:59 +1200 Subject: [PATCH 08/26] Fixed import error --- scripts/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/train.py b/scripts/train.py index 5448db5..9681037 100755 --- a/scripts/train.py +++ b/scripts/train.py @@ -9,7 +9,6 @@ import os import torch -import train_loops.discrete_policy_loop as dpbe import train_loops.policy_loop as pbe import train_loops.ppo_loop as ppe import train_loops.value_loop as vbe From 68220e9bad5739600070fb37f0086395027dc483 Mon Sep 17 00:00:00 2001 From: PK-and-Sam Date: Wed, 14 Aug 2024 17:08:32 +1200 Subject: [PATCH 09/26] Fixed policy loop --- scripts/train_loops/policy_loop.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py index d079bf0..9f713cd 100644 --- a/scripts/train_loops/policy_loop.py +++ b/scripts/train_loops/policy_loop.py @@ -10,7 +10,7 @@ def evaluate_policy_network( - env, agent, config: TrainingConfig, record=None, total_steps=0 + env, agent, config: TrainingConfig, record=None, total_steps=0, normalisation=True ): state = env.reset() @@ -29,12 +29,12 @@ def evaluate_policy_network( while not done and not truncated: episode_timesteps += 1 - action = agent.select_action_from_policy(state, evaluation=True) - action_env = hlp.denormalize( - action, env.max_action_value, env.min_action_value - ) + normalised_action = agent.select_action_from_policy(state, evaluation=True) + denormalised_action = hlp.denormalize( + normalised_action, env.max_action_value, env.min_action_value + ) if normalisation else normalised_action - state, reward, done, truncated = env.step(action_env) + state, reward, done, truncated = env.step(denormalised_action) episode_reward += reward if eval_episode_counter == 0 and record is not None: @@ -171,6 +171,7 @@ def policy_based_train( train_config, record=record, total_steps=total_step_counter, + normalisation=normalisation ) logging.info("--------------------------------------------") From 855c924059e953952334b07466b922ef6541ac91 Mon Sep 17 00:00:00 2001 From: PK Wadsworth Date: Tue, 10 Sep 2024 13:10:34 +1200 Subject: [PATCH 10/26] Added discrete config --- scripts/environments/pyboy/pyboy_environment.py | 1 + scripts/util/configurations.py | 1 + shell-scripts/fight.sh | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/environments/pyboy/pyboy_environment.py b/scripts/environments/pyboy/pyboy_environment.py index 00803e7..56d31c0 100644 --- a/scripts/environments/pyboy/pyboy_environment.py +++ b/scripts/environments/pyboy/pyboy_environment.py @@ -17,6 +17,7 @@ def __init__(self, config: GymEnvironmentConfig) -> None: config.act_freq, config.emulation_speed, config.headless, + config.discrete, ) @cached_property diff --git a/scripts/util/configurations.py b/scripts/util/configurations.py index 05c73f3..81f0966 100755 --- a/scripts/util/configurations.py +++ b/scripts/util/configurations.py @@ -43,3 +43,4 @@ class GymEnvironmentConfig(EnvironmentConfig): act_freq: Optional[int] = 24 emulation_speed: Optional[int] = 0 headless: Optional[int] = 0 + discrete: Optional[int] = 0 diff --git a/shell-scripts/fight.sh b/shell-scripts/fight.sh index 3263ceb..bc84cc3 100755 --- a/shell-scripts/fight.sh +++ b/shell-scripts/fight.sh @@ -1 +1 @@ -python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --domain pokemon --task fight SACD --image_observation 1 +python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 0 --domain pokemon --task fight SACAE From e327a9b3142682d5d10d802151521ab495507173 Mon Sep 17 00:00:00 2001 From: PK Wadsworth Date: Thu, 12 Sep 2024 16:24:17 +1200 Subject: [PATCH 11/26] Added log comments and discretisation --- scripts/environments/gym_environment.py | 4 ++ .../environments/pyboy/pyboy_environment.py | 2 + scripts/train_loops/policy_loop.py | 44 +++++++++++++++++++ shell-scripts/fight.sh | 2 +- 4 files changed, 51 insertions(+), 1 deletion(-) diff --git a/scripts/environments/gym_environment.py b/scripts/environments/gym_environment.py index a779a60..ce8fbba 100644 --- a/scripts/environments/gym_environment.py +++ b/scripts/environments/gym_environment.py @@ -12,9 +12,13 @@ def __init__(self, config: GymEnvironmentConfig) -> None: self.task = config.task def render(self): + # debug-log logging.info("Logging128") frame = self.grab_frame() + # debug-log logging.info("Logging129") cv2.imshow(f"{self.task}", frame) + # debug-log logging.info("Logging130") cv2.waitKey(10) + # debug-log logging.info("Logging131") @cached_property @abc.abstractmethod diff --git a/scripts/environments/pyboy/pyboy_environment.py b/scripts/environments/pyboy/pyboy_environment.py index 56d31c0..698ba52 100644 --- a/scripts/environments/pyboy/pyboy_environment.py +++ b/scripts/environments/pyboy/pyboy_environment.py @@ -1,6 +1,7 @@ from functools import cached_property import numpy as np +import logging from environments.gym_environment import GymEnvironment from util.configurations import GymEnvironmentConfig @@ -46,6 +47,7 @@ def reset(self) -> np.ndarray: return self.env.reset() def step(self, action: int) -> tuple: + # debug-log logging.info("Logging109") return self.env.step(action) def grab_frame(self, height=240, width=300) -> np.ndarray: diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py index 9f713cd..ca318ed 100644 --- a/scripts/train_loops/policy_loop.py +++ b/scripts/train_loops/policy_loop.py @@ -12,13 +12,19 @@ def evaluate_policy_network( env, agent, config: TrainingConfig, record=None, total_steps=0, normalisation=True ): + # debug-log logging.info("Logging32") state = env.reset() + # debug-log logging.info("Logging33") if record is not None: + # debug-log logging.info("Logging34") frame = env.grab_frame() record.start_video(total_steps + 1, frame) + # debug-log logging.info("Logging35") + # debug-log logging.info("Logging36") number_eval_episodes = int(config.number_eval_episodes) + # debug-log logging.info("Logging37") for eval_episode_counter in range(number_eval_episodes): episode_timesteps = 0 @@ -27,28 +33,39 @@ def evaluate_policy_network( done = False truncated = False + # debug-log logging.info("Logging38") while not done and not truncated: + # debug-log logging.info("Logging39") episode_timesteps += 1 normalised_action = agent.select_action_from_policy(state, evaluation=True) + # debug-log logging.info("Logging40") denormalised_action = hlp.denormalize( normalised_action, env.max_action_value, env.min_action_value ) if normalisation else normalised_action + # debug-log logging.info("Logging41") state, reward, done, truncated = env.step(denormalised_action) episode_reward += reward + # debug-log logging.info("Logging42") if eval_episode_counter == 0 and record is not None: + # debug-log logging.info("Logging44") frame = env.grab_frame() record.log_video(frame) + # debug-log logging.info("Logging45") + # debug-log logging.info("Logging43") if done or truncated: + # debug-log logging.info("Logging46") if record is not None: + # debug-log logging.info("Logging47") record.log_eval( total_steps=total_steps + 1, episode=eval_episode_counter + 1, episode_reward=episode_reward, display=True, ) + # debug-log logging.info("Logging48") # Reset environment state = env.reset() @@ -56,7 +73,9 @@ def evaluate_policy_network( episode_timesteps = 0 episode_num += 1 + # debug-log logging.info("Logging49") record.stop_video() + # debug-log logging.info("Logging50") def policy_based_train( @@ -70,18 +89,22 @@ def policy_based_train( display=False, normalisation=True, ): + # debug-log logging.info("Logging9") start_time = time.time() + # debug-log logging.info("Logging10") max_steps_training = alg_config.max_steps_training max_steps_exploration = alg_config.max_steps_exploration number_steps_per_evaluation = train_config.number_steps_per_evaluation number_steps_per_train_policy = alg_config.number_steps_per_train_policy + # debug-log logging.info("Logging11") # Algorthm specific attributes - e.g. NaSA-TD3 dd intrinsic_on = ( bool(alg_config.intrinsic_on) if hasattr(alg_config, "intrinsic_on") else False ) + # debug-log logging.info("Logging12") min_noise = alg_config.min_noise if hasattr(alg_config, "min_noise") else 0 noise_decay = alg_config.noise_decay if hasattr(alg_config, "noise_decay") else 1.0 noise_scale = alg_config.noise_scale if hasattr(alg_config, "noise_scale") else 0.1 @@ -99,10 +122,13 @@ def policy_based_train( state = env.reset() + # debug-log logging.info("Logging13") episode_start = time.time() for total_step_counter in range(int(max_steps_training)): + # debug-log logging.info("Logging14") episode_timesteps += 1 + # debug-log logging.info("Logging15") if total_step_counter < max_steps_exploration: logging.info( f"Running Exploration Steps {total_step_counter + 1}/{max_steps_exploration}" @@ -110,6 +136,7 @@ def policy_based_train( denormalised_action = env.sample_action() + # debug-log logging.info("Logging16") # algorithm range [-1, 1] - note for DMCS this is redudenant but required for openai if normalisation: normalised_action = hlp.normalize( @@ -117,14 +144,18 @@ def policy_based_train( ) else: normalised_action = denormalised_action + # debug-log logging.info("Logging17") else: + # debug-log logging.info("Logging18") noise_scale *= noise_decay noise_scale = max(min_noise, noise_scale) + # debug-log logging.info("Logging19") # algorithm range [-1, 1] normalised_action = agent.select_action_from_policy( state, noise_scale=noise_scale ) + # debug-log logging.info("Logging20") # mapping to env range [e.g. -2 , 2 for pendulum] - note for DMCS this is redudenant but required for openai if normalisation: denormalised_action = hlp.denormalize( @@ -132,19 +163,26 @@ def policy_based_train( ) else: denormalised_action = normalised_action + # debug-log logging.info("Logging21") + # debug-log logging.info("Logging22") next_state, reward_extrinsic, done, truncated = env.step(denormalised_action) + # debug-log logging.info("Logging23") if display: + # debug-log logging.info("Logging128") env.render() + # debug-log logging.info("Logging23") intrinsic_reward = 0 if intrinsic_on and total_step_counter > max_steps_exploration: intrinsic_reward = agent.get_intrinsic_reward( state, normalised_action, next_state ) + # debug-log logging.info("Logging24") total_reward = reward_extrinsic + intrinsic_reward + # debug-log logging.info("Logging25") memory.add( state, normalised_action, @@ -152,16 +190,20 @@ def policy_based_train( next_state, done, ) + # debug-log logging.info("Logging26") state = next_state episode_reward += reward_extrinsic # Note we only track the extrinsic reward for the episode for proper comparison + # debug-log logging.info("Logging27") if ( total_step_counter >= max_steps_exploration and total_step_counter % number_steps_per_train_policy == 0 ): + # debug-log logging.info("Logging28") for _ in range(G): agent.train_policy(memory, batch_size) + # debug-log logging.info("Logging29") if (total_step_counter + 1) % number_steps_per_evaluation == 0: logging.info("*************--Evaluation Loop--*************") @@ -177,6 +219,7 @@ def policy_based_train( if done or truncated: episode_time = time.time() - episode_start + # debug-log logging.info("Logging30") record.log_train( total_steps=total_step_counter + 1, episode=episode_num + 1, @@ -185,6 +228,7 @@ def policy_based_train( episode_time=episode_time, display=True, ) + # debug-log logging.info("Logging31") # Reset environment state = env.reset() diff --git a/shell-scripts/fight.sh b/shell-scripts/fight.sh index bc84cc3..cd44790 100755 --- a/shell-scripts/fight.sh +++ b/shell-scripts/fight.sh @@ -1 +1 @@ -python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 0 --domain pokemon --task fight SACAE +python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --domain pokemon --task fight SACAE From 1277a7caf5f1f769a16a4adec1500bf53b68d588 Mon Sep 17 00:00:00 2001 From: PK Wadsworth Date: Mon, 16 Sep 2024 16:20:53 +1200 Subject: [PATCH 12/26] Updated shell scripts --- shell-scripts/catch-sacae.sh | 1 + shell-scripts/fight-sacae.sh | 1 + shell-scripts/fight-sacd.sh | 1 + shell-scripts/fight.sh | 1 - 4 files changed, 3 insertions(+), 1 deletion(-) create mode 100755 shell-scripts/catch-sacae.sh create mode 100755 shell-scripts/fight-sacae.sh create mode 100755 shell-scripts/fight-sacd.sh delete mode 100755 shell-scripts/fight.sh diff --git a/shell-scripts/catch-sacae.sh b/shell-scripts/catch-sacae.sh new file mode 100755 index 0000000..8573265 --- /dev/null +++ b/shell-scripts/catch-sacae.sh @@ -0,0 +1 @@ +python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 0 --domain pokemon --task catch SACAE diff --git a/shell-scripts/fight-sacae.sh b/shell-scripts/fight-sacae.sh new file mode 100755 index 0000000..8164fed --- /dev/null +++ b/shell-scripts/fight-sacae.sh @@ -0,0 +1 @@ +python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 0 --domain pokemon --task fight SACAE diff --git a/shell-scripts/fight-sacd.sh b/shell-scripts/fight-sacd.sh new file mode 100755 index 0000000..41ce632 --- /dev/null +++ b/shell-scripts/fight-sacd.sh @@ -0,0 +1 @@ +python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 1 --domain pokemon --task fight SACD diff --git a/shell-scripts/fight.sh b/shell-scripts/fight.sh deleted file mode 100755 index cd44790..0000000 --- a/shell-scripts/fight.sh +++ /dev/null @@ -1 +0,0 @@ -python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --domain pokemon --task fight SACAE From 128d0b824b6ee8f0f5170bdeec572adfe95f1a9a Mon Sep 17 00:00:00 2001 From: PK Wadsworth Date: Wed, 2 Oct 2024 18:27:05 +1300 Subject: [PATCH 13/26] Made changes to gym which allow image overlay --- scripts/environments/gym_environment.py | 4 + scripts/environments/image_wrapper.py | 3 + .../environments/pyboy/pyboy_environment.py | 3 + scripts/train_loops/policy_loop.py | 86 ++++++++++++++++--- scripts/util/configurations.py | 2 +- shell-scripts/brock.sh | 1 + shell-scripts/catch-sacae.sh | 1 - shell-scripts/catch.sh | 1 + shell-scripts/fight-sacdae.sh | 1 + 9 files changed, 88 insertions(+), 14 deletions(-) create mode 100755 shell-scripts/brock.sh delete mode 100755 shell-scripts/catch-sacae.sh create mode 100755 shell-scripts/catch.sh create mode 100755 shell-scripts/fight-sacdae.sh diff --git a/scripts/environments/gym_environment.py b/scripts/environments/gym_environment.py index ce8fbba..641fc93 100644 --- a/scripts/environments/gym_environment.py +++ b/scripts/environments/gym_environment.py @@ -24,6 +24,10 @@ def render(self): @abc.abstractmethod def min_action_value(self): raise NotImplementedError("Override this method") + + @abc.abstractmethod + def action_as_string(self, action): + raise NotImplemented("Override this method") @cached_property @abc.abstractmethod diff --git a/scripts/environments/image_wrapper.py b/scripts/environments/image_wrapper.py index 51a9b8f..1dd2dd2 100644 --- a/scripts/environments/image_wrapper.py +++ b/scripts/environments/image_wrapper.py @@ -37,6 +37,9 @@ def min_action_value(self): @cached_property def max_action_value(self): return self.gym.max_action_value + + def action_as_string(self, action): + return self.gym.action_as_string(action) def sample_action(self): return self.gym.sample_action() diff --git a/scripts/environments/pyboy/pyboy_environment.py b/scripts/environments/pyboy/pyboy_environment.py index 698ba52..6f916ed 100644 --- a/scripts/environments/pyboy/pyboy_environment.py +++ b/scripts/environments/pyboy/pyboy_environment.py @@ -40,6 +40,9 @@ def action_num(self) -> int: def sample_action(self): return self.env.sample_action() + def action_as_string(self, action): + return self.env.action_as_string(action) + def set_seed(self, seed: int) -> None: self.env.set_seed(seed) diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py index 453d36f..6834301 100644 --- a/scripts/train_loops/policy_loop.py +++ b/scripts/train_loops/policy_loop.py @@ -1,12 +1,38 @@ import copy import logging import time +import os from cares_reinforcement_learning.util import helpers as hlp from cares_reinforcement_learning.util.configurations import ( AlgorithmConfig, TrainingConfig, ) +import cv2 +import numpy as np + +def overlay_info(image, **kwargs): + # Create a copy of the image to overlay text + output_image = image.copy() + + # Define the position for the text (top-left corner) + text_x, text_y = 10, 30 + + # Set the font, scale, color, and thickness for the text + font = cv2.FONT_HERSHEY_SIMPLEX + font_scale = 0.4 # Smaller font scale + color = (0, 0, 255) # Red color in BGR + thickness = 1 # Thicker text + + # Create overlay text from the kwargs dictionary + overlay_text = "\n".join([f"{key}: {value}" for key, value in kwargs.items()]) + + # Split the overlay text into lines and calculate position for each line + for i, line in enumerate(overlay_text.split('\n')): + cv2.putText(output_image, line, (text_x, text_y + i * 20), + font, font_scale, color, thickness, cv2.LINE_AA) + + return output_image def evaluate_policy_network( @@ -89,6 +115,11 @@ def policy_based_train( display=False, normalisation=True, ): + + + highest_reward = float("-inf") + start_new_video = True + # debug-log logging.info("Logging9") start_time = time.time() @@ -134,6 +165,11 @@ def policy_based_train( f"Running Exploration Steps {total_step_counter + 1}/{max_steps_exploration}" ) + if start_new_video == True: + start_new_video = False + frame = env.grab_frame() + record.start_video("temp_train_video", frame) + denormalised_action = env.sample_action() # debug-log logging.info("Logging16") @@ -172,6 +208,14 @@ def policy_based_train( # debug-log logging.info("Logging128") env.render() + # debug-log logging.info("Logging42") + if record is not None: + # debug-log logging.info("Logging44") + frame = env.grab_frame() + frame_with_stats = overlay_info(frame, Reward=f"{episode_reward:.1f}") + record.log_video(frame_with_stats) + # debug-log logging.info("Logging45") + # debug-log logging.info("Logging23") intrinsic_reward = 0 if intrinsic_on and total_step_counter > max_steps_exploration: @@ -209,18 +253,6 @@ def policy_based_train( if intrinsic_on: info["intrinsic_reward"] = intrinsic_reward - if (total_step_counter + 1) % number_steps_per_evaluation == 0: - logging.info("*************--Evaluation Loop--*************") - evaluate_policy_network( - env_eval, - agent, - train_config, - record=record, - total_steps=total_step_counter, - normalisation=normalisation, - ) - logging.info("--------------------------------------------") - if done or truncated: episode_time = time.time() - episode_start # debug-log logging.info("Logging30") @@ -235,12 +267,42 @@ def policy_based_train( ) # debug-log logging.info("Logging31") + record.stop_video() + + if episode_reward > highest_reward: + highest_reward = episode_reward + + vdir = os.path.join(record.directory, "videos") + highest_reward_video = os.path.join(vdir, "highest_reward.mp4") + training_video = os.path.join(vdir, "temp_train_video.mp4") + + try: + if os.path.exists(highest_reward_video): + os.remove(highest_reward_video) + + os.rename(training_video, highest_reward_video) + except: + logging.error("An error renaming the video occured :/") + # Reset environment + start_new_video = True state = env.reset() episode_timesteps = 0 episode_reward = 0 episode_num += 1 episode_start = time.time() + + if (total_step_counter + 1) % number_steps_per_evaluation == 0: + logging.info("*************--Evaluation Loop--*************") + evaluate_policy_network( + env_eval, + agent, + train_config, + record=record, + total_steps=total_step_counter, + normalisation=normalisation, + ) + logging.info("--------------------------------------------") end_time = time.time() elapsed_time = end_time - start_time diff --git a/scripts/util/configurations.py b/scripts/util/configurations.py index 81f0966..131557f 100755 --- a/scripts/util/configurations.py +++ b/scripts/util/configurations.py @@ -33,7 +33,7 @@ class GymEnvironmentConfig(EnvironmentConfig): display: Optional[int] = 0 # image observation configurations - frames_to_stack: Optional[int] = 3 + frames_to_stack: Optional[int] = 9 frame_width: Optional[int] = 84 frame_height: Optional[int] = 84 grey_scale: Optional[int] = 0 diff --git a/shell-scripts/brock.sh b/shell-scripts/brock.sh new file mode 100755 index 0000000..329b766 --- /dev/null +++ b/shell-scripts/brock.sh @@ -0,0 +1 @@ +python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 0 --discrete 0 --domain pokemon --task brock SACAE diff --git a/shell-scripts/catch-sacae.sh b/shell-scripts/catch-sacae.sh deleted file mode 100755 index 8573265..0000000 --- a/shell-scripts/catch-sacae.sh +++ /dev/null @@ -1 +0,0 @@ -python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 0 --domain pokemon --task catch SACAE diff --git a/shell-scripts/catch.sh b/shell-scripts/catch.sh new file mode 100755 index 0000000..6a20a92 --- /dev/null +++ b/shell-scripts/catch.sh @@ -0,0 +1 @@ +python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 1 --domain pokemon --task catch SACDAE diff --git a/shell-scripts/fight-sacdae.sh b/shell-scripts/fight-sacdae.sh new file mode 100755 index 0000000..8427404 --- /dev/null +++ b/shell-scripts/fight-sacdae.sh @@ -0,0 +1 @@ +python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 1 --domain pokemon --task fight NaSATD3 From 042b3e17782c95bdef6ebe7d8767e2eb383f91d4 Mon Sep 17 00:00:00 2001 From: PK Wadsworth Date: Thu, 3 Oct 2024 15:29:16 +1300 Subject: [PATCH 14/26] Added more data saving - especially on highest reward --- scripts/train_loops/policy_loop.py | 63 +++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py index 6834301..3bee261 100644 --- a/scripts/train_loops/policy_loop.py +++ b/scripts/train_loops/policy_loop.py @@ -2,6 +2,8 @@ import logging import time import os +import inspect +import pandas as pd from cares_reinforcement_learning.util import helpers as hlp from cares_reinforcement_learning.util.configurations import ( @@ -115,10 +117,9 @@ def policy_based_train( display=False, normalisation=True, ): + - - highest_reward = float("-inf") - start_new_video = True + start_new_run = True # debug-log logging.info("Logging9") start_time = time.time() @@ -149,27 +150,35 @@ def policy_based_train( episode_timesteps = 0 episode_reward = 0 + highest_reward = float("-inf") episode_num = 0 state = env.reset() + # Initialize the DataFrame with specified columns + run_data_rows = [] + # debug-log logging.info("Logging13") episode_start = time.time() for total_step_counter in range(int(max_steps_training)): # debug-log logging.info("Logging14") episode_timesteps += 1 + step_data = {} + + if start_new_run == True: + start_new_run = False + frame = env.grab_frame() + record.start_video("temp_train_video", frame) + run_data_rows = [] + + # debug-log logging.info("Logging15") if total_step_counter < max_steps_exploration: logging.info( f"Running Exploration Steps {total_step_counter + 1}/{max_steps_exploration}" ) - if start_new_video == True: - start_new_video = False - frame = env.grab_frame() - record.start_video("temp_train_video", frame) - denormalised_action = env.sample_action() # debug-log logging.info("Logging16") @@ -188,9 +197,15 @@ def policy_based_train( # debug-log logging.info("Logging19") # algorithm range [-1, 1] - normalised_action = agent.select_action_from_policy( - state, noise_scale=noise_scale - ) + + # Horrible hack so I don't have to change all the algorithms + select_action_from_policy = agent.select_action_from_policy + + if "info" in inspect.signature(select_action_from_policy).parameters: + denormalised_action = select_action_from_policy(state, noise_scale=noise_scale, info=step_data) + else: + denormalised_action = select_action_from_policy(state, noise_scale=noise_scale) + # debug-log logging.info("Logging20") # mapping to env range [e.g. -2 , 2 for pendulum] - note for DMCS this is redudenant but required for openai if normalisation: @@ -240,6 +255,12 @@ def policy_based_train( episode_reward += reward_extrinsic # Note we only track the extrinsic reward for the episode for proper comparison # debug-log logging.info("Logging27") + step_data["action"] = denormalised_action + step_data["reward"] = total_reward + step_data["episode_reward"] = episode_reward + + run_data_rows.append(step_data) + info = {} if ( total_step_counter >= max_steps_exploration @@ -270,22 +291,34 @@ def policy_based_train( record.stop_video() if episode_reward > highest_reward: + + highest_reward = episode_reward - vdir = os.path.join(record.directory, "videos") - highest_reward_video = os.path.join(vdir, "highest_reward.mp4") - training_video = os.path.join(vdir, "temp_train_video.mp4") + video_dir = os.path.join(record.directory, "videos") + data_dir = os.path.join(record.directory, "data") + + highest_reward_video = os.path.join(video_dir, "highest_reward.mp4") + training_video = os.path.join(video_dir, "temp_train_video.mp4") + run_csv = os.path.join(data_dir, "highest_reward.csv") + + logging.info(f"New highest reward of {episode_reward}. Saving video and run data...") + + pd.DataFrame(run_data_rows).to_csv(run_csv, index=False) try: if os.path.exists(highest_reward_video): os.remove(highest_reward_video) + except: + logging.error("An error deleting the highest reward video occured :/") + try: os.rename(training_video, highest_reward_video) except: logging.error("An error renaming the video occured :/") # Reset environment - start_new_video = True + start_new_run = True state = env.reset() episode_timesteps = 0 episode_reward = 0 From d58e5372de646eb40f4667259432c3f2fc3c460d Mon Sep 17 00:00:00 2001 From: PK Wadsworth Date: Thu, 3 Oct 2024 16:08:25 +1300 Subject: [PATCH 15/26] HUGE FIX --- scripts/train_loops/policy_loop.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py index 3bee261..4c038a2 100644 --- a/scripts/train_loops/policy_loop.py +++ b/scripts/train_loops/policy_loop.py @@ -202,9 +202,9 @@ def policy_based_train( select_action_from_policy = agent.select_action_from_policy if "info" in inspect.signature(select_action_from_policy).parameters: - denormalised_action = select_action_from_policy(state, noise_scale=noise_scale, info=step_data) + normalised_action = select_action_from_policy(state, noise_scale=noise_scale, info=step_data) else: - denormalised_action = select_action_from_policy(state, noise_scale=noise_scale) + normalised_action = select_action_from_policy(state, noise_scale=noise_scale) # debug-log logging.info("Logging20") # mapping to env range [e.g. -2 , 2 for pendulum] - note for DMCS this is redudenant but required for openai @@ -289,22 +289,23 @@ def policy_based_train( # debug-log logging.info("Logging31") record.stop_video() + video_dir = os.path.join(record.directory, "videos") + data_dir = os.path.join(record.directory, "data") + + run_csv = os.path.join(data_dir, f"episode_{episode_num}.csv") + pd.DataFrame(run_data_rows).to_csv(run_csv, index=False) if episode_reward > highest_reward: highest_reward = episode_reward - video_dir = os.path.join(record.directory, "videos") - data_dir = os.path.join(record.directory, "data") highest_reward_video = os.path.join(video_dir, "highest_reward.mp4") training_video = os.path.join(video_dir, "temp_train_video.mp4") - run_csv = os.path.join(data_dir, "highest_reward.csv") logging.info(f"New highest reward of {episode_reward}. Saving video and run data...") - pd.DataFrame(run_data_rows).to_csv(run_csv, index=False) try: if os.path.exists(highest_reward_video): From f8d13d574c86eff909a4f204cbc3c59e9d4e7b0c Mon Sep 17 00:00:00 2001 From: PK Wadsworth Date: Thu, 3 Oct 2024 16:08:31 +1300 Subject: [PATCH 16/26] renamed brock to flexi --- shell-scripts/brock.sh | 1 - shell-scripts/flexi.sh | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100755 shell-scripts/brock.sh create mode 100755 shell-scripts/flexi.sh diff --git a/shell-scripts/brock.sh b/shell-scripts/brock.sh deleted file mode 100755 index 329b766..0000000 --- a/shell-scripts/brock.sh +++ /dev/null @@ -1 +0,0 @@ -python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 0 --discrete 0 --domain pokemon --task brock SACAE diff --git a/shell-scripts/flexi.sh b/shell-scripts/flexi.sh new file mode 100755 index 0000000..998b0e5 --- /dev/null +++ b/shell-scripts/flexi.sh @@ -0,0 +1 @@ +python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 0 --domain pokemon --task flexi SACAE From d99c5ec6da94ddf5fe8f7401c6d63d84460d030b Mon Sep 17 00:00:00 2001 From: PK Wadsworth Date: Mon, 7 Oct 2024 11:00:09 +1300 Subject: [PATCH 17/26] Set frames to stack back to 3 --- scripts/util/configurations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/util/configurations.py b/scripts/util/configurations.py index 131557f..81f0966 100755 --- a/scripts/util/configurations.py +++ b/scripts/util/configurations.py @@ -33,7 +33,7 @@ class GymEnvironmentConfig(EnvironmentConfig): display: Optional[int] = 0 # image observation configurations - frames_to_stack: Optional[int] = 9 + frames_to_stack: Optional[int] = 3 frame_width: Optional[int] = 84 frame_height: Optional[int] = 84 grey_scale: Optional[int] = 0 From d20ad849871f82e7bc313394999f37492c3d6150 Mon Sep 17 00:00:00 2001 From: PK Wadsworth Date: Mon, 7 Oct 2024 11:50:02 +1300 Subject: [PATCH 18/26] Updated dockerfile and requirements --- Dockerfile | 41 +++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 +- 2 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..efc7487 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,41 @@ + +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y git + +# Keeps Python from generating .pyc files in the container +ENV PYTHONDONTWRITEBYTECODE=1 + +# Turns off buffering for easier container logging +ENV PYTHONUNBUFFERED=1 + +WORKDIR /workspace + +# Install Cares Reinforcement Learning +RUN git clone https://github.com/UoA-CARES/cares_reinforcement_learning.git +WORKDIR /workspace/cares_reinforcement_learning +RUN pip3 install -r requirements.txt +RUN pip3 install --editable . + +WORKDIR /workspace + +# Install Pyboy Environments +RUN git clone https://github.com/UoA-CARES/pyboy_environment.git +WORKDIR /workspace/pyboy_environment +RUN pip3 install -r requirements.txt +RUN pip3 install --editable . + +WORKDIR /workspace + +RUN git clone https://github.com/UoA-CARES/gymnasium_envrionments.git +WORKDIR /workspace/gymnasium_envrionments +RUN pip3 install -r requirements.txt + +# We don't have GUI capabilities +RUN pip3 uninstall opencv-python +RUN pip3 install opencv-python-headless + +# Incase someone doesn't mount volume at runtime +VOLUME /root/cares_rl_logs + +WORKDIR /workspace/gymnasium_envrionments/scripts \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4268b16..d695e56 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,6 @@ numpy==1.26.4 opencv-contrib-python==4.6.0.66 pydantic==1.10.13 torch==2.3.1 -pyboy==2.2.1 +pyboy==2.2.2 plotly==5.22.0 mediapy==1.1.9 From bf94e9fc8fb090d802b5eeb288d8f03555ceb702 Mon Sep 17 00:00:00 2001 From: PK Wadsworth Date: Mon, 7 Oct 2024 11:56:52 +1300 Subject: [PATCH 19/26] Updated dockerfile --- Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile b/Dockerfile index efc7487..e44d71e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,7 @@ WORKDIR /workspace # Install Cares Reinforcement Learning RUN git clone https://github.com/UoA-CARES/cares_reinforcement_learning.git WORKDIR /workspace/cares_reinforcement_learning +RUN git checkout action-info-loggin RUN pip3 install -r requirements.txt RUN pip3 install --editable . @@ -22,6 +23,7 @@ WORKDIR /workspace # Install Pyboy Environments RUN git clone https://github.com/UoA-CARES/pyboy_environment.git WORKDIR /workspace/pyboy_environment +RUN git checkout lvl-up-task RUN pip3 install -r requirements.txt RUN pip3 install --editable . @@ -29,6 +31,7 @@ WORKDIR /workspace RUN git clone https://github.com/UoA-CARES/gymnasium_envrionments.git WORKDIR /workspace/gymnasium_envrionments +RUN git checkout p4p-pokemon-docker RUN pip3 install -r requirements.txt # We don't have GUI capabilities From 3a4fea3d994fc2697d20da9a4eca96fad3d24496 Mon Sep 17 00:00:00 2001 From: PK Wadsworth Date: Mon, 7 Oct 2024 12:02:53 +1300 Subject: [PATCH 20/26] fixed dockerfile --- Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile b/Dockerfile index e44d71e..3590fd3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,3 +1,6 @@ +# For more information, please refer to https://aka.ms/vscode-docker-python +FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime +SHELL [ "/bin/bash", "-c" ] RUN apt-get update && \ apt-get upgrade -y && \ From c4b00269d25cc817d0b33acb4b9a5d9a3afbbc80 Mon Sep 17 00:00:00 2001 From: PK Wadsworth Date: Mon, 7 Oct 2024 12:42:23 +1300 Subject: [PATCH 21/26] Fixed dockerfile --- Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3590fd3..fcbba2e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ WORKDIR /workspace # Install Cares Reinforcement Learning RUN git clone https://github.com/UoA-CARES/cares_reinforcement_learning.git WORKDIR /workspace/cares_reinforcement_learning -RUN git checkout action-info-loggin +RUN git checkout -t origin/action-info-logging RUN pip3 install -r requirements.txt RUN pip3 install --editable . @@ -26,7 +26,7 @@ WORKDIR /workspace # Install Pyboy Environments RUN git clone https://github.com/UoA-CARES/pyboy_environment.git WORKDIR /workspace/pyboy_environment -RUN git checkout lvl-up-task +RUN git checkout -t origin/lvl-up-task RUN pip3 install -r requirements.txt RUN pip3 install --editable . @@ -34,7 +34,7 @@ WORKDIR /workspace RUN git clone https://github.com/UoA-CARES/gymnasium_envrionments.git WORKDIR /workspace/gymnasium_envrionments -RUN git checkout p4p-pokemon-docker +RUN git checkout -t origin/p4p-pokemon-docker RUN pip3 install -r requirements.txt # We don't have GUI capabilities @@ -44,4 +44,4 @@ RUN pip3 install opencv-python-headless # Incase someone doesn't mount volume at runtime VOLUME /root/cares_rl_logs -WORKDIR /workspace/gymnasium_envrionments/scripts \ No newline at end of file +WORKDIR /workspace/gymnasium_envrionments \ No newline at end of file From 1c8676c9e5bf1df78e66921ab29c4c5f4dd35b87 Mon Sep 17 00:00:00 2001 From: PK Wadsworth Date: Mon, 7 Oct 2024 13:14:18 +1300 Subject: [PATCH 22/26] Fixed dockerfile --- Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile b/Dockerfile index fcbba2e..f7ed58a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,6 +37,9 @@ WORKDIR /workspace/gymnasium_envrionments RUN git checkout -t origin/p4p-pokemon-docker RUN pip3 install -r requirements.txt +WORKDIR /root +RUN git clone https://github.com/PKWadsy/cares_pokemon_configs.git cares_rl_configs + # We don't have GUI capabilities RUN pip3 uninstall opencv-python RUN pip3 install opencv-python-headless From 92d8cef59e07db00b2ccbafd7797f930003d89d8 Mon Sep 17 00:00:00 2001 From: PK Wadsworth Date: Thu, 10 Oct 2024 20:02:13 +1300 Subject: [PATCH 23/26] Removed eval and added better video saving --- scripts/train_loops/policy_loop.py | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py index 4c038a2..963ab2c 100644 --- a/scripts/train_loops/policy_loop.py +++ b/scripts/train_loops/policy_loop.py @@ -301,20 +301,13 @@ def policy_based_train( highest_reward = episode_reward - highest_reward_video = os.path.join(video_dir, "highest_reward.mp4") + new_record_video = os.path.join(video_dir, f"new_record_episode_{episode_num+1}.mp4") training_video = os.path.join(video_dir, "temp_train_video.mp4") logging.info(f"New highest reward of {episode_reward}. Saving video and run data...") - - - try: - if os.path.exists(highest_reward_video): - os.remove(highest_reward_video) - except: - logging.error("An error deleting the highest reward video occured :/") try: - os.rename(training_video, highest_reward_video) + os.rename(training_video, new_record_video) except: logging.error("An error renaming the video occured :/") @@ -325,18 +318,6 @@ def policy_based_train( episode_reward = 0 episode_num += 1 episode_start = time.time() - - if (total_step_counter + 1) % number_steps_per_evaluation == 0: - logging.info("*************--Evaluation Loop--*************") - evaluate_policy_network( - env_eval, - agent, - train_config, - record=record, - total_steps=total_step_counter, - normalisation=normalisation, - ) - logging.info("--------------------------------------------") end_time = time.time() elapsed_time = end_time - start_time From c6ae3c5435bb07a4bbe224f665763d0650f30256 Mon Sep 17 00:00:00 2001 From: beardyface Date: Fri, 11 Oct 2024 11:28:27 +1300 Subject: [PATCH 24/26] Docker file merge with main --- scripts/train.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/scripts/train.py b/scripts/train.py index 3d8c439..7ef62cf 100755 --- a/scripts/train.py +++ b/scripts/train.py @@ -5,21 +5,18 @@ """ import logging -import sys import os +import sys -import torch import train_loops.policy_loop as pbe import train_loops.ppo_loop as ppe import train_loops.value_loop as vbe import yaml - -from environments.environment_factory import EnvironmentFactory -from util.configurations import GymEnvironmentConfig - from cares_reinforcement_learning.memory.memory_factory import MemoryFactory from cares_reinforcement_learning.util import NetworkFactory, Record, RLParser from cares_reinforcement_learning.util import helpers as hlp +from environments.environment_factory import EnvironmentFactory +from util.configurations import GymEnvironmentConfig logging.basicConfig(level=logging.INFO) From 25d5f32487904a8ffed0ea05b7e0278fb2a201d7 Mon Sep 17 00:00:00 2001 From: "Formatter [BOT]" Date: Thu, 6 Mar 2025 01:31:45 +0000 Subject: [PATCH 25/26] =?UTF-8?q?Auto-format=20code=20=F0=9F=A7=B9?= =?UTF-8?q?=F0=9F=8C=9F=F0=9F=A4=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/environments/gym_environment.py | 2 +- scripts/environments/image_wrapper.py | 2 +- scripts/train_loops/policy_loop.py | 55 ++++++++++++++++--------- 3 files changed, 38 insertions(+), 21 deletions(-) diff --git a/scripts/environments/gym_environment.py b/scripts/environments/gym_environment.py index 784e9d3..ed7204c 100644 --- a/scripts/environments/gym_environment.py +++ b/scripts/environments/gym_environment.py @@ -20,7 +20,7 @@ def render(self): @abc.abstractmethod def min_action_value(self): raise NotImplementedError("Override this method") - + @abc.abstractmethod def action_as_string(self, action): raise NotImplemented("Override this method") diff --git a/scripts/environments/image_wrapper.py b/scripts/environments/image_wrapper.py index af7513f..b1f4b58 100644 --- a/scripts/environments/image_wrapper.py +++ b/scripts/environments/image_wrapper.py @@ -41,7 +41,7 @@ def min_action_value(self): @cached_property def max_action_value(self): return self.gym.max_action_value - + def action_as_string(self, action): return self.gym.action_as_string(action) diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py index 9fd5f51..6aace2f 100644 --- a/scripts/train_loops/policy_loop.py +++ b/scripts/train_loops/policy_loop.py @@ -12,6 +12,7 @@ import cv2 import numpy as np + def overlay_info(image, **kwargs): # Create a copy of the image to overlay text output_image = image.copy() @@ -23,15 +24,23 @@ def overlay_info(image, **kwargs): font = cv2.FONT_HERSHEY_SIMPLEX font_scale = 0.4 # Smaller font scale color = (0, 0, 255) # Red color in BGR - thickness = 1 # Thicker text + thickness = 1 # Thicker text # Create overlay text from the kwargs dictionary overlay_text = "\n".join([f"{key}: {value}" for key, value in kwargs.items()]) # Split the overlay text into lines and calculate position for each line - for i, line in enumerate(overlay_text.split('\n')): - cv2.putText(output_image, line, (text_x, text_y + i * 20), - font, font_scale, color, thickness, cv2.LINE_AA) + for i, line in enumerate(overlay_text.split("\n")): + cv2.putText( + output_image, + line, + (text_x, text_y + i * 20), + font, + font_scale, + color, + thickness, + cv2.LINE_AA, + ) return output_image @@ -57,10 +66,14 @@ def evaluate_policy_network( while not done and not truncated: episode_timesteps += 1 normalised_action = agent.select_action_from_policy(state, evaluation=True) - - denormalised_action = hlp.denormalize( - normalised_action, env.max_action_value, env.min_action_value - ) if normalisation else normalised_action + + denormalised_action = ( + hlp.denormalize( + normalised_action, env.max_action_value, env.min_action_value + ) + if normalisation + else normalised_action + ) state, reward, done, truncated = env.step(denormalised_action) episode_reward += reward @@ -98,7 +111,6 @@ def policy_based_train( display=False, normalisation=True, ): - start_new_run = True @@ -146,8 +158,7 @@ def policy_based_train( frame = env.grab_frame() record.start_video("temp_train_video", frame) run_data_rows = [] - - + if total_step_counter < max_steps_exploration: logging.info( f"Running Exploration Steps {total_step_counter + 1}/{max_steps_exploration}" @@ -172,9 +183,13 @@ def policy_based_train( select_action_from_policy = agent.select_action_from_policy if "info" in inspect.signature(select_action_from_policy).parameters: - normalised_action = select_action_from_policy(state, noise_scale=noise_scale, info=step_data) + normalised_action = select_action_from_policy( + state, noise_scale=noise_scale, info=step_data + ) else: - normalised_action = select_action_from_policy(state, noise_scale=noise_scale) + normalised_action = select_action_from_policy( + state, noise_scale=noise_scale + ) # mapping to env range [e.g. -2 , 2 for pendulum] - note for DMCS this is redudenant but required for openai if normalisation: @@ -228,7 +243,7 @@ def policy_based_train( if intrinsic_on: info["intrinsic_reward"] = intrinsic_reward - + if done or truncated: episode_time = time.time() - episode_start record.log_train( @@ -250,15 +265,17 @@ def policy_based_train( if episode_reward > highest_reward: - highest_reward = episode_reward - - new_record_video = os.path.join(video_dir, f"new_record_episode_{episode_num+1}.mp4") + new_record_video = os.path.join( + video_dir, f"new_record_episode_{episode_num+1}.mp4" + ) training_video = os.path.join(video_dir, "temp_train_video.mp4") - logging.info(f"New highest reward of {episode_reward}. Saving video and run data...") - + logging.info( + f"New highest reward of {episode_reward}. Saving video and run data..." + ) + try: os.rename(training_video, new_record_video) except: From 83cd7c54fe04b37b198af305226cdb74940d3bcb Mon Sep 17 00:00:00 2001 From: Sam Boasman Date: Fri, 7 Mar 2025 10:31:06 +1300 Subject: [PATCH 26/26] Update directory attribute acess for recording --- scripts/train_loops/policy_loop.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py index 6aace2f..d290360 100644 --- a/scripts/train_loops/policy_loop.py +++ b/scripts/train_loops/policy_loop.py @@ -257,8 +257,8 @@ def policy_based_train( ) record.stop_video() - video_dir = os.path.join(record.directory, "videos") - data_dir = os.path.join(record.directory, "data") + video_dir = os.path.join(record.current_sub_directory, "videos") + data_dir = os.path.join(record.current_sub_directory, "data") run_csv = os.path.join(data_dir, f"episode_{episode_num}.csv") pd.DataFrame(run_data_rows).to_csv(run_csv, index=False)