From 7c3346bb8c744f49480d34507baf49866ddb5d92 Mon Sep 17 00:00:00 2001
From: PK-and-Sam <kwad109@aucklanduni.ac.nz>
Date: Sat, 20 Jul 2024 15:32:06 +1200
Subject: [PATCH 01/26] Fixed deep copy bug temp

---
 scripts/train_loops/policy_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py
index e0455be..b472edf 100644
--- a/scripts/train_loops/policy_loop.py
+++ b/scripts/train_loops/policy_loop.py
@@ -154,7 +154,7 @@ def policy_based_train(
         if (total_step_counter + 1) % number_steps_per_evaluation == 0:
             logging.info("*************--Evaluation Loop--*************")
             evaluate_policy_network(
-                copy.deepcopy(env),
+                env,
                 agent,
                 train_config,
                 record=record,

From 311d388c744071ccb8ead893517697f40607b26d Mon Sep 17 00:00:00 2001
From: PK-and-Sam <kwad109@aucklanduni.ac.nz>
Date: Sat, 20 Jul 2024 16:02:16 +1200
Subject: [PATCH 02/26] ignored added shell script

---
 .gitignore | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 1bd8f1f..e1667b3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,4 +15,6 @@ dist/
 .DS_Store
 rl_logs/
 
-configs
\ No newline at end of file
+configs
+
+shell-scripts
\ No newline at end of file

From 9b3009ef0e8635fdb524a9e601dbb59b95fcd16d Mon Sep 17 00:00:00 2001
From: PK-and-Sam <kwad109@aucklanduni.ac.nz>
Date: Sun, 21 Jul 2024 17:53:14 +1200
Subject: [PATCH 03/26] Added fight script

---
 .gitignore             | 4 +---
 shell-scripts/fight.sh | 1 +
 2 files changed, 2 insertions(+), 3 deletions(-)
 create mode 100755 shell-scripts/fight.sh

diff --git a/.gitignore b/.gitignore
index e1667b3..1bd8f1f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,4 @@ dist/
 .DS_Store
 rl_logs/
 
-configs
-
-shell-scripts
\ No newline at end of file
+configs
\ No newline at end of file
diff --git a/shell-scripts/fight.sh b/shell-scripts/fight.sh
new file mode 100755
index 0000000..cd44790
--- /dev/null
+++ b/shell-scripts/fight.sh
@@ -0,0 +1 @@
+python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --domain pokemon --task fight SACAE

From d4634957c7608cd31cbf7e3fc8aff5c6d09a1baa Mon Sep 17 00:00:00 2001
From: PKWadsworth <pkwads2@gmail.com>
Date: Sun, 21 Jul 2024 18:57:35 +1200
Subject: [PATCH 04/26] Made pyboy env use sample action from env

---
 scripts/environments/pyboy/pyboy_environment.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/scripts/environments/pyboy/pyboy_environment.py b/scripts/environments/pyboy/pyboy_environment.py
index 5159744..00803e7 100644
--- a/scripts/environments/pyboy/pyboy_environment.py
+++ b/scripts/environments/pyboy/pyboy_environment.py
@@ -36,9 +36,7 @@ def action_num(self) -> int:
         return self.env.action_num
 
     def sample_action(self):
-        return np.random.uniform(
-            self.min_action_value, self.max_action_value, size=self.action_num
-        )
+        return self.env.sample_action()
 
     def set_seed(self, seed: int) -> None:
         self.env.set_seed(seed)

From c5e2ad8382fb157eca990e10f645705d5b12d130 Mon Sep 17 00:00:00 2001
From: PK-and-Sam <kwad109@aucklanduni.ac.nz>
Date: Tue, 23 Jul 2024 17:06:39 +1200
Subject: [PATCH 05/26] Added the discrete policy loop

---
 scripts/train.py                            |  11 ++
 scripts/train_loops/discrete_policy_loop.py | 114 ++++++++++++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 scripts/train_loops/discrete_policy_loop.py

diff --git a/scripts/train.py b/scripts/train.py
index 1af81b4..79be83d 100755
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -8,6 +8,7 @@
 import sys
 
 import torch
+import train_loops.discrete_policy_loop as dpbe
 import train_loops.policy_loop as pbe
 import train_loops.ppo_loop as ppe
 import train_loops.value_loop as vbe
@@ -133,6 +134,16 @@ def main():
                 alg_config,
                 display=env_config.display,
             )
+        elif agent.type == "discrete_policy":
+            dpbe.discrete_policy_based_train(
+                env,
+                agent,
+                memory,
+                record,
+                training_config,
+                alg_config,
+                display=env_config.display,
+            )
         elif agent.type == "policy":
             pbe.policy_based_train(
                 env,
diff --git a/scripts/train_loops/discrete_policy_loop.py b/scripts/train_loops/discrete_policy_loop.py
new file mode 100644
index 0000000..773d5e4
--- /dev/null
+++ b/scripts/train_loops/discrete_policy_loop.py
@@ -0,0 +1,114 @@
+import logging
+import time
+
+from cares_reinforcement_learning.util import helpers as hlp
+from cares_reinforcement_learning.util.configurations import (
+    AlgorithmConfig,
+    TrainingConfig,
+)
+
+def discrete_policy_based_train(
+    env,
+    agent,
+    memory,
+    record,
+    train_config: TrainingConfig,
+    alg_config: AlgorithmConfig,
+    display=False,
+):
+    start_time = time.time()
+
+    max_steps_training = alg_config.max_steps_training
+    max_steps_exploration = alg_config.max_steps_exploration
+    number_steps_per_evaluation = train_config.number_steps_per_evaluation
+    number_steps_per_train_policy = alg_config.number_steps_per_train_policy
+
+    # Algorthm specific attributes - e.g. NaSA-TD3 dd
+    intrinsic_on = (
+        bool(alg_config.intrinsic_on) if hasattr(alg_config, "intrinsic_on") else False
+    )
+
+    min_noise = alg_config.min_noise if hasattr(alg_config, "min_noise") else 0
+    noise_decay = alg_config.noise_decay if hasattr(alg_config, "noise_decay") else 1.0
+    noise_scale = alg_config.noise_scale if hasattr(alg_config, "noise_scale") else 0.1
+
+    logging.info(
+        f"Training {max_steps_training} Exploration {max_steps_exploration} Evaluation {number_steps_per_evaluation}"
+    )
+
+    batch_size = alg_config.batch_size
+    G = alg_config.G
+
+    episode_timesteps = 0
+    episode_reward = 0
+    episode_num = 0
+
+    state = env.reset()
+
+    episode_start = time.time()
+    for total_step_counter in range(int(max_steps_training)):
+        episode_timesteps += 1
+
+        if total_step_counter < max_steps_exploration:
+            logging.info(
+                f"Running Exploration Steps {total_step_counter + 1}/{max_steps_exploration}"
+            )
+
+            action = env.sample_action()
+
+        else:
+            noise_scale *= noise_decay
+            noise_scale = max(min_noise, noise_scale)
+
+            # algorithm range [-1, 1]
+            action = agent.select_action_from_policy(state, noise_scale=noise_scale)
+
+        next_state, reward_extrinsic, done, truncated = env.step(action)
+        if display:
+            env.render()
+
+        intrinsic_reward = 0
+        if intrinsic_on and total_step_counter > max_steps_exploration:
+            intrinsic_reward = agent.get_intrinsic_reward(state, action, next_state)
+
+        total_reward = reward_extrinsic + intrinsic_reward
+
+        memory.add(
+            state,
+            action,
+            total_reward,
+            next_state,
+            done,
+        )
+
+        state = next_state
+        episode_reward += reward_extrinsic  # Note we only track the extrinsic reward for the episode for proper comparison
+
+        if (
+            total_step_counter >= max_steps_exploration
+            and total_step_counter % number_steps_per_train_policy == 0
+        ):
+            for _ in range(G):
+                agent.train_policy(memory, batch_size)
+
+        if done or truncated:
+            episode_time = time.time() - episode_start
+            record.log_train(
+                total_steps=total_step_counter + 1,
+                episode=episode_num + 1,
+                episode_steps=episode_timesteps,
+                episode_reward=episode_reward,
+                episode_time=episode_time,
+                display=True,
+            )
+
+            # Reset environment
+            state = env.reset()
+            episode_timesteps = 0
+            episode_reward = 0
+            episode_num += 1
+            episode_start = time.time()
+
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    print("Training time:", time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

From a7077962d6c494fc1dda2a4a508fc05949683e8f Mon Sep 17 00:00:00 2001
From: PK-and-Sam <kwad109@aucklanduni.ac.nz>
Date: Wed, 7 Aug 2024 16:47:07 +1200
Subject: [PATCH 06/26] Added discrete policy loop for pokemon and fight script

---
 requirements.txt                              |  1 +
 .../environments/openai/openai_environment.py |  2 +
 scripts/plot.py                               | 45 ++++++++++++++
 scripts/train.py                              |  1 -
 scripts/train_loops/discrete_policy_loop.py   | 61 ++++++++++++++++++-
 shell-scripts/fight.sh                        |  2 +-
 6 files changed, 108 insertions(+), 4 deletions(-)
 create mode 100644 scripts/plot.py

diff --git a/requirements.txt b/requirements.txt
index 5f90734..4268b16 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,5 @@ opencv-contrib-python==4.6.0.66
 pydantic==1.10.13
 torch==2.3.1
 pyboy==2.2.1
+plotly==5.22.0
 mediapy==1.1.9
diff --git a/scripts/environments/openai/openai_environment.py b/scripts/environments/openai/openai_environment.py
index 1dabafb..80b9e5a 100644
--- a/scripts/environments/openai/openai_environment.py
+++ b/scripts/environments/openai/openai_environment.py
@@ -15,10 +15,12 @@ def __init__(self, config: GymEnvironmentConfig) -> None:
 
     @cached_property
     def max_action_value(self) -> float:
+        return self.env.action_space.n - 1
         return self.env.action_space.high[0]
 
     @cached_property
     def min_action_value(self) -> float:
+        return 0
         return self.env.action_space.low[0]
 
     @cached_property
diff --git a/scripts/plot.py b/scripts/plot.py
new file mode 100644
index 0000000..b807d91
--- /dev/null
+++ b/scripts/plot.py
@@ -0,0 +1,45 @@
+import os
+import sys
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+
+def plot_csv_files(directory_path):
+    # Define file paths
+    train_file_path = os.path.join(directory_path, 'train.csv')
+    eval_file_path = os.path.join(directory_path, 'eval.csv')
+
+    # Check if files exist
+    if not os.path.exists(train_file_path):
+        print(f"Train file not found at {train_file_path}")
+        return
+    if not os.path.exists(eval_file_path):
+        print(f"Eval file not found at {eval_file_path}")
+        return
+
+    # Read CSV files
+    train_df = pd.read_csv(train_file_path)
+    eval_df = pd.read_csv(eval_file_path)
+
+    # Create subplots for train and eval data
+    fig = px.make_subplots(rows=2, cols=1, shared_xaxes=True, 
+                        subplot_titles=('Train Data', 'Eval Data'))
+
+    # Plot train data
+    for col in train_df.columns:
+        fig.add_trace(go.Scatter(x=train_df.index, y=train_df[col], mode='lines', name=f'Train {col}'), row=1, col=1)
+
+    # Plot eval data
+    for col in eval_df.columns:
+        fig.add_trace(go.Scatter(x=eval_df.index, y=eval_df[col], mode='lines', name=f'Eval {col}'), row=2, col=1)
+
+    # Update layout
+    fig.update_layout(height=600, width=800, title_text="Train and Eval Data Plots")
+    fig.show()
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python plot_csv.py <directory_path>")
+    else:
+        directory_path = sys.argv[1]
+        plot_csv_files(directory_path)
diff --git a/scripts/train.py b/scripts/train.py
index 79be83d..7363bb6 100755
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -111,7 +111,6 @@ def main():
         )
         # create the record class - standardised results tracking
         record = Record(
-            glob_log_dir="",
             log_dir=log_dir,
             algorithm=alg_config.algorithm,
             task=env_config.task,
diff --git a/scripts/train_loops/discrete_policy_loop.py b/scripts/train_loops/discrete_policy_loop.py
index 773d5e4..438ef1b 100644
--- a/scripts/train_loops/discrete_policy_loop.py
+++ b/scripts/train_loops/discrete_policy_loop.py
@@ -7,6 +7,52 @@
     TrainingConfig,
 )
 
+def evaluate_policy_network(
+    env, agent, config: TrainingConfig, record=None, total_steps=0
+):
+    state = env.reset()
+
+    if record is not None:
+        frame = env.grab_frame()
+        record.start_video(total_steps + 1, frame)
+
+    number_eval_episodes = int(config.number_eval_episodes)
+
+    for eval_episode_counter in range(number_eval_episodes):
+        episode_timesteps = 0
+        episode_reward = 0
+        episode_num = 0
+        done = False
+        truncated = False
+
+        while not done and not truncated:
+            episode_timesteps += 1
+            action = agent.select_action_from_policy(state, evaluation=True).item()
+
+            state, reward, done, truncated = env.step(action)
+            episode_reward += reward
+
+            if eval_episode_counter == 0 and record is not None:
+                frame = env.grab_frame()
+                record.log_video(frame)
+
+            if done or truncated:
+                if record is not None:
+                    record.log_eval(
+                        total_steps=total_steps + 1,
+                        episode=eval_episode_counter + 1,
+                        episode_reward=episode_reward,
+                        display=True,
+                    )
+
+                # Reset environment
+                state = env.reset()
+                episode_reward = 0
+                episode_timesteps = 0
+                episode_num += 1
+
+    record.stop_video()
+
 def discrete_policy_based_train(
     env,
     agent,
@@ -53,7 +99,6 @@ def discrete_policy_based_train(
             logging.info(
                 f"Running Exploration Steps {total_step_counter + 1}/{max_steps_exploration}"
             )
-
             action = env.sample_action()
 
         else:
@@ -61,7 +106,8 @@ def discrete_policy_based_train(
             noise_scale = max(min_noise, noise_scale)
 
             # algorithm range [-1, 1]
-            action = agent.select_action_from_policy(state, noise_scale=noise_scale)
+            action_tensor = agent.select_action_from_policy(state, noise_scale=noise_scale)
+            action = action_tensor.item()
 
         next_state, reward_extrinsic, done, truncated = env.step(action)
         if display:
@@ -91,6 +137,17 @@ def discrete_policy_based_train(
             for _ in range(G):
                 agent.train_policy(memory, batch_size)
 
+        if (total_step_counter + 1) % number_steps_per_evaluation == 0:
+            logging.info("*************--Evaluation Loop--*************")
+            evaluate_policy_network(
+                env,
+                agent,
+                train_config,
+                record=record,
+                total_steps=total_step_counter,
+            )
+            logging.info("--------------------------------------------")
+
         if done or truncated:
             episode_time = time.time() - episode_start
             record.log_train(
diff --git a/shell-scripts/fight.sh b/shell-scripts/fight.sh
index cd44790..3263ceb 100755
--- a/shell-scripts/fight.sh
+++ b/shell-scripts/fight.sh
@@ -1 +1 @@
-python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --domain pokemon --task fight SACAE
+python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --domain pokemon --task fight SACD --image_observation 1

From eea9c31c0dbd213c564666318770a470d2cbdf55 Mon Sep 17 00:00:00 2001
From: PKWadsworth <pkwads2@gmail.com>
Date: Tue, 13 Aug 2024 15:30:46 +1200
Subject: [PATCH 07/26] Removed discrete policy loop

---
 scripts/train_loops/discrete_policy_loop.py | 171 --------------------
 1 file changed, 171 deletions(-)
 delete mode 100644 scripts/train_loops/discrete_policy_loop.py

diff --git a/scripts/train_loops/discrete_policy_loop.py b/scripts/train_loops/discrete_policy_loop.py
deleted file mode 100644
index 438ef1b..0000000
--- a/scripts/train_loops/discrete_policy_loop.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import logging
-import time
-
-from cares_reinforcement_learning.util import helpers as hlp
-from cares_reinforcement_learning.util.configurations import (
-    AlgorithmConfig,
-    TrainingConfig,
-)
-
-def evaluate_policy_network(
-    env, agent, config: TrainingConfig, record=None, total_steps=0
-):
-    state = env.reset()
-
-    if record is not None:
-        frame = env.grab_frame()
-        record.start_video(total_steps + 1, frame)
-
-    number_eval_episodes = int(config.number_eval_episodes)
-
-    for eval_episode_counter in range(number_eval_episodes):
-        episode_timesteps = 0
-        episode_reward = 0
-        episode_num = 0
-        done = False
-        truncated = False
-
-        while not done and not truncated:
-            episode_timesteps += 1
-            action = agent.select_action_from_policy(state, evaluation=True).item()
-
-            state, reward, done, truncated = env.step(action)
-            episode_reward += reward
-
-            if eval_episode_counter == 0 and record is not None:
-                frame = env.grab_frame()
-                record.log_video(frame)
-
-            if done or truncated:
-                if record is not None:
-                    record.log_eval(
-                        total_steps=total_steps + 1,
-                        episode=eval_episode_counter + 1,
-                        episode_reward=episode_reward,
-                        display=True,
-                    )
-
-                # Reset environment
-                state = env.reset()
-                episode_reward = 0
-                episode_timesteps = 0
-                episode_num += 1
-
-    record.stop_video()
-
-def discrete_policy_based_train(
-    env,
-    agent,
-    memory,
-    record,
-    train_config: TrainingConfig,
-    alg_config: AlgorithmConfig,
-    display=False,
-):
-    start_time = time.time()
-
-    max_steps_training = alg_config.max_steps_training
-    max_steps_exploration = alg_config.max_steps_exploration
-    number_steps_per_evaluation = train_config.number_steps_per_evaluation
-    number_steps_per_train_policy = alg_config.number_steps_per_train_policy
-
-    # Algorthm specific attributes - e.g. NaSA-TD3 dd
-    intrinsic_on = (
-        bool(alg_config.intrinsic_on) if hasattr(alg_config, "intrinsic_on") else False
-    )
-
-    min_noise = alg_config.min_noise if hasattr(alg_config, "min_noise") else 0
-    noise_decay = alg_config.noise_decay if hasattr(alg_config, "noise_decay") else 1.0
-    noise_scale = alg_config.noise_scale if hasattr(alg_config, "noise_scale") else 0.1
-
-    logging.info(
-        f"Training {max_steps_training} Exploration {max_steps_exploration} Evaluation {number_steps_per_evaluation}"
-    )
-
-    batch_size = alg_config.batch_size
-    G = alg_config.G
-
-    episode_timesteps = 0
-    episode_reward = 0
-    episode_num = 0
-
-    state = env.reset()
-
-    episode_start = time.time()
-    for total_step_counter in range(int(max_steps_training)):
-        episode_timesteps += 1
-
-        if total_step_counter < max_steps_exploration:
-            logging.info(
-                f"Running Exploration Steps {total_step_counter + 1}/{max_steps_exploration}"
-            )
-            action = env.sample_action()
-
-        else:
-            noise_scale *= noise_decay
-            noise_scale = max(min_noise, noise_scale)
-
-            # algorithm range [-1, 1]
-            action_tensor = agent.select_action_from_policy(state, noise_scale=noise_scale)
-            action = action_tensor.item()
-
-        next_state, reward_extrinsic, done, truncated = env.step(action)
-        if display:
-            env.render()
-
-        intrinsic_reward = 0
-        if intrinsic_on and total_step_counter > max_steps_exploration:
-            intrinsic_reward = agent.get_intrinsic_reward(state, action, next_state)
-
-        total_reward = reward_extrinsic + intrinsic_reward
-
-        memory.add(
-            state,
-            action,
-            total_reward,
-            next_state,
-            done,
-        )
-
-        state = next_state
-        episode_reward += reward_extrinsic  # Note we only track the extrinsic reward for the episode for proper comparison
-
-        if (
-            total_step_counter >= max_steps_exploration
-            and total_step_counter % number_steps_per_train_policy == 0
-        ):
-            for _ in range(G):
-                agent.train_policy(memory, batch_size)
-
-        if (total_step_counter + 1) % number_steps_per_evaluation == 0:
-            logging.info("*************--Evaluation Loop--*************")
-            evaluate_policy_network(
-                env,
-                agent,
-                train_config,
-                record=record,
-                total_steps=total_step_counter,
-            )
-            logging.info("--------------------------------------------")
-
-        if done or truncated:
-            episode_time = time.time() - episode_start
-            record.log_train(
-                total_steps=total_step_counter + 1,
-                episode=episode_num + 1,
-                episode_steps=episode_timesteps,
-                episode_reward=episode_reward,
-                episode_time=episode_time,
-                display=True,
-            )
-
-            # Reset environment
-            state = env.reset()
-            episode_timesteps = 0
-            episode_reward = 0
-            episode_num += 1
-            episode_start = time.time()
-
-    end_time = time.time()
-    elapsed_time = end_time - start_time
-    print("Training time:", time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

From efb3ab9e8d5e9c65185a49cad6bfd20e104beb6d Mon Sep 17 00:00:00 2001
From: PK-and-Sam <kwad109@aucklanduni.ac.nz>
Date: Wed, 14 Aug 2024 15:53:59 +1200
Subject: [PATCH 08/26] Fixed import error

---
 scripts/train.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/train.py b/scripts/train.py
index 5448db5..9681037 100755
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -9,7 +9,6 @@
 import os
 
 import torch
-import train_loops.discrete_policy_loop as dpbe
 import train_loops.policy_loop as pbe
 import train_loops.ppo_loop as ppe
 import train_loops.value_loop as vbe

From 68220e9bad5739600070fb37f0086395027dc483 Mon Sep 17 00:00:00 2001
From: PK-and-Sam <kwad109@aucklanduni.ac.nz>
Date: Wed, 14 Aug 2024 17:08:32 +1200
Subject: [PATCH 09/26] Fixed policy loop

---
 scripts/train_loops/policy_loop.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py
index d079bf0..9f713cd 100644
--- a/scripts/train_loops/policy_loop.py
+++ b/scripts/train_loops/policy_loop.py
@@ -10,7 +10,7 @@
 
 
 def evaluate_policy_network(
-    env, agent, config: TrainingConfig, record=None, total_steps=0
+    env, agent, config: TrainingConfig, record=None, total_steps=0, normalisation=True
 ):
     state = env.reset()
 
@@ -29,12 +29,12 @@ def evaluate_policy_network(
 
         while not done and not truncated:
             episode_timesteps += 1
-            action = agent.select_action_from_policy(state, evaluation=True)
-            action_env = hlp.denormalize(
-                action, env.max_action_value, env.min_action_value
-            )
+            normalised_action = agent.select_action_from_policy(state, evaluation=True)
+            denormalised_action = hlp.denormalize(
+                normalised_action, env.max_action_value, env.min_action_value
+            ) if normalisation else normalised_action
 
-            state, reward, done, truncated = env.step(action_env)
+            state, reward, done, truncated = env.step(denormalised_action)
             episode_reward += reward
 
             if eval_episode_counter == 0 and record is not None:
@@ -171,6 +171,7 @@ def policy_based_train(
                 train_config,
                 record=record,
                 total_steps=total_step_counter,
+                normalisation=normalisation
             )
             logging.info("--------------------------------------------")
 

From 855c924059e953952334b07466b922ef6541ac91 Mon Sep 17 00:00:00 2001
From: PK Wadsworth <pkwads2@gmail.com>
Date: Tue, 10 Sep 2024 13:10:34 +1200
Subject: [PATCH 10/26] Added discrete config

---
 scripts/environments/pyboy/pyboy_environment.py | 1 +
 scripts/util/configurations.py                  | 1 +
 shell-scripts/fight.sh                          | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/environments/pyboy/pyboy_environment.py b/scripts/environments/pyboy/pyboy_environment.py
index 00803e7..56d31c0 100644
--- a/scripts/environments/pyboy/pyboy_environment.py
+++ b/scripts/environments/pyboy/pyboy_environment.py
@@ -17,6 +17,7 @@ def __init__(self, config: GymEnvironmentConfig) -> None:
             config.act_freq,
             config.emulation_speed,
             config.headless,
+            config.discrete,
         )
 
     @cached_property
diff --git a/scripts/util/configurations.py b/scripts/util/configurations.py
index 05c73f3..81f0966 100755
--- a/scripts/util/configurations.py
+++ b/scripts/util/configurations.py
@@ -43,3 +43,4 @@ class GymEnvironmentConfig(EnvironmentConfig):
     act_freq: Optional[int] = 24
     emulation_speed: Optional[int] = 0
     headless: Optional[int] = 0
+    discrete: Optional[int] = 0
diff --git a/shell-scripts/fight.sh b/shell-scripts/fight.sh
index 3263ceb..bc84cc3 100755
--- a/shell-scripts/fight.sh
+++ b/shell-scripts/fight.sh
@@ -1 +1 @@
-python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --domain pokemon --task fight SACD --image_observation 1
+python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 0 --domain pokemon --task fight SACAE

From e327a9b3142682d5d10d802151521ab495507173 Mon Sep 17 00:00:00 2001
From: PK Wadsworth <pkwads2@gmail.com>
Date: Thu, 12 Sep 2024 16:24:17 +1200
Subject: [PATCH 11/26] Added log comments and discretisation

---
 scripts/environments/gym_environment.py       |  4 ++
 .../environments/pyboy/pyboy_environment.py   |  2 +
 scripts/train_loops/policy_loop.py            | 44 +++++++++++++++++++
 shell-scripts/fight.sh                        |  2 +-
 4 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/scripts/environments/gym_environment.py b/scripts/environments/gym_environment.py
index a779a60..ce8fbba 100644
--- a/scripts/environments/gym_environment.py
+++ b/scripts/environments/gym_environment.py
@@ -12,9 +12,13 @@ def __init__(self, config: GymEnvironmentConfig) -> None:
         self.task = config.task
 
     def render(self):
+        # debug-log logging.info("Logging128")
         frame = self.grab_frame()
+        # debug-log logging.info("Logging129")
         cv2.imshow(f"{self.task}", frame)
+        # debug-log logging.info("Logging130")
         cv2.waitKey(10)
+        # debug-log logging.info("Logging131")
 
     @cached_property
     @abc.abstractmethod
diff --git a/scripts/environments/pyboy/pyboy_environment.py b/scripts/environments/pyboy/pyboy_environment.py
index 56d31c0..698ba52 100644
--- a/scripts/environments/pyboy/pyboy_environment.py
+++ b/scripts/environments/pyboy/pyboy_environment.py
@@ -1,6 +1,7 @@
 from functools import cached_property
 
 import numpy as np
+import logging
 from environments.gym_environment import GymEnvironment
 from util.configurations import GymEnvironmentConfig
 
@@ -46,6 +47,7 @@ def reset(self) -> np.ndarray:
         return self.env.reset()
 
     def step(self, action: int) -> tuple:
+        # debug-log logging.info("Logging109")
         return self.env.step(action)
 
     def grab_frame(self, height=240, width=300) -> np.ndarray:
diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py
index 9f713cd..ca318ed 100644
--- a/scripts/train_loops/policy_loop.py
+++ b/scripts/train_loops/policy_loop.py
@@ -12,13 +12,19 @@
 def evaluate_policy_network(
     env, agent, config: TrainingConfig, record=None, total_steps=0, normalisation=True
 ):
+    # debug-log logging.info("Logging32")
     state = env.reset()
 
+    # debug-log logging.info("Logging33")
     if record is not None:
+        # debug-log logging.info("Logging34")
         frame = env.grab_frame()
         record.start_video(total_steps + 1, frame)
+        # debug-log logging.info("Logging35")
 
+    # debug-log logging.info("Logging36")
     number_eval_episodes = int(config.number_eval_episodes)
+    # debug-log logging.info("Logging37")
 
     for eval_episode_counter in range(number_eval_episodes):
         episode_timesteps = 0
@@ -27,28 +33,39 @@ def evaluate_policy_network(
         done = False
         truncated = False
 
+        # debug-log logging.info("Logging38")
         while not done and not truncated:
+            # debug-log logging.info("Logging39")
             episode_timesteps += 1
             normalised_action = agent.select_action_from_policy(state, evaluation=True)
+            # debug-log logging.info("Logging40")
             denormalised_action = hlp.denormalize(
                 normalised_action, env.max_action_value, env.min_action_value
             ) if normalisation else normalised_action
 
+            # debug-log logging.info("Logging41")
             state, reward, done, truncated = env.step(denormalised_action)
             episode_reward += reward
 
+            # debug-log logging.info("Logging42")
             if eval_episode_counter == 0 and record is not None:
+                # debug-log logging.info("Logging44")
                 frame = env.grab_frame()
                 record.log_video(frame)
+                # debug-log logging.info("Logging45")
 
+            # debug-log logging.info("Logging43")
             if done or truncated:
+                # debug-log logging.info("Logging46")
                 if record is not None:
+                    # debug-log logging.info("Logging47")
                     record.log_eval(
                         total_steps=total_steps + 1,
                         episode=eval_episode_counter + 1,
                         episode_reward=episode_reward,
                         display=True,
                     )
+                    # debug-log logging.info("Logging48")
 
                 # Reset environment
                 state = env.reset()
@@ -56,7 +73,9 @@ def evaluate_policy_network(
                 episode_timesteps = 0
                 episode_num += 1
 
+    # debug-log logging.info("Logging49")
     record.stop_video()
+    # debug-log logging.info("Logging50")
 
 
 def policy_based_train(
@@ -70,18 +89,22 @@ def policy_based_train(
     display=False,
     normalisation=True,
 ):
+    # debug-log logging.info("Logging9")
     start_time = time.time()
 
+    # debug-log logging.info("Logging10")
     max_steps_training = alg_config.max_steps_training
     max_steps_exploration = alg_config.max_steps_exploration
     number_steps_per_evaluation = train_config.number_steps_per_evaluation
     number_steps_per_train_policy = alg_config.number_steps_per_train_policy
 
+    # debug-log logging.info("Logging11")
     # Algorthm specific attributes - e.g. NaSA-TD3 dd
     intrinsic_on = (
         bool(alg_config.intrinsic_on) if hasattr(alg_config, "intrinsic_on") else False
     )
 
+    # debug-log logging.info("Logging12")
     min_noise = alg_config.min_noise if hasattr(alg_config, "min_noise") else 0
     noise_decay = alg_config.noise_decay if hasattr(alg_config, "noise_decay") else 1.0
     noise_scale = alg_config.noise_scale if hasattr(alg_config, "noise_scale") else 0.1
@@ -99,10 +122,13 @@ def policy_based_train(
 
     state = env.reset()
 
+    # debug-log logging.info("Logging13")
     episode_start = time.time()
     for total_step_counter in range(int(max_steps_training)):
+        # debug-log logging.info("Logging14")
         episode_timesteps += 1
 
+        # debug-log logging.info("Logging15")
         if total_step_counter < max_steps_exploration:
             logging.info(
                 f"Running Exploration Steps {total_step_counter + 1}/{max_steps_exploration}"
@@ -110,6 +136,7 @@ def policy_based_train(
 
             denormalised_action = env.sample_action()
 
+            # debug-log logging.info("Logging16")
             # algorithm range [-1, 1] - note for DMCS this is redudenant but required for openai
             if normalisation:
                 normalised_action = hlp.normalize(
@@ -117,14 +144,18 @@ def policy_based_train(
                 )
             else:
                 normalised_action = denormalised_action
+            # debug-log logging.info("Logging17")
         else:
+            # debug-log logging.info("Logging18")
             noise_scale *= noise_decay
             noise_scale = max(min_noise, noise_scale)
 
+            # debug-log logging.info("Logging19")
             # algorithm range [-1, 1]
             normalised_action = agent.select_action_from_policy(
                 state, noise_scale=noise_scale
             )
+            # debug-log logging.info("Logging20")
             # mapping to env range [e.g. -2 , 2 for pendulum] - note for DMCS this is redudenant but required for openai
             if normalisation:
                 denormalised_action = hlp.denormalize(
@@ -132,19 +163,26 @@ def policy_based_train(
                 )
             else:
                 denormalised_action = normalised_action
+            # debug-log logging.info("Logging21")
 
+        # debug-log logging.info("Logging22")
         next_state, reward_extrinsic, done, truncated = env.step(denormalised_action)
+        # debug-log logging.info("Logging23")
         if display:
+            # debug-log logging.info("Logging128")
             env.render()
 
+        # debug-log logging.info("Logging23")
         intrinsic_reward = 0
         if intrinsic_on and total_step_counter > max_steps_exploration:
             intrinsic_reward = agent.get_intrinsic_reward(
                 state, normalised_action, next_state
             )
 
+        # debug-log logging.info("Logging24")
         total_reward = reward_extrinsic + intrinsic_reward
 
+        # debug-log logging.info("Logging25")
         memory.add(
             state,
             normalised_action,
@@ -152,16 +190,20 @@ def policy_based_train(
             next_state,
             done,
         )
+        # debug-log logging.info("Logging26")
 
         state = next_state
         episode_reward += reward_extrinsic  # Note we only track the extrinsic reward for the episode for proper comparison
+        # debug-log logging.info("Logging27")
 
         if (
             total_step_counter >= max_steps_exploration
             and total_step_counter % number_steps_per_train_policy == 0
         ):
+            # debug-log logging.info("Logging28")
             for _ in range(G):
                 agent.train_policy(memory, batch_size)
+            # debug-log logging.info("Logging29")
 
         if (total_step_counter + 1) % number_steps_per_evaluation == 0:
             logging.info("*************--Evaluation Loop--*************")
@@ -177,6 +219,7 @@ def policy_based_train(
 
         if done or truncated:
             episode_time = time.time() - episode_start
+            # debug-log logging.info("Logging30")
             record.log_train(
                 total_steps=total_step_counter + 1,
                 episode=episode_num + 1,
@@ -185,6 +228,7 @@ def policy_based_train(
                 episode_time=episode_time,
                 display=True,
             )
+            # debug-log logging.info("Logging31")
 
             # Reset environment
             state = env.reset()
diff --git a/shell-scripts/fight.sh b/shell-scripts/fight.sh
index bc84cc3..cd44790 100755
--- a/shell-scripts/fight.sh
+++ b/shell-scripts/fight.sh
@@ -1 +1 @@
-python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 0 --domain pokemon --task fight SACAE
+python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --domain pokemon --task fight SACAE

From 1277a7caf5f1f769a16a4adec1500bf53b68d588 Mon Sep 17 00:00:00 2001
From: PK Wadsworth <pkwads2@gmail.com>
Date: Mon, 16 Sep 2024 16:20:53 +1200
Subject: [PATCH 12/26] Updated shell scripts

---
 shell-scripts/catch-sacae.sh | 1 +
 shell-scripts/fight-sacae.sh | 1 +
 shell-scripts/fight-sacd.sh  | 1 +
 shell-scripts/fight.sh       | 1 -
 4 files changed, 3 insertions(+), 1 deletion(-)
 create mode 100755 shell-scripts/catch-sacae.sh
 create mode 100755 shell-scripts/fight-sacae.sh
 create mode 100755 shell-scripts/fight-sacd.sh
 delete mode 100755 shell-scripts/fight.sh

diff --git a/shell-scripts/catch-sacae.sh b/shell-scripts/catch-sacae.sh
new file mode 100755
index 0000000..8573265
--- /dev/null
+++ b/shell-scripts/catch-sacae.sh
@@ -0,0 +1 @@
+python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 0 --domain pokemon --task catch SACAE
diff --git a/shell-scripts/fight-sacae.sh b/shell-scripts/fight-sacae.sh
new file mode 100755
index 0000000..8164fed
--- /dev/null
+++ b/shell-scripts/fight-sacae.sh
@@ -0,0 +1 @@
+python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 0 --domain pokemon --task fight SACAE
diff --git a/shell-scripts/fight-sacd.sh b/shell-scripts/fight-sacd.sh
new file mode 100755
index 0000000..41ce632
--- /dev/null
+++ b/shell-scripts/fight-sacd.sh
@@ -0,0 +1 @@
+python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 1 --domain pokemon --task fight SACD
diff --git a/shell-scripts/fight.sh b/shell-scripts/fight.sh
deleted file mode 100755
index cd44790..0000000
--- a/shell-scripts/fight.sh
+++ /dev/null
@@ -1 +0,0 @@
-python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --domain pokemon --task fight SACAE

From 128d0b824b6ee8f0f5170bdeec572adfe95f1a9a Mon Sep 17 00:00:00 2001
From: PK Wadsworth <pkwads2@gmail.com>
Date: Wed, 2 Oct 2024 18:27:05 +1300
Subject: [PATCH 13/26] Made changes to gym which allow image overlay

---
 scripts/environments/gym_environment.py       |  4 +
 scripts/environments/image_wrapper.py         |  3 +
 .../environments/pyboy/pyboy_environment.py   |  3 +
 scripts/train_loops/policy_loop.py            | 86 ++++++++++++++++---
 scripts/util/configurations.py                |  2 +-
 shell-scripts/brock.sh                        |  1 +
 shell-scripts/catch-sacae.sh                  |  1 -
 shell-scripts/catch.sh                        |  1 +
 shell-scripts/fight-sacdae.sh                 |  1 +
 9 files changed, 88 insertions(+), 14 deletions(-)
 create mode 100755 shell-scripts/brock.sh
 delete mode 100755 shell-scripts/catch-sacae.sh
 create mode 100755 shell-scripts/catch.sh
 create mode 100755 shell-scripts/fight-sacdae.sh

diff --git a/scripts/environments/gym_environment.py b/scripts/environments/gym_environment.py
index ce8fbba..641fc93 100644
--- a/scripts/environments/gym_environment.py
+++ b/scripts/environments/gym_environment.py
@@ -24,6 +24,10 @@ def render(self):
     @abc.abstractmethod
     def min_action_value(self):
         raise NotImplementedError("Override this method")
+    
+    @abc.abstractmethod
+    def action_as_string(self, action):
+        raise NotImplemented("Override this method")
 
     @cached_property
     @abc.abstractmethod
diff --git a/scripts/environments/image_wrapper.py b/scripts/environments/image_wrapper.py
index 51a9b8f..1dd2dd2 100644
--- a/scripts/environments/image_wrapper.py
+++ b/scripts/environments/image_wrapper.py
@@ -37,6 +37,9 @@ def min_action_value(self):
     @cached_property
     def max_action_value(self):
         return self.gym.max_action_value
+    
+    def action_as_string(self, action):
+        return self.gym.action_as_string(action)
 
     def sample_action(self):
         return self.gym.sample_action()
diff --git a/scripts/environments/pyboy/pyboy_environment.py b/scripts/environments/pyboy/pyboy_environment.py
index 698ba52..6f916ed 100644
--- a/scripts/environments/pyboy/pyboy_environment.py
+++ b/scripts/environments/pyboy/pyboy_environment.py
@@ -40,6 +40,9 @@ def action_num(self) -> int:
     def sample_action(self):
         return self.env.sample_action()
 
+    def action_as_string(self, action):
+        return self.env.action_as_string(action)
+
     def set_seed(self, seed: int) -> None:
         self.env.set_seed(seed)
 
diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py
index 453d36f..6834301 100644
--- a/scripts/train_loops/policy_loop.py
+++ b/scripts/train_loops/policy_loop.py
@@ -1,12 +1,38 @@
 import copy
 import logging
 import time
+import os
 
 from cares_reinforcement_learning.util import helpers as hlp
 from cares_reinforcement_learning.util.configurations import (
     AlgorithmConfig,
     TrainingConfig,
 )
+import cv2
+import numpy as np
+
+def overlay_info(image, **kwargs):
+    # Create a copy of the image to overlay text
+    output_image = image.copy()
+
+    # Define the position for the text (top-left corner)
+    text_x, text_y = 10, 30
+
+    # Set the font, scale, color, and thickness for the text
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    font_scale = 0.4  # Smaller font scale
+    color = (0, 0, 255)  # Red color in BGR
+    thickness = 1       # Thicker text
+
+    # Create overlay text from the kwargs dictionary
+    overlay_text = "\n".join([f"{key}: {value}" for key, value in kwargs.items()])
+
+    # Split the overlay text into lines and calculate position for each line
+    for i, line in enumerate(overlay_text.split('\n')):
+        cv2.putText(output_image, line, (text_x, text_y + i * 20), 
+                    font, font_scale, color, thickness, cv2.LINE_AA)
+
+    return output_image
 
 
 def evaluate_policy_network(
@@ -89,6 +115,11 @@ def policy_based_train(
     display=False,
     normalisation=True,
 ):
+
+
+    highest_reward = float("-inf")
+    start_new_video = True
+
     # debug-log logging.info("Logging9")
     start_time = time.time()
 
@@ -134,6 +165,11 @@ def policy_based_train(
                 f"Running Exploration Steps {total_step_counter + 1}/{max_steps_exploration}"
             )
 
+            if start_new_video == True:
+                start_new_video = False
+                frame = env.grab_frame()
+                record.start_video("temp_train_video", frame)
+
             denormalised_action = env.sample_action()
 
             # debug-log logging.info("Logging16")
@@ -172,6 +208,14 @@ def policy_based_train(
             # debug-log logging.info("Logging128")
             env.render()
 
+        # debug-log logging.info("Logging42")
+        if record is not None:
+            # debug-log logging.info("Logging44")
+            frame = env.grab_frame()
+            frame_with_stats = overlay_info(frame, Reward=f"{episode_reward:.1f}")
+            record.log_video(frame_with_stats)
+            # debug-log logging.info("Logging45")
+
         # debug-log logging.info("Logging23")
         intrinsic_reward = 0
         if intrinsic_on and total_step_counter > max_steps_exploration:
@@ -209,18 +253,6 @@ def policy_based_train(
         if intrinsic_on:
             info["intrinsic_reward"] = intrinsic_reward
 
-        if (total_step_counter + 1) % number_steps_per_evaluation == 0:
-            logging.info("*************--Evaluation Loop--*************")
-            evaluate_policy_network(
-                env_eval,
-                agent,
-                train_config,
-                record=record,
-                total_steps=total_step_counter,
-                normalisation=normalisation,
-            )
-            logging.info("--------------------------------------------")
-
         if done or truncated:
             episode_time = time.time() - episode_start
             # debug-log logging.info("Logging30")
@@ -235,12 +267,42 @@ def policy_based_train(
             )
             # debug-log logging.info("Logging31")
 
+            record.stop_video()
+
+            if episode_reward > highest_reward:
+                highest_reward = episode_reward
+
+                vdir = os.path.join(record.directory, "videos")
+                highest_reward_video = os.path.join(vdir, "highest_reward.mp4")
+                training_video = os.path.join(vdir, "temp_train_video.mp4")
+
+                try:
+                    if os.path.exists(highest_reward_video):
+                        os.remove(highest_reward_video)
+                    
+                    os.rename(training_video, highest_reward_video)
+                except:
+                    logging.error("An error renaming the video occured :/")
+
             # Reset environment
+            start_new_video = True
             state = env.reset()
             episode_timesteps = 0
             episode_reward = 0
             episode_num += 1
             episode_start = time.time()
+        
+        if (total_step_counter + 1) % number_steps_per_evaluation == 0:
+            logging.info("*************--Evaluation Loop--*************")
+            evaluate_policy_network(
+                env_eval,
+                agent,
+                train_config,
+                record=record,
+                total_steps=total_step_counter,
+                normalisation=normalisation,
+            )
+            logging.info("--------------------------------------------")
 
     end_time = time.time()
     elapsed_time = end_time - start_time
diff --git a/scripts/util/configurations.py b/scripts/util/configurations.py
index 81f0966..131557f 100755
--- a/scripts/util/configurations.py
+++ b/scripts/util/configurations.py
@@ -33,7 +33,7 @@ class GymEnvironmentConfig(EnvironmentConfig):
     display: Optional[int] = 0
 
     # image observation configurations
-    frames_to_stack: Optional[int] = 3
+    frames_to_stack: Optional[int] = 9
     frame_width: Optional[int] = 84
     frame_height: Optional[int] = 84
     grey_scale: Optional[int] = 0
diff --git a/shell-scripts/brock.sh b/shell-scripts/brock.sh
new file mode 100755
index 0000000..329b766
--- /dev/null
+++ b/shell-scripts/brock.sh
@@ -0,0 +1 @@
+python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 0 --discrete 0 --domain pokemon --task brock SACAE
diff --git a/shell-scripts/catch-sacae.sh b/shell-scripts/catch-sacae.sh
deleted file mode 100755
index 8573265..0000000
--- a/shell-scripts/catch-sacae.sh
+++ /dev/null
@@ -1 +0,0 @@
-python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 0 --domain pokemon --task catch SACAE
diff --git a/shell-scripts/catch.sh b/shell-scripts/catch.sh
new file mode 100755
index 0000000..6a20a92
--- /dev/null
+++ b/shell-scripts/catch.sh
@@ -0,0 +1 @@
+python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 1 --domain pokemon --task catch SACDAE
diff --git a/shell-scripts/fight-sacdae.sh b/shell-scripts/fight-sacdae.sh
new file mode 100755
index 0000000..8427404
--- /dev/null
+++ b/shell-scripts/fight-sacdae.sh
@@ -0,0 +1 @@
+python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 1 --domain pokemon --task fight NaSATD3

From 042b3e17782c95bdef6ebe7d8767e2eb383f91d4 Mon Sep 17 00:00:00 2001
From: PK Wadsworth <pkwads2@gmail.com>
Date: Thu, 3 Oct 2024 15:29:16 +1300
Subject: [PATCH 14/26] Added more data saving - especially on highest reward

---
 scripts/train_loops/policy_loop.py | 63 +++++++++++++++++++++++-------
 1 file changed, 48 insertions(+), 15 deletions(-)

diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py
index 6834301..3bee261 100644
--- a/scripts/train_loops/policy_loop.py
+++ b/scripts/train_loops/policy_loop.py
@@ -2,6 +2,8 @@
 import logging
 import time
 import os
+import inspect
+import pandas as pd
 
 from cares_reinforcement_learning.util import helpers as hlp
 from cares_reinforcement_learning.util.configurations import (
@@ -115,10 +117,9 @@ def policy_based_train(
     display=False,
     normalisation=True,
 ):
+    
 
-
-    highest_reward = float("-inf")
-    start_new_video = True
+    start_new_run = True
 
     # debug-log logging.info("Logging9")
     start_time = time.time()
@@ -149,27 +150,35 @@ def policy_based_train(
 
     episode_timesteps = 0
     episode_reward = 0
+    highest_reward = float("-inf")
     episode_num = 0
 
     state = env.reset()
 
+    # Initialize the DataFrame with specified columns
+    run_data_rows = []
+
     # debug-log logging.info("Logging13")
     episode_start = time.time()
     for total_step_counter in range(int(max_steps_training)):
         # debug-log logging.info("Logging14")
         episode_timesteps += 1
 
+        step_data = {}
+
+        if start_new_run == True:
+            start_new_run = False
+            frame = env.grab_frame()
+            record.start_video("temp_train_video", frame)
+            run_data_rows = []
+            
+        
         # debug-log logging.info("Logging15")
         if total_step_counter < max_steps_exploration:
             logging.info(
                 f"Running Exploration Steps {total_step_counter + 1}/{max_steps_exploration}"
             )
 
-            if start_new_video == True:
-                start_new_video = False
-                frame = env.grab_frame()
-                record.start_video("temp_train_video", frame)
-
             denormalised_action = env.sample_action()
 
             # debug-log logging.info("Logging16")
@@ -188,9 +197,15 @@ def policy_based_train(
 
             # debug-log logging.info("Logging19")
             # algorithm range [-1, 1]
-            normalised_action = agent.select_action_from_policy(
-                state, noise_scale=noise_scale
-            )
+
+            # Horrible hack so I don't have to change all the algorithms
+            select_action_from_policy = agent.select_action_from_policy
+
+            if "info" in inspect.signature(select_action_from_policy).parameters:
+                denormalised_action = select_action_from_policy(state, noise_scale=noise_scale, info=step_data)
+            else:
+                denormalised_action = select_action_from_policy(state, noise_scale=noise_scale)
+
             # debug-log logging.info("Logging20")
             # mapping to env range [e.g. -2 , 2 for pendulum] - note for DMCS this is redudenant but required for openai
             if normalisation:
@@ -240,6 +255,12 @@ def policy_based_train(
         episode_reward += reward_extrinsic  # Note we only track the extrinsic reward for the episode for proper comparison
         # debug-log logging.info("Logging27")
 
+        step_data["action"] = denormalised_action
+        step_data["reward"] = total_reward
+        step_data["episode_reward"] = episode_reward
+
+        run_data_rows.append(step_data)
+
         info = {}
         if (
             total_step_counter >= max_steps_exploration
@@ -270,22 +291,34 @@ def policy_based_train(
             record.stop_video()
 
             if episode_reward > highest_reward:
+
+
                 highest_reward = episode_reward
 
-                vdir = os.path.join(record.directory, "videos")
-                highest_reward_video = os.path.join(vdir, "highest_reward.mp4")
-                training_video = os.path.join(vdir, "temp_train_video.mp4")
+                video_dir = os.path.join(record.directory, "videos")
+                data_dir = os.path.join(record.directory, "data")
+
+                highest_reward_video = os.path.join(video_dir, "highest_reward.mp4")
+                training_video = os.path.join(video_dir, "temp_train_video.mp4")
+                run_csv = os.path.join(data_dir, "highest_reward.csv")
+
+                logging.info(f"New highest reward of {episode_reward}. Saving video and run data...")
+
+                pd.DataFrame(run_data_rows).to_csv(run_csv, index=False)
 
                 try:
                     if os.path.exists(highest_reward_video):
                         os.remove(highest_reward_video)
+                except:
+                    logging.error("An error deleting the highest reward video occured :/")
                     
+                try:
                     os.rename(training_video, highest_reward_video)
                 except:
                     logging.error("An error renaming the video occured :/")
 
             # Reset environment
-            start_new_video = True
+            start_new_run = True
             state = env.reset()
             episode_timesteps = 0
             episode_reward = 0

From d58e5372de646eb40f4667259432c3f2fc3c460d Mon Sep 17 00:00:00 2001
From: PK Wadsworth <pkwads2@gmail.com>
Date: Thu, 3 Oct 2024 16:08:25 +1300
Subject: [PATCH 15/26] HUGE FIX

---
 scripts/train_loops/policy_loop.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py
index 3bee261..4c038a2 100644
--- a/scripts/train_loops/policy_loop.py
+++ b/scripts/train_loops/policy_loop.py
@@ -202,9 +202,9 @@ def policy_based_train(
             select_action_from_policy = agent.select_action_from_policy
 
             if "info" in inspect.signature(select_action_from_policy).parameters:
-                denormalised_action = select_action_from_policy(state, noise_scale=noise_scale, info=step_data)
+                normalised_action = select_action_from_policy(state, noise_scale=noise_scale, info=step_data)
             else:
-                denormalised_action = select_action_from_policy(state, noise_scale=noise_scale)
+                normalised_action = select_action_from_policy(state, noise_scale=noise_scale)
 
             # debug-log logging.info("Logging20")
             # mapping to env range [e.g. -2 , 2 for pendulum] - note for DMCS this is redudenant but required for openai
@@ -289,22 +289,23 @@ def policy_based_train(
             # debug-log logging.info("Logging31")
 
             record.stop_video()
+            video_dir = os.path.join(record.directory, "videos")
+            data_dir = os.path.join(record.directory, "data")
+
+            run_csv = os.path.join(data_dir, f"episode_{episode_num}.csv")
+            pd.DataFrame(run_data_rows).to_csv(run_csv, index=False)
 
             if episode_reward > highest_reward:
 
 
                 highest_reward = episode_reward
 
-                video_dir = os.path.join(record.directory, "videos")
-                data_dir = os.path.join(record.directory, "data")
 
                 highest_reward_video = os.path.join(video_dir, "highest_reward.mp4")
                 training_video = os.path.join(video_dir, "temp_train_video.mp4")
-                run_csv = os.path.join(data_dir, "highest_reward.csv")
 
                 logging.info(f"New highest reward of {episode_reward}. Saving video and run data...")
 
-                pd.DataFrame(run_data_rows).to_csv(run_csv, index=False)
 
                 try:
                     if os.path.exists(highest_reward_video):

From f8d13d574c86eff909a4f204cbc3c59e9d4e7b0c Mon Sep 17 00:00:00 2001
From: PK Wadsworth <pkwads2@gmail.com>
Date: Thu, 3 Oct 2024 16:08:31 +1300
Subject: [PATCH 16/26] renamed brock to flexi

---
 shell-scripts/brock.sh | 1 -
 shell-scripts/flexi.sh | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)
 delete mode 100755 shell-scripts/brock.sh
 create mode 100755 shell-scripts/flexi.sh

diff --git a/shell-scripts/brock.sh b/shell-scripts/brock.sh
deleted file mode 100755
index 329b766..0000000
--- a/shell-scripts/brock.sh
+++ /dev/null
@@ -1 +0,0 @@
-python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 0 --discrete 0 --domain pokemon --task brock SACAE
diff --git a/shell-scripts/flexi.sh b/shell-scripts/flexi.sh
new file mode 100755
index 0000000..998b0e5
--- /dev/null
+++ b/shell-scripts/flexi.sh
@@ -0,0 +1 @@
+python ./scripts/train.py run --gym pyboy --number_eval_episodes 1 --plot_frequency 10000000 --headless 1 --discrete 0 --domain pokemon --task flexi SACAE

From d99c5ec6da94ddf5fe8f7401c6d63d84460d030b Mon Sep 17 00:00:00 2001
From: PK Wadsworth <pkwads2@gmail.com>
Date: Mon, 7 Oct 2024 11:00:09 +1300
Subject: [PATCH 17/26] Set frames to stack back to 3

---
 scripts/util/configurations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/util/configurations.py b/scripts/util/configurations.py
index 131557f..81f0966 100755
--- a/scripts/util/configurations.py
+++ b/scripts/util/configurations.py
@@ -33,7 +33,7 @@ class GymEnvironmentConfig(EnvironmentConfig):
     display: Optional[int] = 0
 
     # image observation configurations
-    frames_to_stack: Optional[int] = 9
+    frames_to_stack: Optional[int] = 3
     frame_width: Optional[int] = 84
     frame_height: Optional[int] = 84
     grey_scale: Optional[int] = 0

From d20ad849871f82e7bc313394999f37492c3d6150 Mon Sep 17 00:00:00 2001
From: PK Wadsworth <pkwads2@gmail.com>
Date: Mon, 7 Oct 2024 11:50:02 +1300
Subject: [PATCH 18/26] Updated dockerfile and requirements

---
 Dockerfile       | 41 +++++++++++++++++++++++++++++++++++++++++
 requirements.txt |  2 +-
 2 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..efc7487
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,41 @@
+
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y git
+
+# Keeps Python from generating .pyc files in the container
+ENV PYTHONDONTWRITEBYTECODE=1
+
+# Turns off buffering for easier container logging
+ENV PYTHONUNBUFFERED=1
+
+WORKDIR /workspace
+
+# Install Cares Reinforcement Learning
+RUN git clone https://github.com/UoA-CARES/cares_reinforcement_learning.git
+WORKDIR /workspace/cares_reinforcement_learning
+RUN pip3 install -r requirements.txt
+RUN pip3 install --editable .
+
+WORKDIR /workspace
+
+# Install Pyboy Environments
+RUN git clone https://github.com/UoA-CARES/pyboy_environment.git
+WORKDIR /workspace/pyboy_environment
+RUN pip3 install -r requirements.txt
+RUN pip3 install --editable .
+
+WORKDIR /workspace
+
+RUN git clone https://github.com/UoA-CARES/gymnasium_envrionments.git
+WORKDIR /workspace/gymnasium_envrionments
+RUN pip3 install -r requirements.txt
+
+# We don't have GUI capabilities
+RUN pip3 uninstall opencv-python
+RUN pip3 install opencv-python-headless
+
+# Incase someone doesn't mount volume at runtime
+VOLUME /root/cares_rl_logs
+
+WORKDIR /workspace/gymnasium_envrionments/scripts
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 4268b16..d695e56 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,6 @@ numpy==1.26.4
 opencv-contrib-python==4.6.0.66
 pydantic==1.10.13
 torch==2.3.1
-pyboy==2.2.1
+pyboy==2.2.2
 plotly==5.22.0
 mediapy==1.1.9

From bf94e9fc8fb090d802b5eeb288d8f03555ceb702 Mon Sep 17 00:00:00 2001
From: PK Wadsworth <pkwads2@gmail.com>
Date: Mon, 7 Oct 2024 11:56:52 +1300
Subject: [PATCH 19/26] Updated dockerfile

---
 Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index efc7487..e44d71e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -14,6 +14,7 @@ WORKDIR /workspace
 # Install Cares Reinforcement Learning
 RUN git clone https://github.com/UoA-CARES/cares_reinforcement_learning.git
 WORKDIR /workspace/cares_reinforcement_learning
+RUN git checkout action-info-loggin
 RUN pip3 install -r requirements.txt
 RUN pip3 install --editable .
 
@@ -22,6 +23,7 @@ WORKDIR /workspace
 # Install Pyboy Environments
 RUN git clone https://github.com/UoA-CARES/pyboy_environment.git
 WORKDIR /workspace/pyboy_environment
+RUN git checkout lvl-up-task
 RUN pip3 install -r requirements.txt
 RUN pip3 install --editable .
 
@@ -29,6 +31,7 @@ WORKDIR /workspace
 
 RUN git clone https://github.com/UoA-CARES/gymnasium_envrionments.git
 WORKDIR /workspace/gymnasium_envrionments
+RUN git checkout p4p-pokemon-docker
 RUN pip3 install -r requirements.txt
 
 # We don't have GUI capabilities

From 3a4fea3d994fc2697d20da9a4eca96fad3d24496 Mon Sep 17 00:00:00 2001
From: PK Wadsworth <pkwads2@gmail.com>
Date: Mon, 7 Oct 2024 12:02:53 +1300
Subject: [PATCH 20/26] fixed dockerfile

---
 Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index e44d71e..3590fd3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,3 +1,6 @@
+# For more information, please refer to https://aka.ms/vscode-docker-python
+FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
+SHELL [ "/bin/bash", "-c" ]
 
 RUN apt-get update && \
     apt-get upgrade -y && \

From c4b00269d25cc817d0b33acb4b9a5d9a3afbbc80 Mon Sep 17 00:00:00 2001
From: PK Wadsworth <pkwads2@gmail.com>
Date: Mon, 7 Oct 2024 12:42:23 +1300
Subject: [PATCH 21/26] Fixed dockerfile

---
 Dockerfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 3590fd3..fcbba2e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -17,7 +17,7 @@ WORKDIR /workspace
 # Install Cares Reinforcement Learning
 RUN git clone https://github.com/UoA-CARES/cares_reinforcement_learning.git
 WORKDIR /workspace/cares_reinforcement_learning
-RUN git checkout action-info-loggin
+RUN git checkout -t origin/action-info-logging
 RUN pip3 install -r requirements.txt
 RUN pip3 install --editable .
 
@@ -26,7 +26,7 @@ WORKDIR /workspace
 # Install Pyboy Environments
 RUN git clone https://github.com/UoA-CARES/pyboy_environment.git
 WORKDIR /workspace/pyboy_environment
-RUN git checkout lvl-up-task
+RUN git checkout -t origin/lvl-up-task
 RUN pip3 install -r requirements.txt
 RUN pip3 install --editable .
 
@@ -34,7 +34,7 @@ WORKDIR /workspace
 
 RUN git clone https://github.com/UoA-CARES/gymnasium_envrionments.git
 WORKDIR /workspace/gymnasium_envrionments
-RUN git checkout p4p-pokemon-docker
+RUN git checkout -t origin/p4p-pokemon-docker
 RUN pip3 install -r requirements.txt
 
 # We don't have GUI capabilities
@@ -44,4 +44,4 @@ RUN pip3 install opencv-python-headless
 # Incase someone doesn't mount volume at runtime
 VOLUME /root/cares_rl_logs
 
-WORKDIR /workspace/gymnasium_envrionments/scripts
\ No newline at end of file
+WORKDIR /workspace/gymnasium_envrionments
\ No newline at end of file

From 1c8676c9e5bf1df78e66921ab29c4c5f4dd35b87 Mon Sep 17 00:00:00 2001
From: PK Wadsworth <pkwads2@gmail.com>
Date: Mon, 7 Oct 2024 13:14:18 +1300
Subject: [PATCH 22/26] Fixed dockerfile

---
 Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index fcbba2e..f7ed58a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -37,6 +37,9 @@ WORKDIR /workspace/gymnasium_envrionments
 RUN git checkout -t origin/p4p-pokemon-docker
 RUN pip3 install -r requirements.txt
 
+WORKDIR /root
+RUN git clone https://github.com/PKWadsy/cares_pokemon_configs.git cares_rl_configs
+
 # We don't have GUI capabilities
 RUN pip3 uninstall opencv-python
 RUN pip3 install opencv-python-headless

From 92d8cef59e07db00b2ccbafd7797f930003d89d8 Mon Sep 17 00:00:00 2001
From: PK Wadsworth <pkwads2@gmail.com>
Date: Thu, 10 Oct 2024 20:02:13 +1300
Subject: [PATCH 23/26] Removed eval and added better video saving

---
 scripts/train_loops/policy_loop.py | 23 ++---------------------
 1 file changed, 2 insertions(+), 21 deletions(-)

diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py
index 4c038a2..963ab2c 100644
--- a/scripts/train_loops/policy_loop.py
+++ b/scripts/train_loops/policy_loop.py
@@ -301,20 +301,13 @@ def policy_based_train(
                 highest_reward = episode_reward
 
 
-                highest_reward_video = os.path.join(video_dir, "highest_reward.mp4")
+                new_record_video = os.path.join(video_dir, f"new_record_episode_{episode_num+1}.mp4")
                 training_video = os.path.join(video_dir, "temp_train_video.mp4")
 
                 logging.info(f"New highest reward of {episode_reward}. Saving video and run data...")
-
-
-                try:
-                    if os.path.exists(highest_reward_video):
-                        os.remove(highest_reward_video)
-                except:
-                    logging.error("An error deleting the highest reward video occured :/")
                     
                 try:
-                    os.rename(training_video, highest_reward_video)
+                    os.rename(training_video, new_record_video)
                 except:
                     logging.error("An error renaming the video occured :/")
 
@@ -325,18 +318,6 @@ def policy_based_train(
             episode_reward = 0
             episode_num += 1
             episode_start = time.time()
-        
-        if (total_step_counter + 1) % number_steps_per_evaluation == 0:
-            logging.info("*************--Evaluation Loop--*************")
-            evaluate_policy_network(
-                env_eval,
-                agent,
-                train_config,
-                record=record,
-                total_steps=total_step_counter,
-                normalisation=normalisation,
-            )
-            logging.info("--------------------------------------------")
 
     end_time = time.time()
     elapsed_time = end_time - start_time

From c6ae3c5435bb07a4bbe224f665763d0650f30256 Mon Sep 17 00:00:00 2001
From: beardyface <henryamwilliams@gmail.com>
Date: Fri, 11 Oct 2024 11:28:27 +1300
Subject: [PATCH 24/26] Docker file merge with main

---
 scripts/train.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/scripts/train.py b/scripts/train.py
index 3d8c439..7ef62cf 100755
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -5,21 +5,18 @@
 """
 
 import logging
-import sys
 import os
+import sys
 
-import torch
 import train_loops.policy_loop as pbe
 import train_loops.ppo_loop as ppe
 import train_loops.value_loop as vbe
 import yaml
-
-from environments.environment_factory import EnvironmentFactory
-from util.configurations import GymEnvironmentConfig
-
 from cares_reinforcement_learning.memory.memory_factory import MemoryFactory
 from cares_reinforcement_learning.util import NetworkFactory, Record, RLParser
 from cares_reinforcement_learning.util import helpers as hlp
+from environments.environment_factory import EnvironmentFactory
+from util.configurations import GymEnvironmentConfig
 
 logging.basicConfig(level=logging.INFO)
 

From 25d5f32487904a8ffed0ea05b7e0278fb2a201d7 Mon Sep 17 00:00:00 2001
From: "Formatter [BOT]"
 <runner@fv-az1922-927.ggmvq42yn0ienksolwgsqeev3b.dx.internal.cloudapp.net>
Date: Thu, 6 Mar 2025 01:31:45 +0000
Subject: [PATCH 25/26] =?UTF-8?q?Auto-format=20code=20=F0=9F=A7=B9?=
 =?UTF-8?q?=F0=9F=8C=9F=F0=9F=A4=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/environments/gym_environment.py |  2 +-
 scripts/environments/image_wrapper.py   |  2 +-
 scripts/train_loops/policy_loop.py      | 55 ++++++++++++++++---------
 3 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/scripts/environments/gym_environment.py b/scripts/environments/gym_environment.py
index 784e9d3..ed7204c 100644
--- a/scripts/environments/gym_environment.py
+++ b/scripts/environments/gym_environment.py
@@ -20,7 +20,7 @@ def render(self):
     @abc.abstractmethod
     def min_action_value(self):
         raise NotImplementedError("Override this method")
-    
+
     @abc.abstractmethod
     def action_as_string(self, action):
         raise NotImplemented("Override this method")
diff --git a/scripts/environments/image_wrapper.py b/scripts/environments/image_wrapper.py
index af7513f..b1f4b58 100644
--- a/scripts/environments/image_wrapper.py
+++ b/scripts/environments/image_wrapper.py
@@ -41,7 +41,7 @@ def min_action_value(self):
     @cached_property
     def max_action_value(self):
         return self.gym.max_action_value
-    
+
     def action_as_string(self, action):
         return self.gym.action_as_string(action)
 
diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py
index 9fd5f51..6aace2f 100644
--- a/scripts/train_loops/policy_loop.py
+++ b/scripts/train_loops/policy_loop.py
@@ -12,6 +12,7 @@
 import cv2
 import numpy as np
 
+
 def overlay_info(image, **kwargs):
     # Create a copy of the image to overlay text
     output_image = image.copy()
@@ -23,15 +24,23 @@ def overlay_info(image, **kwargs):
     font = cv2.FONT_HERSHEY_SIMPLEX
     font_scale = 0.4  # Smaller font scale
     color = (0, 0, 255)  # Red color in BGR
-    thickness = 1       # Thicker text
+    thickness = 1  # Thicker text
 
     # Create overlay text from the kwargs dictionary
     overlay_text = "\n".join([f"{key}: {value}" for key, value in kwargs.items()])
 
     # Split the overlay text into lines and calculate position for each line
-    for i, line in enumerate(overlay_text.split('\n')):
-        cv2.putText(output_image, line, (text_x, text_y + i * 20), 
-                    font, font_scale, color, thickness, cv2.LINE_AA)
+    for i, line in enumerate(overlay_text.split("\n")):
+        cv2.putText(
+            output_image,
+            line,
+            (text_x, text_y + i * 20),
+            font,
+            font_scale,
+            color,
+            thickness,
+            cv2.LINE_AA,
+        )
 
     return output_image
 
@@ -57,10 +66,14 @@ def evaluate_policy_network(
         while not done and not truncated:
             episode_timesteps += 1
             normalised_action = agent.select_action_from_policy(state, evaluation=True)
-            
-            denormalised_action = hlp.denormalize(
-                normalised_action, env.max_action_value, env.min_action_value
-            ) if normalisation else normalised_action
+
+            denormalised_action = (
+                hlp.denormalize(
+                    normalised_action, env.max_action_value, env.min_action_value
+                )
+                if normalisation
+                else normalised_action
+            )
 
             state, reward, done, truncated = env.step(denormalised_action)
             episode_reward += reward
@@ -98,7 +111,6 @@ def policy_based_train(
     display=False,
     normalisation=True,
 ):
-    
 
     start_new_run = True
 
@@ -146,8 +158,7 @@ def policy_based_train(
             frame = env.grab_frame()
             record.start_video("temp_train_video", frame)
             run_data_rows = []
-            
-        
+
         if total_step_counter < max_steps_exploration:
             logging.info(
                 f"Running Exploration Steps {total_step_counter + 1}/{max_steps_exploration}"
@@ -172,9 +183,13 @@ def policy_based_train(
             select_action_from_policy = agent.select_action_from_policy
 
             if "info" in inspect.signature(select_action_from_policy).parameters:
-                normalised_action = select_action_from_policy(state, noise_scale=noise_scale, info=step_data)
+                normalised_action = select_action_from_policy(
+                    state, noise_scale=noise_scale, info=step_data
+                )
             else:
-                normalised_action = select_action_from_policy(state, noise_scale=noise_scale)
+                normalised_action = select_action_from_policy(
+                    state, noise_scale=noise_scale
+                )
 
             # mapping to env range [e.g. -2 , 2 for pendulum] - note for DMCS this is redudenant but required for openai
             if normalisation:
@@ -228,7 +243,7 @@ def policy_based_train(
 
         if intrinsic_on:
             info["intrinsic_reward"] = intrinsic_reward
-            
+
         if done or truncated:
             episode_time = time.time() - episode_start
             record.log_train(
@@ -250,15 +265,17 @@ def policy_based_train(
 
             if episode_reward > highest_reward:
 
-
                 highest_reward = episode_reward
 
-
-                new_record_video = os.path.join(video_dir, f"new_record_episode_{episode_num+1}.mp4")
+                new_record_video = os.path.join(
+                    video_dir, f"new_record_episode_{episode_num+1}.mp4"
+                )
                 training_video = os.path.join(video_dir, "temp_train_video.mp4")
 
-                logging.info(f"New highest reward of {episode_reward}. Saving video and run data...")
-                    
+                logging.info(
+                    f"New highest reward of {episode_reward}. Saving video and run data..."
+                )
+
                 try:
                     os.rename(training_video, new_record_video)
                 except:

From 83cd7c54fe04b37b198af305226cdb74940d3bcb Mon Sep 17 00:00:00 2001
From: Sam Boasman <samboasman@gmail.com>
Date: Fri, 7 Mar 2025 10:31:06 +1300
Subject: [PATCH 26/26] Update directory attribute acess for recording

---
 scripts/train_loops/policy_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/train_loops/policy_loop.py b/scripts/train_loops/policy_loop.py
index 6aace2f..d290360 100644
--- a/scripts/train_loops/policy_loop.py
+++ b/scripts/train_loops/policy_loop.py
@@ -257,8 +257,8 @@ def policy_based_train(
             )
 
             record.stop_video()
-            video_dir = os.path.join(record.directory, "videos")
-            data_dir = os.path.join(record.directory, "data")
+            video_dir = os.path.join(record.current_sub_directory, "videos")
+            data_dir = os.path.join(record.current_sub_directory, "data")
 
             run_csv = os.path.join(data_dir, f"episode_{episode_num}.csv")
             pd.DataFrame(run_data_rows).to_csv(run_csv, index=False)