Score Following RL (#195)

qi116 · Brian Qi · web-flow · commit 23e7a90b8dde · 2025-04-30T13:35:54.000-04:00
* Updated rl requirements.txt

* Trying to change environment. Changes: Increased window to 15 when training. -0.5 reward when standing still to encourage movement. Tried randomizing start location for training

* Changed columns_per_beat to 16, basically manually aligning so that agent can keep moving forwards.

* Added vecnormalization stuff. Seems to work well. Changed cols_per_beat to 4

* Included changes for normalization. Might need to change tracking_window.

---------

Co-authored-by: Brian Qi &lt;brianqi@Brians-MacBook-Pro.local&gt;
diff --git a/reinforcement_learning/gymnasium_env/envs/score_following_env.py b/reinforcement_learning/gymnasium_env/envs/score_following_env.py
@@ -55,11 +55,11 @@ def midi_to_piano_roll(midi_path: str, fps: int = 20) -> np.ndarray:
 
 
 class ScoreFollowingEnv(gym.Env):
-    def __init__(self, midi_path: str, audio_path: str, bpm: int, alignment: np.ndarray):
+    def __init__(self, midi_path: str, audio_path: str, bpm: int, alignment: np.ndarray, training=False):
         super(ScoreFollowingEnv, self).__init__()
 
         self.alignment = alignment
-
+        self.training = training
         # Define audio processing parameters
         sr = 22050                   # Sample rate in Hz
         n_fft = 2048                 # FFT window size
@@ -90,15 +90,18 @@ def __init__(self, midi_path: str, audio_path: str, bpm: int, alignment: np.ndar
 
         # Define window sizes (in quarter notes)
         self.score_window_beats = 10  # Number of beats for score context
-        columns_per_beat = 1  # Number of columns per beat in the piano roll
+        self.columns_per_beat = 4  # Number of columns per beat in the piano roll
+        columns_per_beat = self.columns_per_beat
         score_fps = calculate_piano_roll_fps(columns_per_beat, bpm)  # Calculate fps based on BPM
 
         # Get the piano roll representation of the MIDI file
         # This is the "world" the agent will be navigating
         self.piano_roll = midi_to_piano_roll(midi_path, fps=score_fps)
         self.size = self.piano_roll.shape[1]
 
-        self.tracking_window = 5  # max distance from target to agent before termination
+        self.tracking_window = 15 if self.training else 5
+        self.tracking_window *= columns_per_beat # Extend leniency because we grow note sizes?
+        # max distance from target to agent before termination
 
         # Define dimensions for our fixed-size representations
         # Score window length is a fixed number of beats
@@ -202,7 +205,7 @@ def update_target_location(self):
         target_index = np.where(note_onsets > live_time)[0]
         if target_index.size > 0:  # if there are note onsets after the current time
             target_index = target_index[0]  # get the first one
-            self._target_location = beats[target_index]  # get the corresponding beat
+            self._target_location = beats[target_index] * self.columns_per_beat  # get the corresponding beat
         else:
             # If no note onsets are found, set target_location to the end of the audio
             self._target_location = beats[-1]
@@ -226,9 +229,20 @@ def _get_obs(self):
         }
     
     def _get_info(self):
-        return {"distance": abs(self._agent_location - self._target_location)}
+        return {"distance": abs(self._agent_location - self._target_location), "target": self._target_location}
     
     def reset(self, seed=None):
+        super().reset(seed=seed)
+
+        # Trying to change starting position during training because otherwise agent never moved.
+        # if self.training:
+        #     self._agent_location = int(self.np_random.integers(0, self.size))#0
+        #     self._target_location = self._agent_location
+        #     while self._target_location == self._agent_location:
+        #         self._target_location = int(self.np_random.integers(0, self.size))
+        #     self.num_steps = int(self._agent_location)
+
+        # else:
         self._agent_location = 0
         self._target_location = 0
         self.num_steps = 0
@@ -243,18 +257,25 @@ def step(self, action):
             self._agent_location -= 1
         elif action == 1:
             self._agent_location += 1
-
+        
         # Clip the agent's location to be within the valid range
         self._agent_location = np.clip(self._agent_location, 0, self.size - 1)
 
-        offtrack = abs(self._agent_location - self._target_location) > self.tracking_window
+        offtrack = abs(self._agent_location - self._target_location) > self.tracking_window #
         end_of_score = self._agent_location >= self.size
         end_of_spectrogram = self.num_steps >= self.spectrogram.shape[1]
-        terminated = offtrack or end_of_score or end_of_spectrogram
+        terminated = end_of_score or end_of_spectrogram or offtrack
 
         truncated = False
         tracking_error = self._agent_location - self._target_location
+        
         reward = 1 - abs(tracking_error) / self.tracking_window  # Compute reward based on tracking error
+        # reward = np.exp(-0.5 * (tracking_error / self.tracking_window)**2) # Gaussian curve
+
+        if action == 2 and tracking_error > 0:
+            # reward -= (abs(tracking_error) / self.tracking_window) * 0.5
+            reward -= 0.5 #try to discourage staying still
+
         self.num_steps += 1  # Increment the number of steps
         self.update_target_location()
         observation = self._get_obs()
diff --git a/reinforcement_learning/ppo_score_following_env2 b/reinforcement_learning/ppo_score_following_env2
diff --git a/reinforcement_learning/requirements.txt b/reinforcement_learning/requirements.txt
@@ -1,8 +1,8 @@
 gymnasium==1.1.1
 imageio==2.37.0
 librosa==0.11.0
-matplotlib==3.10.1
-numpy==2.2.4
+matplotlib==3.9.4
+numpy>=1.23
 pretty_midi==0.2.10
 pygame==2.1.3
 stable_baselines3==2.6.0
diff --git a/reinforcement_learning/test_agent.py b/reinforcement_learning/test_agent.py
@@ -1,30 +1,49 @@
 from gymnasium_env.envs.score_following_env import ScoreFollowingEnv
 import numpy as np
 from stable_baselines3 import PPO
+from stable_baselines3.common.env_util import make_vec_env  
+from stable_baselines3.common.vec_env import VecNormalize
 
 alignment = [(i, 6 / 7 * i) for i in range(0, 64)]
 alignment = np.array(alignment).T
 
-env = ScoreFollowingEnv(midi_path="ode_beg.mid", audio_path="ode_beg.mp3", bpm=70, alignment=alignment)
-model = PPO.load("ppo_score_following", env=env)
+env_kwargs={
+     "midi_path": "ode_beg.mid",
+     "audio_path": "ode_beg.mp3",
+     "bpm": 70,
+     "alignment": alignment,
+ }
+vec_env = make_vec_env(lambda: ScoreFollowingEnv(**env_kwargs), n_envs=1)
+env = VecNormalize.load("ppo_score_following_env4", vec_env)
+# env = ScoreFollowingEnv(midi_path="ode_beg.mid", audio_path="ode_beg.mp3", bpm=70, alignment=alignment, training=False)
+
+model = PPO.load("ppo_score_following4", env=env)
 
 # Reset the environment
-obs, info = env.reset()
+obs = env.reset()
 terminated = False
 
 # Get the initial agent location
 agent_location = obs["agent"][0]
 
+total_reward = 0
 i = 0
 while not terminated:
     # Get the action from the model
     action, _ = model.predict(obs, deterministic=True)
 
     # Take a step in the environment
-    obs, reward, terminated, truncated, info = env.step(action)
+    obs, reward, terminated, truncated = env.step(action)
     agent_location, score_window, spectrogram_window = obs["agent"][0], obs["score"], obs["spectrogram"]
     env.render(mode="human")
 
     # Print the agent's location and reward
-    print(f"Step {i}: Agent location: {agent_location}, Reward: {reward}, Info: {info}")
-    i += 1
+    current_agent_loc = obs["agent"][0][0] # Adjust indexing based on actual obs structure
+    current_reward = reward[0]
+    target_loc = env.get_attr('_target_location')[0]
+    agent_loc= env.get_attr('_agent_location')[0]
+    print(f"Step {i}: Action: {action[0]}, Agent loc: {agent_loc:.2f}, target loc: {target_loc:.2f}, Reward: {current_reward:.3f}")  
+    i += 1
+    total_reward += reward
+
+print(f'Total reward: {total_reward}')
diff --git a/reinforcement_learning/train_agent.py b/reinforcement_learning/train_agent.py
@@ -2,6 +2,7 @@
 from stable_baselines3 import PPO
 from gymnasium_env.envs.score_following_env import ScoreFollowingEnv
 from stable_baselines3.common.env_util import make_vec_env
+from stable_baselines3.common.vec_env import VecNormalize
 from tqdm import tqdm
 
 
@@ -17,14 +18,19 @@
         "audio_path": "ode_beg.mp3",
         "bpm": 70,
         "alignment": alignment,
+        "training": True
     },
 )
 
+vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=False, gamma=0.99)
+
 # Create the PPO model using MultiInputPolicy to handle the Dict observation space.
 model = PPO("MultiInputPolicy", vec_env, verbose=1)
 
 # Train the model for a specified number of timesteps.
 model.learn(total_timesteps=100_000, progress_bar=tqdm)
 
 # Save the trained model.
-model.save("ppo_score_following")
+model.save("ppo_score_following4")
+vec_env.save("ppo_score_following_env4")
+vec_env.close()