Merge pull request #5 from reiniscimurs/feature/bounded_policy_gradient

reiniscimurs · web-flow · commit 226602881875 · 2025-04-09T19:02:16.000+02:00
Add Max upper bound Q value loss
diff --git a/robot_nav/models/CNNTD3/CNNTD3.py b/robot_nav/models/CNNTD3/CNNTD3.py
@@ -7,6 +7,8 @@
 from numpy import inf
 from torch.utils.tensorboard import SummaryWriter
 
+from robot_nav.utils import get_max_bound
+
 
 class Actor(nn.Module):
     def __init__(self, action_dim):
@@ -128,6 +130,8 @@ def __init__(
         save_directory=Path("robot_nav/models/CNNTD3/checkpoint"),
         model_name="CNNTD3",
         load_directory=Path("robot_nav/models/CNNTD3/checkpoint"),
+        use_max_bound=False,
+        bound_weight=0.25,
     ):
         # Initialize the Actor network
         self.device = device
@@ -145,13 +149,15 @@ def __init__(
         self.action_dim = action_dim
         self.max_action = max_action
         self.state_dim = state_dim
-        self.writer = SummaryWriter()
+        self.writer = SummaryWriter(comment=model_name)
         self.iter_count = 0
         if load_model:
             self.load(filename=model_name, directory=load_directory)
         self.save_every = save_every
         self.model_name = model_name
         self.save_directory = save_directory
+        self.use_max_bound = use_max_bound
+        self.bound_weight = bound_weight
 
     def get_action(self, obs, add_noise):
         if add_noise:
@@ -177,6 +183,11 @@ def train(
         policy_noise=0.2,
         noise_clip=0.5,
         policy_freq=2,
+        max_lin_vel=0.5,
+        max_ang_vel=1,
+        goal_reward=100,
+        distance_norm=10,
+        time_step=0.3,
     ):
         av_Q = 0
         max_Q = -inf
@@ -224,6 +235,25 @@ def train(
             # Calculate the loss between the current Q value and the target Q value
             loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
 
+            if self.use_max_bound:
+                max_bound = get_max_bound(
+                    next_state,
+                    discount,
+                    max_ang_vel,
+                    max_lin_vel,
+                    time_step,
+                    distance_norm,
+                    goal_reward,
+                    reward,
+                    done,
+                    self.device,
+                )
+                max_excess_Q1 = F.relu(current_Q1 - max_bound)
+                max_excess_Q2 = F.relu(current_Q2 - max_bound)
+                max_bound_loss = (max_excess_Q1**2).mean() + (max_excess_Q2**2).mean()
+                # Add loss for Q values exceeding maximum possible upper bound
+                loss += self.bound_weight * max_bound_loss
+
             # Perform the gradient descent
             self.critic_optimizer.zero_grad()
             loss.backward()
diff --git a/robot_nav/models/DDPG/DDPG.py b/robot_nav/models/DDPG/DDPG.py
@@ -7,6 +7,8 @@
 from numpy import inf
 from torch.utils.tensorboard import SummaryWriter
 
+from robot_nav.utils import get_max_bound
+
 
 class Actor(nn.Module):
     def __init__(self, state_dim, action_dim):
@@ -65,6 +67,8 @@ def __init__(
         save_directory=Path("robot_nav/models/DDPG/checkpoint"),
         model_name="DDPG",
         load_directory=Path("robot_nav/models/DDPG/checkpoint"),
+        use_max_bound=False,
+        bound_weight=0.25,
     ):
         # Initialize the Actor network
         self.device = device
@@ -82,13 +86,15 @@ def __init__(
         self.action_dim = action_dim
         self.max_action = max_action
         self.state_dim = state_dim
-        self.writer = SummaryWriter()
+        self.writer = SummaryWriter(comment=model_name)
         self.iter_count = 0
         if load_model:
             self.load(filename=model_name, directory=load_directory)
         self.save_every = save_every
         self.model_name = model_name
         self.save_directory = save_directory
+        self.use_max_bound = use_max_bound
+        self.bound_weight = bound_weight
 
     def get_action(self, obs, add_noise):
         if add_noise:
@@ -114,6 +120,11 @@ def train(
         policy_noise=0.2,
         noise_clip=0.5,
         policy_freq=2,
+        max_lin_vel=0.5,
+        max_ang_vel=1,
+        goal_reward=100,
+        distance_norm=10,
+        time_step=0.3,
     ):
         av_Q = 0
         max_Q = -inf
@@ -159,6 +170,24 @@ def train(
             # Calculate the loss between the current Q value and the target Q value
             loss = F.mse_loss(current_Q, target_Q)
 
+            if self.use_max_bound:
+                max_bound = get_max_bound(
+                    next_state,
+                    discount,
+                    max_ang_vel,
+                    max_lin_vel,
+                    time_step,
+                    distance_norm,
+                    goal_reward,
+                    reward,
+                    done,
+                    self.device,
+                )
+                max_excess_Q = F.relu(current_Q - max_bound)
+                max_bound_loss = (max_excess_Q**2).mean()
+                # Add loss for Q values exceeding maximum possible upper bound
+                loss += self.bound_weight * max_bound_loss
+
             # Perform the gradient descent
             self.critic_optimizer.zero_grad()
             loss.backward()
diff --git a/robot_nav/models/PPO/PPO.py b/robot_nav/models/PPO/PPO.py
@@ -157,7 +157,7 @@ def __init__(
             self.load(filename=model_name, directory=load_directory)
 
         self.MseLoss = nn.MSELoss()
-        self.writer = SummaryWriter()
+        self.writer = SummaryWriter(comment=model_name)
 
     def set_action_std(self, new_action_std):
         self.action_std = new_action_std
diff --git a/robot_nav/models/RCPG/RCPG.py b/robot_nav/models/RCPG/RCPG.py
@@ -189,7 +189,7 @@ def __init__(
         self.action_dim = action_dim
         self.max_action = max_action
         self.state_dim = state_dim
-        self.writer = SummaryWriter()
+        self.writer = SummaryWriter(comment=model_name)
         self.iter_count = 0
         self.model_name = model_name + rnn
         if load_model:
diff --git a/robot_nav/models/SAC/SAC.py b/robot_nav/models/SAC/SAC.py
@@ -112,7 +112,7 @@ def __init__(
         self.actor.train(True)
         self.critic.train(True)
         self.step = 0
-        self.writer = SummaryWriter()
+        self.writer = SummaryWriter(comment=model_name)
 
     def save(self, filename, directory):
         Path(directory).mkdir(parents=True, exist_ok=True)
diff --git a/robot_nav/models/TD3/TD3.py b/robot_nav/models/TD3/TD3.py
@@ -7,6 +7,8 @@
 from numpy import inf
 from torch.utils.tensorboard import SummaryWriter
 
+from robot_nav.utils import get_max_bound
+
 
 class Actor(nn.Module):
     def __init__(self, state_dim, action_dim):
@@ -81,6 +83,8 @@ def __init__(
         save_directory=Path("robot_nav/models/TD3/checkpoint"),
         model_name="TD3",
         load_directory=Path("robot_nav/models/TD3/checkpoint"),
+        use_max_bound=False,
+        bound_weight=0.25,
     ):
         # Initialize the Actor network
         self.device = device
@@ -98,13 +102,15 @@ def __init__(
         self.action_dim = action_dim
         self.max_action = max_action
         self.state_dim = state_dim
-        self.writer = SummaryWriter()
+        self.writer = SummaryWriter(comment=model_name)
         self.iter_count = 0
         if load_model:
             self.load(filename=model_name, directory=load_directory)
         self.save_every = save_every
         self.model_name = model_name
         self.save_directory = save_directory
+        self.use_max_bound = use_max_bound
+        self.bound_weight = bound_weight
 
     def get_action(self, obs, add_noise):
         if add_noise:
@@ -130,6 +136,11 @@ def train(
         policy_noise=0.2,
         noise_clip=0.5,
         policy_freq=2,
+        max_lin_vel=0.5,
+        max_ang_vel=1,
+        goal_reward=100,
+        distance_norm=10,
+        time_step=0.3,
     ):
         av_Q = 0
         max_Q = -inf
@@ -177,6 +188,25 @@ def train(
             # Calculate the loss between the current Q value and the target Q value
             loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
 
+            if self.use_max_bound:
+                max_bound = get_max_bound(
+                    next_state,
+                    discount,
+                    max_ang_vel,
+                    max_lin_vel,
+                    time_step,
+                    distance_norm,
+                    goal_reward,
+                    reward,
+                    done,
+                    self.device,
+                )
+                max_excess_Q1 = F.relu(current_Q1 - max_bound)
+                max_excess_Q2 = F.relu(current_Q2 - max_bound)
+                max_bound_loss = (max_excess_Q1**2).mean() + (max_excess_Q2**2).mean()
+                # Add loss for Q values exceeding maximum possible upper bound
+                loss += self.bound_weight * max_bound_loss
+
             # Perform the gradient descent
             self.critic_optimizer.zero_grad()
             loss.backward()
diff --git a/robot_nav/robot_world.yaml b/robot_nav/robot_world.yaml
@@ -31,7 +31,7 @@ robot:
       show_trajectory: True
 
 obstacle:
-  - number: 5
+  - number: 4
     kinematics: {name: 'omni'}
     distribution: {name: 'random', range_low: [0, 0, -3.14], range_high: [10, 10, 3.14]}
     behavior: {name: 'rvo', wander: True, range_low: [0, 0, -3.14], range_high: [10, 10, 3.14], vxmax: 0.2, vymax: 0.2, factor: 1.0}
@@ -40,6 +40,9 @@ obstacle:
     shape:
       - {name: 'circle', radius: 1.0, random_shape: True}
       - {name: 'polygon', random_shape: true, avg_radius_range: [0.5, 1.0], irregularity_range: [0, 0.4], spikeyness_range: [0, 0.4], num_vertices_range: [4, 6]}
+  - shape: {name: 'circle', radius: 0.8}  # length, width
+    state: [ 5, 2, 1 ]
+    kinematics: { name: 'static' }
   - shape: { name: 'rectangle', length: 1.0, width: 1.2 }  # length, width
     state: [ 8, 5, 1 ]
     kinematics: {name: 'static'}
diff --git a/robot_nav/sim.py b/robot_nav/sim.py
@@ -4,7 +4,6 @@
 
 import shapely
 from irsim.lib.handler.geometry_handler import GeometryFactory
-from irsim.world import ObjectBase
 
 
 class SIM_ENV:
@@ -35,7 +34,13 @@ def step(self, lin_velocity=0.0, ang_velocity=0.1):
 
         return latest_scan, distance, cos, sin, collision, goal, action, reward
 
-    def reset(self, robot_state=None, robot_goal=None, random_obstacles=True):
+    def reset(
+        self,
+        robot_state=None,
+        robot_goal=None,
+        random_obstacles=True,
+        random_obstacle_ids=None,
+    ):
         if robot_state is None:
             robot_state = [[random.uniform(1, 9)], [random.uniform(1, 9)], [0], [0]]
 
@@ -45,10 +50,12 @@ def reset(self, robot_state=None, robot_goal=None, random_obstacles=True):
         )
 
         if random_obstacles:
+            if random_obstacle_ids is None:
+                random_obstacle_ids = [i + 1 for i in range(7)]
             self.env.random_obstacle_position(
                 range_low=[0, 0, -3.14],
                 range_high=[10, 10, 3.14],
-                ids=[i + 1 for i in range(7)],
+                ids=random_obstacle_ids,
                 non_overlapping=True,
             )
 
@@ -82,7 +89,6 @@ def cossin(vec1, vec2):
         vec2 = vec2 / np.linalg.norm(vec2)
         cos = np.dot(vec1, vec2)
         sin = vec1[0] * vec2[1] - vec1[1] * vec2[0]
-
         return cos, sin
 
     @staticmethod
diff --git a/robot_nav/test.py b/robot_nav/test.py
@@ -22,12 +22,13 @@ def main(args=None):
     epoch = 0  # epoch number
     max_steps = 300  # maximum number of steps in single episode
 
-    model = CNNTD3(
+    model = TD3(
         state_dim=state_dim,
         action_dim=action_dim,
         max_action=max_action,
         device=device,
         load_model=True,
+        model_name="TD3",
     )  # instantiate a model
 
     sim = SIM_ENV(world_file="eval_world.yaml")  # instantiate environment
diff --git a/robot_nav/test_random.py b/robot_nav/test_random.py
@@ -25,17 +25,17 @@ def main(args=None):
     max_steps = 300  # maximum number of steps in single episode
     test_scenarios = 1000
 
-    model = DDPG(
+    model = SAC(
         state_dim=state_dim,
         action_dim=action_dim,
         max_action=max_action,
         device=device,
         load_model=True,
-        model_name="DDPGexp5",
+        model_name="SAC",
     )  # instantiate a model
 
     sim = SIM_ENV(
-        world_file="eval_world.yaml", disable_plotting=True
+        world_file="eval_world.yaml", disable_plotting=False
     )  # instantiate environment
 
     print("..............................................")
diff --git a/robot_nav/train.py b/robot_nav/train.py
@@ -34,7 +34,7 @@ def main(args=None):
     pretraining_iterations = (
         10  # number of training iterations to run during pre-training
     )
-    save_every = 10  # save the model every n training cycles
+    save_every = 5  # save the model every n training cycles
 
     model = TD3(
         state_dim=state_dim,
@@ -43,9 +43,10 @@ def main(args=None):
         device=device,
         save_every=save_every,
         load_model=False,
+        model_name="TD3",
     )  # instantiate a model
 
-    sim = SIM_ENV()  # instantiate environment
+    sim = SIM_ENV(disable_plotting=False)  # instantiate environment
     replay_buffer = get_buffer(
         model,
         sim,
diff --git a/robot_nav/utils.py b/robot_nav/utils.py
diff --git a/tests/test_model.py b/tests/test_model.py