Skip to content

Commit 2266028

Browse files
authored
Merge pull request #5 from reiniscimurs/feature/bounded_policy_gradient
Add Max upper bound Q value loss
2 parents fdcaa8d + 430bdcb commit 2266028

File tree

13 files changed

+219
-18
lines changed

13 files changed

+219
-18
lines changed

robot_nav/models/CNNTD3/CNNTD3.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from numpy import inf
88
from torch.utils.tensorboard import SummaryWriter
99

10+
from robot_nav.utils import get_max_bound
11+
1012

1113
class Actor(nn.Module):
1214
def __init__(self, action_dim):
@@ -128,6 +130,8 @@ def __init__(
128130
save_directory=Path("robot_nav/models/CNNTD3/checkpoint"),
129131
model_name="CNNTD3",
130132
load_directory=Path("robot_nav/models/CNNTD3/checkpoint"),
133+
use_max_bound=False,
134+
bound_weight=0.25,
131135
):
132136
# Initialize the Actor network
133137
self.device = device
@@ -145,13 +149,15 @@ def __init__(
145149
self.action_dim = action_dim
146150
self.max_action = max_action
147151
self.state_dim = state_dim
148-
self.writer = SummaryWriter()
152+
self.writer = SummaryWriter(comment=model_name)
149153
self.iter_count = 0
150154
if load_model:
151155
self.load(filename=model_name, directory=load_directory)
152156
self.save_every = save_every
153157
self.model_name = model_name
154158
self.save_directory = save_directory
159+
self.use_max_bound = use_max_bound
160+
self.bound_weight = bound_weight
155161

156162
def get_action(self, obs, add_noise):
157163
if add_noise:
@@ -177,6 +183,11 @@ def train(
177183
policy_noise=0.2,
178184
noise_clip=0.5,
179185
policy_freq=2,
186+
max_lin_vel=0.5,
187+
max_ang_vel=1,
188+
goal_reward=100,
189+
distance_norm=10,
190+
time_step=0.3,
180191
):
181192
av_Q = 0
182193
max_Q = -inf
@@ -224,6 +235,25 @@ def train(
224235
# Calculate the loss between the current Q value and the target Q value
225236
loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
226237

238+
if self.use_max_bound:
239+
max_bound = get_max_bound(
240+
next_state,
241+
discount,
242+
max_ang_vel,
243+
max_lin_vel,
244+
time_step,
245+
distance_norm,
246+
goal_reward,
247+
reward,
248+
done,
249+
self.device,
250+
)
251+
max_excess_Q1 = F.relu(current_Q1 - max_bound)
252+
max_excess_Q2 = F.relu(current_Q2 - max_bound)
253+
max_bound_loss = (max_excess_Q1**2).mean() + (max_excess_Q2**2).mean()
254+
# Add loss for Q values exceeding maximum possible upper bound
255+
loss += self.bound_weight * max_bound_loss
256+
227257
# Perform the gradient descent
228258
self.critic_optimizer.zero_grad()
229259
loss.backward()

robot_nav/models/DDPG/DDPG.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from numpy import inf
88
from torch.utils.tensorboard import SummaryWriter
99

10+
from robot_nav.utils import get_max_bound
11+
1012

1113
class Actor(nn.Module):
1214
def __init__(self, state_dim, action_dim):
@@ -65,6 +67,8 @@ def __init__(
6567
save_directory=Path("robot_nav/models/DDPG/checkpoint"),
6668
model_name="DDPG",
6769
load_directory=Path("robot_nav/models/DDPG/checkpoint"),
70+
use_max_bound=False,
71+
bound_weight=0.25,
6872
):
6973
# Initialize the Actor network
7074
self.device = device
@@ -82,13 +86,15 @@ def __init__(
8286
self.action_dim = action_dim
8387
self.max_action = max_action
8488
self.state_dim = state_dim
85-
self.writer = SummaryWriter()
89+
self.writer = SummaryWriter(comment=model_name)
8690
self.iter_count = 0
8791
if load_model:
8892
self.load(filename=model_name, directory=load_directory)
8993
self.save_every = save_every
9094
self.model_name = model_name
9195
self.save_directory = save_directory
96+
self.use_max_bound = use_max_bound
97+
self.bound_weight = bound_weight
9298

9399
def get_action(self, obs, add_noise):
94100
if add_noise:
@@ -114,6 +120,11 @@ def train(
114120
policy_noise=0.2,
115121
noise_clip=0.5,
116122
policy_freq=2,
123+
max_lin_vel=0.5,
124+
max_ang_vel=1,
125+
goal_reward=100,
126+
distance_norm=10,
127+
time_step=0.3,
117128
):
118129
av_Q = 0
119130
max_Q = -inf
@@ -159,6 +170,24 @@ def train(
159170
# Calculate the loss between the current Q value and the target Q value
160171
loss = F.mse_loss(current_Q, target_Q)
161172

173+
if self.use_max_bound:
174+
max_bound = get_max_bound(
175+
next_state,
176+
discount,
177+
max_ang_vel,
178+
max_lin_vel,
179+
time_step,
180+
distance_norm,
181+
goal_reward,
182+
reward,
183+
done,
184+
self.device,
185+
)
186+
max_excess_Q = F.relu(current_Q - max_bound)
187+
max_bound_loss = (max_excess_Q**2).mean()
188+
# Add loss for Q values exceeding maximum possible upper bound
189+
loss += self.bound_weight * max_bound_loss
190+
162191
# Perform the gradient descent
163192
self.critic_optimizer.zero_grad()
164193
loss.backward()

robot_nav/models/PPO/PPO.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def __init__(
157157
self.load(filename=model_name, directory=load_directory)
158158

159159
self.MseLoss = nn.MSELoss()
160-
self.writer = SummaryWriter()
160+
self.writer = SummaryWriter(comment=model_name)
161161

162162
def set_action_std(self, new_action_std):
163163
self.action_std = new_action_std

robot_nav/models/RCPG/RCPG.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ def __init__(
189189
self.action_dim = action_dim
190190
self.max_action = max_action
191191
self.state_dim = state_dim
192-
self.writer = SummaryWriter()
192+
self.writer = SummaryWriter(comment=model_name)
193193
self.iter_count = 0
194194
self.model_name = model_name + rnn
195195
if load_model:

robot_nav/models/SAC/SAC.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def __init__(
112112
self.actor.train(True)
113113
self.critic.train(True)
114114
self.step = 0
115-
self.writer = SummaryWriter()
115+
self.writer = SummaryWriter(comment=model_name)
116116

117117
def save(self, filename, directory):
118118
Path(directory).mkdir(parents=True, exist_ok=True)

robot_nav/models/TD3/TD3.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from numpy import inf
88
from torch.utils.tensorboard import SummaryWriter
99

10+
from robot_nav.utils import get_max_bound
11+
1012

1113
class Actor(nn.Module):
1214
def __init__(self, state_dim, action_dim):
@@ -81,6 +83,8 @@ def __init__(
8183
save_directory=Path("robot_nav/models/TD3/checkpoint"),
8284
model_name="TD3",
8385
load_directory=Path("robot_nav/models/TD3/checkpoint"),
86+
use_max_bound=False,
87+
bound_weight=0.25,
8488
):
8589
# Initialize the Actor network
8690
self.device = device
@@ -98,13 +102,15 @@ def __init__(
98102
self.action_dim = action_dim
99103
self.max_action = max_action
100104
self.state_dim = state_dim
101-
self.writer = SummaryWriter()
105+
self.writer = SummaryWriter(comment=model_name)
102106
self.iter_count = 0
103107
if load_model:
104108
self.load(filename=model_name, directory=load_directory)
105109
self.save_every = save_every
106110
self.model_name = model_name
107111
self.save_directory = save_directory
112+
self.use_max_bound = use_max_bound
113+
self.bound_weight = bound_weight
108114

109115
def get_action(self, obs, add_noise):
110116
if add_noise:
@@ -130,6 +136,11 @@ def train(
130136
policy_noise=0.2,
131137
noise_clip=0.5,
132138
policy_freq=2,
139+
max_lin_vel=0.5,
140+
max_ang_vel=1,
141+
goal_reward=100,
142+
distance_norm=10,
143+
time_step=0.3,
133144
):
134145
av_Q = 0
135146
max_Q = -inf
@@ -177,6 +188,25 @@ def train(
177188
# Calculate the loss between the current Q value and the target Q value
178189
loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
179190

191+
if self.use_max_bound:
192+
max_bound = get_max_bound(
193+
next_state,
194+
discount,
195+
max_ang_vel,
196+
max_lin_vel,
197+
time_step,
198+
distance_norm,
199+
goal_reward,
200+
reward,
201+
done,
202+
self.device,
203+
)
204+
max_excess_Q1 = F.relu(current_Q1 - max_bound)
205+
max_excess_Q2 = F.relu(current_Q2 - max_bound)
206+
max_bound_loss = (max_excess_Q1**2).mean() + (max_excess_Q2**2).mean()
207+
# Add loss for Q values exceeding maximum possible upper bound
208+
loss += self.bound_weight * max_bound_loss
209+
180210
# Perform the gradient descent
181211
self.critic_optimizer.zero_grad()
182212
loss.backward()

robot_nav/robot_world.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ robot:
3131
show_trajectory: True
3232

3333
obstacle:
34-
- number: 5
34+
- number: 4
3535
kinematics: {name: 'omni'}
3636
distribution: {name: 'random', range_low: [0, 0, -3.14], range_high: [10, 10, 3.14]}
3737
behavior: {name: 'rvo', wander: True, range_low: [0, 0, -3.14], range_high: [10, 10, 3.14], vxmax: 0.2, vymax: 0.2, factor: 1.0}
@@ -40,6 +40,9 @@ obstacle:
4040
shape:
4141
- {name: 'circle', radius: 1.0, random_shape: True}
4242
- {name: 'polygon', random_shape: true, avg_radius_range: [0.5, 1.0], irregularity_range: [0, 0.4], spikeyness_range: [0, 0.4], num_vertices_range: [4, 6]}
43+
- shape: {name: 'circle', radius: 0.8} # length, width
44+
state: [ 5, 2, 1 ]
45+
kinematics: { name: 'static' }
4346
- shape: { name: 'rectangle', length: 1.0, width: 1.2 } # length, width
4447
state: [ 8, 5, 1 ]
4548
kinematics: {name: 'static'}

robot_nav/sim.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
import shapely
66
from irsim.lib.handler.geometry_handler import GeometryFactory
7-
from irsim.world import ObjectBase
87

98

109
class SIM_ENV:
@@ -35,7 +34,13 @@ def step(self, lin_velocity=0.0, ang_velocity=0.1):
3534

3635
return latest_scan, distance, cos, sin, collision, goal, action, reward
3736

38-
def reset(self, robot_state=None, robot_goal=None, random_obstacles=True):
37+
def reset(
38+
self,
39+
robot_state=None,
40+
robot_goal=None,
41+
random_obstacles=True,
42+
random_obstacle_ids=None,
43+
):
3944
if robot_state is None:
4045
robot_state = [[random.uniform(1, 9)], [random.uniform(1, 9)], [0], [0]]
4146

@@ -45,10 +50,12 @@ def reset(self, robot_state=None, robot_goal=None, random_obstacles=True):
4550
)
4651

4752
if random_obstacles:
53+
if random_obstacle_ids is None:
54+
random_obstacle_ids = [i + 1 for i in range(7)]
4855
self.env.random_obstacle_position(
4956
range_low=[0, 0, -3.14],
5057
range_high=[10, 10, 3.14],
51-
ids=[i + 1 for i in range(7)],
58+
ids=random_obstacle_ids,
5259
non_overlapping=True,
5360
)
5461

@@ -82,7 +89,6 @@ def cossin(vec1, vec2):
8289
vec2 = vec2 / np.linalg.norm(vec2)
8390
cos = np.dot(vec1, vec2)
8491
sin = vec1[0] * vec2[1] - vec1[1] * vec2[0]
85-
8692
return cos, sin
8793

8894
@staticmethod

robot_nav/test.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,13 @@ def main(args=None):
2222
epoch = 0 # epoch number
2323
max_steps = 300 # maximum number of steps in single episode
2424

25-
model = CNNTD3(
25+
model = TD3(
2626
state_dim=state_dim,
2727
action_dim=action_dim,
2828
max_action=max_action,
2929
device=device,
3030
load_model=True,
31+
model_name="TD3",
3132
) # instantiate a model
3233

3334
sim = SIM_ENV(world_file="eval_world.yaml") # instantiate environment

robot_nav/test_random.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,17 +25,17 @@ def main(args=None):
2525
max_steps = 300 # maximum number of steps in single episode
2626
test_scenarios = 1000
2727

28-
model = DDPG(
28+
model = SAC(
2929
state_dim=state_dim,
3030
action_dim=action_dim,
3131
max_action=max_action,
3232
device=device,
3333
load_model=True,
34-
model_name="DDPGexp5",
34+
model_name="SAC",
3535
) # instantiate a model
3636

3737
sim = SIM_ENV(
38-
world_file="eval_world.yaml", disable_plotting=True
38+
world_file="eval_world.yaml", disable_plotting=False
3939
) # instantiate environment
4040

4141
print("..............................................")

0 commit comments

Comments
 (0)