Skip to content

Commit 9805a6e

Browse files
authored
feat: add draft MaxEntropyDeepIRL class (#5)
* add MaxEntropyDeepIRL * first MaxEntropyDeepIRL draft
1 parent 2737c78 commit 9805a6e

File tree

12 files changed

+341
-7
lines changed

12 files changed

+341
-7
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# IRLwPython
22

3-
<img src="logo/IRLwPython.jpg" width="200">
3+
<img src="logo/IRLwPython.png" width="200">
44

55
Inverse Reinforcement Learning Algorithm implementation with python.
66

logo/IRLwPython.jpg

-24.5 KB
Binary file not shown.

logo/IRLwPython.png

187 KB
Loading
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
import gym
2+
import numpy as np
3+
import torch
4+
import torch.optim as optim
5+
import torch.nn as nn
6+
import matplotlib.pyplot as plt
7+
8+
9+
class ActorNetwork(nn.Module):
10+
def __init__(self, num_inputs, num_output, hidden_size):
11+
super(ActorNetwork, self).__init__()
12+
self.fc1 = nn.Linear(num_inputs, hidden_size)
13+
self.fc2 = nn.Linear(hidden_size, hidden_size)
14+
self.fc3 = nn.Linear(hidden_size, num_output)
15+
16+
def forward(self, x):
17+
x = nn.functional.relu(self.fc1(x))
18+
x = nn.functional.relu(self.fc2(x))
19+
return self.fc3(x) # torch.nn.functional.softmax(self.fc3(x))
20+
21+
22+
class CriticNetwork(nn.Module):
23+
def __init__(self, num_inputs, hidden_size):
24+
super(CriticNetwork, self).__init__()
25+
self.fc1 = nn.Linear(num_inputs, hidden_size)
26+
self.fc2 = nn.Linear(hidden_size, hidden_size)
27+
self.fc3 = nn.Linear(hidden_size, 1)
28+
29+
self.theta_layer = nn.Linear(hidden_size, 3)
30+
31+
def forward(self, x):
32+
x_ = nn.functional.relu(self.fc1(x))
33+
x_ = nn.functional.relu(self.fc2(x_))
34+
theta_ = self.theta_layer(x_)
35+
return self.fc3(x_) + torch.matmul(theta_, x)
36+
37+
38+
class MaxEntropyDeepIRL:
39+
def __init__(self, target, state_dim, action_dim, learning_rate=0.001, gamma=0.99, num_epochs=1000):
40+
self.target = target
41+
self.state_dim = state_dim
42+
self.action_dim = action_dim
43+
self.learning_rate = learning_rate
44+
# self.theta = torch.rand(state_dim + 1, requires_grad=True)
45+
self.gamma = gamma
46+
self.num_epochs = num_epochs
47+
self.actor_network = ActorNetwork(state_dim, action_dim, 100)
48+
self.critic_network = CriticNetwork(state_dim + 1, 100)
49+
self.optimizer_actor = optim.Adam(self.actor_network.parameters(), lr=learning_rate)
50+
self.optimizer_critic = optim.Adam(self.critic_network.parameters(), lr=learning_rate)
51+
52+
def get_reward(self, state, action):
53+
state_action = list(state) + list([action])
54+
state_action = torch.Tensor(state_action)
55+
return self.critic_network(state_action)
56+
57+
def expert_feature_expectations(self, demonstrations):
58+
feature_expectations = torch.zeros(self.state_dim)
59+
60+
for demonstration in demonstrations:
61+
for state, _, _ in demonstration:
62+
state_tensor = torch.tensor(state, dtype=torch.float32)
63+
feature_expectations += state_tensor.squeeze()
64+
65+
feature_expectations /= demonstrations.shape[0]
66+
return feature_expectations
67+
68+
def maxent_irl(self, expert, learner):
69+
# Update critic network
70+
71+
self.optimizer_critic.zero_grad()
72+
73+
# Loss function for critic network
74+
loss_critic = torch.nn.functional.mse_loss(learner, expert)
75+
loss_critic.backward()
76+
77+
self.optimizer_critic.step()
78+
79+
def update_q_network(self, state_array, action, reward, next_state):
80+
self.optimizer_actor.zero_grad()
81+
82+
state_tensor = torch.tensor(state_array, dtype=torch.float32)
83+
next_state_tensor = torch.tensor(next_state, dtype=torch.float32)
84+
85+
q_values = self.actor_network(state_tensor)
86+
# q_1 = self.actor_network(state_tensor)[action]
87+
# q_2 = reward + self.gamma * max(self.actor_network(next_state_tensor))
88+
next_q_values = reward + self.gamma * self.actor_network(next_state_tensor)
89+
90+
loss_actor = nn.functional.mse_loss(q_values, next_q_values)
91+
loss_actor.backward()
92+
self.optimizer_actor.step()
93+
94+
def get_demonstrations(self):
95+
env_low = self.target.observation_space.low
96+
env_high = self.target.observation_space.high
97+
env_distance = (env_high - env_low) / 20 # self.one_feature
98+
99+
raw_demo = np.load(file="expert_demo/expert_demo.npy")
100+
demonstrations = np.zeros((len(raw_demo), len(raw_demo[0]), 3))
101+
for x in range(len(raw_demo)):
102+
for y in range(len(raw_demo[0])):
103+
position_idx = int((raw_demo[x][y][0] - env_low[0]) / env_distance[0])
104+
velocity_idx = int((raw_demo[x][y][1] - env_low[1]) / env_distance[1])
105+
state_idx = position_idx + velocity_idx * 20 # self.one_feature
106+
107+
demonstrations[x][y][0] = state_idx
108+
demonstrations[x][y][1] = raw_demo[x][y][2]
109+
110+
return demonstrations
111+
112+
def train(self):
113+
demonstrations = self.get_demonstrations()
114+
expert = self.expert_feature_expectations(demonstrations)
115+
116+
learner_feature_expectations = torch.zeros(self.state_dim, requires_grad=True) # Add requires_grad=True
117+
episodes, scores = [], []
118+
119+
for episode in range(self.num_epochs):
120+
state, info = self.target.reset()
121+
score = 0
122+
123+
if (episode != 0 and episode == 10) or (episode > 10 and episode % 5 == 0):
124+
learner = learner_feature_expectations / episode
125+
self.maxent_irl(expert, learner)
126+
127+
while True:
128+
state_tensor = torch.tensor(state, dtype=torch.float32)
129+
130+
q_state = self.actor_network(state_tensor)
131+
action = torch.argmax(q_state).item()
132+
next_state, reward, done, _, _ = self.target.step(action)
133+
134+
irl_reward = self.get_reward(state, action)
135+
self.update_q_network(state, action, irl_reward, next_state)
136+
137+
print("Q Actor Network", state, q_state)
138+
print("Reward", reward, "IRL Reward", irl_reward)
139+
140+
learner_feature_expectations = learner_feature_expectations + state_tensor.squeeze()
141+
142+
print(expert)
143+
print(learner_feature_expectations)
144+
145+
score += reward
146+
state = next_state
147+
if done:
148+
scores.append(score)
149+
episodes.append(episode)
150+
break
151+
152+
if episode % 1 == 0:
153+
score_avg = np.mean(scores)
154+
print('{} episode score is {:.2f}'.format(episode, score_avg))
155+
plt.plot(episodes, scores, 'b')
156+
plt.savefig("./learning_curves/maxent_30000_network.png")
157+
158+
torch.save(self.q_network.state_dict(), "./results/maxent_30000_q_network.pth")
159+
160+
def test(self):
161+
episodes, scores = [], []
162+
163+
for episode in range(10):
164+
state = self.target.reset()
165+
score = 0
166+
167+
while True:
168+
self.target.render()
169+
state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
170+
171+
action = torch.argmax(self.q_network(state_tensor)).item()
172+
next_state, reward, done, _, _ = self.target.step(action)
173+
174+
score += reward
175+
state = next_state
176+
177+
if done:
178+
scores.append(score)
179+
episodes.append(episode)
180+
plt.plot(episodes, scores, 'b')
181+
plt.savefig("./learning_curves/maxent_test_30000_network.png")
182+
break
183+
184+
if episode % 1 == 0:
185+
print('{} episode score is {:.2f}'.format(episode, score))

src/irlwpython/MaxEntropyIRL.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def maxent_irl(self, expert, learner, learning_rate):
6464

6565
# Clip theta
6666
for j in range(len(self.theta)):
67-
if self.theta[j] > 0:
67+
if self.theta[j] > 0: # log values
6868
self.theta[j] = 0
6969

7070
def update_q_table(self, state, action, reward, next_state):
@@ -101,9 +101,11 @@ def train(self, theta_learning_rate):
101101
state = self.target.env_reset()
102102
score = 0
103103

104-
# Mini-Batches ?
104+
# Mini-Batches:
105105
if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0):
106+
# calculate density
106107
learner = learner_feature_expectations / episode
108+
# Maximum Entropy IRL step
107109
self.maxent_irl(expert, learner, theta_learning_rate)
108110

109111
# One Step in environment
@@ -115,12 +117,16 @@ def train(self, theta_learning_rate):
115117
# Run one timestep of the environment's dynamics.
116118
next_state, reward, done, _, _ = self.target.env_step(action)
117119

120+
# get pseudo-reward and update q table
118121
irl_reward = self.get_reward(self.n_states, state_idx)
119122
next_state_idx = self.target.idx_to_state(next_state)
120123
self.update_q_table(state_idx, action, irl_reward, next_state_idx)
121124

125+
# State counting for densitiy
122126
learner_feature_expectations += self.get_feature_matrix()[int(state_idx)]
123127

128+
print(reward, irl_reward)
129+
124130
score += reward
125131
state = next_state
126132
if done:
5.34 MB
Binary file not shown.
-3.15 KB
Loading
-2.76 KB
Loading

src/irlwpython/main.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
#from irlwpython import __version__
1111

12+
import gym
13+
1214
__author__ = "HokageM"
1315
__copyright__ = "HokageM"
1416
__license__ = "MIT"
@@ -74,18 +76,27 @@ def main(args):
7476

7577
gamma = 0.99
7678
q_learning_rate = 0.03
77-
theta_learning_rate = 0.05
7879

80+
# Theta works as Critic
81+
theta_learning_rate = 0.05
7982
theta = -(np.random.uniform(size=(n_states,)))
8083

8184
if args.render:
8285
car = MountainCar(True, one_feature)
8386
else:
8487
car = MountainCar(False, one_feature)
8588

86-
#if args.deep:
87-
# deep = MaxEntropyDeepIRL()
88-
# deep.run()
89+
if args.deep:
90+
91+
# Create MountainCar environment
92+
env = gym.make('MountainCar-v0', render_mode="human")
93+
state_dim = env.observation_space.shape[0]
94+
action_dim = env.action_space.n
95+
96+
# Run MaxEnt Deep IRL using MountainCar environment
97+
maxent_deep_irl_agent = MaxEntropyDeepIRL(env, state_dim, action_dim)
98+
maxent_deep_irl_agent.train()
99+
maxent_deep_irl_agent.test()
89100

90101
if args.training:
91102
q_table = np.zeros((n_states, n_actions))
0 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)