Skip to content

Commit 2737c78

Browse files
authored
refactor: MaxEntropyIRL and MountainCar (#4)
* refactor MaxEntropyIRL and MountainCar * add logo * adjust header
1 parent dfb143f commit 2737c78

File tree

5 files changed

+154
-127
lines changed

5 files changed

+154
-127
lines changed

README.md

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,22 @@
11
# IRLwPython
2+
3+
<img src="logo/IRLwPython.jpg" width="200">
4+
25
Inverse Reinforcement Learning Algorithm implementation with python.
36

4-
The implementation is based on: https://github.com/reinforcement-learning-kr/lets-do-irl
7+
Implemented Algorithms:
8+
- Maximum Entropy IRL
9+
- Maximum Entropy Deep IRL
10+
11+
Experiment:
12+
- Mountaincar: [gym](https://www.gymlibrary.dev/environments/classic_control/mountain_car/)
13+
14+
The implementation of MaxEntropyIRL and MountainCar is based on the implementation of:
15+
[lets-do-irl](https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent)
16+
17+
# References
518

6-
Mountaincar experiment from: https://www.gymlibrary.dev/environments/classic_control/mountain_car/
19+
...
720

821
# Installation
922

logo/IRLwPython.jpg

24.5 KB
Loading

src/irlwpython/MaxEntropyIRL.py

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,23 @@
1+
#
2+
# This file is hardly inspired by the IRL implementation of:
3+
# https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent
4+
# It is a class type implementation restructured for our use case.
5+
#
6+
17
import numpy as np
8+
import matplotlib.pyplot as plt
29

310

411
class MaxEntropyIRL:
5-
def __init__(self, feature_matrix, theta):
12+
def __init__(self, target, feature_matrix, one_feature, q_table, q_learning_rate, gamma, n_states, theta):
13+
self.target = target
614
self.feature_matrix = feature_matrix
15+
self.one_feature = one_feature
16+
self.q_table = q_table
17+
self.q_learning_rate = q_learning_rate
718
self.theta = theta
19+
self.gamma = gamma
20+
self.n_states = n_states
821

922
def get_feature_matrix(self):
1023
"""
@@ -53,3 +66,102 @@ def maxent_irl(self, expert, learner, learning_rate):
5366
for j in range(len(self.theta)):
5467
if self.theta[j] > 0:
5568
self.theta[j] = 0
69+
70+
def update_q_table(self, state, action, reward, next_state):
71+
"""
72+
Updates the Q table for a specified state and action.
73+
:param state:
74+
:param action:
75+
:param reward:
76+
:param next_state:
77+
:return:
78+
"""
79+
q_1 = self.q_table[state][action]
80+
q_2 = reward + self.gamma * max(self.q_table[next_state])
81+
self.q_table[state][action] += self.q_learning_rate * (q_2 - q_1)
82+
83+
def train(self, theta_learning_rate):
84+
"""
85+
Trains a model.
86+
:param theta_learning_rate:
87+
:return:
88+
"""
89+
demonstrations = self.target.get_demonstrations()
90+
91+
# Get expert feature expectations
92+
expert = self.expert_feature_expectations(demonstrations)
93+
94+
# Learning
95+
learner_feature_expectations = np.zeros(self.n_states)
96+
episodes, scores = [], []
97+
# For every episode
98+
for episode in range(30000):
99+
# Resets the environment to an initial state and returns the initial observation.
100+
# Start position is in random range of [-0.6, -0.4]
101+
state = self.target.env_reset()
102+
score = 0
103+
104+
# Mini-Batches ?
105+
if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0):
106+
learner = learner_feature_expectations / episode
107+
self.maxent_irl(expert, learner, theta_learning_rate)
108+
109+
# One Step in environment
110+
state = state[0]
111+
while True:
112+
state_idx = self.target.idx_to_state(state)
113+
action = np.argmax(self.q_table[state_idx])
114+
115+
# Run one timestep of the environment's dynamics.
116+
next_state, reward, done, _, _ = self.target.env_step(action)
117+
118+
irl_reward = self.get_reward(self.n_states, state_idx)
119+
next_state_idx = self.target.idx_to_state(next_state)
120+
self.update_q_table(state_idx, action, irl_reward, next_state_idx)
121+
122+
learner_feature_expectations += self.get_feature_matrix()[int(state_idx)]
123+
124+
score += reward
125+
state = next_state
126+
if done:
127+
scores.append(score)
128+
episodes.append(episode)
129+
break
130+
131+
if episode % 1000 == 0:
132+
score_avg = np.mean(scores)
133+
print('{} episode score is {:.2f}'.format(episode, score_avg))
134+
plt.plot(episodes, scores, 'b')
135+
plt.savefig("./learning_curves/maxent_30000.png")
136+
np.save("./results/maxent_30000_table", arr=self.q_table)
137+
138+
def test(self):
139+
"""
140+
Tests the previous trained model
141+
:return:
142+
"""
143+
episodes, scores = [], []
144+
145+
for episode in range(10):
146+
state = self.target.env_reset()
147+
score = 0
148+
149+
state = state[0]
150+
while True:
151+
self.target.env_render()
152+
state_idx = self.target.idx_to_state(state)
153+
action = np.argmax(self.q_table[state_idx])
154+
next_state, reward, done, _, _ = self.target.env_step(action)
155+
156+
score += reward
157+
state = next_state
158+
159+
if done:
160+
scores.append(score)
161+
episodes.append(episode)
162+
plt.plot(episodes, scores, 'b')
163+
plt.savefig("./learning_curves/maxent_test_30000.png")
164+
break
165+
166+
if episode % 1 == 0:
167+
print('{} episode score is {:.2f}'.format(episode, score))

src/irlwpython/MountainCar.py

Lines changed: 14 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,23 @@
1+
#
2+
# This file is hardly inspired by the IRL implementation of:
3+
# https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent
4+
# It is a class type implementation restructured for our use case.
5+
#
6+
17
import gym
28
import numpy as np
3-
import matplotlib.pyplot as plt
49

510

611
class MountainCar:
712

8-
def __init__(self, animation, feature_matrix, one_feature, q_learning_rate, gamma, n_states, trainer):
13+
def __init__(self, animation, one_feature):
914
if animation:
1015
self.env = gym.make('MountainCar-v0', render_mode="human")
1116
else:
1217
self.env = gym.make('MountainCar-v0')
13-
self.feature_matrix = feature_matrix
1418
self.one_feature = one_feature
15-
self.q_table = None
16-
self.q_learning_rate = q_learning_rate
17-
self.gamma = gamma
18-
self.n_states = n_states
19-
self.trainer = trainer
20-
21-
def set_q_table(self, table):
22-
"""
23-
Sets the
24-
:param table:
25-
:return:
26-
"""
27-
self.q_table = table
2819

29-
def get_demonstrations(self, one_feature):
20+
def get_demonstrations(self):
3021
"""
3122
Parses the demonstrations and returns the demonstrations.
3223
:param one_feature:
@@ -42,7 +33,7 @@ def get_demonstrations(self, one_feature):
4233
for y in range(len(raw_demo[0])):
4334
position_idx = int((raw_demo[x][y][0] - env_low[0]) / env_distance[0])
4435
velocity_idx = int((raw_demo[x][y][1] - env_low[1]) / env_distance[1])
45-
state_idx = position_idx + velocity_idx * one_feature
36+
state_idx = position_idx + velocity_idx * self.one_feature
4637

4738
demonstrations[x][y][0] = state_idx
4839
demonstrations[x][y][1] = raw_demo[x][y][2]
@@ -64,18 +55,11 @@ def idx_to_state(self, state):
6455
state_idx = position_idx + velocity_idx * self.one_feature
6556
return state_idx
6657

67-
def update_q_table(self, state, action, reward, next_state):
68-
"""
69-
Updates the Q table for a specified state and action.
70-
:param state:
71-
:param action:
72-
:param reward:
73-
:param next_state:
74-
:return:
75-
"""
76-
q_1 = self.q_table[state][action]
77-
q_2 = reward + self.gamma * max(self.q_table[next_state])
78-
self.q_table[state][action] += self.q_learning_rate * (q_2 - q_1)
58+
def env_action_space(self):
59+
return self.env.action_space
60+
61+
def env_observation_space(self):
62+
return self.env.observation_space
7963

8064
def env_render(self):
8165
"""
@@ -99,88 +83,3 @@ def env_step(self, action):
9983
:return:
10084
"""
10185
return self.env.step(action)
102-
103-
def train(self, theta_learning_rate):
104-
"""
105-
Trains a model.
106-
:param theta_learning_rate:
107-
:return:
108-
"""
109-
demonstrations = self.get_demonstrations(self.one_feature)
110-
111-
# Get expert feature expectations
112-
expert = self.trainer.expert_feature_expectations(demonstrations)
113-
114-
# Learning
115-
learner_feature_expectations = np.zeros(self.n_states)
116-
episodes, scores = [], []
117-
# For every episode
118-
for episode in range(30000):
119-
# Resets the environment to an initial state and returns the initial observation.
120-
# Start position is in random range of [-0.6, -0.4]
121-
state = self.env_reset()
122-
score = 0
123-
124-
# Mini-Batches ?
125-
if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0):
126-
learner = learner_feature_expectations / episode
127-
self.trainer.maxent_irl(expert, learner, theta_learning_rate)
128-
129-
# One Step in environment
130-
state = state[0]
131-
while True:
132-
state_idx = self.idx_to_state(state)
133-
action = np.argmax(self.q_table[state_idx])
134-
# Run one timestep of the environment's dynamics.
135-
next_state, reward, done, _, _ = self.env_step(action)
136-
137-
irl_reward = self.trainer.get_reward(self.n_states, state_idx)
138-
next_state_idx = self.idx_to_state(next_state)
139-
self.update_q_table(state_idx, action, irl_reward, next_state_idx)
140-
141-
learner_feature_expectations += self.trainer.get_feature_matrix()[int(state_idx)]
142-
143-
score += reward
144-
state = next_state
145-
if done:
146-
scores.append(score)
147-
episodes.append(episode)
148-
break
149-
150-
if episode % 1000 == 0:
151-
score_avg = np.mean(scores)
152-
print('{} episode score is {:.2f}'.format(episode, score_avg))
153-
plt.plot(episodes, scores, 'b')
154-
plt.savefig("./learning_curves/maxent_30000.png")
155-
np.save("./results/maxent_30000_table", arr=self.q_table)
156-
157-
def test(self):
158-
"""
159-
Tests the previous trained model
160-
:return:
161-
"""
162-
episodes, scores = [], []
163-
164-
for episode in range(10):
165-
state = self.env_reset()
166-
score = 0
167-
168-
state = state[0]
169-
while True:
170-
self.env_render()
171-
state_idx = self.idx_to_state(state)
172-
action = np.argmax(self.q_table[state_idx])
173-
next_state, reward, done, _, _ = self.env_step(action)
174-
175-
score += reward
176-
state = next_state
177-
178-
if done:
179-
scores.append(score)
180-
episodes.append(episode)
181-
plt.plot(episodes, scores, 'b')
182-
plt.savefig("./learning_curves/maxent_test_30000.png")
183-
break
184-
185-
if episode % 1 == 0:
186-
print('{} episode score is {:.2f}'.format(episode, score))

src/irlwpython/main.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from MountainCar import MountainCar
77
from MaxEntropyIRL import MaxEntropyIRL
8+
from MaxEntropyDeepIRL import MaxEntropyDeepIRL
89

910
#from irlwpython import __version__
1011

@@ -33,6 +34,7 @@ def parse_args(args):
3334
action="version",
3435
# version=f"IRLwPython {__version__}",
3536
)
37+
parser.add_argument('--deep', action='store_true', help="Uses Max Entropy Deep IRL.")
3638
parser.add_argument('--training', action='store_true', help="Enables training of model.")
3739
parser.add_argument('--testing', action='store_true',
3840
help="Enables testing of previously created model.")
@@ -75,24 +77,25 @@ def main(args):
7577
theta_learning_rate = 0.05
7678

7779
theta = -(np.random.uniform(size=(n_states,)))
78-
trainer = MaxEntropyIRL(feature_matrix, theta)
7980

8081
if args.render:
81-
car = MountainCar(True, feature_matrix, one_feature, q_learning_rate, gamma, n_states, trainer)
82+
car = MountainCar(True, one_feature)
8283
else:
83-
car = MountainCar(False, feature_matrix, one_feature, q_learning_rate, gamma, n_states, trainer)
84+
car = MountainCar(False, one_feature)
85+
86+
#if args.deep:
87+
# deep = MaxEntropyDeepIRL()
88+
# deep.run()
8489

8590
if args.training:
8691
q_table = np.zeros((n_states, n_actions))
87-
car.set_q_table(q_table)
88-
89-
car.train(theta_learning_rate)
92+
trainer = MaxEntropyIRL(car, feature_matrix, one_feature, q_table, q_learning_rate, gamma, n_states, theta)
93+
trainer.train(theta_learning_rate)
9094

9195
if args.testing:
9296
q_table = np.load(file="./results/maxent_q_table.npy") # (400, 3)
93-
car.set_q_table(q_table)
94-
95-
car.test()
97+
trainer = MaxEntropyIRL(car, feature_matrix, one_feature, q_table, q_learning_rate, gamma, n_states, theta)
98+
trainer.test()
9699

97100
_logger.info("Script ends here")
98101

0 commit comments

Comments
 (0)