Skip to content

Commit 9c02156

Browse files
authored
Merge pull request #3 from HokageM/feat/pytorch_irl
doc: add docu
2 parents 3529a8e + 646ae24 commit 9c02156

File tree

6 files changed

+84
-21
lines changed

6 files changed

+84
-21
lines changed

src/irlwpytorch/MaxEntropyIRL.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,29 @@ def __init__(self, feature_matrix, theta):
66
self.feature_matrix = feature_matrix
77
self.theta = theta
88

9-
def __enter__(self):
10-
return self
11-
12-
def __exit__(self, exc_type, exc_val, exc_tb):
13-
pass
14-
159
def get_feature_matrix(self):
10+
"""
11+
Returns the feature matrix.
12+
:return:
13+
"""
1614
return self.feature_matrix
1715

1816
def get_reward(self, n_states, state_idx):
17+
"""
18+
Returns the achieved reward.
19+
:param n_states:
20+
:param state_idx:
21+
:return:
22+
"""
1923
irl_rewards = self.feature_matrix.dot(self.theta).reshape((n_states,))
2024
return irl_rewards[state_idx]
2125

2226
def expert_feature_expectations(self, demonstrations):
27+
"""
28+
Returns the feature expectations.
29+
:param demonstrations:
30+
:return:
31+
"""
2332
feature_expectations = np.zeros(self.feature_matrix.shape[0])
2433

2534
for demonstration in demonstrations:
@@ -30,6 +39,13 @@ def expert_feature_expectations(self, demonstrations):
3039
return feature_expectations
3140

3241
def maxent_irl(self, expert, learner, learning_rate):
42+
"""
43+
Max Entropy Learning step.
44+
:param expert:
45+
:param learner:
46+
:param learning_rate:
47+
:return:
48+
"""
3349
gradient = expert - learner
3450
self.theta += learning_rate * gradient
3551

src/irlwpytorch/MountainCar.py

Lines changed: 60 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,23 +18,26 @@ def __init__(self, animation, feature_matrix, one_feature, q_learning_rate, gamm
1818
self.n_states = n_states
1919
self.trainer = trainer
2020

21-
def __enter__(self):
22-
return self
23-
24-
def __exit__(self, exc_type, exc_val, exc_tb):
25-
pass
26-
2721
def set_q_table(self, table):
22+
"""
23+
Sets the
24+
:param table:
25+
:return:
26+
"""
2827
self.q_table = table
2928

30-
def idx_demo(self, one_feature):
29+
def get_demonstrations(self, one_feature):
30+
"""
31+
Parses the demonstrations and returns the demonstrations.
32+
:param one_feature:
33+
:return:
34+
"""
3135
env_low = self.env.observation_space.low
3236
env_high = self.env.observation_space.high
3337
env_distance = (env_high - env_low) / self.one_feature
3438

3539
raw_demo = np.load(file="expert_demo/expert_demo.npy")
3640
demonstrations = np.zeros((len(raw_demo), len(raw_demo[0]), 3))
37-
3841
for x in range(len(raw_demo)):
3942
for y in range(len(raw_demo[0])):
4043
position_idx = int((raw_demo[x][y][0] - env_low[0]) / env_distance[0])
@@ -47,7 +50,12 @@ def idx_demo(self, one_feature):
4750
return demonstrations
4851

4952
def idx_to_state(self, state):
50-
""" Convert pos and vel about mounting car environment to the integer value"""
53+
"""
54+
Converts state (pos, vel) to the integer value using the mountain car environment.
55+
:param state:
56+
:return:
57+
"""
58+
""" """
5159
env_low = self.env.observation_space.low
5260
env_high = self.env.observation_space.high
5361
env_distance = (env_high - env_low) / self.one_feature
@@ -57,38 +65,73 @@ def idx_to_state(self, state):
5765
return state_idx
5866

5967
def update_q_table(self, state, action, reward, next_state):
68+
"""
69+
Updates the Q table for a specified state and action.
70+
:param state:
71+
:param action:
72+
:param reward:
73+
:param next_state:
74+
:return:
75+
"""
6076
q_1 = self.q_table[state][action]
6177
q_2 = reward + self.gamma * max(self.q_table[next_state])
6278
self.q_table[state][action] += self.q_learning_rate * (q_2 - q_1)
6379

6480
def env_render(self):
81+
"""
82+
Computes the render frames as specified by render_mode attribute during initialization of the environment.
83+
:return:
84+
"""
6585
self.env.render()
6686

6787
def env_reset(self):
88+
"""
89+
Resets the environment to an initial state and returns the initial observation.
90+
Start position is in random range of [-0.6, -0.4].
91+
:return:
92+
"""
6893
return self.env.reset()
6994

7095
def env_step(self, action):
96+
"""
97+
Runs one timestep of the environment's dynamics.
98+
:param action:
99+
:return:
100+
"""
71101
return self.env.step(action)
72102

73103
def train(self, theta_learning_rate):
74-
demonstrations = self.idx_demo(self.one_feature)
75-
104+
"""
105+
Trains a model.
106+
:param theta_learning_rate:
107+
:return:
108+
"""
109+
demonstrations = self.get_demonstrations(self.one_feature)
110+
111+
# Get expert feature expectations
76112
expert = self.trainer.expert_feature_expectations(demonstrations)
113+
114+
# Learning
77115
learner_feature_expectations = np.zeros(self.n_states)
78116
episodes, scores = [], []
79-
117+
# For every episode
80118
for episode in range(30000):
119+
# Resets the environment to an initial state and returns the initial observation.
120+
# Start position is in random range of [-0.6, -0.4]
81121
state = self.env_reset()
82122
score = 0
83123

124+
# Mini-Batches ?
84125
if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0):
85126
learner = learner_feature_expectations / episode
86127
self.trainer.maxent_irl(expert, learner, theta_learning_rate)
87128

129+
# One Step in environment
88130
state = state[0]
89131
while True:
90132
state_idx = self.idx_to_state(state)
91133
action = np.argmax(self.q_table[state_idx])
134+
# Run one timestep of the environment's dynamics.
92135
next_state, reward, done, _, _ = self.env_step(action)
93136

94137
irl_reward = self.trainer.get_reward(self.n_states, state_idx)
@@ -104,14 +147,18 @@ def train(self, theta_learning_rate):
104147
episodes.append(episode)
105148
break
106149

107-
if episode % 100 == 0:
150+
if episode % 1000 == 0:
108151
score_avg = np.mean(scores)
109152
print('{} episode score is {:.2f}'.format(episode, score_avg))
110153
plt.plot(episodes, scores, 'b')
111154
plt.savefig("./learning_curves/maxent_30000.png")
112155
np.save("./results/maxent_30000_table", arr=self.q_table)
113156

114157
def test(self):
158+
"""
159+
Tests the previous trained model
160+
:return:
161+
"""
115162
episodes, scores = [], []
116163

117164
for episode in range(10):
2.61 KB
Loading
3.73 KB
Loading

src/irlwpytorch/main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ def main(args):
6565
args = parse_args(args)
6666
_logger.debug("Starting crazy calculations...")
6767

68-
n_states = 400 # position - 20, velocity - 20
69-
n_actions = 3
68+
n_states = 400 # position - 20, velocity - 20 -> 20*20
69+
n_actions = 3 # Accelerate to the left: 0, Don’t accelerate: 1, Accelerate to the right: 2
7070
one_feature = 20 # number of state per one feature
7171
feature_matrix = np.eye(n_states) # (400, 400)
7272

0 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)