-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
36 lines (33 loc) · 1.21 KB
/
main.py
File metadata and controls
36 lines (33 loc) · 1.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import numpy as np
import value_iteration as vi
import gym
from gym import wrappers as wrapper
simulation_length = 100
environment = 'FrozenLake8x8-v0'
directory = 'VI-frozenlake8x8-1'
env = gym.envs.make(environment)
record_video_every = 10
env = wrapper.Monitor(env, directory=directory, video_callable=lambda count: count % record_video_every == 0, resume=True)
# Find optimal value function and the optimal policy using value iteration
gamma = 1.0
optimalPol, optimalV = vi.value_iteration(env, discount_factor = gamma)
# Initialise container for reward record
reward_for_av = []
for episode in range(simulation_length):
observation = env.reset()
for t in range(10000):
env.render()
action = optimalPol[observation]
observation, reward, done, info = env.step(action)
if done:
if reward == 0.0:
print("LOSE")
else:
print("WIN")
print("Episode finished after {} timesteps".format(t+1))
break
reward_for_av.append(reward)
# print average reward every 1000 steps
if simulation_length % 1000 == 0:
print('Current average reward: %f' % np.mean(reward_for_av))
#env.close()