Reinforcement-Learning-for-automation/main.py at master · bl3e967/Reinforcement-Learning-for-automation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import numpy as np
import value_iteration as vi
import gym
from gym import wrappers as wrapper

simulation_length = 100
environment = 'FrozenLake8x8-v0'
directory   = 'VI-frozenlake8x8-1'
env = gym.envs.make(environment)
record_video_every = 10
env = wrapper.Monitor(env, directory=directory, video_callable=lambda count: count % record_video_every == 0, resume=True)
# Find optimal value function and the optimal policy using value iteration
gamma = 1.0
optimalPol, optimalV = vi.value_iteration(env, discount_factor = gamma)

# Initialise container for reward record
reward_for_av = []

for episode in range(simulation_length):
    observation = env.reset()
    for t in range(10000):
        env.render()
        action = optimalPol[observation]
        observation, reward, done, info = env.step(action)
        if done:
            if reward == 0.0:
                print("LOSE")
            else:
                print("WIN")
            print("Episode finished after {} timesteps".format(t+1))
            break
        reward_for_av.append(reward)
        # print average reward every 1000 steps
        if simulation_length % 1000 == 0:
            print('Current average reward: %f' % np.mean(reward_for_av))
#env.close()