MITDeepLearning
diff --git a/‎lab3/solutions/pong.py
Lines changed: 82 additions & 112 deletions b/‎lab3/solutions/pong.py
Lines changed: 82 additions & 112 deletions
@@ -3,6 +3,9 @@
 import argparse
 import multiprocessing
 from multiprocessing import Pool
+import os
+os.environ['KMP_DUPLICATE_LIB_OK']='True'
+import copy
 
 
 import tensorflow as tf
@@ -21,48 +24,11 @@
 print(args)
 
 
-from tensorflow.keras.models import Sequential, Model
-from tensorflow.keras.layers import Dense
-from tensorflow.python.keras.layers import deserialize, serialize
-from tensorflow.python.keras.saving import saving_utils
-
-
-def unpack(model, training_config, weights):
-    restored_model = deserialize(model)
-    if training_config is not None:
-        restored_model.compile(
-            **saving_utils.compile_args_from_training_config(
-                training_config
-            )
-        )
-    restored_model.set_weights(weights)
-    return restored_model
-
-# Hotfix function
-def make_keras_picklable():
-
-    def __reduce__(self):
-        model_metadata = saving_utils.model_metadata(self)
-        training_config = model_metadata.get("training_config", None)
-        model = serialize(self)
-        weights = self.get_weights()
-        return (unpack, (model, training_config, weights))
-
-    cls = Model
-    cls.__reduce__ = __reduce__
-
-# Run the function
-make_keras_picklable()
-
-
-
 physical_devices = tf.config.experimental.list_physical_devices('GPU')
 if len(physical_devices) > 0:
     tf.config.experimental.set_memory_growth(physical_devices[0], True)
 
 
-
-
 env = gym.make("Pong-v0", frameskip=5, difficulty=0)
 env.seed(1)  # for reproducibility
 
@@ -78,27 +44,30 @@ def __reduce__(self):
 #   observation: observation which is fed as input to the model
 # Returns:
 #   action: choice of agent action
-def choose_action(model, observation):
+def choose_action(model, observations):
     # add batch dimension to the observation
-    observation = np.expand_dims(observation, axis=0)
+    # observation = np.expand_dims(observation, axis=0)
     '''TODO: feed the observations through the model to predict the log probabilities of each possible action.'''
-    logits = model.predict(observation)  # TODO
+
+    logits = model.predict(observations)  # TODO
     # logits = model.predict('''TODO''')
 
     # pass the log probabilities through a softmax to compute true probabilities
-    prob_weights = tf.nn.softmax(logits).numpy()
+    prob_weights = tf.nn.softmax(logits)
     '''TODO: randomly sample from the prob_weights to pick an action.
   Hint: carefully consider the dimensionality of the input probabilities (vector) and the output action (scalar)'''
-    action = np.random.choice(
-        n_actions, size=1, p=prob_weights.flatten())[0]  # TODO
+
+    action = tf.random.categorical(logits, 1)[:,0].numpy()
+
+    # action = np.random.choice(
+    #     n_actions, size=1, p=prob_weights.flatten())[0]  # TODO
     # action = np.random.choice('''TODO''', size=1, p=''''TODO''')['''TODO''']
 
     return action
 
 
 ### Reward function ###
 
-
 # Helper function that normalizes an np.array x
 def normalize(x):
     x -= np.mean(x)
@@ -109,7 +78,6 @@ def normalize(x):
 
 ### Agent Memory ###
 
-
 class Memory:
     def __init__(self):
         self.clear()
@@ -258,7 +226,6 @@ def fix(img):
 # Model and optimizer
 pong_model = create_pong_model()
 pong_model.build((None, 40, 40, 1))
-pong_model.save("model.h5")
 
 optimizer = tf.keras.optimizers.Adam(learning_rate)
 
@@ -267,100 +234,109 @@ def fix(img):
 smoothed_reward.append(-21) # start the reward at the minimum (0-21) for baseline comparison
 # plotter = mdl.util.PeriodicPlotter(
 #     sec=5, xlabel='Iterations', ylabel='Rewards')
-memory = Memory()
 batch_size = args.batch_size
-batches = 0
 
 
-def run_episode(env, model):
-    ("running episode")
-    memory = Memory()
-    observation = env.reset()
-    previous_frame = fix(mdl.lab3.preprocess_pong(observation))
-    done = False
-    while not done:
-        # Pre-process image
-        current_frame = fix(mdl.lab3.preprocess_pong(observation))
-        obs_change = current_frame - previous_frame  # TODO
 
-        # obs_change = # TODO
-        action = choose_action(model, obs_change)  # TODO
 
-        # action = # TODO
-        # Take the chosen action
-        next_observation, reward, done, info = env.step(action)
 
-        memory.add_to_memory(obs_change, action, reward)  # TODO
 
-        observation = next_observation
-        previous_frame = current_frame
-    return memory
+# def run_episode(env, model):
+#     print("running episode")
+#     memory = Memory()
+#     observation = env.reset()
+#     previous_frame = fix(mdl.lab3.preprocess_pong(observation))
+#     done = False
+#     while not done:
+#         # Pre-process image
+#         current_frame = fix(mdl.lab3.preprocess_pong(observation))
+#         obs_change = current_frame - previous_frame  # TODO
+#
+#         # obs_change = # TODO
+#         tic = time.time()
+#         action = choose_action(model, obs_change)  # TODO
+#
+#         # action = # TODO
+#         # Take the chosen action
+#         tic = time.time()
+#         next_observation, reward, done, info = env.step(action)
+#
+#         memory.add_to_memory(obs_change, action, reward)  # TODO
+#
+#         observation = next_observation
+#         previous_frame = current_frame
+#     return memory
 
 
+envs = [copy.deepcopy(env) for _ in range(batch_size)]
 
 for i_episode in range(MAX_ITERS):
 
-    # plotter.plot(smoothed_reward.get())
+    tic = time.time()
+    memories = [Memory() for _ in range(batch_size)]
+    next_observations = [single_env.reset() for single_env in envs]
+    previous_frames = [obs for obs in next_observations]
+    done = [False] * batch_size
+    actions = [0] * batch_size
+    rewards = [0] * batch_size
+    print("reiniting", time.time()-tic)
 
-    # # Restart the environment
-    # observation = env.reset()
-    # previous_frame = fix(mdl.lab3.preprocess_pong(observation))
-    # tic = time.time()
-    # while True:
-    #     # Pre-process image
-    #     current_frame = fix(mdl.lab3.preprocess_pong(observation))
-    #     '''TODO: determine the observation change
-    #   Hint: this is the difference between the past two frames'''
-    #     obs_change = current_frame - previous_frame  # TODO
-    #
-    #
-    #
-    #     # obs_change = # TODO
-    #     '''TODO: choose an action for the pong model, using the frame difference, and evaluate'''
-    #     action = choose_action(pong_model, obs_change)  # TODO
-    #     # action = # TODO
-    #     # Take the chosen action
-    #     next_observation, reward, done, info = env.step(action)
-    #     '''TODO: save the observed frame difference, the action that was taken, and the resulting reward!'''
-    #     memory.add_to_memory(obs_change, action, reward)  # TODO
-    #
-    #     if len(memory.actions) % 3 == 0 and args.draw:
-    #         z = obs_change
-    #         z = (z-z.min())/ (z.max()-z.min()+1e-6)
-    #         cv2.imshow('hi', cv2.resize(z, (256, 256)))
-    #         cv2.waitKey(1)
 
+    tic = time.time()
+    while True:
+
+        current_frames = [obs for obs in next_observations]
+        diff_frames = [mdl.lab3.pong_change(prev, curr) for (prev, curr) in zip(previous_frames, current_frames)]
+
+        diff_frames_not_done = [diff_frames[b] for b in range(batch_size) if not done[b]]
+        actions_not_done = choose_action(pong_model, np.array(diff_frames_not_done))
+
+        actions = [None] * batch_size
+        ind_not_done = 0
+        for b in range(batch_size):
+            if not done[b]:
+                actions[b] = actions_not_done[ind_not_done]
+                ind_not_done += 1
+
+        for b in range(batch_size):
+            if done[b]:
+                continue
+            next_observations[b], rewards[b], done[b], info = envs[b].step(actions[b])
+            previous_frames[b] = current_frames[b]
+            memories[b].add_to_memory(diff_frames[b], actions[b], rewards[b])
 
 
-    import copy
+        if all(done):
+            break
 
-    def parallel_episode(new_model):
-        print("insdie paralel")
-        # new_model = tf.keras.models.load_model('model.h5')
-        print(new_model)
-        return run_episode(env=copy.deepcopy(env), model=new_model)
 
+    # def parallel_episode(i):
+    #     return run_episode(env=copy.deepcopy(env), model=pong_model)
+    #
     # tic = time.time()
     # memories = [parallel_episode(batch) for batch in range(batch_size)]
     # print(time.time()-tic)
 
-    models = [tf.keras.models.load_model('model.h5') for b in range(batch_size)]
-    tic = time.time()
-    with Pool(processes=batch_size) as pool:
-        memories = pool.map(parallel_episode, models)#range(batch_size))
+    # models = [tf.keras.models.load_model('model.h5') for b in range(batch_size)]
+    # tic = time.time()
+    # with Pool(processes=batch_size) as pool:
+    #     memories = pool.map(parallel_episode, models)#range(batch_size))
+    # print(time.time()-tic)
+
     print(time.time()-tic)
 
     batch_memory = Memory()
     for memory in memories:
          for step in zip(memory.observations, memory.actions, memory.rewards):
              batch_memory.add_to_memory(*step)
 
+
+
     def play(memory):
         for o in memory.observations:
             cv2.imshow('hi', cv2.resize(o, (500,500)))
             cv2.waitKey(20)
 
-    # import pdb; pdb.set_trace()
 
 
     ### Train with this batch!!!
@@ -380,9 +356,6 @@ def play(memory):
     last_smoothed_reward = smoothed_reward.get()[-1]
     print(f"{iters} \t {round(last_smoothed_reward, 3)}")
 
-    tf.keras.backend.clear_session()
-    pong_model = tf.keras.models.load_model('model.h5')
-
     # begin training
     train_step(
         pong_model,
@@ -391,9 +364,6 @@ def play(memory):
         actions=np.array(batch_memory.actions),
         discounted_rewards=discount_rewards(batch_memory.rewards))
 
-    tf.keras.backend.clear_session()
-    del pong_model
-
     batch_memory.clear()
     # break