finalizing lab3 2021

avaamini · avaamini · commit 827950936ca9 · 2021-01-02T22:23:10.000-05:00
diff --git a/lab3/solutions/RL_Solution.ipynb b/lab3/solutions/RL_Solution.ipynb
@@ -3,7 +3,7 @@
   "nbformat_minor": 0,
   "metadata": {
     "colab": {
-      "name": "RL_Solution_APS.ipynb",
+      "name": "RL_Solution.ipynb",
       "provenance": [],
       "collapsed_sections": []
     },
@@ -109,7 +109,7 @@
         "import base64, io, time, gym\n",
         "import IPython, functools\n",
         "import matplotlib.pyplot as plt\n",
-        "import copy\n",
+        "import time\n",
         "from tqdm import tqdm\n",
         "\n",
         "# !pip install mitdeeplearning\n",
@@ -624,7 +624,9 @@
         "id": "lbYHLr66i15n"
       },
       "source": [
-        "env = gym.make(\"Pong-v0\", frameskip=5)\n",
+        "def create_pong_env(): \n",
+        "  return gym.make(\"Pong-v0\", frameskip=5)\n",
+        "env = create_pong_env()\n",
         "env.seed(1); # for reproducibility"
       ],
       "execution_count": null,
@@ -819,7 +821,9 @@
         "id": "YBLVfdpv7ajG"
       },
       "source": [
-        "Let's also consider the fact that, unlike CartPole, the Pong environment is a *dynamic* one -- that is, the environment is changing over time, based on the actions we take and the actions of the opponent, which result in motion of the ball and motion of the paddles. Therefore, to capture the dynamics, we also consider how the environment changes by looking at the difference between a previous observation (image frame) and the current observation (image frame). We've implemented a helper function, `pong_change`, that pre-processes two frames, calculates the change between the two, and then re-normalizes the values. Let's inspect this to visualize how the environment can change:"
+        "Let's also consider the fact that, unlike CartPole, the Pong environment has an additional element of uncertainty -- regardless of what action the agent takes, we don't know how the opponent will play. That is, the environment is changing over time, based on *both* the actions we take and the actions of the opponent, which result in motion of the ball and motion of the paddles.\r\n",
+        "\r\n",
+        "Therefore, to capture the dynamics, we also consider how the environment changes by looking at the difference between a previous observation (image frame) and the current observation (image frame). We've implemented a helper function, `pong_change`, that pre-processes two frames, calculates the change between the two, and then re-normalizes the values. Let's inspect this to visualize how the environment can change:"
       ]
     },
     {
@@ -837,7 +841,7 @@
         "  a.axis(\"off\")\r\n",
         "ax[0].imshow(observation); ax[0].set_title('Previous Frame');\r\n",
         "ax[1].imshow(next_observation); ax[1].set_title('Current Frame');\r\n",
-        "ax[2].imshow(np.squeeze(diff)); ax[2].set_title('Difference');"
+        "ax[2].imshow(np.squeeze(diff)); ax[2].set_title('Difference (Model Input)');"
       ],
       "execution_count": null,
       "outputs": []
@@ -986,55 +990,72 @@
         "Let's run the code block to train our Pong agent. Note that, even with parallelization, completing training and getting stable behavior will take quite a bit of time (estimated at least a couple of hours). We will again visualize the evolution of the total reward as a function of training to get a sense of how the agent is learning."
       ]
     },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "FaEHTMRVMRXP"
+      },
+      "source": [
+        "### Hyperparameters and setup for training ###\r\n",
+        "# Rerun this cell if you want to re-initialize the training process\r\n",
+        "#   (i.e., create new model, reset loss, etc)\r\n",
+        "\r\n",
+        "# Hyperparameters\r\n",
+        "learning_rate = 1e-3\r\n",
+        "MAX_ITERS = 1000 # increase the maximum to train longer\r\n",
+        "batch_size = 5 # number of batches to run\r\n",
+        "\r\n",
+        "# Model, optimizer\r\n",
+        "pong_model = create_pong_model()\r\n",
+        "optimizer = tf.keras.optimizers.Adam(learning_rate)\r\n",
+        "iteration = 0 # counter for training steps\r\n",
+        "\r\n",
+        "# Plotting\r\n",
+        "smoothed_reward = mdl.util.LossHistory(smoothing_factor=0.9)\r\n",
+        "smoothed_reward.append(0) # start the reward at zero for baseline comparison\r\n",
+        "plotter = mdl.util.PeriodicPlotter(sec=15, xlabel='Iterations', ylabel='Win Percentage (%)')\r\n",
+        "\r\n",
+        "# Batches and environment\r\n",
+        "# To parallelize batches, we need to make multiple copies of the environment.\r\n",
+        "envs = [create_pong_env() for _ in range(batch_size)] # For parallelization"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
     {
       "cell_type": "code",
       "metadata": {
         "id": "xCwyQQrPnkZG"
       },
       "source": [
         "### Training Pong ###\n",
-        "\n",
-        "# Hyperparameters\n",
-        "learning_rate = 1e-3\n",
-        "MAX_ITERS = 1000 # increase the maximum to train longer\n",
+        "# You can run this cell and stop it anytime in the middle of training to save \n",
+        "#   a progress video (see next codeblock). To continue training, simply run this \n",
+        "#   cell again, your model will pick up right where it left off. To reset training,\n",
+        "#   you need to run the cell above. \n",
         "\n",
         "games_to_win_episode = 21 # this is set by OpenAI gym and cannot be changed.\n",
         "\n",
-        "# Model, optimizer\n",
-        "pong_model = create_pong_model()\n",
-        "optimizer = tf.keras.optimizers.Adam(learning_rate)\n",
-        "\n",
-        "# Plotting\n",
-        "smoothed_reward = mdl.util.LossHistory(smoothing_factor=0.9)\n",
-        "smoothed_reward.append(0) # start the reward at zero for baseline comparison\n",
-        "plotter = mdl.util.PeriodicPlotter(sec=20, xlabel='Iterations', ylabel='Win Percentage (%)')\n",
-        "\n",
-        "# Batches and environment\n",
-        "batch_size = 5 # number of batches to run\n",
-        "# To parallelize batches, we need to make multiple copies of the environment.\n",
-        "envs = [copy.deepcopy(env) for _ in range(batch_size)] # For parallelization\n",
-        "\n",
         "# Main training loop\n",
-        "for i_episode in range(MAX_ITERS):\n",
+        "while iteration < MAX_ITERS:\n",
         "\n",
         "  plotter.plot(smoothed_reward.get())\n",
         "\n",
+        "  tic = time.time()\n",
         "  # RL agent algorithm. By default, uses serial batch processing.\n",
         "  # memories = collect_rollout(batch_size, env, pong_model, choose_action)\n",
         "\n",
         "  # Parallelized version. Uncomment line below (and comment out line above) to parallelize\n",
         "  memories = mdl.lab3.parallelized_collect_rollout(batch_size, envs, pong_model, choose_action)\n",
+        "  print(time.time()-tic)\n",
         "\n",
         "  # Aggregate memories from multiple batches\n",
         "  batch_memory = aggregate_memories(memories)\n",
         "\n",
-        "  # Determine total reward and track reported as win percentage\n",
-        "  # net_score = sum(batch_memory.rewards) / batch_size\n",
-        "  # win_rate = abs(net_score) / games_to_win_episode\n",
+        "  # Track performance based on win percentage (calculated from rewards)\n",
         "  total_wins = sum(np.array(batch_memory.rewards) == 1)\n",
         "  total_games = sum(np.abs(np.array(batch_memory.rewards)))\n",
         "  win_rate = total_wins / total_games\n",
-        "\n",
         "  smoothed_reward.append(100 * win_rate)\n",
         "  \n",
         "  # Training!\n",
@@ -1047,9 +1068,11 @@
         "  )\n",
         "\n",
         "  # Save a video of progress -- this can be played back later\n",
-        "  if i_episode % 500 == 0:\n",
+        "  if iteration % 100 == 0:\n",
         "    mdl.lab3.save_video_of_model(pong_model, \"Pong-v0\", \n",
-        "                                                suffix=\"_\"+str(i_episode))\n"
+        "                                                suffix=\"_\"+str(iteration))\n",
+        "    \n",
+        "  iteration += 1 # Mark next episode"
       ],
       "execution_count": null,
       "outputs": []
@@ -1069,9 +1092,9 @@
         "id": "TvHXbkL0tR6M"
       },
       "source": [
-        "final_pong = mdl.lab3.save_video_of_model(\n",
-        "    pong_model, \"Pong-v0\", suffix=\"final\")\n",
-        "mdl.lab3.play_video(final_pong)"
+        "latest_pong = mdl.lab3.save_video_of_model(\n",
+        "    pong_model, \"Pong-v0\", suffix=\"latest\")\n",
+        "mdl.lab3.play_video(latest_pong)"
       ],
       "execution_count": null,
       "outputs": []
@@ -1088,11 +1111,11 @@
         "\n",
         "*   How does the agent perform?\n",
         "*   Could you train it for shorter amounts of time  and still perform well?\n",
-        "*   Do you think that training longer would help even more? \n",
-        "* How does the complexity of Pong relative to Cartpole alter the rate at which the agent learns and its performance? \n",
+        "* What are some limitations of the current representation i.e., difference of current and previous frames? How is this reflected in the agent's behavior? What could be done to generate an improved representation?\n",
+        "* How does the complexity of Pong relative to CartPole alter the rate at which the agent learns and its performance? \n",
         "* What are some things you could change about the agent or the learning process to potentially improve performance?\n",
         "\n",
-        "Try to optimize your model to achieve improved performance.  **MIT students and affiliates will be eligible for prizes during the IAP offering.** To enter the competition, please [email us](mailto:introtodeeplearning-staff@mit.edu) with your name and the following:\n",
+        "Try to optimize your **Pong** model and algorithm to achieve improved performance.  **MIT students and affiliates will be eligible for prizes during the IAP offering.** To enter the competition, please [email us](mailto:introtodeeplearning-staff@mit.edu) with your name and the following:\n",
         "\n",
         "* Jupyter notebook with the code you used to generate your results, **with the Pong training executed**;\n",
         "* saved video of your Pong agent competing;\n",