|
161 | 161 | "source": [
|
162 | 162 | "%%bash\n",
|
163 | 163 | "# Install additional packages for visualization\n",
|
164 |
| - "sudo apt-get install -y xvfb python-opengl > /dev/null 2>&1\n", |
165 |
| - "pip install pyvirtualdisplay > /dev/null 2>&1\n", |
| 164 | + "sudo apt-get install -y python-opengl > /dev/null 2>&1\n", |
166 | 165 | "pip install git+https://github.com/tensorflow/docs > /dev/null 2>&1"
|
167 | 166 | ]
|
168 | 167 | },
|
|
187 | 186 | "\n",
|
188 | 187 | "\n",
|
189 | 188 | "# Create the environment\n",
|
190 |
| - "env = gym.make(\"CartPole-v0\")\n", |
| 189 | + "env = gym.make(\"CartPole-v1\")\n", |
191 | 190 | "\n",
|
192 | 191 | "# Set seed for experiment reproducibility\n",
|
193 | 192 | "seed = 42\n",
|
194 |
| - "env.seed(seed)\n", |
195 | 193 | "tf.random.set_seed(seed)\n",
|
196 | 194 | "np.random.seed(seed)\n",
|
197 | 195 | "\n",
|
|
307 | 305 | "def env_step(action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:\n",
|
308 | 306 | " \"\"\"Returns state, reward and done flag given an action.\"\"\"\n",
|
309 | 307 | "\n",
|
310 |
| - " state, reward, done, _ = env.step(action)\n", |
| 308 | + " state, reward, done, truncated, info = env.step(action)\n", |
311 | 309 | " return (state.astype(np.float32), \n",
|
312 | 310 | " np.array(reward, np.int32), \n",
|
313 | 311 | " np.array(done, np.int32))\n",
|
|
431 | 429 | {
|
432 | 430 | "cell_type": "markdown",
|
433 | 431 | "metadata": {
|
434 |
| - "id": "1hrPLrgGxlvb" |
| 432 | + "id": "qhr50_Czxazw" |
435 | 433 | },
|
436 | 434 | "source": [
|
437 | 435 | "### 3. The actor-critic loss\n",
|
438 | 436 | "\n",
|
439 | 437 | "Since a hybrid actor-critic model is used, the chosen loss function is a combination of actor and critic losses for training, as shown below:\n",
|
440 | 438 | "\n",
|
441 |
| - "$$L = L_{actor} + L_{critic}$$\n", |
442 |
| - "\n", |
| 439 | + "$$L = L_{actor} + L_{critic}$$" |
| 440 | + ] |
| 441 | + }, |
| 442 | + { |
| 443 | + "cell_type": "markdown", |
| 444 | + "metadata": { |
| 445 | + "id": "nOQIJuG1xdTH" |
| 446 | + }, |
| 447 | + "source": [ |
443 | 448 | "#### Actor loss\n",
|
444 | 449 | "\n",
|
445 | 450 | "The actor loss is based on [policy gradients with the critic as a state dependent baseline](https://www.youtube.com/watch?v=EKqxumCuAAY&t=62m23s) and computed with single-sample (per-episode) estimates.\n",
|
|
456 | 461 | "\n",
|
457 | 462 | "A negative term is added to the sum since the idea is to maximize the probabilities of actions yielding higher rewards by minimizing the combined loss.\n",
|
458 | 463 | "\n",
|
459 |
| - "<br>\n", |
460 |
| - "\n", |
| 464 | + "<br>" |
| 465 | + ] |
| 466 | + }, |
| 467 | + { |
| 468 | + "cell_type": "markdown", |
| 469 | + "metadata": { |
| 470 | + "id": "Y304O4OAxiAv" |
| 471 | + }, |
| 472 | + "source": [ |
461 | 473 | "##### Advantage\n",
|
462 | 474 | "\n",
|
463 | 475 | "The $G - V$ term in our $L_{actor}$ formulation is called the [advantage](https://spinningup.openai.com/en/latest/spinningup/rl_intro.html#advantage-functions), which indicates how much better an action is given a particular state over a random action selected according to the policy $\\pi$ for that state.\n",
|
|
468 | 480 | "\n",
|
469 | 481 | "For instance, suppose that two actions for a given state would yield the same expected return. Without the critic, the algorithm would try to raise the probability of these actions based on the objective $J$. With the critic, it may turn out that there's no advantage ($G - V = 0$) and thus no benefit gained in increasing the actions' probabilities and the algorithm would set the gradients to zero.\n",
|
470 | 482 | "\n",
|
471 |
| - "<br>\n", |
472 |
| - "\n", |
| 483 | + "<br>" |
| 484 | + ] |
| 485 | + }, |
| 486 | + { |
| 487 | + "cell_type": "markdown", |
| 488 | + "metadata": { |
| 489 | + "id": "1hrPLrgGxlvb" |
| 490 | + }, |
| 491 | + "source": [ |
473 | 492 | "#### Critic loss\n",
|
474 | 493 | "\n",
|
475 | 494 | "Training $V$ to be as close possible to $G$ can be set up as a regression problem with the following loss function:\n",
|
|
596 | 615 | "\n",
|
597 | 616 | "min_episodes_criterion = 100\n",
|
598 | 617 | "max_episodes = 10000\n",
|
599 |
| - "max_steps_per_episode = 1000\n", |
| 618 | + "max_steps_per_episode = 500\n", |
600 | 619 | "\n",
|
601 |
| - "# Cartpole-v0 is considered solved if average reward is >= 195 over 100 \n", |
| 620 | + "# Cartpole-v1 is considered solved if average reward is >= 475 over 500 \n", |
602 | 621 | "# consecutive trials\n",
|
603 |
| - "reward_threshold = 195\n", |
| 622 | + "reward_threshold = 475\n", |
604 | 623 | "running_reward = 0\n",
|
605 | 624 | "\n",
|
606 | 625 | "# Discount factor for future rewards\n",
|
|
609 | 628 | "# Keep last episodes reward\n",
|
610 | 629 | "episodes_reward: collections.deque = collections.deque(maxlen=min_episodes_criterion)\n",
|
611 | 630 | "\n",
|
612 |
| - "with tqdm.trange(max_episodes) as t:\n", |
613 |
| - " for i in t:\n", |
614 |
| - " initial_state = tf.constant(env.reset(), dtype=tf.float32)\n", |
| 631 | + "t = tqdm.trange(max_episodes)\n", |
| 632 | + "for i in t:\n", |
| 633 | + " initial_state, info = env.reset()\n", |
| 634 | + " initial_state = tf.constant(initial_state, dtype=tf.float32)\n", |
615 | 635 | " episode_reward = int(train_step(\n",
|
616 | 636 | " initial_state, model, optimizer, gamma, max_steps_per_episode))\n",
|
617 | 637 | " \n",
|
618 | 638 | " episodes_reward.append(episode_reward)\n",
|
619 | 639 | " running_reward = statistics.mean(episodes_reward)\n",
|
620 | 640 | " \n",
|
621 |
| - " t.set_description(f'Episode {i}')\n", |
| 641 | + "\n", |
622 | 642 | " t.set_postfix(\n",
|
623 | 643 | " episode_reward=episode_reward, running_reward=running_reward)\n",
|
624 | 644 | " \n",
|
|
655 | 675 | "\n",
|
656 | 676 | "from IPython import display as ipythondisplay\n",
|
657 | 677 | "from PIL import Image\n",
|
658 |
| - "from pyvirtualdisplay import Display\n", |
659 |
| - "\n", |
660 |
| - "\n", |
661 |
| - "display = Display(visible=0, size=(400, 300))\n", |
662 |
| - "display.start()\n", |
663 | 678 | "\n",
|
| 679 | + "render_env = gym.make(\"CartPole-v1\", render_mode='rgb_array')\n", |
664 | 680 | "\n",
|
665 | 681 | "def render_episode(env: gym.Env, model: tf.keras.Model, max_steps: int): \n",
|
666 |
| - " screen = env.render(mode='rgb_array')\n", |
667 |
| - " im = Image.fromarray(screen)\n", |
668 |
| - "\n", |
669 |
| - " images = [im]\n", |
670 |
| - " \n", |
671 |
| - " state = tf.constant(env.reset(), dtype=tf.float32)\n", |
| 682 | + " state, info = render_env.reset()\n", |
| 683 | + " state = tf.constant(state, dtype=tf.float32)\n", |
| 684 | + " screen = render_env.render()\n", |
| 685 | + " images = [Image.fromarray(screen)]\n", |
| 686 | + " \n", |
672 | 687 | " for i in range(1, max_steps + 1):\n",
|
673 | 688 | " state = tf.expand_dims(state, 0)\n",
|
674 | 689 | " action_probs, _ = model(state)\n",
|
675 | 690 | " action = np.argmax(np.squeeze(action_probs))\n",
|
676 | 691 | "\n",
|
677 |
| - " state, _, done, _ = env.step(action)\n", |
| 692 | + " state, reward, done, truncated, info = render_env.step(action)\n", |
678 | 693 | " state = tf.constant(state, dtype=tf.float32)\n",
|
679 | 694 | "\n",
|
680 | 695 | " # Render screen every 10 steps\n",
|
681 | 696 | " if i % 10 == 0:\n",
|
682 |
| - " screen = env.render(mode='rgb_array')\n", |
| 697 | + " screen = render_env.render()\n", |
683 | 698 | " images.append(Image.fromarray(screen))\n",
|
684 | 699 | " \n",
|
685 | 700 | " if done:\n",
|
|
690 | 705 | "\n",
|
691 | 706 | "# Save GIF image\n",
|
692 | 707 | "images = render_episode(env, model, max_steps_per_episode)\n",
|
693 |
| - "image_file = 'cartpole-v0.gif'\n", |
| 708 | + "image_file = 'cartpole-v1.gif'\n", |
694 | 709 | "# loop=0: loop forever, duration=1: play each frame for 1ms\n",
|
695 | 710 | "images[0].save(\n",
|
696 | 711 | " image_file, save_all=True, append_images=images[1:], loop=0, duration=1)"
|
|
0 commit comments