PaddlePaddle
diff --git a/‎benchmark/torch/dqn/README.md‎
Lines changed: 16 additions & 0 deletions b/‎benchmark/torch/dqn/README.md‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎benchmark/torch/dqn/agent.py‎
Lines changed: 75 additions & 56 deletions b/‎benchmark/torch/dqn/agent.py‎
Lines changed: 75 additions & 56 deletions
diff --git a/‎benchmark/torch/dqn/model.py‎
Lines changed: 58 additions & 30 deletions b/‎benchmark/torch/dqn/model.py‎
Lines changed: 58 additions & 30 deletions
@@ -22,6 +22,22 @@ Performance of **DQN** on various environments:
 <img src=".benchmark/dqn.png" alt="result"/>
 </p>
 
+Performance of **Dueling DQN** on 55 Atari environments:
+
+|                     |                      |                      |                    |                 |
+|---------------------|----------------------|----------------------|--------------------|-----------------|
+|Alien (2390)         | Amidar (468)         | Assault (13898)      |Asterix (24067)     | Asteroids (450)  |
+|Atlantis (136833)    | WizardOfWor (1767)   | BankHeist (953)      |BattleZone (26667)  | BeamRider (9771) |
+|Berzerk (531)        | Bowling (30)         | Boxing (100)         |Breakout (531)      | Centipede (7416) |
+|ChopperCommand (1533)| CrazyClimber (102072)| DemonAttack (83478)  |DoubleDunk (0)      | Enduro (1634)    |
+|FishingDerby (26)    | Freeway (32)         | Frostbite (4803)     |Gopher (8128)       | Gravitar (83)    |
+|Hero (11810)         | IceHockey (-3)       | Jamesbond (616)      |Kangaroo (4900)     | Krull (8789)     |
+|KungFuMaster (33144) | MontezumaRevenge (0) | MsPacman (2873)      |NameThisGame (15010)| Phoenix (14837)  |
+|Pitfall (0)          | Pong (21)            | PrivateEye (100)     |Qbert (4850)        | Riverraid (12453)|
+|RoadRunner (58000)   | Robotank (26)        | Seaquest (5960)      |Skiing (-10584)     | Solaris (347)    |
+|SpaceInvaders (2068) | StarGunner (22100)   | Tennis (1)           |TimePilot (2967)    | Tutankham (132)  |
+|UpNDown (12350)      | Venture (0)          | VideoPinball (876611)|YarsRevenge (30281) | Zaxxon (4400)    |
+
 ## How to use
 ### Dependencies:
 + python>=3.6.2
 
@@ -1,4 +1,4 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,85 +12,99 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import argparse
-import gym
-
+import parl
 import numpy as np
-
 import torch
-import torch.nn as nn
-import torch.optim as optim
-import torch.nn.functional as F
-
-import parl
+from parl.utils.scheduler import LinearDecayScheduler
 
 
 class AtariAgent(parl.Agent):
-    """Base class of the Agent.
+    """Agent of Atari env.
 
     Args:
-        algorithm (object): Algorithm used by this agent.
-        args (argparse.Namespace): Model configurations.
-        device (torch.device): use cpu or gpu.
+        algorithm (`parl.Algorithm`): algorithm to be used in this agent.
+        act_dim (int): action space dimension
+        total_step (int): total epsilon decay steps
+        start_lr (float): initial learning rate
+        update_target_step (int): target network update frequency
     """
 
-    def __init__(self, algorithm, act_dim):
-        assert isinstance(act_dim, int)
-        super(AtariAgent, self).__init__(algorithm)
+    def __init__(self, algorithm, act_dim, start_lr, total_step,
+                 update_target_step):
+        super().__init__(algorithm)
+        self.global_update_step = 0
+        self.update_target_step = update_target_step
         self.act_dim = act_dim
-        self.exploration = 1
-        self.global_step = 0
-        self.update_target_steps = 10000 // 4
-
+        self.curr_ep = 1
+        self.ep_end = 0.1
+        self.lr_end = 0.00001
         self.device = torch.device('cuda' if torch.cuda.
                                    is_available() else 'cpu')
 
-    def save(self, filepath):
-        state = {
-            'model': self.alg.model.state_dict(),
-            'target_model': self.alg.target_model.state_dict(),
-            'optimizer': self.alg.optimizer.state_dict(),
-            'scheduler': self.alg.scheduler.state_dict(),
-            'exploration': self.exploration,
-        }
-        torch.save(state, filepath)
-
-    def restore(self, filepath):
-        checkpoint = torch.load(filepath)
-        self.exploration = checkpoint['exploration']
-        self.alg.model.load_state_dict(checkpoint['model'])
-        self.alg.target_model.load_state_dict(checkpoint['target_model'])
-        self.alg.optimizer.load_state_dict(checkpoint['optimizer'])
-        self.alg.scheduler.load_state_dict(checkpoint['scheduler'])
+        self.ep_scheduler = LinearDecayScheduler(1, total_step)
+        self.lr_scheduler = LinearDecayScheduler(start_lr, total_step)
 
     def sample(self, obs):
-        sample = np.random.random()
-        if sample < self.exploration:
+        """Sample an action when given an observation, base on the current epsilon value, 
+        either a greedy action or a random action will be returned.
+
+        Args:
+            obs (np.float32): shape of (3, 84, 84) or (1, 3, 84, 84), current observation
+
+        Returns:
+            act (int): action
+        """
+        explore = np.random.choice([True, False],
+                                   p=[self.curr_ep, 1 - self.curr_ep])
+        if explore:
             act = np.random.randint(self.act_dim)
         else:
-            if np.random.random() < 0.01:
-                act = np.random.randint(self.act_dim)
-            else:
-                act = self.predict(obs)
-        self.exploration = max(0.1, self.exploration - 1e-6)
+            act = self.predict(obs)
+
+        self.curr_ep = max(self.ep_scheduler.step(1), self.ep_end)
         return act
 
     def predict(self, obs):
-        obs = np.expand_dims(obs, 0)
+        """Predict an action when given an observation, a greedy action will be returned.
+
+        Args:
+            obs (np.float32): shape of (3, 84, 84) or (1, 3, 84, 84), current observation
+
+        Returns:
+            act(int): action
+        """
+        if obs.ndim == 3:  # if obs is 3 dimensional, we need to expand it to have batch_size = 1
+            obs = np.expand_dims(obs, axis=0)
+
         obs = torch.tensor(obs, dtype=torch.float, device=self.device)
-        pred_q = self.alg.predict(obs)
-        action = pred_q.max(1)[1].item()
-        return action
+        pred_q = self.alg.predict(obs).cpu().detach().numpy().squeeze()
+
+        best_actions = np.where(pred_q == pred_q.max())[0]
+        act = np.random.choice(best_actions)
+        return act
 
     def learn(self, obs, act, reward, next_obs, terminal):
-        if self.global_step % self.update_target_steps == 0:
+        """Update model with an episode data
+
+        Args:
+            obs (np.float32): shape of (batch_size, obs_dim)
+            act (np.int32): shape of (batch_size)
+            reward (np.float32): shape of (batch_size)
+            next_obs (np.float32): shape of (batch_size, obs_dim)
+            terminal (np.float32): shape of (batch_size)
+
+        Returns:
+            loss (float)
+        """
+        if self.global_update_step % self.update_target_step == 0:
             self.alg.sync_target()
-        self.global_step += 1
 
-        act = np.expand_dims(act, -1)
-        terminal = np.expand_dims(terminal, -1)
-        reward = np.expand_dims(reward, -1)
+        self.global_update_step += 1
+
         reward = np.clip(reward, -1, 1)
+        act = np.expand_dims(act, axis=-1)
+        reward = np.expand_dims(reward, axis=-1)
+        terminal = np.expand_dims(terminal, axis=-1)
 
         obs = torch.tensor(obs, dtype=torch.float, device=self.device)
         next_obs = torch.tensor(
@@ -100,5 +114,10 @@ def learn(self, obs, act, reward, next_obs, terminal):
         terminal = torch.tensor(
             terminal, dtype=torch.float, device=self.device)
 
-        cost = self.alg.learn(obs, act, reward, next_obs, terminal)
-        return cost
+        loss = self.alg.learn(obs, act, reward, next_obs, terminal)
+
+        # learning rate decay
+        for param_group in self.alg.optimizer.param_groups:
+            param_group['lr'] = max(self.lr_scheduler.step(1), self.lr_end)
+
+        return loss
@@ -1,4 +1,4 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,38 +12,55 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
 import torch.nn as nn
-import torch.nn.functional as F
-
 import parl
 
 
 class AtariModel(parl.Model):
-    """CNN network used in TensorPack examples.
+    """ Neural Network to solve Atari problem.
 
     Args:
-        input_channel (int): Input channel of states.
         act_dim (int): Dimension of action space.
-        algo (str): which ('DQN', 'Double', 'Dueling') model to use.
+        dueling (bool): True if use dueling architecture else False
     """
 
-    def __init__(self, input_channel, act_dim, algo='DQN'):
-        super(AtariModel, self).__init__()
+    def __init__(self, act_dim, dueling=False):
+        super().__init__()
         self.conv1 = nn.Conv2d(
-            input_channel, 32, kernel_size=8, stride=4, padding=2)
-        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=2)
-        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
+            in_channels=4, out_channels=32, kernel_size=5, stride=1, padding=2)
+        self.conv2 = nn.Conv2d(
+            in_channels=32,
+            out_channels=32,
+            kernel_size=5,
+            stride=1,
+            padding=2)
+        self.conv3 = nn.Conv2d(
+            in_channels=32,
+            out_channels=64,
+            kernel_size=4,
+            stride=1,
+            padding=1)
+        self.conv4 = nn.Conv2d(
+            in_channels=64,
+            out_channels=64,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.relu = nn.ReLU()
+        self.max_pool = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.flatten = nn.Flatten()
+
+        self.dueling = dueling
+
+        if dueling:
+            self.linear_1_adv = nn.Linear(in_features=6400, out_features=512)
+            self.linear_2_adv = nn.Linear(
+                in_features=512, out_features=act_dim)
+            self.linear_1_val = nn.Linear(in_features=6400, out_features=512)
+            self.linear_2_val = nn.Linear(in_features=512, out_features=1)
 
-        self.algo = algo
-        if self.algo == 'Dueling':
-            self.fc1_adv = nn.Linear(7744, 512)
-            self.fc1_val = nn.Linear(7744, 512)
-            self.fc2_adv = nn.Linear(512, act_dim)
-            self.fc2_val = nn.Linear(512, 1)
         else:
-            self.fc1 = nn.Linear(7744, 512)
-            self.fc2 = nn.Linear(512, act_dim)
+            self.linear_1 = nn.Linear(in_features=6400, out_features=act_dim)
 
         self.reset_params()
 
@@ -54,16 +71,27 @@ def reset_params(self):
                     m.weight, mode='fan_out', nonlinearity='relu')
                 nn.init.zeros_(m.bias)
 
-    def forward(self, x):
-        x = x / 255.0
-        x = F.relu(self.conv1(x))
-        x = F.relu(self.conv2(x))
-        x = F.relu(self.conv3(x))
-        x = x.view(x.size(0), -1)
-        if self.algo == 'Dueling':
-            As = self.fc2_adv(F.relu(self.fc1_adv(x)))
-            V = self.fc2_val(F.relu(self.fc1_val(x)))
+    def forward(self, obs):
+        """ Perform forward pass 
+
+        Args:
+            obs (torch.Tensor): shape of (batch_size, 3, 84, 84), mini batch of observations
+        """
+        obs = obs / 255.0
+        out = self.max_pool(self.relu(self.conv1(obs)))
+        out = self.max_pool(self.relu(self.conv2(out)))
+        out = self.max_pool(self.relu(self.conv3(out)))
+        out = self.relu(self.conv4(out))
+        out = self.flatten(out)
+
+        if self.dueling:
+            As = self.relu(self.linear_1_adv(out))
+            As = self.linear_2_adv(As)
+            V = self.relu(self.linear_1_val(out))
+            V = self.linear_2_val(V)
             Q = As + (V - As.mean(dim=1, keepdim=True))
+
         else:
-            Q = self.fc2(F.relu(self.fc1(x)))
+            Q = self.linear_1(out)
+
         return Q