feature(zjow): impala policy for continuous action space (#551)

zjowowen · web-flow · commit 01b1a8b73cc2 · 2022-12-07T11:50:32.000+08:00
* Add continuous impala

* Add config file.

* polish config

* rm matrix sigma

* polish

* polish

* add unittest

* add unittest

* polish

* polish config

* polish policy
diff --git a/ding/hpc_rl/tests/test_vtrace.py b/ding/hpc_rl/tests/test_vtrace.py
@@ -1,7 +1,7 @@
 import time
 import torch
 import torch.nn.functional as F
-from hpc_rll.origin.vtrace import vtrace_error, vtrace_data
+from hpc_rll.origin.vtrace import vtrace_error_discrete_action, vtrace_data
 from hpc_rll.rl_utils.vtrace import VTrace
 from testbase import mean_relative_error, times
 
@@ -48,7 +48,7 @@ def vtrace_val():
 
     ori_target_output.requires_grad_(True)
     ori_value.requires_grad_(True)
-    ori_loss = vtrace_error(
+    ori_loss = vtrace_error_discrete_action(
         vtrace_data(ori_target_output, ori_behaviour_output, ori_action, ori_value, ori_reward, None)
     )
     ori_loss = sum(ori_loss)
@@ -114,7 +114,7 @@ def vtrace_perf():
     ori_value.requires_grad_(True)
     for i in range(times):
         t = time.time()
-        ori_loss = vtrace_error(
+        ori_loss = vtrace_error_discrete_action(
             vtrace_data(ori_target_output, ori_behaviour_output, ori_action, ori_value, ori_reward, None)
         )
         ori_loss = sum(ori_loss)
diff --git a/ding/hpc_rl/wrapper.py b/ding/hpc_rl/wrapper.py
@@ -69,7 +69,7 @@ def register_runtime_fn(fn_name, runtime_name, shape):
         'ScatterConnection': ['hpc_rll.torch_utils.network.scatter_connection', 'ScatterConnection'],
         'td_lambda_error': ['hpc_rll.rl_utils.td', 'TDLambda'],
         'upgo_loss': ['hpc_rll.rl_utils.upgo', 'UPGO'],
-        'vtrace_error': ['hpc_rll.rl_utils.vtrace', 'VTrace'],
+        'vtrace_error_discrete_action': ['hpc_rll.rl_utils.vtrace', 'VTrace'],
     }
     fn_str = fn_name_mapping[fn_name]
     cls = getattr(importlib.import_module(fn_str[0]), fn_str[1])
diff --git a/ding/model/wrapper/model_wrappers.py b/ding/model/wrapper/model_wrappers.py
@@ -2,11 +2,11 @@
 from abc import ABC
 import numpy as np
 import torch
+import torch.nn.functional as F
+from torch.distributions import Categorical, Independent, Normal
 from ding.torch_utils import get_tensor_data
 from ding.rl_utils import create_noise_generator
-from torch.distributions import Categorical, Independent, Normal
 from ding.utils.data import default_collate
-import torch.nn.functional as F
 
 
 class IModelWrapper(ABC):
diff --git a/ding/policy/impala.py b/ding/policy/impala.py
@@ -4,7 +4,7 @@
 import torch
 
 from ding.model import model_wrap
-from ding.rl_utils import vtrace_data, vtrace_error, get_train_sample
+from ding.rl_utils import vtrace_data, vtrace_error_discrete_action, vtrace_error_continuous_action, get_train_sample
 from ding.torch_utils import Adam, RMSprop, to_device
 from ding.utils import POLICY_REGISTRY
 from ding.utils.data import default_collate, default_decollate
@@ -48,6 +48,8 @@ class IMPALAPolicy(Policy):
         priority=False,
         # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True.
         priority_IS_weight=False,
+        # (str) Which kind of action space used in IMPALAPolicy, ['discrete', 'continuous']
+        action_space='discrete',
         # (int) the trajectory length to calculate v-trace target
         unroll_len=32,
         # (bool) Whether to need policy data in process transition
@@ -97,6 +99,8 @@ def _init_learn(self) -> None:
             Learn mode init method. Called by ``self.__init__``.
             Initialize the optimizer, algorithm config and main model.
         """
+        assert self._cfg.action_space in ["continuous", "discrete"]
+        self._action_space = self._cfg.action_space
         # Optimizer
         grad_clip_type = self._cfg.learn.get("grad_clip_type", None)
         clip_value = self._cfg.learn.get("clip_value", None)
@@ -165,10 +169,21 @@ def _data_preprocess_learn(self, data: List[Dict[str, Any]]):
         else:
             data['weight'] = data.get('weight', None)
         data['obs_plus_1'] = torch.cat((data['obs'] + data['next_obs'][-1:]), dim=0)  # shape (T+1)*B,env_obs_shape
-        data['logit'] = torch.cat(
-            data['logit'], dim=0
-        ).reshape(self._unroll_len, -1, self._action_shape)  # shape T,B,env_action_shape
-        data['action'] = torch.cat(data['action'], dim=0).reshape(self._unroll_len, -1)  # shape T,B,
+        if self._action_space == 'continuous':
+            data['logit']['mu'] = torch.cat(
+                data['logit']['mu'], dim=0
+            ).reshape(self._unroll_len, -1, self._action_shape)  # shape T,B,env_action_shape
+            data['logit']['sigma'] = torch.cat(
+                data['logit']['sigma'], dim=0
+            ).reshape(self._unroll_len, -1, self._action_shape)  # shape T,B,env_action_shape
+            data['action'] = torch.cat(
+                data['action'], dim=0
+            ).reshape(self._unroll_len, -1, self._action_shape)  # shape T,B,env_action_shape
+        elif self._action_space == 'discrete':
+            data['logit'] = torch.cat(
+                data['logit'], dim=0
+            ).reshape(self._unroll_len, -1, self._action_shape)  # shape T,B,env_action_shape
+            data['action'] = torch.cat(data['action'], dim=0).reshape(self._unroll_len, -1)  # shape T,B,
         data['done'] = torch.cat(data['done'], dim=0).reshape(self._unroll_len, -1).float()  # shape T,B,
         data['reward'] = torch.cat(data['reward'], dim=0).reshape(self._unroll_len, -1)  # shape T,B,
         data['weight'] = torch.cat(
@@ -204,7 +219,11 @@ def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         # Calculate vtrace error
         data = vtrace_data(target_logit, behaviour_logit, actions, values, rewards, weights)
         g, l, r, c, rg = self._gamma, self._lambda, self._rho_clip_ratio, self._c_clip_ratio, self._rho_pg_clip_ratio
-        vtrace_loss = vtrace_error(data, g, l, r, c, rg)
+        if self._action_space == 'continuous':
+            vtrace_loss = vtrace_error_continuous_action(data, g, l, r, c, rg)
+        elif self._action_space == 'discrete':
+            vtrace_loss = vtrace_error_discrete_action(data, g, l, r, c, rg)
+
         wv, we = self._value_weight, self._entropy_weight
         total_loss = vtrace_loss.policy_loss + wv * vtrace_loss.value_loss - we * vtrace_loss.entropy_loss
         # ====================
@@ -244,10 +263,18 @@ def _reshape_data(self, output: Dict[str, Any], data: Dict[str, Any]) -> Tuple[A
             - rewards (:obj:`torch.FloatTensor`): :math:`(T, B)`
             - weights (:obj:`torch.FloatTensor`): :math:`(T, B)`
         """
-        target_logit = output['logit'].reshape(self._unroll_len + 1, -1,
-                                               self._action_shape)[:-1]  # shape (T+1),B,env_obs_shape
+        if self._action_space == 'continuous':
+            target_logit = {}
+            target_logit['mu'] = output['logit']['mu'].reshape(self._unroll_len + 1, -1,
+                                                               self._action_shape)[:-1
+                                                                                   ]  # shape (T+1),B,env_action_shape
+            target_logit['sigma'] = output['logit']['sigma'].reshape(self._unroll_len + 1, -1, self._action_shape
+                                                                     )[:-1]  # shape (T+1),B,env_action_shape
+        elif self._action_space == 'discrete':
+            target_logit = output['logit'].reshape(self._unroll_len + 1, -1,
+                                                   self._action_shape)[:-1]  # shape (T+1),B,env_action_shape
         behaviour_logit = data['logit']  # shape T,B
-        actions = data['action']  # shape T,B
+        actions = data['action']  # shape T,B for discrete # shape T,B,env_action_shape for continuous
         values = output['value'].reshape(self._unroll_len + 1, -1)  # shape T+1,B,env_action_shape
         rewards = data['reward']  # shape T,B
         weights_ = 1 - data['done']  # shape T,B
@@ -289,7 +316,13 @@ def _init_collect(self) -> None:
             Collect mode init method. Called by ``self.__init__``, initialize algorithm arguments and collect_model.
             Use multinomial_sample to choose action.
         """
-        self._collect_model = model_wrap(self._model, wrapper_name='multinomial_sample')
+        assert self._cfg.action_space in ["continuous", "discrete"]
+        self._action_space = self._cfg.action_space
+        if self._action_space == 'continuous':
+            self._collect_model = model_wrap(self._model, wrapper_name='reparam_sample')
+        elif self._action_space == 'discrete':
+            self._collect_model = model_wrap(self._model, wrapper_name='multinomial_sample')
+
         self._collect_model.reset()
 
     def _forward_collect(self, data: Dict[int, Any]) -> Dict[int, Dict[str, Any]]:
@@ -364,7 +397,13 @@ def _init_eval(self) -> None:
             Evaluate mode init method. Called by ``self.__init__``, initialize eval_model,
             and use argmax_sample to choose action.
         """
-        self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample')
+        assert self._cfg.action_space in ["continuous", "discrete"]
+        self._action_space = self._cfg.action_space
+        if self._action_space == 'continuous':
+            self._eval_model = model_wrap(self._model, wrapper_name='deterministic_sample')
+        elif self._action_space == 'discrete':
+            self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample')
+
         self._eval_model.reset()
 
     def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]:
diff --git a/ding/policy/ppo.py b/ding/policy/ppo.py
@@ -92,6 +92,7 @@ def _init_learn(self) -> None:
         self._priority_IS_weight = self._cfg.priority_IS_weight
         assert not self._priority and not self._priority_IS_weight, "Priority is not implemented in PPO"
 
+        assert self._cfg.action_space in ["continuous", "discrete", "hybrid"]
         self._action_space = self._cfg.action_space
         if self._cfg.learn.ppo_param_init:
             for n, m in self._model.named_modules():
@@ -287,6 +288,7 @@ def _init_collect(self) -> None:
             Init traj and unroll length, collect model.
         """
         self._unroll_len = self._cfg.collect.unroll_len
+        assert self._cfg.action_space in ["continuous", "discrete", "hybrid"]
         self._action_space = self._cfg.action_space
         if self._action_space == 'continuous':
             self._collect_model = model_wrap(self._model, wrapper_name='reparam_sample')
@@ -399,6 +401,7 @@ def _init_eval(self) -> None:
             Evaluate mode init method. Called by ``self.__init__``.
             Init eval model with argmax strategy.
         """
+        assert self._cfg.action_space in ["continuous", "discrete", "hybrid"]
         self._action_space = self._cfg.action_space
         if self._action_space == 'continuous':
             self._eval_model = model_wrap(self._model, wrapper_name='deterministic_sample')
@@ -511,6 +514,7 @@ def default_model(self) -> Tuple[str, List[str]]:
         return 'pg', ['ding.model.template.pg']
 
     def _init_learn(self) -> None:
+        assert self._cfg.action_space in ["continuous", "discrete", "hybrid"]
         self._action_space = self._cfg.action_space
         if self._cfg.learn.ppo_param_init:
             for n, m in self._model.named_modules():
@@ -586,6 +590,7 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         return return_infos
 
     def _init_collect(self) -> None:
+        assert self._cfg.action_space in ["continuous", "discrete", "hybrid"]
         self._action_space = self._cfg.action_space
         self._unroll_len = self._cfg.collect.unroll_len
         if self._action_space == 'continuous':
@@ -632,6 +637,7 @@ def _get_train_sample(self, data: list) -> Union[None, List[Any]]:
         return get_train_sample(data, self._unroll_len)
 
     def _init_eval(self) -> None:
+        assert self._cfg.action_space in ["continuous", "discrete", "hybrid"]
         self._action_space = self._cfg.action_space
         if self._action_space == 'continuous':
             self._eval_model = model_wrap(self._model, wrapper_name='deterministic_sample')
diff --git a/ding/rl_utils/__init__.py b/ding/rl_utils/__init__.py
@@ -16,7 +16,7 @@
 from .upgo import upgo_loss
 from .adder import get_gae, get_gae_with_default_last_value, get_nstep_return_data, get_train_sample
 from .value_rescale import value_transform, value_inv_transform
-from .vtrace import vtrace_data, vtrace_error
+from .vtrace import vtrace_data, vtrace_error_discrete_action, vtrace_error_continuous_action
 from .beta_function import beta_function_map
 from .retrace import compute_q_retraces
 from .acer import acer_policy_error, acer_value_error, acer_trust_region_update
diff --git a/ding/rl_utils/isw.py b/ding/rl_utils/isw.py
@@ -1,33 +1,54 @@
+from typing import Union
 import torch
+from torch.distributions import Categorical, Independent, Normal
 
 
-def compute_importance_weights(target_output, behaviour_output, action, requires_grad=False):
+def compute_importance_weights(
+    target_output: Union[torch.Tensor, dict],
+    behaviour_output: Union[torch.Tensor, dict],
+    action: torch.Tensor,
+    action_space_type: str = 'discrete',
+    requires_grad: bool = False
+):
     """
     Overview:
         Computing importance sampling weight with given output and action
     Arguments:
-        - target_output (:obj:`torch.Tensor`): the output taking the action by the current policy network,\
-            usually this output is network output logit
-        - behaviour_output (:obj:`torch.Tensor`): the output taking the action by the behaviour policy network,\
-            usually this output is network output logit, which is used to produce the trajectory(collector)
+        - target_output (:obj:`Union[torch.Tensor,dict]`): the output taking the action \
+            by the current policy network, \
+            usually this output is network output logit if action space is discrete, \
+            or is a dict containing parameters of action distribution if action space is continuous.
+        - behaviour_output (:obj:`Union[torch.Tensor,dict]`): the output taking the action \
+            by the behaviour policy network,\
+            usually this output is network output logit,  if action space is discrete, \
+            or is a dict containing parameters of action distribution if action space is continuous.
         - action (:obj:`torch.Tensor`): the chosen action(index for the discrete action space) in trajectory,\
             i.e.: behaviour_action
+        - action_space_type (:obj:`str`): action space types in ['discrete', 'continuous']
         - requires_grad (:obj:`bool`): whether requires grad computation
     Returns:
         - rhos (:obj:`torch.Tensor`): Importance sampling weight
     Shapes:
-        - target_output (:obj:`torch.FloatTensor`): :math:`(T, B, N)`, where T is timestep, B is batch size and\
-            N is action dim
-        - behaviour_output (:obj:`torch.FloatTensor`): :math:`(T, B, N)`
+        - target_output (:obj:`Union[torch.FloatTensor,dict]`): :math:`(T, B, N)`, \
+            where T is timestep, B is batch size and N is action dim
+        - behaviour_output (:obj:`Union[torch.FloatTensor,dict]`): :math:`(T, B, N)`
         - action (:obj:`torch.LongTensor`): :math:`(T, B)`
         - rhos (:obj:`torch.FloatTensor`): :math:`(T, B)`
     """
     grad_context = torch.enable_grad() if requires_grad else torch.no_grad()
     assert isinstance(action, torch.Tensor)
+    assert action_space_type in ['discrete', 'continuous']
 
     with grad_context:
-        dist_target = torch.distributions.Categorical(logits=target_output)
-        dist_behaviour = torch.distributions.Categorical(logits=behaviour_output)
-        rhos = dist_target.log_prob(action) - dist_behaviour.log_prob(action)
-        rhos = torch.exp(rhos)
-        return rhos
+        if action_space_type == 'continuous':
+            dist_target = Independent(Normal(loc=target_output['mu'], scale=target_output['sigma']), 1)
+            dist_behaviour = Independent(Normal(loc=behaviour_output['mu'], scale=behaviour_output['sigma']), 1)
+            rhos = dist_target.log_prob(action) - dist_behaviour.log_prob(action)
+            rhos = torch.exp(rhos)
+            return rhos
+        elif action_space_type == 'discrete':
+            dist_target = Categorical(logits=target_output)
+            dist_behaviour = Categorical(logits=behaviour_output)
+            rhos = dist_target.log_prob(action) - dist_behaviour.log_prob(action)
+            rhos = torch.exp(rhos)
+            return rhos
diff --git a/ding/rl_utils/tests/test_vtrace.py b/ding/rl_utils/tests/test_vtrace.py
@@ -1,22 +1,47 @@
 import pytest
 import torch
-from ding.rl_utils import vtrace_data, vtrace_error
+from ding.rl_utils import vtrace_data, vtrace_error_discrete_action, vtrace_error_continuous_action
 
 
 @pytest.mark.unittest
-def test_vtrace():
+def test_vtrace_discrete_action():
     T, B, N = 4, 8, 16
     value = torch.randn(T + 1, B).requires_grad_(True)
     reward = torch.rand(T, B)
     target_output = torch.randn(T, B, N).requires_grad_(True)
     behaviour_output = torch.randn(T, B, N)
     action = torch.randint(0, N, size=(T, B))
     data = vtrace_data(target_output, behaviour_output, action, value, reward, None)
-    loss = vtrace_error(data, rho_clip_ratio=1.1)
+    loss = vtrace_error_discrete_action(data, rho_clip_ratio=1.1)
     assert all([l.shape == tuple() for l in loss])
     assert target_output.grad is None
     assert value.grad is None
     loss = sum(loss)
     loss.backward()
     assert isinstance(target_output, torch.Tensor)
     assert isinstance(value, torch.Tensor)
+
+
+@pytest.mark.unittest
+def test_vtrace_continuous_action():
+    T, B, N = 4, 8, 16
+    value = torch.randn(T + 1, B).requires_grad_(True)
+    reward = torch.rand(T, B)
+    target_output = {}
+    target_output['mu'] = torch.randn(T, B, N).requires_grad_(True)
+    target_output['sigma'] = torch.exp(torch.randn(T, B, N).requires_grad_(True))
+    behaviour_output = {}
+    behaviour_output['mu'] = torch.randn(T, B, N)
+    behaviour_output['sigma'] = torch.exp(torch.randn(T, B, N))
+    action = torch.randn((T, B, N))
+    data = vtrace_data(target_output, behaviour_output, action, value, reward, None)
+    loss = vtrace_error_continuous_action(data, rho_clip_ratio=1.1)
+    assert all([l.shape == tuple() for l in loss])
+    assert target_output['mu'].grad is None
+    assert target_output['sigma'].grad is None
+    assert value.grad is None
+    loss = sum(loss)
+    loss.backward()
+    assert isinstance(target_output['mu'], torch.Tensor)
+    assert isinstance(target_output['sigma'], torch.Tensor)
+    assert isinstance(value, torch.Tensor)
diff --git a/ding/rl_utils/vtrace.py b/ding/rl_utils/vtrace.py
diff --git a/dizoo/box2d/bipedalwalker/config/bipedalwalker_impala_config.py b/dizoo/box2d/bipedalwalker/config/bipedalwalker_impala_config.py

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,7 @@ def register_runtime_fn(fn_name, runtime_name, shape):`
`69`	`69`	`'ScatterConnection': ['hpc_rll.torch_utils.network.scatter_connection', 'ScatterConnection'],`
`70`	`70`	`'td_lambda_error': ['hpc_rll.rl_utils.td', 'TDLambda'],`
`71`	`71`	`'upgo_loss': ['hpc_rll.rl_utils.upgo', 'UPGO'],`
`72`		`- 'vtrace_error': ['hpc_rll.rl_utils.vtrace', 'VTrace'],`
	`72`	`+ 'vtrace_error_discrete_action': ['hpc_rll.rl_utils.vtrace', 'VTrace'],`
`73`	`73`	`}`
`74`	`74`	`fn_str = fn_name_mapping[fn_name]`
`75`	`75`	`cls = getattr(importlib.import_module(fn_str[0]), fn_str[1])`