feature(nyz): add ppof cuda

PaParaZz1 · PaParaZz1 · commit dfae2cccc09e · 2023-01-06T12:31:38.000+08:00
diff --git a/ding/framework/middleware/collector.py b/ding/framework/middleware/collector.py
@@ -105,6 +105,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None:
         Input of ctx:
             - env_step (:obj:`int`): The env steps which will increase during collection.
         """
+        device = self.policy._device
         old = ctx.env_step
         target_size = self.n_sample * self.unroll_len
 
@@ -113,7 +114,9 @@ def __call__(self, ctx: "OnlineRLContext") -> None:
 
         while True:
             obs = ttorch.as_tensor(self.env.ready_obs).to(dtype=ttorch.float32)
+            obs = obs.to(device)
             inference_output = self.policy.collect(obs, **ctx.collect_kwargs)
+            inference_output = inference_output.cpu()
             action = inference_output.action.numpy()
             timesteps = self.env.step(action)
             ctx.env_step += len(timesteps)
diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py
@@ -343,11 +343,14 @@ def _evaluate(ctx: "OnlineRLContext"):
         else:
             env.reset()
         policy.reset()
+        device = policy._device
         eval_monitor = VectorEvalMonitor(env.env_num, n_evaluator_episode)
 
         while not eval_monitor.is_finished():
             obs = ttorch.as_tensor(env.ready_obs).to(dtype=ttorch.float32)
+            obs = obs.to(device)
             inference_output = policy.eval(obs)
+            inference_output = inference_output.cpu()
             if render:
                 eval_monitor.update_video(env.ready_imgs)
                 eval_monitor.update_output(inference_output)
diff --git a/ding/framework/middleware/functional/trainer.py b/ding/framework/middleware/functional/trainer.py
@@ -71,7 +71,8 @@ def _train(ctx: Union["OnlineRLContext", "OfflineRLContext"]):
 
         if ctx.train_data is None:  # no enough data from data fetcher
             return
-        train_output = policy.forward(ctx.train_data)
+        data = ctx.train_data.to(policy._device)
+        train_output = policy.forward(data)
         nonlocal last_log_iter
         if ctx.train_iter - last_log_iter >= log_freq:
             loss = np.mean([o['total_loss'] for o in train_output])
diff --git a/ding/policy/ppof.py b/ding/policy/ppof.py
@@ -64,6 +64,7 @@ def __init__(self, cfg: "EasyDict", model: torch.nn.Module, enable_mode: List[st
             self._model = model
         if self._cfg.cuda and torch.cuda.is_available():
             self._device = 'cuda'
+            self._model.cuda()
         else:
             self._device = 'cpu'
         assert self._cfg.action_space in ["continuous", "discrete", "hybrid", 'multi_discrete']