Ensure tensors are created on default device

bmind7 · bmind7 · commit 87d6c9c22d81 · 2025-09-15T16:36:19.000-07:00
Updated tensor creation in optimizers, reward providers, and network normalization to explicitly use the configured default_device. Removed redundant set_torch_config call in trainer_controller to avoid interfering with PyTorch's global device context. These changes improve device consistency and prevent device mismatch errors in multi-threaded or multi-device training scenarios.
diff --git a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
@@ -1,5 +1,5 @@
 from typing import Dict, Optional, Tuple, List
-from mlagents.torch_utils import torch
+from mlagents.torch_utils import torch, default_device
 import numpy as np
 from collections import defaultdict
 
@@ -162,7 +162,7 @@ def get_trajectory_value_estimates(
             memory = self.critic_memory_dict[agent_id]
         else:
             memory = (
-                torch.zeros((1, 1, self.critic.memory_size))
+                torch.zeros((1, 1, self.critic.memory_size), device=default_device())
                 if self.policy.use_recurrent
                 else None
             )
diff --git a/ml-agents/mlagents/trainers/poca/optimizer_torch.py b/ml-agents/mlagents/trainers/poca/optimizer_torch.py
@@ -608,12 +608,12 @@ def get_trajectory_and_baseline_value_estimates(
             _init_baseline_mem = self.baseline_memory_dict[agent_id]
         else:
             _init_value_mem = (
-                torch.zeros((1, 1, self.critic.memory_size))
+                torch.zeros((1, 1, self.critic.memory_size), device=default_device())
                 if self.policy.use_recurrent
                 else None
             )
             _init_baseline_mem = (
-                torch.zeros((1, 1, self.critic.memory_size))
+                torch.zeros((1, 1, self.critic.memory_size), device=default_device())
                 if self.policy.use_recurrent
                 else None
             )
diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py
@@ -109,6 +109,7 @@ def evaluate(
         if "log_probs" in run_out:
             run_out["log_probs"] = run_out["log_probs"].to_log_probs_tuple()
         if "entropy" in run_out:
+            # Ensure entropy is detached and moved to CPU before NumPy conversion
             run_out["entropy"] = ModelUtils.to_numpy(run_out["entropy"])
         if self.use_recurrent:
             run_out["memory_out"] = ModelUtils.to_numpy(memories).squeeze(0)
diff --git a/ml-agents/mlagents/trainers/torch_entities/components/reward_providers/gail_reward_provider.py b/ml-agents/mlagents/trainers/torch_entities/components/reward_providers/gail_reward_provider.py
@@ -143,7 +143,7 @@ def compute_estimate(
         if self._settings.use_actions:
             actions = self.get_action_input(mini_batch)
             dones = torch.as_tensor(
-                mini_batch[BufferKey.DONE], dtype=torch.float
+                mini_batch[BufferKey.DONE], dtype=torch.float, device=default_device()
             ).unsqueeze(1)
             action_inputs = torch.cat([actions, dones], dim=1)
             hidden, _ = self.encoder(inputs, action_inputs)
@@ -162,7 +162,7 @@ def compute_loss(
         """
         Given a policy mini_batch and an expert mini_batch, computes the loss of the discriminator.
         """
-        total_loss = torch.zeros(1)
+        total_loss = torch.zeros(1, device=default_device())
         stats_dict: Dict[str, np.ndarray] = {}
         policy_estimate, policy_mu = self.compute_estimate(
             policy_batch, use_vail_noise=True
@@ -219,21 +219,21 @@ def compute_gradient_magnitude(
         expert_inputs = self.get_state_inputs(expert_batch)
         interp_inputs = []
         for policy_input, expert_input in zip(policy_inputs, expert_inputs):
-            obs_epsilon = torch.rand(policy_input.shape)
+            obs_epsilon = torch.rand(policy_input.shape, device=policy_input.device)
             interp_input = obs_epsilon * policy_input + (1 - obs_epsilon) * expert_input
             interp_input.requires_grad = True  # For gradient calculation
             interp_inputs.append(interp_input)
         if self._settings.use_actions:
             policy_action = self.get_action_input(policy_batch)
             expert_action = self.get_action_input(expert_batch)
-            action_epsilon = torch.rand(policy_action.shape)
+            action_epsilon = torch.rand(policy_action.shape, device=policy_action.device)
             policy_dones = torch.as_tensor(
-                policy_batch[BufferKey.DONE], dtype=torch.float
+                policy_batch[BufferKey.DONE], dtype=torch.float, device=default_device()
             ).unsqueeze(1)
             expert_dones = torch.as_tensor(
-                expert_batch[BufferKey.DONE], dtype=torch.float
+                expert_batch[BufferKey.DONE], dtype=torch.float, device=default_device()
             ).unsqueeze(1)
-            dones_epsilon = torch.rand(policy_dones.shape)
+            dones_epsilon = torch.rand(policy_dones.shape, device=policy_dones.device)
             action_inputs = torch.cat(
                 [
                     action_epsilon * policy_action
diff --git a/ml-agents/mlagents/trainers/torch_entities/networks.py b/ml-agents/mlagents/trainers/torch_entities/networks.py
@@ -1,7 +1,7 @@
 from typing import Callable, List, Dict, Tuple, Optional, Union, Any
 import abc
 
-from mlagents.torch_utils import torch, nn
+from mlagents.torch_utils import torch, nn, default_device
 
 from mlagents_envs.base_env import ActionSpec, ObservationSpec, ObservationType
 from mlagents.trainers.torch_entities.action_model import ActionModel
@@ -86,8 +86,10 @@ def total_goal_enc_size(self) -> int:
     def update_normalization(self, buffer: AgentBuffer) -> None:
         obs = ObsUtil.from_buffer(buffer, len(self.processors))
         for vec_input, enc in zip(obs, self.processors):
-            if isinstance(enc, VectorInput):
-                enc.update_normalization(torch.as_tensor(vec_input.to_ndarray()))
+                if isinstance(enc, VectorInput):
+                    enc.update_normalization(
+                        torch.as_tensor(vec_input.to_ndarray(), device=default_device())
+                    )
 
     def copy_normalization(self, other_encoder: "ObservationEncoder") -> None:
         if self.normalize:
diff --git a/ml-agents/mlagents/trainers/trainer_controller.py b/ml-agents/mlagents/trainers/trainer_controller.py
@@ -293,9 +293,9 @@ def join_threads(self, timeout_seconds: float = 1.0) -> None:
                     merge_gauges(thread_timer_stack.gauges)
 
     def trainer_update_func(self, trainer: Trainer) -> None:
-        torch_utils.set_torch_config(
-            TorchSettings(device=str(torch_utils.default_device()))
-        )
+        # Note: Avoid calling torch.set_default_device in worker threads; it can
+        # interfere with PyTorch's global device context manager. The policy and
+        # optimizer code explicitly places tensors on the configured default_device().
         while not self.kill_trainers:
             with hierarchical_timer("trainer_advance"):
                 trainer.advance()