update to fix comment

yulangz · yulangz · commit 9ae69d2995e4 · 2025-10-11T11:46:16.000+08:00
diff --git a/areal/engine/fsdp_engine.py b/areal/engine/fsdp_engine.py
@@ -121,6 +121,8 @@ def create_process_group(self, parallel_strategy: ParallelStrategy | None = None
         self.dp_head = int(self.world_mesh["sp_tp"].mesh[0].item())
         self.dp_rank = dist.get_rank(self.dp_group)
 
+        self.world_size = int(os.environ["WORLD_SIZE"])
+
         self.logger.info(f"Data parallel head {self.dp_head} and rank {self.dp_rank}")
 
     def initialize(
@@ -137,8 +139,6 @@ def initialize(
             "torch", "2.4.0"
         ), f"areal only supports FSDP2, which requires torch>=2.4.0"
 
-        self.world_size = int(os.environ["WORLD_SIZE"])
-
         # Create device model
         self.create_device_model()
 
diff --git a/areal/scheduler/rpc/rpc_client.py b/areal/scheduler/rpc/rpc_client.py
@@ -28,7 +28,6 @@ def create_engine(
         self,
         worker_id: str,
         engine_obj: Union[InferenceEngine, TrainEngine],
-        # init_config: Union[InferenceEngineConfig, TrainEngineConfig],
         *args,
         **kwargs,
     ) -> None:
diff --git a/areal/scheduler/rpc/rpc_server.py b/areal/scheduler/rpc/rpc_server.py
@@ -4,22 +4,44 @@
 import traceback
 from http import HTTPStatus
 from http.server import BaseHTTPRequestHandler, HTTPServer
-from typing import AnyStr
+from typing import Any, AnyStr, Dict, List
 
 import cloudpickle
+import torch
 from tensordict import TensorDict
 
 from areal.api.controller_api import DistributedBatch
 from areal.api.engine_api import InferenceEngine
 from areal.controller.batch import DistributedBatchMemory
 from areal.utils import logging
-from areal.utils.data import (
-    tensor_container_to,
-)
 
 logger = logging.getLogger("RPCServer")
 
 
+def tensor_container_to_safe(
+    d: Dict[str, Any] | torch.Tensor | List[torch.Tensor], *args, **kwargs
+):
+    """Apply `t.to(*args, **kwargs)` to all tensors in the dictionary.
+    Support nested dictionaries.
+    """
+    new_dict = {}
+    if torch.is_tensor(d):
+        return d.to(*args, **kwargs)
+    elif isinstance(d, list):
+        return [tensor_container_to_safe(v, *args, **kwargs) for v in d]
+    elif isinstance(d, dict):
+        for key, value in d.items():
+            if isinstance(value, dict) or isinstance(value, list):
+                new_dict[key] = tensor_container_to_safe(value, *args, **kwargs)
+            elif torch.is_tensor(value):
+                new_dict[key] = value.to(*args, **kwargs)
+            else:
+                new_dict[key] = value
+        return new_dict
+    else:
+        return d
+
+
 def process_input_to_distributed_batch(to_device, *args, **kwargs):
     for i in range(len(args)):
         if isinstance(args[i], DistributedBatch):
@@ -31,14 +53,14 @@ def process_input_to_distributed_batch(to_device, *args, **kwargs):
         if isinstance(kwargs[k], DistributedBatch):
             kwargs[k] = kwargs[k].get_data()
 
-    args = tuple(tensor_container_to(list(args), to_device))
-    kwargs = tensor_container_to(kwargs, to_device)
+    args = tuple(tensor_container_to_safe(list(args), to_device))
+    kwargs = tensor_container_to_safe(kwargs, to_device)
 
     return args, kwargs
 
 
 def process_output_to_distributed_batch(result):
-    result = tensor_container_to(result, "cpu")
+    result = tensor_container_to_safe(result, "cpu")
     if isinstance(result, dict):
         return DistributedBatchMemory.from_dict(result)
     elif isinstance(result, TensorDict):
diff --git a/areal/utils/data.py b/areal/utils/data.py
@@ -351,10 +351,7 @@ def tensor_container_to(
                 new_dict[key] = value
         return new_dict
     else:
-        logger.warning(
-            f"Unsupported type in tensor_container_to: {type(d)}, returning original."
-        )
-        return d
+        raise ValueError(f"Unsupported type: {type(d)}")
 
 
 @dataclass