opendilab
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.zh.md‎
Lines changed: 1 addition & 1 deletion b/‎README.zh.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lzero/entry/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎lzero/entry/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lzero/entry/eval_muzero.py‎
Lines changed: 2 additions & 1 deletion b/‎lzero/entry/eval_muzero.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lzero/mcts/buffer/game_buffer.py‎
Lines changed: 12 additions & 8 deletions b/‎lzero/mcts/buffer/game_buffer.py‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎lzero/mcts/tree_search/mcts_ctree.py‎
Lines changed: 15 additions & 2 deletions b/‎lzero/mcts/tree_search/mcts_ctree.py‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎lzero/model/common.py‎
Lines changed: 42 additions & 22 deletions b/‎lzero/model/common.py‎
Lines changed: 42 additions & 22 deletions
diff --git a/‎lzero/model/unizero_model.py‎
Lines changed: 23 additions & 2 deletions b/‎lzero/model/unizero_model.py‎
Lines changed: 23 additions & 2 deletions
@@ -28,7 +28,7 @@
 [![GitHub license](https://img.shields.io/github/license/opendilab/LightZero)](https://github.com/opendilab/LightZero/blob/master/LICENSE)
 [![discord badge](https://dcbadge.vercel.app/api/server/dkZS2JF56X?style=flat)](https://discord.gg/dkZS2JF56X)
 
-Updated on 2025.04.09 LightZero-v0.2.0
+Updated on 2025.06.03 LightZero-v0.2.0
 
 English | [简体中文(Simplified Chinese)](https://github.com/opendilab/LightZero/blob/main/README.zh.md) | [Documentation](https://opendilab.github.io/LightZero) | [LightZero Paper](https://arxiv.org/abs/2310.08348) | [🔥UniZero Paper](https://arxiv.org/abs/2406.10667) | [🔥ReZero Paper](https://arxiv.org/abs/2404.16364)
 
 
@@ -27,7 +27,7 @@
 [![Contributors](https://img.shields.io/github/contributors/opendilab/LightZero)](https://github.com/opendilab/LightZero/graphs/contributors)
 [![GitHub license](https://img.shields.io/github/license/opendilab/LightZero)](https://github.com/opendilab/LightZero/blob/master/LICENSE)
 
-最近更新于 2025.04.09 LightZero-v0.2.0
+最近更新于 2025.06.03 LightZero-v0.2.0
 
 [English](https://github.com/opendilab/LightZero/blob/main/README.md) | 简体中文 | [文档](https://opendilab.github.io/LightZero) | [LightZero 论文](https://arxiv.org/abs/2310.08348) | [🔥UniZero 论文](https://arxiv.org/abs/2406.10667) | [🔥ReZero 论文](https://arxiv.org/abs/2404.16364)
 
 
@@ -1,5 +1,6 @@
 from .eval_alphazero import eval_alphazero
 from .eval_muzero import eval_muzero
+
 from .eval_muzero_with_gym_env import eval_muzero_with_gym_env
 from .train_alphazero import train_alphazero
 from .train_muzero import train_muzero
 
@@ -1,6 +1,7 @@
 import os
 from functools import partial
 from typing import Optional, Tuple
+import logging
 
 import numpy as np
 import torch
@@ -51,7 +52,7 @@ def eval_muzero(
     # Create main components: env, policy
     env_fn, collector_env_cfg, evaluator_env_cfg = get_vec_env_setting(cfg.env)
     evaluator_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in evaluator_env_cfg])
-
+    # print(f"cfg.seed:{cfg.seed}")
     evaluator_env.seed(cfg.seed, dynamic_seed=False)
     set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda)
 
 
@@ -151,14 +151,18 @@ def _sample_orig_data(self, batch_size: int) -> Tuple:
             # Indices exceeding `game_segment_length` are padded with the next segment and are not updated
             # in the current implementation. Therefore, we need to sample `pos_in_game_segment` within
             # [0, game_segment_length - num_unroll_steps] to avoid padded data.
-
-            # TODO: Consider increasing `self._cfg.game_segment_length` to ensure sampling efficiency.
-            # if pos_in_game_segment >= self._cfg.game_segment_length - self._cfg.num_unroll_steps:
-            #     pos_in_game_segment = np.random.choice(self._cfg.game_segment_length - self._cfg.num_unroll_steps, 1).item()
-
-            # NOTE: Sample the init position from the whole segment, but not from the padded part
-            if pos_in_game_segment >= self._cfg.game_segment_length:
-                pos_in_game_segment = np.random.choice(self._cfg.game_segment_length, 1).item()
+            
+            if self._cfg.action_type == 'varied_action_space':
+                # For some environments (e.g., Jericho), the action space size may be different.
+                # To ensure we can always unroll `num_unroll_steps` steps starting from the sampled position (without exceeding segment length),
+                # we avoid sampling from the last `num_unroll_steps` steps of the game segment. 
+                if pos_in_game_segment >= self._cfg.game_segment_length - self._cfg.num_unroll_steps:
+                    pos_in_game_segment = np.random.choice(self._cfg.game_segment_length - self._cfg.num_unroll_steps, 1).item()
+            else:
+                # For environments with a fixed action space (e.g., Atari),
+                # we can safely sample from the entire game segment range.
+                if pos_in_game_segment >= self._cfg.game_segment_length:
+                    pos_in_game_segment = np.random.choice(self._cfg.game_segment_length, 1).item()
 
             pos_in_game_segment_list.append(pos_in_game_segment)
 
 
@@ -75,7 +75,7 @@ def roots(cls: int, active_collect_env_num: int, legal_actions: List[Any]) -> "m
     def search(
             self, roots: Any, model: torch.nn.Module, latent_state_roots: List[Any], to_play_batch: Union[int,
             List[Any]], timestep: Union[int, List[Any]]
-    ) -> None:
+    ) -> dict:
         """
         Overview:
             Perform Monte Carlo Tree Search (MCTS) for a batch of root nodes in parallel. 
@@ -93,6 +93,10 @@ def search(
 
             # preparation some constant
             batch_size = roots.num
+
+            # Store the latent state of each possible action at the MCTS root for each environment.
+            first_action_latent_map = {env_id: {} for env_id in range(batch_size)} # {env_id: {action: latent_state}} 
+
             pb_c_base, pb_c_init, discount_factor = self._cfg.pb_c_base, self._cfg.pb_c_init, self._cfg.discount_factor
             # the data storage of latent states: storing the latent state of all the nodes in the search.
             latent_state_batch_in_search_path = [latent_state_roots]
@@ -156,8 +160,15 @@ def search(
                 network_output.value = to_detach_cpu_numpy(self.inverse_scalar_transform_handle(network_output.value))
                 network_output.reward = to_detach_cpu_numpy(self.inverse_scalar_transform_handle(network_output.reward))
 
+                for env_id in range(batch_size):
+                    depth = search_depth[env_id]
+                    action = last_actions[env_id].item()
+                    if depth == 1 and action not in first_action_latent_map[env_id]:
+                        first_action_latent_map[env_id][action] = network_output.latent_state[env_id]
+                    else:
+                        continue
+                        
                 latent_state_batch_in_search_path.append(network_output.latent_state)
-
                 # tolist() is to be compatible with cpp datatype.
                 reward_batch = network_output.reward.reshape(-1).tolist()
                 value_batch = network_output.value.reshape(-1).tolist()
@@ -173,6 +184,8 @@ def search(
                     current_latent_state_index, discount_factor, reward_batch, value_batch, policy_logits_batch,
                     min_max_stats_lst, results, virtual_to_play_batch
                 )
+            
+            return first_action_latent_map
 
 
 class MuZeroMCTSCtree(object):
 
@@ -364,12 +364,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 class HFLanguageRepresentationNetwork(nn.Module):
     def __init__(self,
-                 model_path: str = 'google-bert/bert-base-uncased',
-                 embedding_size: int = 768,
-                 group_size: int = 8,
-                 norm_type: str = "simnorm",
-                #  norm_type: str = "layernorm", # TODO: Why does nan appear in the first step of training?
-                 tokenizer=None):
+                model_path: str = 'google-bert/bert-base-uncased',
+                embedding_size: int = 768,
+                group_size: int = 8,
+                final_norm_option_in_encoder: str = "layernorm",
+                tokenizer=None):
         """
         Overview:
             This class defines a language representation network that utilizes a pretrained Hugging Face model.
@@ -379,7 +378,7 @@ def __init__(self,
             - model_path (str): The path to the pretrained Hugging Face model. Default is 'google-bert/bert-base-uncased'.
             - embedding_size (int): The dimension of the output embeddings. Default is 768.
             - group_size (int): The group size for SimNorm when using normalization.
-            - norm_type (str): The type of normalization to use ("simnorm" or "layernorm"). Default is "layernorm".
+            - final_norm_option_in_encoder (str): The type of normalization to use ("simnorm" or "layernorm"). Default is "layernorm".
             - tokenizer (Optional): An instance of a tokenizer. If None, the tokenizer will be loaded from the pretrained model.
         """
         super().__init__()
@@ -389,12 +388,13 @@ def __init__(self,
 
         # In distributed training, only the rank 0 process downloads the model, and other processes load from cache to speed up startup.
         if get_rank() == 0:
-            self.model = AutoModel.from_pretrained(model_path)
+            self.pretrained_model = AutoModel.from_pretrained(model_path)
+
         if get_world_size() > 1:
             # Wait for rank 0 to finish loading the model.
             torch.distributed.barrier()
         if get_rank() != 0:
-            self.model = AutoModel.from_pretrained(model_path)
+            self.pretrained_model = AutoModel.from_pretrained(model_path)
 
         if tokenizer is None:
             # Only rank 0 downloads the tokenizer, and then other processes load it from cache.
@@ -409,15 +409,15 @@ def __init__(self,
 
         # Set the embedding dimension. A linear projection is added (the dimension remains unchanged here but can be extended for other mappings).
         self.embedding_size = embedding_size
-        self.embed_proj_head = nn.Linear(self.model.config.hidden_size, self.embedding_size)
+        self.embed_proj_head = nn.Linear(self.pretrained_model.config.hidden_size, self.embedding_size)
 
-        # Select the normalization method based on the norm_type parameter.
-        if norm_type.lower() == "simnorm":
+        # # Select the normalization method based on the final_norm_option_in_encoder parameter.
+        if final_norm_option_in_encoder.lower() == "simnorm":
             self.norm = SimNorm(simnorm_dim=group_size)
-        elif norm_type.lower() == "layernorm":
+        elif final_norm_option_in_encoder.lower() == "layernorm":
             self.norm = nn.LayerNorm(embedding_size)
         else:
-            raise NotImplementedError(f"Normalization type '{norm_type}' is not implemented. "
+            raise NotImplementedError(f"Normalization type '{final_norm_option_in_encoder}' is not implemented. "
                                       f"Choose 'simnorm' or 'layernorm'.")
 
     def forward(self, x: torch.Tensor, no_grad: bool = True) -> torch.Tensor:
@@ -433,26 +433,27 @@ def forward(self, x: torch.Tensor, no_grad: bool = True) -> torch.Tensor:
         Returns:
         - torch.Tensor: The processed language embedding with shape [batch_size, embedding_size].
         """
+
         # Construct the attention mask to exclude padding tokens.
         attention_mask = x != self.tokenizer.pad_token_id
 
         # Use no_grad context if specified to disable gradient computation.
         if no_grad:
             with torch.no_grad():
                 x = x.long()  # Ensure the input tensor is of type long.
-                outputs = self.model(x, attention_mask=attention_mask)
+                outputs = self.pretrained_model(x, attention_mask=attention_mask)
                 # Get the hidden state from the last layer and select the output corresponding to the [CLS] token.
                 cls_embedding = outputs.last_hidden_state[:, 0, :]
         else:
             x = x.long()
-            outputs = self.model(x, attention_mask=attention_mask)
+            outputs = self.pretrained_model(x, attention_mask=attention_mask)
             cls_embedding = outputs.last_hidden_state[:, 0, :]
 
         # Apply linear projection to obtain the desired output dimension.
         cls_embedding = self.embed_proj_head(cls_embedding)
         # Normalize the embeddings using the selected normalization layer (SimNorm or LayerNorm) to ensure training stability.
         cls_embedding = self.norm(cls_embedding)
-
+        
         return cls_embedding
 
 
@@ -468,6 +469,7 @@ def __init__(
             norm_type: str = 'BN',
             embedding_dim: int = 256,
             group_size: int = 8,
+            final_norm_option_in_encoder: str = 'LayerNorm', # TODO
     ) -> None:
         """
         Overview:
@@ -486,6 +488,8 @@ def __init__(
             - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'.
             - embedding_dim (:obj:`int`): The dimension of the latent state.
             - group_size (:obj:`int`): The dimension for simplicial normalization.
+            - final_norm_option_in_encoder (:obj:`str`): The normalization option for the final layer, defaults to 'SimNorm'. \
+                Options are 'SimNorm' and 'LayerNorm'.
         """
         super().__init__()
         assert norm_type in ['BN', 'LN'], "norm_type must in ['BN', 'LN']"
@@ -530,7 +534,14 @@ def __init__(
         elif self.observation_shape[1] in [84, 96]:
             self.last_linear = nn.Linear(64 * 6 * 6, self.embedding_dim, bias=False)
 
-        self.sim_norm = SimNorm(simnorm_dim=group_size)
+        self.final_norm_option_in_encoder = final_norm_option_in_encoder
+        if self.final_norm_option_in_encoder == 'LayerNorm':
+            self.final_norm = nn.LayerNorm(self.embedding_dim, eps=1e-5)
+        elif self.final_norm_option_in_encoder == 'SimNorm':
+            self.final_norm = SimNorm(simnorm_dim=group_size)
+        else:
+            raise ValueError(f"Unsupported final_norm_option_in_encoder: {self.final_norm_option_in_encoder}")
+
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
@@ -557,7 +568,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = x.view(-1, self.embedding_dim)
 
         # NOTE: very important for training stability.
-        x = self.sim_norm(x)
+        x = self.final_norm(x)
 
         return x
 
@@ -670,6 +681,7 @@ def __init__(
             activation: nn.Module = nn.GELU(approximate='tanh'),
             norm_type: Optional[str] = 'BN',
             group_size: int = 8,
+            final_norm_option_in_encoder: str = 'LayerNorm', # TODO
     ) -> torch.Tensor:
         """
         Overview:
@@ -700,7 +712,15 @@ def __init__(
             # last_linear_layer_init_zero=True is beneficial for convergence speed.
             last_linear_layer_init_zero=True,
         )
-        self.sim_norm = SimNorm(simnorm_dim=group_size)
+
+        # # Select the normalization method based on the final_norm_option_in_encoder parameter.
+        if final_norm_option_in_encoder.lower() == "simnorm":
+            self.norm = SimNorm(simnorm_dim=group_size)
+        elif final_norm_option_in_encoder.lower() == "layernorm":
+            self.norm = nn.LayerNorm(hidden_channels)
+        else:
+            raise NotImplementedError(f"Normalization type '{final_norm_option_in_encoder}' is not implemented. "
+                                      f"Choose 'simnorm' or 'layernorm'.")
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
@@ -709,8 +729,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             - output (:obj:`torch.Tensor`): :math:`(B, hidden_channels)`, where B is batch size.
         """
         x = self.fc_representation(x)
-        # TODO
-        x = self.sim_norm(x)
+        x = self.norm(x)
+
         return x
 
 
 
@@ -4,12 +4,14 @@
 import torch.nn as nn
 from ding.utils import MODEL_REGISTRY, SequenceType
 from easydict import EasyDict
+from transformers import T5ForConditionalGeneration, T5Tokenizer
 
 from .common import MZNetworkOutput, RepresentationNetworkUniZero, RepresentationNetworkMLP, LatentDecoder, \
     VectorDecoderForMemoryEnv, LatentEncoderForMemoryEnv, LatentDecoderForMemoryEnv, FeatureAndGradientHook, \
     HFLanguageRepresentationNetwork
 from .unizero_world_models.tokenizer import Tokenizer
 from .unizero_world_models.world_model import WorldModel
+from ding.utils import ENV_REGISTRY, set_pkg_seed, get_rank, get_world_size
 
 
 # use ModelRegistry to register the model, for more details about ModelRegistry, please refer to DI-engine's document.
@@ -64,6 +66,10 @@ def __init__(
                 - analysis_sim_norm (:obj:`bool`): Whether to analyze the similarity of the norm.
         """
         super(UniZeroModel, self).__init__()
+        # Get current world size and rank for distributed setups.
+        self.world_size: int = get_world_size()
+        self.rank: int = get_rank()
+
         self.action_space_size = action_space_size
         self.activation = activation
         self.downsample = downsample
@@ -77,6 +83,7 @@ def __init__(
                 layer_num=2,
                 activation=self.activation,
                 group_size=world_model_cfg.group_size,
+                final_norm_option_in_encoder=world_model_cfg.final_norm_option_in_encoder
             )
             # TODO: only for MemoryEnv now
             self.decoder_network = VectorDecoderForMemoryEnv(embedding_dim=world_model_cfg.embed_dim, output_shape=25)
@@ -89,8 +96,21 @@ def __init__(
             print(f'{sum(p.numel() for p in self.tokenizer.encoder.parameters())} parameters in agent.tokenizer.encoder')
             print('==' * 20)
         elif world_model_cfg.obs_type == 'text':
-            self.representation_network = HFLanguageRepresentationNetwork(model_path=kwargs['encoder_url'], embedding_size=world_model_cfg.embed_dim)
-            self.tokenizer = Tokenizer(encoder=self.representation_network, decoder_network=None, with_lpips=False,)
+            self.representation_network = HFLanguageRepresentationNetwork(model_path=kwargs['encoder_url'], embedding_size=world_model_cfg.embed_dim, final_norm_option_in_encoder=world_model_cfg.final_norm_option_in_encoder)
+            # print(self.representation_network.model.encoder.layer[0].attention.output.LayerNorm.weight)
+
+            if self.rank == 0:
+                self.decoder_network = T5ForConditionalGeneration.from_pretrained("t5-small")
+                self.decoder_network_tokenizer = T5Tokenizer.from_pretrained("t5-small")
+            if self.world_size > 1:
+                # Wait until rank 0 finishes loading the tokenizer
+                torch.distributed.barrier()
+            if self.rank != 0:
+                self.decoder_network = T5ForConditionalGeneration.from_pretrained("t5-small")
+                self.decoder_network_tokenizer = T5Tokenizer.from_pretrained("t5-small")
+
+            projection = [self.representation_network.pretrained_model.config.hidden_size, self.decoder_network.config.d_model]
+            self.tokenizer = Tokenizer(encoder=self.representation_network, decoder_network=self.decoder_network, decoder_network_tokenizer=self.decoder_network_tokenizer, with_lpips=False, projection=projection)
             self.world_model = WorldModel(config=world_model_cfg, tokenizer=self.tokenizer)
             print(f'{sum(p.numel() for p in self.world_model.parameters())} parameters in agent.world_model')
             print('==' * 20)
@@ -107,6 +127,7 @@ def __init__(
                 norm_type=norm_type,
                 embedding_dim=world_model_cfg.embed_dim,
                 group_size=world_model_cfg.group_size,
+                final_norm_option_in_encoder=world_model_cfg.final_norm_option_in_encoder
             )
 
             # ====== for analysis ======