opendilab · puyuan1996 · Apr 25, 2025 · Apr 25, 2025 · Apr 29, 2025 · Apr 29, 2025
diff --git a/.gitignore b/.gitignore
@@ -1450,4 +1450,4 @@ events.*
 !/assets/pooltool/**
 lzero/mcts/ctree/ctree_alphazero/pybind11
 
-zoo/jericho/envs/z-machine-games-master
+zoo/jericho/envs/z-machine-games-master
diff --git a/lzero/entry/__init__.py b/lzero/entry/__init__.py
@@ -10,4 +10,8 @@
 from .train_rezero import train_rezero
 from .train_unizero import train_unizero
 from .train_unizero_segment import train_unizero_segment
-from .utils import *
+from .train_muzero_multitask_segment_ddp import train_muzero_multitask_segment_ddp
+from .train_unizero_multitask_segment_ddp import train_unizero_multitask_segment_ddp
+from .train_unizero_multitask_segment_eval import train_unizero_multitask_segment_eval
+from .train_unizero_multitask_balance_segment_ddp import train_unizero_multitask_balance_segment_ddp
+from .utils import *
diff --git a/lzero/entry/train_muzero_multitask_segment_ddp.py b/lzero/entry/train_muzero_multitask_segment_ddp.py
diff --git a/lzero/entry/train_unizero.py b/lzero/entry/train_unizero.py
@@ -136,6 +136,9 @@ def train_unizero(
     else:
         world_size = 1
         rank = 0
+    # TODO: for visualize
+    # stop, reward = evaluator.eval(learner.save_checkpoint, learner.train_iter, collector.envstep)
+    # import sys; sys.exit(0)
 
     while True:
         # Log memory usage of the replay buffer

diff --git a/lzero/entry/train_unizero_multitask_balance_segment_ddp.py b/lzero/entry/train_unizero_multitask_balance_segment_ddp.py
diff --git a/lzero/entry/train_unizero_multitask_segment_ddp.py b/lzero/entry/train_unizero_multitask_segment_ddp.py
diff --git a/lzero/entry/train_unizero_multitask_segment_eval.py b/lzero/entry/train_unizero_multitask_segment_eval.py
diff --git a/lzero/entry/train_unizero_segment.py b/lzero/entry/train_unizero_segment.py
@@ -154,7 +154,9 @@ def train_unizero_segment(
             collect_kwargs['epsilon'] = epsilon_greedy_fn(collector.envstep)
 
         # Evaluate policy performance
-        if evaluator.should_eval(learner.train_iter):
+        # if learner.train_iter == 0 or evaluator.should_eval(learner.train_iter):
+        if learner.train_iter > 0 and evaluator.should_eval(learner.train_iter):
+
             stop, reward = evaluator.eval(learner.save_checkpoint, learner.train_iter, collector.envstep)
             if stop:
                 break

diff --git a/lzero/entry/utils.py b/lzero/entry/utils.py
diff --git a/lzero/mcts/buffer/game_buffer.py b/lzero/mcts/buffer/game_buffer.py
diff --git a/lzero/mcts/buffer/game_buffer_muzero.py b/lzero/mcts/buffer/game_buffer_muzero.py
@@ -61,6 +61,18 @@ def __init__(self, cfg: dict):
         self.sample_times = 0
         self.active_root_num = 0
 
+        if hasattr(self._cfg, 'task_id'):
+            self.task_id = self._cfg.task_id
+            print(f"Task ID is set to {self.task_id}.")
+            try:
+                self.action_space_size = self._cfg.model.action_space_size_list[self.task_id]
+            except Exception as e:
+                self.action_space_size = self._cfg.model.action_space_size
+
+        else:
+            self.task_id = None
+            print("No task_id found in configuration. Task ID is set to None.")
+            self.action_space_size = self._cfg.model.action_space_size
         self.value_support = DiscreteSupport(*self._cfg.model.value_support_range)
         self.reward_support = DiscreteSupport(*self._cfg.model.reward_support_range)
 
@@ -149,7 +161,7 @@ def sample(
         self.compute_target_re_time += self._compute_target_timer.value
 
         batch_target_policies_non_re = self._compute_target_policy_non_reanalyzed(
-            policy_non_re_context, self._cfg.model.action_space_size
+            policy_non_re_context, self.action_space_size
         )
 
         # fusion of batch_target_policies_re and batch_target_policies_non_re to batch_target_policies
@@ -469,17 +481,21 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
                 end_index = self._cfg.mini_infer_size * (i + 1)
                 m_obs = torch.from_numpy(value_obs_list[beg_index:end_index]).to(self._cfg.device)
                 # calculate the target value
-                m_output = model.initial_inference(m_obs)
-
-                if not model.training:
-                    # if not in training, obtain the scalars of the value/reward
-                    [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
-                        [
-                            m_output.latent_state,
-                            inverse_scalar_transform(m_output.value, self.value_support),
-                            m_output.policy_logits
-                        ]
-                    )
+                if self.task_id is not None:
+                    m_output = model.initial_inference(m_obs, task_id=self.task_id)
+                else:
+                    m_output = model.initial_inference(m_obs)
+
+
+                # if not model.training:
+                # if not in training, obtain the scalars of the value/reward
+                [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
+                    [
+                        m_output.latent_state,
+                        inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
+                        m_output.policy_logits
+                    ]
+                )
 
                 network_output.append(m_output)
 
@@ -594,25 +610,28 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
                 beg_index = self._cfg.mini_infer_size * i
                 end_index = self._cfg.mini_infer_size * (i + 1)
                 m_obs = torch.from_numpy(policy_obs_list[beg_index:end_index]).to(self._cfg.device)
-                m_output = model.initial_inference(m_obs)
-
-                if not model.training:
-                    # if not in training, obtain the scalars of the value/reward
-                    [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
-                        [
-                            m_output.latent_state,
-                            inverse_scalar_transform(m_output.value, self.value_support),
-                            m_output.policy_logits
-                        ]
-                    )
+                if self.task_id is not None:
+                    m_output = model.initial_inference(m_obs, task_id=self.task_id)
+                else:
+                    m_output = model.initial_inference(m_obs)
+
+                # if not model.training:
+                # if not in training, obtain the scalars of the value/reward
+                [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
+                    [
+                        m_output.latent_state,
+                        inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
+                        m_output.policy_logits
+                    ]
+                )
 
                 network_output.append(m_output)
 
             _, reward_pool, policy_logits_pool, latent_state_roots = concat_output(network_output, data_type='muzero')
             reward_pool = reward_pool.squeeze().tolist()
             policy_logits_pool = policy_logits_pool.tolist()
             noises = [
-                np.random.dirichlet([self._cfg.root_dirichlet_alpha] * self._cfg.model.action_space_size
+                np.random.dirichlet([self._cfg.root_dirichlet_alpha] * self.action_space_size
                                     ).astype(np.float32).tolist() for _ in range(transition_batch_size)
             ]
             if self._cfg.mcts_ctree:
@@ -624,7 +643,11 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
                     roots.prepare_no_noise(reward_pool, policy_logits_pool, to_play)
                 # do MCTS for a new policy with the recent target model
                 with self._origin_search_timer:
-                    MCTSCtree(self._cfg).search(roots, model, latent_state_roots, to_play)
+                    if self.task_id is not None:
+                        MCTSCtree(self._cfg).search(roots, model, latent_state_roots, to_play, task_id=self.task_id)
+                    else:
+                        MCTSCtree(self._cfg).search(roots, model, latent_state_roots, to_play)
+
                 self.origin_search_time += self._origin_search_timer.value
             else:
                 # python mcts_tree
@@ -634,7 +657,11 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
                 else:
                     roots.prepare_no_noise(reward_pool, policy_logits_pool, to_play)
                 # do MCTS for a new policy with the recent target model
-                MCTSPtree(self._cfg).search(roots, model, latent_state_roots, to_play)
+                if self.task_id is not None:
+                    MCTSPtree(self._cfg).search(roots, model, latent_state_roots, to_play, task_id=self.task_id)
+                else:
+                    MCTSPtree(self._cfg).search(roots, model, latent_state_roots, to_play)
+
 
             roots_legal_actions_list = legal_actions
             roots_distributions = roots.get_distributions()
@@ -650,7 +677,7 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
 
                     if policy_mask[policy_index] == 0:
                         # NOTE: the invalid padding target policy, O is to make sure the corresponding cross_entropy_loss=0
-                        target_policies.append([0 for _ in range(self._cfg.model.action_space_size)])
+                        target_policies.append([0 for _ in range(self.action_space_size)])
                     else:
                         # NOTE: It is very important to use the latest MCTS visit count distribution.
                         sum_visits = sum(distributions)
@@ -659,7 +686,7 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
                         if distributions is None:
                             # if at some obs, the legal_action is None, add the fake target_policy
                             target_policies.append(
-                                list(np.ones(self._cfg.model.action_space_size) / self._cfg.model.action_space_size)
+                                list(np.ones(self.action_space_size) / self.action_space_size)
                             )
                         else:
                             # Update the data in game segment:
@@ -676,7 +703,7 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
                                 target_policies.append(policy)
                             else:
                                 # for board games that have two players and legal_actions is dy
-                                policy_tmp = [0 for _ in range(self._cfg.model.action_space_size)]
+                                policy_tmp = [0 for _ in range(self.action_space_size)]
                                 # to make sure target_policies have the same dimension
                                 sum_visits = sum(distributions)
                                 policy = [visit_count / sum_visits for visit_count in distributions]
@@ -705,7 +732,7 @@ def _compute_target_policy_non_reanalyzed(
                 - game_segment_lens
                 - action_mask_segment
                 - to_play_segment
-            - policy_shape: self._cfg.model.action_space_size
+            - policy_shape: self.action_space_size
         Returns:
             - batch_target_policies_non_re
         """
@@ -728,7 +755,7 @@ def _compute_target_policy_non_reanalyzed(
             ]
             # NOTE: in continuous action space env: we set all legal_actions as -1
             legal_actions = [
-                [-1 for _ in range(self._cfg.model.action_space_size)] for _ in range(transition_batch_size)
+                [-1 for _ in range(self.action_space_size)] for _ in range(transition_batch_size)
             ]
         else:
             legal_actions = [[i for i, x in enumerate(action_mask[j]) if x == 1] for j in range(transition_batch_size)]
@@ -778,6 +805,7 @@ def update_priority(self, train_data: List[np.ndarray], batch_priorities: Any) -
         NOTE:
             train_data = [current_batch, target_batch]
             current_batch = [obs_list, action_list, improved_policy_list(only in Gumbel MuZero), mask_list, batch_index_list, weights, make_time_list]
+            target_batch = [batch_rewards, batch_target_values, batch_target_policies]
         """
         indices = train_data[0][-3]
         metas = {'make_time': train_data[0][-1], 'batch_priorities': batch_priorities}