fix(pu): adapt atari and dmc2gym env to support shared_memory (#345)

puyuan1996 · puyuan · web-flow · commit 2e98102559cc · 2025-04-15T12:52:07.000+08:00
* fix(pu): fix atari and dmc2gym env to support shared_memory

* tmp

* fix(pu): fix frame_stack_num default cfg in atari env

---------

Co-authored-by: puyuan &lt;puyuan1996@qq.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -1449,3 +1449,5 @@ events.*
 # pooltool-specific stuff
 !/assets/pooltool/**
 lzero/mcts/ctree/ctree_alphazero/pybind11
+
+zoo/jericho/envs/z-machine-games-master
diff --git a/zoo/atari/config/atari_muzero_config.py b/zoo/atari/config/atari_muzero_config.py
@@ -14,7 +14,7 @@
 update_per_collect = None
 replay_ratio = 0.25
 batch_size = 256
-max_env_step = int(2e5)
+max_env_step = int(5e5)
 reanalyze_ratio = 0.
 
 # =========== for debug ===========
@@ -33,13 +33,13 @@
     env=dict(
         stop_value=int(1e6),
         env_id=env_id,
-        observation_shape=(4, 64, 64),  # (4, 96, 96)
+        observation_shape=(4, 64, 64),
         frame_stack_num=4,
         gray_scale=True,
         collector_env_num=collector_env_num,
         evaluator_env_num=evaluator_env_num,
         n_evaluator_episode=evaluator_env_num,
-        manager=dict(shared_memory=False, ),
+        manager=dict(shared_memory=True, ),
         # TODO: debug
         # collect_max_episode_steps=int(50),
         # eval_max_episode_steps=int(50),
@@ -48,17 +48,16 @@
         analysis_sim_norm=False,
         cal_dormant_ratio=False,
         model=dict(
-            observation_shape=(4, 64, 64),  # (4, 96, 96)
-            image_channel=1,
+            observation_shape=(4, 64, 64),
             frame_stack_num=4,
+            image_channel=1,
             gray_scale=True,
             action_space_size=action_space_size,
             downsample=True,
-            self_supervised_learning_loss=True,  # default is False
+            self_supervised_learning_loss=True,
             discrete_action_encoding_type='one_hot',
             norm_type='BN',
             use_sim_norm=True,
-            use_sim_norm_kl_loss=False,
             model_type='conv'
         ),
         cuda=True,
diff --git a/zoo/atari/envs/atari_lightzero_env.py b/zoo/atari/envs/atari_lightzero_env.py
@@ -2,7 +2,6 @@
 from ditk import logging
 from typing import List
 
-# import gymnasium as gym
 import gym 
 import numpy as np
 from ding.envs import BaseEnv, BaseEnvTimestep
@@ -50,6 +49,9 @@ class AtariEnvLightZero(BaseEnv):
         replay_path=None,
         # (bool) If set to True, the game screen is converted to grayscale, reducing the complexity of the observation space.
         gray_scale=True,
+        # (int) Specifies the number of consecutive frames to stack after collecting environment data. 
+        # The stacking process is applied within the collector and evaluator modules.
+        frame_stack_num=1,
         # (int) The number of frames to skip between each action. Higher values result in faster simulation.
         frame_skip=4,
         # (bool) If True, the game ends when the agent loses a life, otherwise, the game only ends when all lives are lost.
@@ -112,7 +114,28 @@ def reset(self) -> dict:
         if not self._init_flag:
             # Create and return the wrapped environment for Atari LightZero.
             self._env = wrap_lightzero(self.cfg, episode_life=self.cfg.episode_life, clip_rewards=self.cfg.clip_rewards)
-            self._observation_space = self._env.env.observation_space
+
+            observation_space_before_stack = (
+                int(self.cfg.observation_shape[0] / self.cfg.frame_stack_num),
+                self.cfg.observation_shape[1],
+                self.cfg.observation_shape[2]
+            )
+
+            self._observation_space = gym.spaces.Dict({
+                'observation': gym.spaces.Box(
+                    low=0, high=1, shape=observation_space_before_stack, dtype=np.float32
+                ),
+                'action_mask': gym.spaces.Box(
+                    low=0, high=1, shape=(self._env.env.action_space.n,), dtype=np.int8
+                ),
+                'to_play': gym.spaces.Box(
+                    low=-1, high=2, shape=(), dtype=np.int8
+                ),
+                'timestep': gym.spaces.Box(
+                    low=0, high=self.cfg.collect_max_episode_steps, shape=(), dtype=np.int32
+                ),
+            })
+
             self._action_space = self._env.env.action_space
             self._reward_space = gym.spaces.Box(
                 low=self._env.env.reward_range[0], high=self._env.env.reward_range[1], shape=(1,), dtype=np.float32
@@ -174,8 +197,10 @@ def observe(self) -> dict:
             observation = np.transpose(observation, (2, 0, 1))
 
         action_mask = np.ones(self._action_space.n, 'int8')
-        return {'observation': observation, 'action_mask': action_mask, 'to_play': -1, 'timestep': self._timestep}
 
+        return {'observation': observation, 'action_mask': action_mask, 'to_play': np.array(-1), 'timestep': np.array(self._timestep)}
+
+        
     @property
     def legal_actions(self):
         return np.arange(self._action_space.n)
diff --git a/zoo/dmc2gym/envs/dmc2gym_lightzero_env.py b/zoo/dmc2gym/envs/dmc2gym_lightzero_env.py
@@ -5,7 +5,8 @@
 from typing import Optional
 
 import dmc2gym
-import gymnasium as gym
+# import gymnasium as gym
+import gym
 import matplotlib.pyplot as plt
 import numpy as np
 from ding.envs import BaseEnv, BaseEnvTimestep, WarpFrameWrapper, ScaledFloatFrameWrapper, \
@@ -255,7 +256,7 @@ def __init__(self, cfg: dict = {}) -> None:
         self._init_flag = False
         self._replay_path = self._cfg.replay_path
 
-        self._observation_space = dmc2gym_env_info[self._cfg.domain_name][self._cfg.task_name]["observation_space"](
+        self._observation_space_origin = dmc2gym_env_info[self._cfg.domain_name][self._cfg.task_name]["observation_space"](
             from_pixels=self._cfg["from_pixels"],
             height=self._cfg["height"],
             width=self._cfg["width"],
@@ -300,7 +301,28 @@ def reset(self) -> Dict[str, np.ndarray]:
                 self._env = FrameStackWrapper(self._env, self._cfg['frame_stack'])
 
             # set the obs, action space of wrapped env
-            self._observation_space = self._env.observation_space
+            self._observation_space = gym.spaces.Dict({
+                'observation': self._observation_space_origin,
+                'action_mask': gym.spaces.Box(
+                    low=0,
+                    high=1,
+                    shape=(1,),
+                    dtype=np.int8
+                ),
+                'to_play': gym.spaces.Box(
+                    low=-1,
+                    high=2,
+                    shape=(),
+                    dtype=np.int8
+                ),
+                'timestep': gym.spaces.Box(
+                    low=0,
+                    high=self._cfg.collect_max_episode_steps,
+                    shape=(),
+                    dtype=np.int32
+                ),
+            })
+
             self._action_space = self._env.action_space
 
             if self._replay_path is not None:
@@ -330,13 +352,13 @@ def reset(self) -> Dict[str, np.ndarray]:
             obs = obs['state']
 
         obs = to_ndarray(obs).astype(np.float32)
-        action_mask = None
 
         self._timestep = 0
         if self._save_replay_gif:
             self._frames = []
-
-        obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1, 'timestep': self._timestep}
+            
+        action_mask = -1
+        obs = {'observation': obs, 'action_mask': np.array(action_mask), 'to_play': np.array(-1), 'timestep': np.array(self._timestep)}
 
         return obs
 
@@ -406,8 +428,8 @@ def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep:
                 print(f'save episode {self._save_replay_count} in {self._replay_path_gif}!')
                 self._save_replay_count += 1
 
-        action_mask = None
-        obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1, 'timestep': self._timestep}
+        action_mask = -1
+        obs = {'observation': obs, 'action_mask': np.array(action_mask), 'to_play': np.array(-1), 'timestep': np.array(self._timestep)}
 
         return BaseEnvTimestep(obs, rew, done, info)