opendilab
diff --git a/‎dizoo/atari/config/serial/pong/pong_trex_offppo_config.py‎
Lines changed: 5 additions & 3 deletions b/‎dizoo/atari/config/serial/pong/pong_trex_offppo_config.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎dizoo/atari/config/serial/pong/pong_trex_sql_config.py‎
Lines changed: 4 additions & 3 deletions b/‎dizoo/atari/config/serial/pong/pong_trex_sql_config.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py‎
Lines changed: 4 additions & 3 deletions b/‎dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py‎
Lines changed: 4 additions & 3 deletions b/‎dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_dqn_config.py‎
Lines changed: 8 additions & 13 deletions b/‎dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_dqn_config.py‎
Lines changed: 8 additions & 13 deletions
diff --git a/‎dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_offppo_config.py‎
Lines changed: 8 additions & 13 deletions b/‎dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_offppo_config.py‎
Lines changed: 8 additions & 13 deletions
diff --git a/‎dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py‎
Lines changed: 4 additions & 3 deletions b/‎dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py‎
Lines changed: 3 additions & 3 deletions b/‎dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py‎
Lines changed: 6 additions & 2 deletions b/‎dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py‎
Lines changed: 5 additions & 3 deletions b/‎dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py‎
Lines changed: 5 additions & 3 deletions
@@ -13,6 +13,7 @@
     ),
     reward_model=dict(
         type='trex',
+        exp_name='pong_trex_offppo_seed0',
         min_snippet_length=50,
         max_snippet_length=100,
         checkpoint_min=0,
@@ -24,7 +25,7 @@
         # Absolute path is recommended.
         # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``.
         # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config.
-        expert_model_path='model_path_placeholder',
+        expert_model_path='pong_ppo_seed0',
         hidden_size_list=[512, 64, 1],
         obs_shape=[4, 84, 84],
         action_shape=6,
@@ -76,6 +77,7 @@
     ),
     env_manager=dict(type='subprocess'),
     policy=dict(type='ppo_offpolicy'),
+    reward_model=dict(type='trex'),
 )
 pong_trex_ppo_create_config = EasyDict(pong_trex_ppo_create_config)
 create_config = pong_trex_ppo_create_config
@@ -87,12 +89,12 @@
     import argparse
     import torch
     from ding.entry import trex_collecting_data
-    from ding.entry import serial_pipeline_preference_based_irl
+    from ding.entry import serial_pipeline_reward_model_offpolicy
     parser = argparse.ArgumentParser()
     parser.add_argument('--cfg', type=str, default='please enter abs path for this file')
     parser.add_argument('--seed', type=int, default=0)
     parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu')
     args = parser.parse_args()
     # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex.
     trex_collecting_data(args)
-    serial_pipeline_preference_based_irl((main_config, create_config))
+    serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False)
@@ -25,7 +25,7 @@
         # Absolute path is recommended.
         # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``.
         # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config.
-        expert_model_path='model_path_placeholder',
+        expert_model_path='pong_sql_seed0',
         hidden_size_list=[512, 64, 1],
         obs_shape=[4, 84, 84],
         action_shape=6,
@@ -62,6 +62,7 @@
     ),
     env_manager=dict(type='subprocess'),
     policy=dict(type='sql'),
+    reward_model=dict(type='trex'),
 )
 pong_trex_sql_create_config = EasyDict(pong_trex_sql_create_config)
 create_config = pong_trex_sql_create_config
@@ -73,12 +74,12 @@
     import argparse
     import torch
     from ding.entry import trex_collecting_data
-    from ding.entry import serial_pipeline_preference_based_irl
+    from ding.entry import serial_pipeline_reward_model_offpolicy
     parser = argparse.ArgumentParser()
     parser.add_argument('--cfg', type=str, default='please enter abs path for this file')
     parser.add_argument('--seed', type=int, default=0)
     parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu')
     args = parser.parse_args()
     # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex.
     trex_collecting_data(args)
-    serial_pipeline_preference_based_irl((main_config, create_config))
+    serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False)
@@ -21,7 +21,7 @@
         checkpoint_step=100,
         learning_rate=1e-5,
         update_per_collect=1,
-        expert_model_path='abs model path',
+        expert_model_path='qbert_dqn_seed0',
         hidden_size_list=[512, 64, 1],
         obs_shape=[4, 84, 84],
         action_shape=6,
@@ -64,6 +64,7 @@
     ),
     env_manager=dict(type='base'),
     policy=dict(type='dqn'),
+    reward_model=dict(type='trex'),
 )
 qbert_trex_dqn_create_config = EasyDict(qbert_trex_dqn_create_config)
 create_config = qbert_trex_dqn_create_config
@@ -76,7 +77,7 @@
     import argparse
     import torch
     from ding.entry import trex_collecting_data
-    from ding.entry import serial_pipeline_reward_model_trex
+    from ding.entry import serial_pipeline_reward_model_offpolicy
 
     parser = argparse.ArgumentParser()
     parser.add_argument('--cfg', type=str, default='please enter abs path for this file')
@@ -85,4 +86,4 @@
     args = parser.parse_args()
     # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex.
     trex_collecting_data(args)
-    serial_pipeline_reward_model_trex((main_config, create_config))
+    serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False)
@@ -21,7 +21,7 @@
         checkpoint_step=100,
         learning_rate=1e-5,
         update_per_collect=1,
-        expert_model_path='abs model path',
+        expert_model_path='qbert_ppo_seed0',
         hidden_size_list=[512, 64, 1],
         obs_shape=[4, 84, 84],
         action_shape=6,
@@ -71,6 +71,7 @@
     ),
     env_manager=dict(type='subprocess'),
     policy=dict(type='ppo_offpolicy'),
+    reward_model=dict(type='trex'),
 )
 create_config = EasyDict(qbert_trex_ppo_create_config)
 
@@ -82,7 +83,7 @@
     import argparse
     import torch
     from ding.entry import trex_collecting_data
-    from ding.entry import serial_pipeline_reward_model_trex
+    from ding.entry import serial_pipeline_reward_model_offpolicy
 
     parser = argparse.ArgumentParser()
     parser.add_argument('--cfg', type=str, default='please enter abs path for this file')
@@ -91,4 +92,4 @@
     args = parser.parse_args()
     # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex.
     trex_collecting_data(args)
-    serial_pipeline_reward_model_trex((main_config, create_config))
+    serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False)
@@ -15,6 +15,7 @@
     ),
     reward_model=dict(
         type='trex',
+        exp_name='spaceinvaders_trex_dqn_seed0',
         min_snippet_length=50,
         max_snippet_length=100,
         checkpoint_min=10000,
@@ -28,17 +29,10 @@
         # Absolute path is recommended.
         # In DI-engine, it is ``exp_name``.
         # For example, if you want to use dqn to generate demos, you can use ``spaceinvaders_dqn``
-        expert_model_path='model_path_placeholder',
-        # path to save reward model
-        # Users should add their own model path here.
-        # Absolute path is recommended.
-        # For example, if you use ``spaceinvaders_drex``, then the reward model will be saved in this directory.
-        reward_model_path='model_path_placeholder + ./spaceinvaders.params',
-        # path to save generated observations.
-        # Users should add their own model path here.
-        # Absolute path is recommended.
-        # For example, if you use ``spaceinvaders_drex``, then all the generated data will be saved in this directory.
-        offline_data_path='data_path_placeholder',
+        expert_model_path='spaceinvaders_dqn_seed0',
+        hidden_size_list=[512, 64, 1],
+        obs_shape=[4, 84, 84],
+        action_shape=6,
     ),
     policy=dict(
         cuda=True,
@@ -78,6 +72,7 @@
     ),
     env_manager=dict(type='subprocess'),
     policy=dict(type='dqn'),
+    reward_model=dict(type='trex'),
 )
 spaceinvaders_trex_dqn_create_config = EasyDict(spaceinvaders_trex_dqn_create_config)
 create_config = spaceinvaders_trex_dqn_create_config
@@ -89,12 +84,12 @@
     import argparse
     import torch
     from ding.entry import trex_collecting_data
-    from ding.entry import serial_pipeline_trex
+    from ding.entry import serial_pipeline_reward_model_offpolicy
     parser = argparse.ArgumentParser()
     parser.add_argument('--cfg', type=str, default='please enter abs path for this file')
     parser.add_argument('--seed', type=int, default=0)
     parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu')
     args = parser.parse_args()
     # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex.
     trex_collecting_data(args)
-    serial_pipeline_trex([main_config, create_config])
+    serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False)
@@ -15,6 +15,7 @@
     ),
     reward_model=dict(
         type='trex',
+        exp_name='spaceinvaders_trex_offppo_seed0',
         min_snippet_length=30,
         max_snippet_length=100,
         checkpoint_min=0,
@@ -27,17 +28,10 @@
         # Absolute path is recommended.
         # In DI-engine, it is ``exp_name``.
         # For example, if you want to use dqn to generate demos, you can use ``spaceinvaders_dqn``
-        expert_model_path='model_path_placeholder',
-        # path to save reward model
-        # Users should add their own model path here.
-        # Absolute path is recommended.
-        # For example, if you use ``spaceinvaders_drex``, then the reward model will be saved in this directory.
-        reward_model_path='model_path_placeholder + ./spaceinvaders.params',
-        # path to save generated observations.
-        # Users should add their own model path here.
-        # Absolute path is recommended.
-        # For example, if you use ``spaceinvaders_drex``, then all the generated data will be saved in this directory.
-        offline_data_path='data_path_placeholder',
+        expert_model_path='spaceinvaders_ppo_seed0',
+        hidden_size_list=[512, 64, 1],
+        obs_shape=[4, 84, 84],
+        action_shape=6,
     ),
     policy=dict(
         cuda=True,
@@ -85,6 +79,7 @@
     ),
     env_manager=dict(type='subprocess'),
     policy=dict(type='ppo_offpolicy'),
+    reward_model=dict(type='trex'),
 )
 spaceinvaders_trex_ppo_create_config = EasyDict(spaceinvaders_trex_ppo_create_config)
 create_config = spaceinvaders_trex_ppo_create_config
@@ -96,12 +91,12 @@
     import argparse
     import torch
     from ding.entry import trex_collecting_data
-    from ding.entry import serial_pipeline_trex
+    from ding.entry import serial_pipeline_reward_model_offpolicy
     parser = argparse.ArgumentParser()
     parser.add_argument('--cfg', type=str, default='please enter abs path for this file')
     parser.add_argument('--seed', type=int, default=0)
     parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu')
     args = parser.parse_args()
     # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex.
     trex_collecting_data(args)
-    serial_pipeline_trex([main_config, create_config])
+    serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False)
@@ -26,7 +26,7 @@
         # Users should add their own model path here. Model path should lead to a model.
         # Absolute path is recommended.
         # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``.
-        expert_model_path='model_path_placeholder',
+        expert_model_path='lunarlander_dqn_seed0',
         hidden_size_list=[512, 64, 1],
         obs_shape=8,
         action_shape=4,
@@ -85,6 +85,7 @@
     ),
     env_manager=dict(type='subprocess'),
     policy=dict(type='dqn'),
+    reward_model=dict(type='trex'),
 )
 lunarlander_trex_dqn_create_config = EasyDict(lunarlander_trex_dqn_create_config)
 create_config = lunarlander_trex_dqn_create_config
@@ -96,12 +97,12 @@
     import argparse
     import torch
     from ding.entry import trex_collecting_data
-    from ding.entry import serial_pipeline_trex
+    from ding.entry import serial_pipeline_reward_model_offpolicy
     parser = argparse.ArgumentParser()
     parser.add_argument('--cfg', type=str, default='please enter abs path for this file')
     parser.add_argument('--seed', type=int, default=0)
     parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu')
     args = parser.parse_args()
     # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex.
     trex_collecting_data(args)
-    serial_pipeline_trex([main_config, create_config])
+    serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False)
@@ -23,7 +23,7 @@
         # Absolute path is recommended.
         # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``.
         # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config.
-        expert_model_path='model_path_placeholder',
+        expert_model_path='lunarlander_offppo_seed0',
         hidden_size_list=[512, 64, 1],
         obs_shape=8,
         action_shape=4,
@@ -73,12 +73,12 @@
     import argparse
     import torch
     from ding.entry import trex_collecting_data
-    from ding.entry import serial_pipeline_trex
+    from ding.entry import serial_pipeline_reward_model_offpolicy
     parser = argparse.ArgumentParser()
     parser.add_argument('--cfg', type=str, default='please enter abs path for this file')
     parser.add_argument('--seed', type=int, default=0)
     parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu')
     args = parser.parse_args()
     # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex.
     trex_collecting_data(args)
-    serial_pipeline_trex([main_config, create_config])
+    serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False)
@@ -10,6 +10,7 @@
     ),
     reward_model=dict(
         type='trex',
+        exp_name='cartpole_trex_dqn_seed0',
         min_snippet_length=5,
         max_snippet_length=100,
         checkpoint_min=0,
@@ -61,6 +62,7 @@
     ),
     env_manager=dict(type='base'),
     policy=dict(type='dqn'),
+    reward_model=dict(type='trex'),
 )
 cartpole_trex_dqn_create_config = EasyDict(cartpole_trex_dqn_create_config)
 create_config = cartpole_trex_dqn_create_config
@@ -69,15 +71,17 @@
     # Users should first run ``cartpole_dqn_config.py`` to save models (or checkpoints).
     # Note: Users should check that the checkpoints generated should include iteration_'checkpoint_min'.pth.tar, iteration_'checkpoint_max'.pth.tar with the interval checkpoint_step
     # where checkpoint_max, checkpoint_min, checkpoint_step are specified above.
+    # example of running this file:
+    # python cartpole_trex_dqn_config.py --cfg cartpole_trex_dqn_config.py --seed 0 --device cpu 
     import argparse
     import torch
     from ding.entry import trex_collecting_data
-    from ding.entry import serial_pipeline_reward_model_trex
+    from ding.entry import serial_pipeline_reward_model_offpolicy
     parser = argparse.ArgumentParser()
     parser.add_argument('--cfg', type=str, default='please enter abs path for this file')
     parser.add_argument('--seed', type=int, default=0)
     parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu')
     args = parser.parse_args()
     # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex.
     trex_collecting_data(args)
-    serial_pipeline_reward_model_trex((main_config, create_config))
+    serial_pipeline_reward_model_offpolicy((main_config, create_config),pretrain_reward=True, cooptrain_reward=False)
@@ -18,7 +18,7 @@
         checkpoint_step=100,
         learning_rate=1e-5,
         update_per_collect=1,
-        expert_model_path='abs model path',
+        expert_model_path='cartpole_ppo_seed0',  # expert model experiment directory path
         hidden_size_list=[512, 64, 1],
         obs_shape=4,
         action_shape=2,
@@ -68,15 +68,17 @@
     # Users should first run ``cartpole_offppo_config.py`` to save models (or checkpoints).
     # Note: Users should check that the checkpoints generated should include iteration_'checkpoint_min'.pth.tar, iteration_'checkpoint_max'.pth.tar with the interval checkpoint_step
     # where checkpoint_max, checkpoint_min, checkpoint_step are specified above.
+    # example:
+    # python cartpole_trex_offppo_config.py --cfg cartpole_trex_offppo_config.py --seed 0 --device cpu
     import argparse
     import torch
     from ding.entry import trex_collecting_data
-    from ding.entry import serial_pipeline_reward_model_trex
+    from ding.entry import serial_pipeline_reward_model_offpolicy
     parser = argparse.ArgumentParser()
     parser.add_argument('--cfg', type=str, default='please enter abs path for this file')
     parser.add_argument('--seed', type=int, default=0)
     parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu')
     args = parser.parse_args()
     # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex.
     trex_collecting_data(args)
-    serial_pipeline_reward_model_trex((main_config, create_config))
+    serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False)