|
14 | 14 | get_template_config, |
15 | 15 | get_unittest_dataset_config, |
16 | 16 | ) |
17 | | -from trinity.cli.launcher import bench, both |
18 | | -from trinity.common.constants import MonitorType, SyncMethod |
| 17 | +from trinity.cli.launcher import bench, both, train |
| 18 | +from trinity.common.constants import AlgorithmType, MonitorType, SyncMethod |
19 | 19 |
|
20 | 20 |
|
21 | 21 | class BaseTrainerCase(RayUnittestBase): |
@@ -109,3 +109,106 @@ def test_trainer(self): |
109 | 109 | def tearDown(self): |
110 | 110 | # remove dir only when the test passed |
111 | 111 | shutil.rmtree(self.config.checkpoint_job_dir) |
| 112 | + |
| 113 | + |
| 114 | +class TestTrainerGSM8K(BaseTrainerCase): |
| 115 | + def test_trainer(self): |
| 116 | + """Test GSM8K.""" |
| 117 | + # test both mode |
| 118 | + self.config.algorithm.algorithm_type = AlgorithmType.GRPO |
| 119 | + self.config.algorithm.repeat_times = 8 |
| 120 | + self.config.algorithm.advantage_fn_type = "grpo_adv_fn" |
| 121 | + self.config.algorithm.advantage_fn_args = {} |
| 122 | + # self.config.buffer.batch_size = 96 # TODO: used for real testing |
| 123 | + self.config.buffer.explorer_input.taskset = get_unittest_dataset_config("gsm8k") |
| 124 | + self.config.check_and_update() |
| 125 | + self.config.trainer.trainer_config.trainer.total_training_steps = 4 |
| 126 | + self.config.trainer.trainer_config.trainer.max_actor_ckpt_to_keep = 2 |
| 127 | + self.config.trainer.trainer_config.actor_rollout_ref.actor.optim.lr = 1e-5 |
| 128 | + both(self.config) |
| 129 | + parser = TensorBoardParser(os.path.join(self.config.monitor.cache_dir, "tensorboard")) |
| 130 | + rollout_metrics = parser.metric_list("rollout") |
| 131 | + self.assertTrue(len(rollout_metrics) > 0) |
| 132 | + self.assertEqual(parser.metric_max_step(rollout_metrics[0]), 4) |
| 133 | + actor_metrics = parser.metric_list("actor") |
| 134 | + self.assertTrue(len(actor_metrics) > 0) |
| 135 | + self.assertEqual(parser.metric_max_step(actor_metrics[0]), 4) |
| 136 | + response_metrics = parser.metric_list("response_length") |
| 137 | + self.assertTrue(len(response_metrics) > 0) |
| 138 | + self.assertEqual(parser.metric_max_step(response_metrics[0]), 4) |
| 139 | + # TODO: used for real testing |
| 140 | + # rewards = parser.metric_values("critic/rewards/mean") |
| 141 | + # self.assertTrue(0.4 < rewards[0] < 0.55) |
| 142 | + # self.assertTrue(0.4 < rewards[1] < 0.55) |
| 143 | + # self.assertTrue(0.6 < rewards[2] < 0.7) |
| 144 | + # self.assertTrue(0.6 < rewards[3] < 0.7) |
| 145 | + ray.shutdown(_exiting_interpreter=True) |
| 146 | + # check checkpoint |
| 147 | + |
| 148 | + def tearDown(self): |
| 149 | + # remove dir only when the test passed |
| 150 | + shutil.rmtree(self.config.checkpoint_job_dir) |
| 151 | + |
| 152 | + |
| 153 | +class TestTrainerGSM8KWithSFT(BaseTrainerCase): |
| 154 | + def test_trainer(self): |
| 155 | + """Test GSM8K With SFT.""" |
| 156 | + # test both mode |
| 157 | + self.config.algorithm.algorithm_type = AlgorithmType.GRPO |
| 158 | + self.config.algorithm.repeat_times = 8 |
| 159 | + self.config.algorithm.advantage_fn_type = "grpo_adv_fn" |
| 160 | + self.config.algorithm.advantage_fn_args = {} |
| 161 | + self.config.buffer.explorer_input.taskset = get_unittest_dataset_config("gsm8k") |
| 162 | + self.config.buffer.trainer_input.sft_warmup_steps = 2 |
| 163 | + self.config.buffer.trainer_input.sft_warmup_dataset = get_unittest_dataset_config( |
| 164 | + "sft_for_gsm8k" |
| 165 | + ) |
| 166 | + self.config.check_and_update() |
| 167 | + self.config.trainer.trainer_config.trainer.total_training_steps = 4 |
| 168 | + self.config.trainer.trainer_config.trainer.max_actor_ckpt_to_keep = 2 |
| 169 | + self.config.trainer.trainer_config.actor_rollout_ref.actor.optim.lr = 1e-5 |
| 170 | + both(self.config) |
| 171 | + parser = TensorBoardParser(os.path.join(self.config.monitor.cache_dir, "tensorboard")) |
| 172 | + rollout_metrics = parser.metric_list("rollout") |
| 173 | + self.assertTrue(len(rollout_metrics) > 0) |
| 174 | + self.assertEqual(parser.metric_max_step(rollout_metrics[0]), 2) |
| 175 | + actor_metrics = parser.metric_list("actor") |
| 176 | + self.assertTrue(len(actor_metrics) > 0) |
| 177 | + self.assertEqual(parser.metric_max_step(actor_metrics[0]), 2) # SFT |
| 178 | + self.assertEqual(parser.metric_max_step(actor_metrics[-1]), 4) # RFT |
| 179 | + response_metrics = parser.metric_list("response_length") |
| 180 | + self.assertTrue(len(response_metrics) > 0) |
| 181 | + self.assertEqual(parser.metric_max_step(response_metrics[0]), 4) |
| 182 | + ray.shutdown(_exiting_interpreter=True) |
| 183 | + # check checkpoint |
| 184 | + |
| 185 | + def tearDown(self): |
| 186 | + # remove dir only when the test passed |
| 187 | + shutil.rmtree(self.config.checkpoint_job_dir) |
| 188 | + |
| 189 | + |
| 190 | +class TestTrainerDPO(BaseTrainerCase): |
| 191 | + def test_trainer(self): |
| 192 | + """Test DPO.""" |
| 193 | + # test both mode |
| 194 | + self.config.mode = "train" |
| 195 | + self.config.algorithm.algorithm_type = AlgorithmType.DPO |
| 196 | + self.config.algorithm.policy_loss_fn = "dpo" |
| 197 | + self.config.algorithm.policy_loss_fn_args = {} |
| 198 | + # self.config.buffer.batch_size = 32 |
| 199 | + self.config.buffer.trainer_input.experience_buffer = get_unittest_dataset_config("dpo") |
| 200 | + self.config.check_and_update() |
| 201 | + self.config.trainer.trainer_config.trainer.total_training_steps = 4 |
| 202 | + self.config.trainer.trainer_config.trainer.max_actor_ckpt_to_keep = 2 |
| 203 | + self.config.trainer.trainer_config.actor_rollout_ref.actor.optim.lr = 5e-7 |
| 204 | + train(self.config) |
| 205 | + parser = TensorBoardParser(os.path.join(self.config.monitor.cache_dir, "tensorboard")) |
| 206 | + actor_metrics = parser.metric_list("actor") |
| 207 | + self.assertTrue(len(actor_metrics) > 0) |
| 208 | + self.assertEqual(parser.metric_max_step(actor_metrics[0]), 4) |
| 209 | + ray.shutdown(_exiting_interpreter=True) |
| 210 | + # check checkpoint |
| 211 | + |
| 212 | + def tearDown(self): |
| 213 | + # remove dir only when the test passed |
| 214 | + shutil.rmtree(self.config.checkpoint_job_dir) |
0 commit comments