Skip to content

Add config_validator.py and refactor config #1392

Add config_validator.py and refactor config

Add config_validator.py and refactor config #1392

Triggered via issue January 22, 2026 11:29
Status Failure
Total duration 1h 33m 11s
Artifacts 1

unittest.yaml

on: issue_comment
Fit to window
Zoom out
Zoom in

Annotations

3 errors
unittest
Process completed with exit code 1.
Failed Test: tests/trainer/trainer_test.py::TestFullyAsyncMode_2_megatron::test_fully_async_mode
tests/trainer/trainer_test.py::TestFullyAsyncMode_2_megatron::test_fully_async_mode: The test failed in the call phase - self = <tests.trainer.trainer_test.TestFullyAsyncMode_2_megatron testMethod=test_fully_async_mode> def test_fully_async_mode(self): config = get_template_config() config.project = "unittest" config.name = f"fully_async_{datetime.now().strftime('%Y%m%d%H%M%S')}" config.checkpoint_root_dir = get_checkpoint_path() config.buffer.total_epochs = 1 config.buffer.batch_size = 4 config.cluster.gpu_per_node = 2 config.cluster.node_num = 1 config.model.model_path = get_model_path() config.buffer.explorer_input.taskset = get_unittest_dataset_config("countdown") config.buffer.trainer_input.experience_buffer = ExperienceBufferConfig( name="exp_buffer", storage_type=StorageType.QUEUE.value, ) config.buffer.trainer_input.experience_buffer.replay_buffer.enable = self.use_priority_queue config.synchronizer.sync_method = SyncMethod.CHECKPOINT config.synchronizer.sync_style = SyncStyle.DYNAMIC_BY_EXPLORER config.synchronizer.sync_interval = 8 config.monitor.monitor_type = "tensorboard" trainer_config = deepcopy(config) trainer_config.mode = "train" trainer_config.buffer.train_batch_size = 4 if self.strategy == "megatron": trainer_config.trainer.trainer_strategy = "megatron" trainer_config.check_and_update() if self.strategy == "megatron": _trainer_config = trainer_config.trainer.trainer_config _trainer_config.actor_rollout_ref.actor.megatron.tensor_model_parallel_size = 2 _trainer_config.actor_rollout_ref.ref.megatron.tensor_model_parallel_size = 2 _trainer_config.critic.strategy = "megatron" _trainer_config.critic.megatron.tensor_model_parallel_size = 2 explorer1_config = deepcopy(config) explorer1_config.trainer = deepcopy(trainer_config.trainer) explorer1_config.mode = "explore" explorer1_config.explorer.name = "explorer1" config.cluster.gpu_per_node = 1 config.cluster.node_num = 1 explorer1_config.explorer.rollout_model.engine_num = 1 explorer1_config.explorer.rollout_model.tensor_parallel_size = 1 explorer1_config.buffer.trainer_input.experience_buffer = ExperienceBufferConfig( name="exp_buffer", storage_type=StorageType.QUEUE.value, ) explorer2_config = deepcopy(explorer1_config) explorer2_config.trainer = deepcopy(trainer_config.trainer) > explorer1_config.check_and_update() tests/trainer/trainer_test.py:625: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ trinity/common/config.py:892: in check_and_update validator.validate(self) trinity/common/config_validator.py:1084: in validate config.trainer.trainer_config.synchronize_config(config) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = veRLConfig(data=Data(train_batch_size=4, trust_remote_code=False), actor_rollout_ref=ActorRolloutRef(hybrid_engine=Tru...timeout=1200, wait_for_checkpoint=False, explorer_world_size=0, ray_namespace='trinity_unittest'), enable_preview=True) config = Config(mode='explore', project='unittest', group='', name='fully_async_20260122124607', checkpoint_root_dir='/mnt/chec...fig(level='INFO', group_by_node=False, save_dir='/mnt/checkpoints/unittest/fully_async_20260122124607/log'), stages=[]) def synchronize_config(self, config: Config) -> None: # noqa: C901 """Synchronize config.""" self.trainer.nnodes = config.cluster.trainer_node_num self.trainer.n_gpus_per_node = config.cluster.trainer_gpu_num_per_node world_size = config.cluster.trainer_gpu_num > if config.buffer.train_batch_size % world_size != 0: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ E ZeroDivisionError: integer modulo by zero
unittest
Process completed with exit code 1.

Artifacts

Produced during runtime
Name Size Digest
pytest-results
11.1 KB
sha256:c910ab60802effbcb6e650a6545b4d1ef7026a82768ade32069ad8a9f4236b5b