Add config_validator.py and refactor config
#1392
Triggered via issue
January 22, 2026 11:29
Status
Failure
Total duration
1h 33m 11s
Artifacts
1
Annotations
3 errors
|
unittest
Process completed with exit code 1.
|
|
Failed Test: tests/trainer/trainer_test.py::TestFullyAsyncMode_2_megatron::test_fully_async_mode
tests/trainer/trainer_test.py::TestFullyAsyncMode_2_megatron::test_fully_async_mode: The test failed in the call phase - self = <tests.trainer.trainer_test.TestFullyAsyncMode_2_megatron testMethod=test_fully_async_mode>
def test_fully_async_mode(self):
config = get_template_config()
config.project = "unittest"
config.name = f"fully_async_{datetime.now().strftime('%Y%m%d%H%M%S')}"
config.checkpoint_root_dir = get_checkpoint_path()
config.buffer.total_epochs = 1
config.buffer.batch_size = 4
config.cluster.gpu_per_node = 2
config.cluster.node_num = 1
config.model.model_path = get_model_path()
config.buffer.explorer_input.taskset = get_unittest_dataset_config("countdown")
config.buffer.trainer_input.experience_buffer = ExperienceBufferConfig(
name="exp_buffer",
storage_type=StorageType.QUEUE.value,
)
config.buffer.trainer_input.experience_buffer.replay_buffer.enable = self.use_priority_queue
config.synchronizer.sync_method = SyncMethod.CHECKPOINT
config.synchronizer.sync_style = SyncStyle.DYNAMIC_BY_EXPLORER
config.synchronizer.sync_interval = 8
config.monitor.monitor_type = "tensorboard"
trainer_config = deepcopy(config)
trainer_config.mode = "train"
trainer_config.buffer.train_batch_size = 4
if self.strategy == "megatron":
trainer_config.trainer.trainer_strategy = "megatron"
trainer_config.check_and_update()
if self.strategy == "megatron":
_trainer_config = trainer_config.trainer.trainer_config
_trainer_config.actor_rollout_ref.actor.megatron.tensor_model_parallel_size = 2
_trainer_config.actor_rollout_ref.ref.megatron.tensor_model_parallel_size = 2
_trainer_config.critic.strategy = "megatron"
_trainer_config.critic.megatron.tensor_model_parallel_size = 2
explorer1_config = deepcopy(config)
explorer1_config.trainer = deepcopy(trainer_config.trainer)
explorer1_config.mode = "explore"
explorer1_config.explorer.name = "explorer1"
config.cluster.gpu_per_node = 1
config.cluster.node_num = 1
explorer1_config.explorer.rollout_model.engine_num = 1
explorer1_config.explorer.rollout_model.tensor_parallel_size = 1
explorer1_config.buffer.trainer_input.experience_buffer = ExperienceBufferConfig(
name="exp_buffer",
storage_type=StorageType.QUEUE.value,
)
explorer2_config = deepcopy(explorer1_config)
explorer2_config.trainer = deepcopy(trainer_config.trainer)
> explorer1_config.check_and_update()
tests/trainer/trainer_test.py:625:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
trinity/common/config.py:892: in check_and_update
validator.validate(self)
trinity/common/config_validator.py:1084: in validate
config.trainer.trainer_config.synchronize_config(config)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = veRLConfig(data=Data(train_batch_size=4, trust_remote_code=False), actor_rollout_ref=ActorRolloutRef(hybrid_engine=Tru...timeout=1200, wait_for_checkpoint=False, explorer_world_size=0, ray_namespace='trinity_unittest'), enable_preview=True)
config = Config(mode='explore', project='unittest', group='', name='fully_async_20260122124607', checkpoint_root_dir='/mnt/chec...fig(level='INFO', group_by_node=False, save_dir='/mnt/checkpoints/unittest/fully_async_20260122124607/log'), stages=[])
def synchronize_config(self, config: Config) -> None: # noqa: C901
"""Synchronize config."""
self.trainer.nnodes = config.cluster.trainer_node_num
self.trainer.n_gpus_per_node = config.cluster.trainer_gpu_num_per_node
world_size = config.cluster.trainer_gpu_num
> if config.buffer.train_batch_size % world_size != 0:
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
E ZeroDivisionError: integer modulo by zero
|
|
unittest
Process completed with exit code 1.
|
Artifacts
Produced during runtime
| Name | Size | Digest | |
|---|---|---|---|
|
pytest-results
|
11.1 KB |
sha256:c910ab60802effbcb6e650a6545b4d1ef7026a82768ade32069ad8a9f4236b5b
|
|