-
Notifications
You must be signed in to change notification settings - Fork 749
Description
Dear authors,
When I didn't use"Trust_remote_code=True", it reminded me to use it. But when i used , it has following bugs:
Error executing job with overrides: ['algorithm.adv_estimator=grpo', 'data.trust_remote_code=True', 'data.train_files=/data1/yyy25/datasets/geo3k/train.parquet', 'data.val_files=/data1/yyy25/datasets/geo3k/test.parquet', 'data.train_batch_size=128', 'data.max_prompt_length=1024', 'data.max_response_length=1536', 'data.filter_overlong_prompts=True', 'data.truncation=error', 'data.image_key=images', 'actor_rollout_ref.model.path=/data1/yyy25/datasets/InternVL3-1B', 'actor_rollout_ref.actor.optim.lr=1e-6', 'actor_rollout_ref.model.use_remove_padding=True', 'actor_rollout_ref.actor.ppo_mini_batch_size=64', 'actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8', 'actor_rollout_ref.actor.use_kl_loss=True', 'actor_rollout_ref.actor.kl_loss_coef=0.01', 'actor_rollout_ref.actor.kl_loss_type=low_var_kl', 'actor_rollout_ref.actor.entropy_coeff=0', 'actor_rollout_ref.model.enable_gradient_checkpointing=True', 'actor_rollout_ref.actor.fsdp_config.param_offload=False', 'actor_rollout_ref.actor.fsdp_config.optimizer_offload=False', 'actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16', 'actor_rollout_ref.rollout.tensor_model_parallel_size=2', 'actor_rollout_ref.rollout.name=vllm', 'actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True', 'actor_rollout_ref.rollout.gpu_memory_utilization=0.6', 'actor_rollout_ref.rollout.enable_chunked_prefill=False', 'actor_rollout_ref.rollout.enforce_eager=False', 'actor_rollout_ref.rollout.free_cache_engine=True', 'actor_rollout_ref.rollout.n=4', 'actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20', 'actor_rollout_ref.ref.fsdp_config.param_offload=True', 'algorithm.use_kl_in_reward=False', 'trainer.critic_warmup=0', 'trainer.logger=[console,wandb]', 'trainer.project_name=verl_grpo_example_geo3k', 'trainer.experiment_name=qwen2_5_vl_7b_function_rm', 'trainer.n_gpus_per_node=2', 'trainer.nnodes=1', 'trainer.save_freq=20', 'trainer.test_freq=5', 'trainer.total_epochs=15']
Traceback (most recent call last):
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/data1/yyy25/verl-internvl/verl/trainer/main_ppo.py", line 289, in
main()
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/hydra/main.py", line 94, in decorated_main
_run_hydra(
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
_run_app(
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/hydra/_internal/utils.py", line 457, in _run_app
run_and_report(
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/hydra/_internal/utils.py", line 223, in run_and_report
raise ex
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
return func()
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/hydra/_internal/utils.py", line 458, in
lambda: hydra.run(
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "/data1/yyy25/verl-internvl/verl/trainer/main_ppo.py", line 31, in main
run_ppo(config)
File "/data1/yyy25/verl-internvl/verl/trainer/main_ppo.py", line 64, in run_ppo
ray.get(runner.run.remote(config))
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
return fn(*args, **kwargs)
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/ray/_private/worker.py", line 2822, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/ray/_private/worker.py", line 930, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): ray::TaskRunner.run() (pid=3845197, ip=10.103.13.91, actor_id=214531f2ad269345a3be17bf01000000, repr=<main_ppo.TaskRunner object at 0x7fbc3e6676a0>)
File "/data1/yyy25/verl-internvl/verl/trainer/main_ppo.py", line 212, in run
trainer.init_workers()
File "/data1/yyy25/verl-internvl/verl/trainer/ppo/ray_trainer.py", line 883, in init_workers
self.ref_policy_wg.init_model()
File "/data1/yyy25/verl-internvl/verl/single_controller/ray/base.py", line 51, in call
output = ray.get(output)
ray.exceptions.RayTaskError(ValueError): ray::WorkerDict.ref_init_model() (pid=3852278, ip=10.103.13.91, actor_id=33d0ff4be57f360c22d46b9f01000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7fb61a4245b0>)
File "/data1/yyy25/verl-internvl/verl/single_controller/ray/base.py", line 710, in func
return getattr(self.worker_dict[key], name)(*args, **kwargs)
File "/data1/yyy25/verl-internvl/verl/single_controller/base/decorator.py", line 549, in inner
return func(*args, **kwargs)
File "/data1/yyy25/verl-internvl/verl/workers/fsdp_workers.py", line 654, in init_model
self.ref_module_fsdp = self._build_model_optimizer(
File "/data1/yyy25/verl-internvl/verl/workers/fsdp_workers.py", line 237, in _build_model_optimizer
self.tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
File "/data1/yyy25/verl-internvl/verl/utils/tokenizer.py", line 64, in hf_tokenizer
config = AutoConfig.from_pretrained(name_or_path, trust_remote_code=trust_remote_code)
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py", line 1156, in from_pretrained
trust_remote_code = resolve_trust_remote_code(
File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/transformers/dynamic_module_utils.py", line 731, in resolve_trust_remote_code
raise ValueError(
ValueError: The repository /data1/yyy25/datasets/InternVL3-1B contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co//data1/yyy25/datasets/InternVL3-1B.
Please pass the argument trust_remote_code=True to allow custom code to be run.
(TaskRunner pid=3845197) Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::WorkerDict.ref_init_model() (pid=3851024, ip=10.103.13.91, actor_id=e26ca310b1e1ea5900c441f501000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7f736b4b46d0>)
(TaskRunner pid=3845197) File "/data1/yyy25/verl-internvl/verl/single_controller/ray/base.py", line 710, in func
(TaskRunner pid=3845197) return getattr(self.worker_dict[key], name)(*args, **kwargs)
(TaskRunner pid=3845197) File "/data1/yyy25/verl-internvl/verl/single_controller/base/decorator.py", line 549, in inner
(TaskRunner pid=3845197) return func(*args, **kwargs)
(TaskRunner pid=3845197) File "/data1/yyy25/verl-internvl/verl/workers/fsdp_workers.py", line 654, in init_model
(TaskRunner pid=3845197) self.ref_module_fsdp = self._build_model_optimizer(
(TaskRunner pid=3845197) File "/data1/yyy25/verl-internvl/verl/workers/fsdp_workers.py", line 237, in _build_model_optimizer
(TaskRunner pid=3845197) self.tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
(TaskRunner pid=3845197) File "/data1/yyy25/verl-internvl/verl/utils/tokenizer.py", line 64, in hf_tokenizer
(TaskRunner pid=3845197) config = AutoConfig.from_pretrained(name_or_path, trust_remote_code=trust_remote_code)
(TaskRunner pid=3845197) File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py", line 1156, in from_pretrained
(TaskRunner pid=3845197) trust_remote_code = resolve_trust_remote_code(
(TaskRunner pid=3845197) File "/home/yyy25/miniconda3/envs/azr/lib/python3.10/site-packages/transformers/dynamic_module_utils.py", line 731, in resolve_trust_remote_code
(TaskRunner pid=3845197) raise ValueError(
(TaskRunner pid=3845197) ValueError: The repository /data1/yyy25/datasets/InternVL3-1B contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co//data1/yyy25/datasets/InternVL3-1B.
(TaskRunner pid=3845197) Please pass the argument trust_remote_code=True to allow custom code to be run.
Do you have some ideas about how to solve the problems?Thank you