Skip to content

Commit 28586eb

Browse files
committed
apply suggestions and remove to(dtype) in fsdp_workers
1 parent ee72f88 commit 28586eb

File tree

2 files changed

+3
-9
lines changed

2 files changed

+3
-9
lines changed

trinity/trainer/verl/fsdp_workers.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -326,9 +326,6 @@ def _build_model_optimizer( # noqa: C901
326326
fused_kernels_backend=fused_kernels_backend,
327327
)
328328

329-
# some parameters may not in torch_dtype. TODO(zhangchi.usc1992) remove this after we switch to fsdp2
330-
actor_module.to(torch_dtype)
331-
332329
if enable_gradient_checkpointing:
333330
actor_module.gradient_checkpointing_enable(
334331
gradient_checkpointing_kwargs={"use_reentrant": False}
@@ -1060,9 +1057,6 @@ def _build_critic_model_optimizer(self, config): # noqa: C901
10601057
ulysses_sp_size=self.ulysses_sequence_parallel_size,
10611058
)
10621059

1063-
# some parameters may not in torch_dtype
1064-
critic_module.to(torch_dtype)
1065-
10661060
if config.model.get("enable_gradient_checkpointing", False):
10671061
critic_module.gradient_checkpointing_enable(
10681062
gradient_checkpointing_kwargs={"use_reentrant": False}

trinity/trainer/verl/megatron_checkpoint_manager.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -273,11 +273,11 @@ def save_checkpoint( # noqa: C901
273273
logger=logger,
274274
log_only_rank_0=True,
275275
)
276-
except Exception as e:
276+
except Exception:
277277
logger.error(
278-
f"Failed to save Huggingface model to {local_path}, you can try to set `use_mbridge=true` to save it."
278+
f"Failed to save Huggingface model to {local_path}, you can try to set `use_mbridge=true` to save it.",
279+
exc_info=True,
279280
)
280-
logger.error(e)
281281

282282
ray.get(
283283
self.checkpoint_monitor.register_thread_count.remote(

0 commit comments

Comments
 (0)