apply suggestions and remove to(dtype) in fsdp_workers

chenyushuo · chenyushuo · commit 28586ebc2b57 · 2025-11-19T15:27:46.000+08:00
diff --git a/trinity/trainer/verl/fsdp_workers.py b/trinity/trainer/verl/fsdp_workers.py
@@ -326,9 +326,6 @@ def _build_model_optimizer(  # noqa: C901
                 fused_kernels_backend=fused_kernels_backend,
             )
 
-            # some parameters may not in torch_dtype. TODO(zhangchi.usc1992) remove this after we switch to fsdp2
-            actor_module.to(torch_dtype)
-
             if enable_gradient_checkpointing:
                 actor_module.gradient_checkpointing_enable(
                     gradient_checkpointing_kwargs={"use_reentrant": False}
@@ -1060,9 +1057,6 @@ def _build_critic_model_optimizer(self, config):  # noqa: C901
                 ulysses_sp_size=self.ulysses_sequence_parallel_size,
             )
 
-            # some parameters may not in torch_dtype
-            critic_module.to(torch_dtype)
-
             if config.model.get("enable_gradient_checkpointing", False):
                 critic_module.gradient_checkpointing_enable(
                     gradient_checkpointing_kwargs={"use_reentrant": False}
diff --git a/trinity/trainer/verl/megatron_checkpoint_manager.py b/trinity/trainer/verl/megatron_checkpoint_manager.py
@@ -273,11 +273,11 @@ def save_checkpoint(  # noqa: C901
                         logger=logger,
                         log_only_rank_0=True,
                     )
-            except Exception as e:
+            except Exception:
                 logger.error(
-                    f"Failed to save Huggingface model to {local_path}, you can try to set `use_mbridge=true` to save it."
+                    f"Failed to save Huggingface model to {local_path}, you can try to set `use_mbridge=true` to save it.",
+                    exc_info=True,
                 )
-                logger.error(e)
 
         ray.get(
             self.checkpoint_monitor.register_thread_count.remote(

Original file line number	Diff line number	Diff line change
`@@ -273,11 +273,11 @@ def save_checkpoint( # noqa: C901`
`273`	`273`	`logger=logger,`
`274`	`274`	`log_only_rank_0=True,`
`275`	`275`	`)`
`276`		`- except Exception as e:`
	`276`	`+ except Exception:`
`277`	`277`	`logger.error(`
`278`		- f"Failed to save Huggingface model to {local_path}, you can try to set `use_mbridge=true` to save it."
	`278`	+ f"Failed to save Huggingface model to {local_path}, you can try to set `use_mbridge=true` to save it.",
	`279`	`+ exc_info=True,`
`279`	`280`	`)`
`280`		`- logger.error(e)`
`281`	`281`
`282`	`282`	`ray.get(`
`283`	`283`	`self.checkpoint_monitor.register_thread_count.remote(`