[Fix] make BaseModel._resolve_compile_cfg works. (#1481)

CyCle1024 · web-flow · commit 8683de8e0399 · 2026-02-12T17:26:38.000+08:00
* [Fix] BaseModel._resolve_compile_cfg is not correct when FSDPConfig.torch_compile is the default value(False).

* [CI] fix ci compile option usage (except test_resolve_compile)

* [Fix] add warning for the deprecation of FSDPConfig.torch_compile
diff --git a/tests/engine/test_dense_train_engine.py b/tests/engine/test_dense_train_engine.py
@@ -40,7 +40,6 @@ def test_dense_engine_train(self, device, tp_size, sp_size):
         optim_cfg: AdamWConfig = AdamWConfig()
         lr_cfg: LRConfig = LRConfig()
         fsdp_cfg: FSDPConfig = FSDPConfig(
-            torch_compile=True,
             cpu_offload=False,
             tp_size=tp_size,
             # hsdp_sharding_size=hsdp_sharding_size,
@@ -125,7 +124,6 @@ def test_save_and_load(self, device, tp_size, hsdp_sharding_size):
         moe_cfg = Qwen3Dense8BConfig()
         optim_cfg: AdamWConfig = AdamWConfig()
         fsdp_cfg: FSDPConfig = FSDPConfig(
-            torch_compile=True,
             cpu_offload=False,
             tp_size=tp_size,
             hsdp_sharding_size=hsdp_sharding_size,
diff --git a/tests/engine/test_moe_train_engine.py b/tests/engine/test_moe_train_engine.py
@@ -48,11 +48,11 @@ def test_moe_engine_train(self, device, ep_size, sp_size):
             ep_size=ep_size,
             balancing_loss_cfg=BalancingLossConfig(),
             z_loss_cfg=ZLossConfig(),
+            compile_cfg=False,
         )
         optim_cfg: AdamWConfig = AdamWConfig()
         lr_cfg: LRConfig = LRConfig()
         fsdp_cfg: FSDPConfig = FSDPConfig(
-            torch_compile=False,
             cpu_offload=False,
             ep_size=ep_size,
             # hsdp_sharding_size=hsdp_sharding_size,
@@ -129,11 +129,11 @@ def test_moe_engine_train_freeze_routers(self, device, ep_size, sp_size):
             balancing_loss_cfg=BalancingLossConfig(),
             z_loss_cfg=ZLossConfig(),
             freeze_routers=True,
+            compile_cfg=False,
         )
         optim_cfg: AdamWConfig = AdamWConfig()
         lr_cfg: LRConfig = LRConfig()
         fsdp_cfg: FSDPConfig = FSDPConfig(
-            torch_compile=False,
             cpu_offload=False,
             ep_size=ep_size,
             # hsdp_sharding_size=hsdp_sharding_size,
@@ -232,10 +232,10 @@ def test_save_and_load(self, device, ep_size, hsdp_sharding_size):
             ep_size=ep_size,
             balancing_loss_cfg=BalancingLossConfig(),
             z_loss_cfg=ZLossConfig(),
+            compile_cfg=False,
         )
         optim_cfg: AdamWConfig = AdamWConfig()
         fsdp_cfg: FSDPConfig = FSDPConfig(
-            torch_compile=False,
             cpu_offload=False,
             ep_size=ep_size,
             hsdp_sharding_size=hsdp_sharding_size,
@@ -447,12 +447,12 @@ def create_engine_from_hf(load_from: Path, dispatcher: str | None, ep_size: int,
     moe_cfg : Qwen3MoEConfig = get_model_config_from_hf(load_from)
     moe_cfg.dispatcher = dispatcher
     moe_cfg.ep_size = ep_size
+    moe_cfg.compile_cfg = False
     if tiny:
         moe_cfg.num_hidden_layers = 2
 
     optim_cfg: AdamWConfig = AdamWConfig()
     fsdp_cfg: FSDPConfig = FSDPConfig(
-        torch_compile=False,
         cpu_offload=False,
         ep_size=ep_size,
     )
diff --git a/tests/engine/test_moe_train_engine_float8.py b/tests/engine/test_moe_train_engine_float8.py
@@ -49,7 +49,6 @@ def test_tile_wise_fp8(self, device, ep_size, hsdp_sharding_size):
         optim_cfg: AdamWConfig = AdamWConfig()
         lr_cfg: LRConfig = LRConfig()
         fsdp_cfg: FSDPConfig = FSDPConfig(
-            torch_compile=True,
             cpu_offload=False,
             ep_size=ep_size,
             # hsdp_sharding_size=8,
@@ -130,7 +129,6 @@ def test_tensor_wise_fp8(self, device, ep_size, hsdp_sharding_size):
         optim_cfg: AdamWConfig = AdamWConfig()
         lr_cfg: LRConfig = LRConfig()
         fsdp_cfg: FSDPConfig = FSDPConfig(
-            torch_compile=True,
             cpu_offload=False,
             ep_size=ep_size,
             # hsdp_sharding_size=hsdp_sharding_size,
@@ -217,7 +215,6 @@ def test_save_and_load(self, device, ep_size, hsdp_sharding_size):
         optim_cfg: AdamWConfig = AdamWConfig()
         lr_cfg: LRConfig = LRConfig()
         fsdp_cfg: FSDPConfig = FSDPConfig(
-            torch_compile=True,
             cpu_offload=False,
             ep_size=ep_size,
             # hsdp_sharding_size=hsdp_sharding_size,
@@ -323,7 +320,6 @@ def test_save_and_load1(self, device, ep_size, hsdp_sharding_size):
         )
         optim_cfg: AdamWConfig = AdamWConfig()
         fsdp_cfg: FSDPConfig = FSDPConfig(
-            torch_compile=True,
             cpu_offload=False,
             ep_size=ep_size,
             hsdp_sharding_size=hsdp_sharding_size,
diff --git a/tests/model/test_qwen3_tile_embedding.py b/tests/model/test_qwen3_tile_embedding.py
@@ -45,7 +45,6 @@ def test_tie_embedding(self, device, tp_size):
         optim_cfg: AdamWConfig = AdamWConfig()
         lr_cfg: LRConfig = LRConfig(lr_min=1e-3)
         fsdp_cfg: FSDPConfig = FSDPConfig(
-            torch_compile=True,
             cpu_offload=False,
             tp_size=tp_size
         )
@@ -114,7 +113,6 @@ def test_qwen3vl_tie_embedding(self, device, tp_size):
         optim_cfg: AdamWConfig = AdamWConfig()
         lr_cfg: LRConfig = LRConfig(lr_min=1e-3)
         fsdp_cfg: FSDPConfig = FSDPConfig(
-            torch_compile=True,
             cpu_offload=False,
             tp_size=tp_size
         )
diff --git a/tests/model/test_qwen3_vl.py b/tests/model/test_qwen3_vl.py
@@ -206,16 +206,10 @@ def test_fsdp_qwen3_run(self, device, sp_size, compile, tol):
         patch_hf_rms_norm(hf_model)
 
         with torch.device("meta"):
-            model_cfg = Qwen3VLDense4BConfig()
-            if compile is False:
-                model_cfg.compile_cfg = False
+            model_cfg = Qwen3VLDense4BConfig(compile_cfg=compile)
             qwen3vl_model = model_cfg.build().to(torch.bfloat16)
 
-        fsdp_config = FSDPConfig(
-            cpu_offload=False,
-            torch_compile=compile
-        )
-
+        fsdp_config = FSDPConfig(cpu_offload=False)
         fsdp_mesh = init_world_mesh()
         qwen3vl_model.vision_tower.fsdp_mesh = fsdp_mesh
         qwen3vl_model.vision_tower.fsdp_config = fsdp_config
diff --git a/tests/ray/test_rl_train_with_sft.py b/tests/ray/test_rl_train_with_sft.py
@@ -80,7 +80,6 @@ def build_train_controller(self):
         model_cfg = Qwen3Dense8BConfig()
         optim_cfg: AdamWConfig = AdamWConfig(lr=5e-7, foreach=False)
         fsdp_cfg: FSDPConfig = FSDPConfig(
-            torch_compile=True,
             cpu_offload=False,
             ep_size=1,
         )
diff --git a/tests/ray/test_rl_trainer.py b/tests/ray/test_rl_trainer.py
@@ -47,6 +47,7 @@ def tearDownClass(cls):
         
     def init_traine_worker_config(self, train_optimizer_steps, pack_max_length):
         model_cfg = get_model_config_from_hf(Path(MODEL_PATH))
+        model_cfg.compile_cfg = False
         optim_cfg = AdamWConfig(lr=1e-6, betas=(0.9, 0.999), max_grad_norm=1.0, weight_decay=0.1, foreach=False)
         loss_cfg = GRPOLossConfig(
             policy_loss_cfg=dict(
@@ -65,7 +66,7 @@ def init_traine_worker_config(self, train_optimizer_steps, pack_max_length):
             chunk_size=512,
         )
         lr_cfg = LRConfig(lr_type="constant", warmup_ratio=0, lr_min=1e-6)
-        fsdp_cfg = FSDPConfig(torch_compile=False, cpu_offload=False, ep_size=1)
+        fsdp_cfg = FSDPConfig(cpu_offload=False, ep_size=1)
         train_worker_cfg: WorkerConfig = WorkerConfig(
             model_cfg=model_cfg,
             load_from=MODEL_PATH,
diff --git a/tests/train/test_trainer.py b/tests/train/test_trainer.py
@@ -408,7 +408,7 @@ def prepare(self):
 
         self.optim_cfg = AdamWConfig(lr=0.1, weight_decay=0.1)
         self.lr_cfg = LRConfig(lr_type="cosine", lr_min=0.001, warmup_ratio=0.03)
-        self.fsdp_cfg = FSDPConfig(torch_compile=True)
+        self.fsdp_cfg = FSDPConfig()
         temp_dir = tempfile.TemporaryDirectory()
         if dist.get_rank() == 0:
             temp_dir = [temp_dir.name]
diff --git a/xtuner/v1/config/fsdp.py b/xtuner/v1/config/fsdp.py
@@ -22,7 +22,8 @@ class FSDPConfig(BaseModel):
     # TODO: (caoweihan) Convert `torch.dtype` to `Annotated` for compatibility with cyclopts
     param_dtype: Annotated[torch.dtype, Parameter(help="Data type for model parameters")] = torch.bfloat16
     reduce_dtype: Annotated[torch.dtype, Parameter(help="Data type for reduction operations")] = torch.bfloat16
-    torch_compile: Annotated[bool, Parameter(help="Enable model compilation for faster inference")] = False
+    # TODO: deprecate `torch_compile` in favor of `compile_cfg` in XTunerBaseModelConfig
+    torch_compile: Annotated[bool, Parameter(help="Enable model compilation for faster inference")] = True
     mesh_prefix: Annotated[str, Parameter(help="Prefix for device mesh configuration in distributed training")] = (
         "default"
     )
diff --git a/xtuner/v1/train/trainer.py b/xtuner/v1/train/trainer.py
@@ -1845,5 +1845,12 @@ def _print_training_config(self):
             logger.info(f"Training config: {config_str}")
 
     def _resolve_deprecate_compile_cfg(self, model_cfg: XTunerBaseModelConfig, fsdp_cfg: FSDPConfig):
+        if self.rank == 0:
+            logger.warning(
+                "FSDPConfig.torch_compile is deprecated, and will be removed in version 1.1.0. "
+                "Please use XTunerBaseModelConfig.compile_cfg to control whether to use torch.compile for the model"
+            )
         if not fsdp_cfg.torch_compile:
+            if self.rank == 0:
+                logger.warning("FSDPConfig.torch_compile is set to False, setting model_cfg.compile_cfg to False.")
             model_cfg.compile_cfg = False

Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,6 @@ def build_train_controller(self):`
`80`	`80`	`model_cfg = Qwen3Dense8BConfig()`
`81`	`81`	`optim_cfg: AdamWConfig = AdamWConfig(lr=5e-7, foreach=False)`
`82`	`82`	`fsdp_cfg: FSDPConfig = FSDPConfig(`
`83`		`- torch_compile=True,`
`84`	`83`	`cpu_offload=False,`
`85`	`84`	`ep_size=1,`
`86`	`85`	`)`