Allow to specify the file name to save config (#1921)

CptGit · Zhiqiang Zang · web-flow · commit 7b96efc8ece1 · 2025-10-21T15:04:43.000-07:00
On top of #1904, allowing to specify the file name to save config. The modified arg: ``` --job.save-config-file {None}|STR Path to save job config into (default: None) ``` The example output json file: ``` { "job": { "config_file": "./torchtitan/models/llama3/train_configs/debug_model.toml", "dump_folder": "./outputs", "description": "Llama 3 debug training", "print_config": false, "config_save_path": "job_config.json", "custom_config_module": "" }, "profiling": { "enable_profiling": false, "save_traces_folder": "profile_trace", "profile_freq": 10, "profiler_active": 1, "profiler_warmup": 3, "enable_memory_snapshot": false, "save_memory_snapshot_folder": "memory_snapshot" }, "metrics": { "log_freq": 1, "enable_tensorboard": false, "disable_color_printing": false, "save_tb_folder": "tb", "save_for_all_ranks": false, "enable_wandb": false }, "model": { "name": "llama3", "flavor": "debugmodel", "hf_assets_path": "./tests/assets/tokenizer", "tokenizer_path": null, "converters": [], "print_after_conversion": false }, "optimizer": { "name": "AdamW", "lr": 0.0008, "beta1": 0.9, "beta2": 0.95, "eps": 1e-08, "weight_decay": 0.1, "implementation": "fused", "early_step_in_backward": false }, "lr_scheduler": { "warmup_steps": 2, "decay_ratio": 0.8, "decay_type": "linear", "min_lr_factor": 0.0 }, "training": { "dataset": "c4_test", "dataset_path": null, "local_batch_size": 8, "global_batch_size": -1, "seq_len": 2048, "max_norm": 1.0, "steps": 10, "enable_cpu_offload": false, "dtype": "float32", "mixed_precision_param": "bfloat16", "mixed_precision_reduce": "float32", "gc_freq": 50, "gc_debug": false, "seed": null, "deterministic": false, "debug_moe_force_load_balance": false }, "parallelism": { "data_parallel_replicate_degree": 1, "enable_compiled_autograd": false, "data_parallel_shard_degree": -1, "fsdp_reshard_after_forward": "default", "tensor_parallel_degree": 1, "disable_loss_parallel": false, "enable_async_tensor_parallel": false, "pipeline_parallel_degree": 1, "module_fqns_per_model_part": null, "pipeline_parallel_first_stage_less_layers": 1, "pipeline_parallel_last_stage_less_layers": 1, "pipeline_parallel_layers_per_stage": null, "pipeline_parallel_schedule": "Interleaved1F1B", "pipeline_parallel_schedule_csv": "", "pipeline_parallel_microbatch_size": 1, "context_parallel_degree": 1, "context_parallel_rotate_method": "allgather", "expert_parallel_degree": 1, "expert_tensor_parallel_degree": 1 }, "checkpoint": { "enable": false, "enable_ft_dataloader_checkpoints": true, "folder": "checkpoint", "interval": 10, "initial_load_path": null, "initial_load_model_only": true, "initial_load_in_hf": false, "initial_load_in_hf_quantized": false, "last_save_model_only": false, "last_save_in_hf": false, "export_dtype": "float32", "async_mode": "disabled", "keep_latest_k": 10, "load_step": -1, "exclude_from_loading": [], "enable_first_step_checkpoint": false, "create_seed_checkpoint": false, "load_only": false }, "activation_checkpoint": { "mode": "selective", "selective_ac_option": "2", "per_op_sac_force_recompute_mm_shapes_by_fqns": [ "moe.router.gate" ], "early_stop": false, "memory_budget": 0.5, "visualize_memory_budget_pareto": false }, "compile": { "enable": false, "components": [ "model", "loss" ], "backend": "inductor" }, "quantize": { "linear": { "float8": { "enable_fsdp_float8_all_gather": false, "precompute_float8_dynamic_scale_for_fsdp": false, "recipe_name": null, "filter_fqns": [ "output" ], "emulate": false }, "mx": { "mxfp8_dim1_cast_kernel_choice": "triton", "recipe_name": "mxfp8_cublas", "filter_fqns": [ "output" ] } }, "grouped_mm": { "float8": { "fqns": [] }, "mx": { "recipe_name": "mxfp8", "fqns": [] } } }, "comm": { "init_timeout_seconds": 300, "train_timeout_seconds": 100, "trace_buf_size": 20000, "save_traces_folder": "comm_traces", "save_traces_file_prefix": "rank_" }, "memory_estimation": { "enable": false, "disable_fake_mode": false }, "fault_tolerance": { "enable": false, "process_group": "gloo", "process_group_timeout_ms": 10000, "replica_id": 0, "group_size": 0, "min_replica_size": 1, "semi_sync_method": null }, "experimental": { "custom_import": "", "custom_args_module": "" }, "validation": { "enable": false, "dataset": "c4_validation", "dataset_path": null, "local_batch_size": 8, "seq_len": 2048, "freq": 5, "steps": 10 } } ``` --------- Co-authored-by: Zhiqiang Zang <zzq@fb.com>
diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
@@ -29,8 +29,8 @@ class Job:
     print_config: bool = False
     """Print the job configs to terminal"""
 
-    save_config_folder: str | None = None
-    """Folder to save a job_config.json file"""
+    save_config_file: str | None = None
+    """Path to save job config into"""
 
     custom_config_module: str = ""
     """
@@ -923,10 +923,8 @@ def maybe_log(self) -> None:
         if self.job.print_config:
             logger.info(f"Running with configs: {self.to_dict()}")
 
-        if self.job.save_config_folder is not None:
-            config_file = os.path.join(
-                self.job.dump_folder, self.job.save_config_folder, "job_config.json"
-            )
+        if self.job.save_config_file is not None:
+            config_file = os.path.join(self.job.dump_folder, self.job.save_config_file)
             if torch.distributed.is_initialized():
                 if torch.distributed.get_rank() == 0:
                     os.makedirs(os.path.dirname(config_file), exist_ok=True)