allow saving config to file (#1904)

tianyu-l · web-flow · commit 755ce8ffa3b6 · 2025-10-21T10:50:08.000-07:00
Here's an example output json file

```
{
  "job": {
    "config_file": "./torchtitan/models/llama3/train_configs/debug_model.toml",
    "dump_folder": "./outputs",
    "description": "Llama 3 debug training",
    "print_config": false,
    "save_config_folder": "config",
    "custom_config_module": ""
  },
  "profiling": {
    "enable_profiling": false,
    "save_traces_folder": "profile_trace",
    "profile_freq": 10,
    "profiler_active": 1,
    "profiler_warmup": 3,
    "enable_memory_snapshot": false,
    "save_memory_snapshot_folder": "memory_snapshot"
  },
  "metrics": {
    "log_freq": 1,
    "enable_tensorboard": false,
    "disable_color_printing": false,
    "save_tb_folder": "tb",
    "save_for_all_ranks": false,
    "enable_wandb": false
  },
  "model": {
    "name": "llama3",
    "flavor": "debugmodel",
    "hf_assets_path": "./tests/assets/tokenizer",
    "tokenizer_path": null,
    "converters": [],
    "print_after_conversion": false
  },
  "optimizer": {
    "name": "AdamW",
    "lr": 0.0008,
    "beta1": 0.9,
    "beta2": 0.95,
    "eps": 1e-08,
    "weight_decay": 0.1,
    "implementation": "fused",
    "early_step_in_backward": false
  },
  "lr_scheduler": {
    "warmup_steps": 2,
    "decay_ratio": 0.8,
    "decay_type": "linear",
    "min_lr_factor": 0.0
  },
  "training": {
    "dataset": "c4_test",
    "dataset_path": null,
    "local_batch_size": 8,
    "global_batch_size": -1,
    "seq_len": 2048,
    "max_norm": 1.0,
    "steps": 10,
    "enable_cpu_offload": false,
    "dtype": "float32",
    "mixed_precision_param": "bfloat16",
    "mixed_precision_reduce": "float32",
    "gc_freq": 50,
    "gc_debug": false,
    "seed": null,
    "deterministic": false,
    "debug_moe_force_load_balance": false
  },
  "parallelism": {
    "data_parallel_replicate_degree": 1,
    "enable_compiled_autograd": false,
    "data_parallel_shard_degree": -1,
    "fsdp_reshard_after_forward": "default",
    "tensor_parallel_degree": 1,
    "disable_loss_parallel": false,
    "enable_async_tensor_parallel": false,
    "pipeline_parallel_degree": 1,
    "module_fqns_per_model_part": null,
    "pipeline_parallel_first_stage_less_layers": 1,
    "pipeline_parallel_last_stage_less_layers": 1,
    "pipeline_parallel_layers_per_stage": null,
    "pipeline_parallel_schedule": "Interleaved1F1B",
    "pipeline_parallel_schedule_csv": "",
    "pipeline_parallel_microbatch_size": 1,
    "context_parallel_degree": 1,
    "context_parallel_rotate_method": "allgather",
    "expert_parallel_degree": 1,
    "expert_tensor_parallel_degree": 1
  },
  "checkpoint": {
    "enable": false,
    "enable_ft_dataloader_checkpoints": true,
    "folder": "checkpoint",
    "interval": 10,
    "initial_load_path": null,
    "initial_load_model_only": true,
    "initial_load_in_hf": false,
    "initial_load_in_hf_quantized": false,
    "last_save_model_only": false,
    "last_save_in_hf": false,
    "export_dtype": "float32",
    "async_mode": "disabled",
    "keep_latest_k": 10,
    "load_step": -1,
    "exclude_from_loading": [],
    "enable_first_step_checkpoint": false,
    "create_seed_checkpoint": false,
    "load_only": false
  },
  "activation_checkpoint": {
    "mode": "selective",
    "selective_ac_option": "2",
    "per_op_sac_force_recompute_mm_shapes_by_fqns": [
      "moe.router.gate"
    ],
    "early_stop": false,
    "memory_budget": 0.5,
    "visualize_memory_budget_pareto": false
  },
  "compile": {
    "enable": false,
    "components": [
      "model",
      "loss"
    ],
    "backend": "inductor"
  },
  "quantize": {
    "linear": {
      "float8": {
        "enable_fsdp_float8_all_gather": false,
        "precompute_float8_dynamic_scale_for_fsdp": false,
        "recipe_name": null,
        "filter_fqns": [
          "output"
        ],
        "emulate": false
      },
      "mx": {
        "mxfp8_dim1_cast_kernel_choice": "triton",
        "recipe_name": "mxfp8_cublas",
        "filter_fqns": [
          "output"
        ]
      }
    },
    "grouped_mm": {
      "float8": {
        "fqns": []
      },
      "mx": {
        "recipe_name": "mxfp8",
        "fqns": []
      }
    }
  },
  "comm": {
    "init_timeout_seconds": 300,
    "train_timeout_seconds": 100,
    "trace_buf_size": 20000,
    "save_traces_folder": "comm_traces",
    "save_traces_file_prefix": "rank_"
  },
  "memory_estimation": {
    "enable": false,
    "disable_fake_mode": false
  },
  "fault_tolerance": {
    "enable": false,
    "process_group": "gloo",
    "process_group_timeout_ms": 10000,
    "replica_id": 0,
    "group_size": 0,
    "min_replica_size": 1,
    "semi_sync_method": null
  },
  "experimental": {
    "custom_import": "",
    "custom_args_module": ""
  },
  "validation": {
    "enable": false,
    "dataset": "c4_validation",
    "dataset_path": null,
    "local_batch_size": 8,
    "seq_len": 2048,
    "freq": 5,
    "steps": 10
  }
}
```
diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
@@ -4,23 +4,33 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import json
+
+import os
 from dataclasses import asdict, dataclass, field
 from typing import Any, Literal
 
+import torch
+
+from torchtitan.tools.logging import logger
+
 
 @dataclass
 class Job:
     config_file: str | None = None
-    """Job config file"""
+    """File to read job configs from"""
 
-    dump_folder: str = "./torchtitan/outputs"
+    dump_folder: str = "./outputs"
     """Folder to dump job outputs"""
 
     description: str = "default job"
     """Description of the job"""
 
     print_config: bool = False
-    """Print the configs to terminal"""
+    """Print the job configs to terminal"""
+
+    save_config_folder: str | None = None
+    """Folder to save a job_config.json file"""
 
     custom_config_module: str = ""
     """
@@ -908,3 +918,22 @@ class JobConfig:
 
     def to_dict(self) -> dict[str, Any]:
         return asdict(self)
+
+    def maybe_log(self) -> None:
+        if self.job.print_config:
+            logger.info(f"Running with configs: {self.to_dict()}")
+
+        if self.job.save_config_folder is not None:
+            config_file = os.path.join(
+                self.job.dump_folder, self.job.save_config_folder, "job_config.json"
+            )
+            if torch.distributed.is_initialized():
+                if torch.distributed.get_rank() == 0:
+                    os.makedirs(os.path.dirname(config_file), exist_ok=True)
+                    with open(config_file, "w") as f:
+                        json.dump(self.to_dict(), f, indent=2)
+                logger.info(f"Saved job configs to {config_file}")
+            else:
+                logger.warning(
+                    "Job configs logging is disabled due to torch.distributed not initialized."
+                )
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -78,9 +78,6 @@ def __init__(self, job_config: JobConfig):
         if job_config.experimental.custom_import:
             importlib.import_module(job_config.experimental.custom_import)
 
-        if job_config.job.print_config:
-            logger.info(f"Running with args: {job_config.to_dict()}")
-
         device_module, device_type = utils.device_module, utils.device_type
         self.device = torch.device(f"{device_type}:{int(os.environ['LOCAL_RANK'])}")
         # Device has to be set before creating TorchFT manager.
@@ -92,6 +89,9 @@ def __init__(self, job_config: JobConfig):
             enable_cpu_backend=job_config.training.enable_cpu_offload,
             base_folder=job_config.job.dump_folder,
         )
+
+        job_config.maybe_log()
+
         world_size = int(os.environ["WORLD_SIZE"])
         parallelism_config = job_config.parallelism
         self.parallel_dims = parallel_dims = self._create_parallel_dims(