Commit 7b96efc
Allow to specify the file name to save config (#1921)
On top of #1904, allowing to specify the file name to save config.
The modified arg:
```
--job.save-config-file {None}|STR
Path to save job config into (default: None)
```
The example output json file:
```
{
"job": {
"config_file": "./torchtitan/models/llama3/train_configs/debug_model.toml",
"dump_folder": "./outputs",
"description": "Llama 3 debug training",
"print_config": false,
"config_save_path": "job_config.json",
"custom_config_module": ""
},
"profiling": {
"enable_profiling": false,
"save_traces_folder": "profile_trace",
"profile_freq": 10,
"profiler_active": 1,
"profiler_warmup": 3,
"enable_memory_snapshot": false,
"save_memory_snapshot_folder": "memory_snapshot"
},
"metrics": {
"log_freq": 1,
"enable_tensorboard": false,
"disable_color_printing": false,
"save_tb_folder": "tb",
"save_for_all_ranks": false,
"enable_wandb": false
},
"model": {
"name": "llama3",
"flavor": "debugmodel",
"hf_assets_path": "./tests/assets/tokenizer",
"tokenizer_path": null,
"converters": [],
"print_after_conversion": false
},
"optimizer": {
"name": "AdamW",
"lr": 0.0008,
"beta1": 0.9,
"beta2": 0.95,
"eps": 1e-08,
"weight_decay": 0.1,
"implementation": "fused",
"early_step_in_backward": false
},
"lr_scheduler": {
"warmup_steps": 2,
"decay_ratio": 0.8,
"decay_type": "linear",
"min_lr_factor": 0.0
},
"training": {
"dataset": "c4_test",
"dataset_path": null,
"local_batch_size": 8,
"global_batch_size": -1,
"seq_len": 2048,
"max_norm": 1.0,
"steps": 10,
"enable_cpu_offload": false,
"dtype": "float32",
"mixed_precision_param": "bfloat16",
"mixed_precision_reduce": "float32",
"gc_freq": 50,
"gc_debug": false,
"seed": null,
"deterministic": false,
"debug_moe_force_load_balance": false
},
"parallelism": {
"data_parallel_replicate_degree": 1,
"enable_compiled_autograd": false,
"data_parallel_shard_degree": -1,
"fsdp_reshard_after_forward": "default",
"tensor_parallel_degree": 1,
"disable_loss_parallel": false,
"enable_async_tensor_parallel": false,
"pipeline_parallel_degree": 1,
"module_fqns_per_model_part": null,
"pipeline_parallel_first_stage_less_layers": 1,
"pipeline_parallel_last_stage_less_layers": 1,
"pipeline_parallel_layers_per_stage": null,
"pipeline_parallel_schedule": "Interleaved1F1B",
"pipeline_parallel_schedule_csv": "",
"pipeline_parallel_microbatch_size": 1,
"context_parallel_degree": 1,
"context_parallel_rotate_method": "allgather",
"expert_parallel_degree": 1,
"expert_tensor_parallel_degree": 1
},
"checkpoint": {
"enable": false,
"enable_ft_dataloader_checkpoints": true,
"folder": "checkpoint",
"interval": 10,
"initial_load_path": null,
"initial_load_model_only": true,
"initial_load_in_hf": false,
"initial_load_in_hf_quantized": false,
"last_save_model_only": false,
"last_save_in_hf": false,
"export_dtype": "float32",
"async_mode": "disabled",
"keep_latest_k": 10,
"load_step": -1,
"exclude_from_loading": [],
"enable_first_step_checkpoint": false,
"create_seed_checkpoint": false,
"load_only": false
},
"activation_checkpoint": {
"mode": "selective",
"selective_ac_option": "2",
"per_op_sac_force_recompute_mm_shapes_by_fqns": [
"moe.router.gate"
],
"early_stop": false,
"memory_budget": 0.5,
"visualize_memory_budget_pareto": false
},
"compile": {
"enable": false,
"components": [
"model",
"loss"
],
"backend": "inductor"
},
"quantize": {
"linear": {
"float8": {
"enable_fsdp_float8_all_gather": false,
"precompute_float8_dynamic_scale_for_fsdp": false,
"recipe_name": null,
"filter_fqns": [
"output"
],
"emulate": false
},
"mx": {
"mxfp8_dim1_cast_kernel_choice": "triton",
"recipe_name": "mxfp8_cublas",
"filter_fqns": [
"output"
]
}
},
"grouped_mm": {
"float8": {
"fqns": []
},
"mx": {
"recipe_name": "mxfp8",
"fqns": []
}
}
},
"comm": {
"init_timeout_seconds": 300,
"train_timeout_seconds": 100,
"trace_buf_size": 20000,
"save_traces_folder": "comm_traces",
"save_traces_file_prefix": "rank_"
},
"memory_estimation": {
"enable": false,
"disable_fake_mode": false
},
"fault_tolerance": {
"enable": false,
"process_group": "gloo",
"process_group_timeout_ms": 10000,
"replica_id": 0,
"group_size": 0,
"min_replica_size": 1,
"semi_sync_method": null
},
"experimental": {
"custom_import": "",
"custom_args_module": ""
},
"validation": {
"enable": false,
"dataset": "c4_validation",
"dataset_path": null,
"local_batch_size": 8,
"seq_len": 2048,
"freq": 5,
"steps": 10
}
}
```
---------
Co-authored-by: Zhiqiang Zang <[email protected]>1 parent d6928f1 commit 7b96efc
1 file changed
+4
-6
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
29 | 29 | | |
30 | 30 | | |
31 | 31 | | |
32 | | - | |
33 | | - | |
| 32 | + | |
| 33 | + | |
34 | 34 | | |
35 | 35 | | |
36 | 36 | | |
| |||
923 | 923 | | |
924 | 924 | | |
925 | 925 | | |
926 | | - | |
927 | | - | |
928 | | - | |
929 | | - | |
| 926 | + | |
| 927 | + | |
930 | 928 | | |
931 | 929 | | |
932 | 930 | | |
| |||
0 commit comments