Commit 755ce8f
authored
allow saving config to file (#1904)
Here's an example output json file
```
{
"job": {
"config_file": "./torchtitan/models/llama3/train_configs/debug_model.toml",
"dump_folder": "./outputs",
"description": "Llama 3 debug training",
"print_config": false,
"save_config_folder": "config",
"custom_config_module": ""
},
"profiling": {
"enable_profiling": false,
"save_traces_folder": "profile_trace",
"profile_freq": 10,
"profiler_active": 1,
"profiler_warmup": 3,
"enable_memory_snapshot": false,
"save_memory_snapshot_folder": "memory_snapshot"
},
"metrics": {
"log_freq": 1,
"enable_tensorboard": false,
"disable_color_printing": false,
"save_tb_folder": "tb",
"save_for_all_ranks": false,
"enable_wandb": false
},
"model": {
"name": "llama3",
"flavor": "debugmodel",
"hf_assets_path": "./tests/assets/tokenizer",
"tokenizer_path": null,
"converters": [],
"print_after_conversion": false
},
"optimizer": {
"name": "AdamW",
"lr": 0.0008,
"beta1": 0.9,
"beta2": 0.95,
"eps": 1e-08,
"weight_decay": 0.1,
"implementation": "fused",
"early_step_in_backward": false
},
"lr_scheduler": {
"warmup_steps": 2,
"decay_ratio": 0.8,
"decay_type": "linear",
"min_lr_factor": 0.0
},
"training": {
"dataset": "c4_test",
"dataset_path": null,
"local_batch_size": 8,
"global_batch_size": -1,
"seq_len": 2048,
"max_norm": 1.0,
"steps": 10,
"enable_cpu_offload": false,
"dtype": "float32",
"mixed_precision_param": "bfloat16",
"mixed_precision_reduce": "float32",
"gc_freq": 50,
"gc_debug": false,
"seed": null,
"deterministic": false,
"debug_moe_force_load_balance": false
},
"parallelism": {
"data_parallel_replicate_degree": 1,
"enable_compiled_autograd": false,
"data_parallel_shard_degree": -1,
"fsdp_reshard_after_forward": "default",
"tensor_parallel_degree": 1,
"disable_loss_parallel": false,
"enable_async_tensor_parallel": false,
"pipeline_parallel_degree": 1,
"module_fqns_per_model_part": null,
"pipeline_parallel_first_stage_less_layers": 1,
"pipeline_parallel_last_stage_less_layers": 1,
"pipeline_parallel_layers_per_stage": null,
"pipeline_parallel_schedule": "Interleaved1F1B",
"pipeline_parallel_schedule_csv": "",
"pipeline_parallel_microbatch_size": 1,
"context_parallel_degree": 1,
"context_parallel_rotate_method": "allgather",
"expert_parallel_degree": 1,
"expert_tensor_parallel_degree": 1
},
"checkpoint": {
"enable": false,
"enable_ft_dataloader_checkpoints": true,
"folder": "checkpoint",
"interval": 10,
"initial_load_path": null,
"initial_load_model_only": true,
"initial_load_in_hf": false,
"initial_load_in_hf_quantized": false,
"last_save_model_only": false,
"last_save_in_hf": false,
"export_dtype": "float32",
"async_mode": "disabled",
"keep_latest_k": 10,
"load_step": -1,
"exclude_from_loading": [],
"enable_first_step_checkpoint": false,
"create_seed_checkpoint": false,
"load_only": false
},
"activation_checkpoint": {
"mode": "selective",
"selective_ac_option": "2",
"per_op_sac_force_recompute_mm_shapes_by_fqns": [
"moe.router.gate"
],
"early_stop": false,
"memory_budget": 0.5,
"visualize_memory_budget_pareto": false
},
"compile": {
"enable": false,
"components": [
"model",
"loss"
],
"backend": "inductor"
},
"quantize": {
"linear": {
"float8": {
"enable_fsdp_float8_all_gather": false,
"precompute_float8_dynamic_scale_for_fsdp": false,
"recipe_name": null,
"filter_fqns": [
"output"
],
"emulate": false
},
"mx": {
"mxfp8_dim1_cast_kernel_choice": "triton",
"recipe_name": "mxfp8_cublas",
"filter_fqns": [
"output"
]
}
},
"grouped_mm": {
"float8": {
"fqns": []
},
"mx": {
"recipe_name": "mxfp8",
"fqns": []
}
}
},
"comm": {
"init_timeout_seconds": 300,
"train_timeout_seconds": 100,
"trace_buf_size": 20000,
"save_traces_folder": "comm_traces",
"save_traces_file_prefix": "rank_"
},
"memory_estimation": {
"enable": false,
"disable_fake_mode": false
},
"fault_tolerance": {
"enable": false,
"process_group": "gloo",
"process_group_timeout_ms": 10000,
"replica_id": 0,
"group_size": 0,
"min_replica_size": 1,
"semi_sync_method": null
},
"experimental": {
"custom_import": "",
"custom_args_module": ""
},
"validation": {
"enable": false,
"dataset": "c4_validation",
"dataset_path": null,
"local_batch_size": 8,
"seq_len": 2048,
"freq": 5,
"steps": 10
}
}
```1 parent 75d4e4d commit 755ce8f
2 files changed
+35
-6
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
4 | 4 | | |
5 | 5 | | |
6 | 6 | | |
| 7 | + | |
| 8 | + | |
| 9 | + | |
7 | 10 | | |
8 | 11 | | |
9 | 12 | | |
| 13 | + | |
| 14 | + | |
| 15 | + | |
| 16 | + | |
10 | 17 | | |
11 | 18 | | |
12 | 19 | | |
13 | 20 | | |
14 | | - | |
| 21 | + | |
15 | 22 | | |
16 | | - | |
| 23 | + | |
17 | 24 | | |
18 | 25 | | |
19 | 26 | | |
20 | 27 | | |
21 | 28 | | |
22 | 29 | | |
23 | | - | |
| 30 | + | |
| 31 | + | |
| 32 | + | |
| 33 | + | |
24 | 34 | | |
25 | 35 | | |
26 | 36 | | |
| |||
908 | 918 | | |
909 | 919 | | |
910 | 920 | | |
| 921 | + | |
| 922 | + | |
| 923 | + | |
| 924 | + | |
| 925 | + | |
| 926 | + | |
| 927 | + | |
| 928 | + | |
| 929 | + | |
| 930 | + | |
| 931 | + | |
| 932 | + | |
| 933 | + | |
| 934 | + | |
| 935 | + | |
| 936 | + | |
| 937 | + | |
| 938 | + | |
| 939 | + | |
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
78 | 78 | | |
79 | 79 | | |
80 | 80 | | |
81 | | - | |
82 | | - | |
83 | | - | |
84 | 81 | | |
85 | 82 | | |
86 | 83 | | |
| |||
92 | 89 | | |
93 | 90 | | |
94 | 91 | | |
| 92 | + | |
| 93 | + | |
| 94 | + | |
95 | 95 | | |
96 | 96 | | |
97 | 97 | | |
| |||
0 commit comments