Commit 56ecd09
File tree
495 files changed
+46234
-32536
lines changed- .github
- actions
- workflows
- docker/common
- docs
- api-guide
- user-guide/features
- examples
- inference/gpt
- multimodal
- radio
- post_training/modelopt
- megatron
- core
- dist_checkpointing
- strategies
- distributed
- fsdp
- src
- megatron_fsdp
- extensions
- inference
- communication/torch_symm_triton
- contexts
- attention_context
- triton
- engines
- model_inference_wrappers
- moe
- quantization
- text_generation_controllers
- text_generation_server/dynamic_text_gen_server
- endpoints
- models
- T5
- bert
- common
- gpt
- heterogeneous
- mamba
- mimo
- config
- model
- submodules
- multimodal
- optimizer
- pipeline_parallel
- post_training/modelopt
- resharding
- copy_services
- ssm
- ops
- tensor_parallel
- transformer
- custom_layers
- experimental_attention_variant
- heterogeneous
- moe
- inference
- legacy
- fused_kernels
- tests
- model
- rl
- agent
- inference
- server/agent
- training
- config
- datasets
- tests
- functional_tests/test_cases
- bert
- bert_mcore_tp1_pp2
- bert_mcore_tp1_pp4_vp2
- bert_mcore_tp2_pp2_frozen_resume_torch_dist
- bert_mcore_tp2_pp2_local_spec
- bert_mcore_tp2_pp2_resume_torch_dist_local_spec
- bert_mcore_tp2_pp2_resume_torch_dist
- bert_mcore_tp2_pp2
- bert_mcore_tp4_pp1
- bert_release_sm
- bert_release
- gpt
- gpt3_15b_8t_release_gb200
- gpt3_15b_8t_release_sm_gb200
- gpt3_15b_8t_release_sm
- gpt3_15b_8t_release
- gpt3_7b_tp1_pp4_memory_speed
- gpt3_7b_tp4_pp1_memory_speed
- gpt3_mcore_reruns_disable
- gpt3_mcore_reruns_enable
- gpt3_mcore_reruns_persistent_1
- gpt3_mcore_reruns_persistent_2
- gpt3_mcore_reruns_reshard
- gpt3_mcore_reruns_resume_check_grads
- gpt3_mcore_reruns_resume
- gpt3_mcore_reruns_transient
- gpt3_mcore_te_tp1_pp1_dist_optimizer_fim_dataset
- gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files
- gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer
- gpt3_mcore_te_tp1_pp1_mup
- gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files
- gpt3_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer
- gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute
- gpt3_mcore_te_tp1_pp1_uniform_full_recompute
- gpt3_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic
- gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic
- gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion
- gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings
- gpt3_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion
- gpt3_mcore_te_tp1_pp2_rope_embeddings
- gpt3_mcore_te_tp1_pp4_disable_bias_linear
- gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu
- gpt3_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear
- gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear
- gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear
- gpt3_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel
- gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu
- gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs
- gpt3_mcore_te_tp1_pp4_sequence_parallel
- gpt3_mcore_te_tp1_pp4_swiglu
- gpt3_mcore_te_tp1_pp4_untie_embeddings_and_outputs
- gpt3_mcore_te_tp1_pp4_vp1_calculate_per_token_loss
- gpt3_mcore_te_tp1_pp4_vp1_decoupled_lr
- gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer
- gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather
- gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied
- gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce
- gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr
- gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss
- gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather
- gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied
- gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce
- gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap
- gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist
- gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap
- gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline
- gpt3_mcore_te_tp1_pp4_vp1
- gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split
- gpt3_mcore_te_tp2_pp1_cp2_nondeterministic
- gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic
- gpt3_mcore_te_tp2_pp1_fsdp2_resume_torch_dist
- gpt3_mcore_te_tp2_pp1_gdn
- gpt3_mcore_te_tp2_pp1_modelopt_distill_resume
- gpt3_mcore_te_tp2_pp1_multi_dist_optimizer_instances
- gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic
- gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances
- gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic
- gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss
- gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last
- gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last
- gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last
- gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last
- gpt3_mcore_te_tp2_pp2_cp2_nondeterministic
- gpt3_mcore_te_tp2_pp2_cp2
- gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion
- gpt3_mcore_te_tp2_pp2_ddp_average_in_collective
- gpt3_mcore_te_tp2_pp2_defer_embedding_wgrad_compute
- gpt3_mcore_te_tp2_pp2_mla
- gpt3_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader
- gpt3_mcore_te_tp2_pp2_no_mmap_bin_files
- gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic
- gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion
- gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective
- gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute
- gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader
- gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files
- gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone
- gpt3_mcore_te_tp2_pp2_resume_torch_dist
- gpt3_mcore_te_tp2_pp2
- gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor
- gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather
- gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce
- gpt3_mcore_te_tp4_pp1_qk_layernorm_test_mode
- gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather
- gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce
- gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode
- gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone
- gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone
- gpt3_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather
- gpt3_mcore_tp1_pp1_fsdp2_resume_torch_dist_te
- gpt3_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather
- gpt3_mcore_tp1_pp2_fp16
- gpt3_mcore_tp1_pp2_resume_torch_dist
- gpt3_mcore_tp1_pp2
- gpt3_mcore_tp1_pp4_resume_torch_dist
- gpt3_mcore_tp1_pp4
- gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te
- gpt3_mcore_tp2_pp2_uninstall_te
- gpt3_mcore_tp4_pp1_resume_torch_dist
- gpt3_mcore_tp4_pp1_resume_torch
- gpt3_mcore_tp4_pp1
- gpt3_weekly_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap
- gpt3_weekly_mcore_tp4_cp2_current_scaling_native_fp8_tp_sp_cp_tp_overlap
- gpt_grpo_tp4_pp1_dp2_8b_cudagraphs_throughput
- gpt_grpo_tp4_pp1_dp2_8b_throughput
- hybrid
- hybrid_dynamic_inference_tp1_pp1_dp8_583m_chunked_prefill
- hybrid_dynamic_inference_tp1_pp1_dp8_583m
- hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G
- hybrid_mr_mcore_te_tp1_pp2_vpp2_cp1_dgx_a100_1N8G
- hybrid_mr_mcore_te_tp1_pp4_cp1_dgx_a100_1N8G
- hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G
- hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G
- moe
- deepseek_proxy_fsdp_ep2_fsdp2_1node
- gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last
- gpt3_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic
- gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer
- gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer
- gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM
- gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer
- gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM
- gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances
- gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router
- gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective
- gpt3_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer
- gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM
- gpt3_mcore_te_tp2_pp1_te_8experts2parallel_top2router
- gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4
- gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4
- gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed
- gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8
- gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph
- gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental
- gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router
- gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last
- gpt3_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel
- gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last
- gpt3_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel
- gpt3_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel
- gpt3_mcore_tp2_pp2_ep2_te_4experts2parallel
- gpt3_mcore_tp2_pp2_resume_torch_dist_te_2experts
- gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon
- gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer
- gpt3_moe_mcore_te_ep8_resume_torch_dist_muon
- gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading
- gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading
- gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer
- gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph
- gpt_grpo_tp8tp4_pp1_ep8ep2_dp8_throughputtest
- t5
- t5_11b_mcore_tp4_pp1
- t5_mcore_te_tp1_pp1_vp1_resume_torch
- t5_mcore_te_tp2_pp1_vp1_sequence_parallel
- t5_mcore_te_tp2_pp1_vp1
- t5_mcore_te_tp4_pp1_resume_torch_dist
- t5_mcore_te_tp4_pp1
- t5_mcore_tp1_pp1_vp1_resume_torch
- t5_mcore_tp1_pp1_vp1
- t5_mcore_tp2_pp1_vp1
- t5_mcore_tp4_pp1_resume_torch_dist
- t5_mcore_tp4_pp1
- t5_release_sm
- t5_release
- test_utils
- python_scripts
- recipes
- gb200
- h100
- unit_tests
- a2a_overlap
- dist_checkpointing
- models
- distributed/megatron_fsdp
- extension
- inference
- contexts
- attention_metadata
- engines
- text_generation_controllers
- models
- pipeline_parallel
- resharding
- rl
- ssm
- ops
- transformer
- moe
- tools
- checkpoint
- common_pile_dataset
Some content is hidden
Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
495 files changed
+46234
-32536
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
39 | 39 | | |
40 | 40 | | |
41 | 41 | | |
| 42 | + | |
| 43 | + | |
| 44 | + | |
42 | 45 | | |
43 | 46 | | |
44 | 47 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
45 | 45 | | |
46 | 46 | | |
47 | 47 | | |
48 | | - | |
49 | | - | |
50 | | - | |
51 | | - | |
52 | | - | |
53 | | - | |
| 48 | + | |
| 49 | + | |
| 50 | + | |
| 51 | + | |
| 52 | + | |
| 53 | + | |
| 54 | + | |
| 55 | + | |
| 56 | + | |
| 57 | + | |
| 58 | + | |
| 59 | + | |
| 60 | + | |
| 61 | + | |
| 62 | + | |
| 63 | + | |
54 | 64 | | |
55 | 65 | | |
56 | 66 | | |
| |||
96 | 106 | | |
97 | 107 | | |
98 | 108 | | |
99 | | - | |
| 109 | + | |
100 | 110 | | |
101 | 111 | | |
102 | 112 | | |
| |||
106 | 116 | | |
107 | 117 | | |
108 | 118 | | |
109 | | - | |
110 | | - | |
111 | | - | |
112 | | - | |
113 | | - | |
114 | | - | |
115 | | - | |
116 | | - | |
117 | | - | |
118 | | - | |
119 | | - | |
120 | | - | |
121 | | - | |
122 | | - | |
123 | | - | |
124 | | - | |
125 | | - | |
126 | | - | |
127 | | - | |
128 | | - | |
129 | | - | |
130 | | - | |
131 | | - | |
132 | | - | |
133 | | - | |
134 | | - | |
135 | | - | |
136 | | - | |
137 | | - | |
138 | | - | |
139 | | - | |
140 | | - | |
141 | | - | |
142 | 119 | | |
143 | 120 | | |
144 | 121 | | |
| |||
150 | 127 | | |
151 | 128 | | |
152 | 129 | | |
153 | | - | |
154 | | - | |
155 | | - | |
156 | | - | |
157 | | - | |
158 | | - | |
159 | | - | |
160 | | - | |
161 | | - | |
162 | | - | |
163 | | - | |
164 | | - | |
165 | | - | |
166 | | - | |
167 | | - | |
168 | | - | |
169 | | - | |
170 | | - | |
171 | | - | |
172 | | - | |
173 | | - | |
| 130 | + | |
| 131 | + | |
| 132 | + | |
| 133 | + | |
| 134 | + | |
| 135 | + | |
174 | 136 | | |
175 | 137 | | |
176 | 138 | | |
| |||
184 | 146 | | |
185 | 147 | | |
186 | 148 | | |
187 | | - | |
| 149 | + | |
188 | 150 | | |
189 | 151 | | |
190 | 152 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1 | 1 | | |
2 | 2 | | |
3 | 3 | | |
4 | | - | |
| 4 | + | |
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1 | 1 | | |
2 | | - | |
3 | | - | |
4 | | - | |
5 | | - | |
6 | | - | |
7 | | - | |
8 | | - | |
9 | | - | |
10 | 2 | | |
11 | 3 | | |
12 | 4 | | |
| |||
46 | 38 | | |
47 | 39 | | |
48 | 40 | | |
| 41 | + | |
| 42 | + | |
| 43 | + | |
| 44 | + | |
| 45 | + | |
| 46 | + | |
| 47 | + | |
| 48 | + | |
49 | 49 | | |
50 | 50 | | |
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
486 | 486 | | |
487 | 487 | | |
488 | 488 | | |
489 | | - | |
| 489 | + | |
490 | 490 | | |
491 | | - | |
492 | | - | |
493 | | - | |
494 | | - | |
495 | | - | |
496 | 491 | | |
497 | 492 | | |
498 | 493 | | |
| |||
505 | 500 | | |
506 | 501 | | |
507 | 502 | | |
508 | | - | |
| 503 | + | |
509 | 504 | | |
510 | | - | |
511 | | - | |
| 505 | + | |
| 506 | + | |
512 | 507 | | |
513 | 508 | | |
514 | 509 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
32 | 32 | | |
33 | 33 | | |
34 | 34 | | |
35 | | - | |
| 35 | + | |
36 | 36 | | |
37 | 37 | | |
38 | 38 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
64 | 64 | | |
65 | 65 | | |
66 | 66 | | |
67 | | - | |
| 67 | + | |
68 | 68 | | |
69 | | - | |
| 69 | + | |
70 | 70 | | |
71 | 71 | | |
72 | | - | |
| 72 | + | |
73 | 73 | | |
74 | 74 | | |
75 | | - | |
| 75 | + | |
76 | 76 | | |
77 | | - | |
| 77 | + | |
78 | 78 | | |
79 | 79 | | |
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
55 | 55 | | |
56 | 56 | | |
57 | 57 | | |
58 | | - | |
| 58 | + | |
59 | 59 | | |
60 | 60 | | |
61 | 61 | | |
| |||
71 | 71 | | |
72 | 72 | | |
73 | 73 | | |
74 | | - | |
| 74 | + | |
| 75 | + | |
75 | 76 | | |
76 | 77 | | |
77 | 78 | | |
| |||
132 | 133 | | |
133 | 134 | | |
134 | 135 | | |
| 136 | + | |
| 137 | + | |
| 138 | + | |
| 139 | + | |
| 140 | + | |
| 141 | + | |
| 142 | + | |
| 143 | + | |
135 | 144 | | |
136 | 145 | | |
137 | 146 | | |
| |||
160 | 169 | | |
161 | 170 | | |
162 | 171 | | |
163 | | - | |
164 | | - | |
165 | | - | |
166 | | - | |
| 172 | + | |
| 173 | + | |
| 174 | + | |
| 175 | + | |
| 176 | + | |
| 177 | + | |
| 178 | + | |
| 179 | + | |
| 180 | + | |
| 181 | + | |
167 | 182 | | |
| 183 | + | |
| 184 | + | |
168 | 185 | | |
169 | 186 | | |
170 | 187 | | |
| |||
0 commit comments