|
6 | 6 |
|
7 | 7 | from dataclasses import dataclass, field |
8 | 8 | from pathlib import Path |
9 | | -from typing import Optional |
10 | | -import textwrap |
11 | 9 |
|
12 | 10 |
|
13 | 11 | @dataclass |
@@ -109,94 +107,3 @@ def generate_train_args(self, hf_model_path: str, checkpoints_path: Path, data_p |
109 | 107 | --apply-chat-template --rollout-shuffle --rm-type math |
110 | 108 | """ |
111 | 109 |
|
112 | | -# GLM-4.7 (358B MoE) model architecture args |
113 | | -# Based on: https://huggingface.co/zai-org/GLM-4.7/blob/main/config.json |
114 | | -GLM_4_7_MODEL_ARGS = """ |
115 | | - --num-layers 92 --hidden-size 5120 --ffn-hidden-size 12288 |
116 | | - --num-attention-heads 96 --group-query-attention --num-query-groups 8 |
117 | | - --kv-channels 128 --vocab-size 151552 |
118 | | - --normalization RMSNorm --norm-epsilon 1e-5 --swiglu |
119 | | - --add-qkv-bias --qk-layernorm |
120 | | - --untie-embeddings-and-output-weights |
121 | | - --use-rotary-position-embeddings --rotary-base 1000000 |
122 | | - --num-experts 160 |
123 | | - --moe-layer-freq "[0]*3+[1]*89" |
124 | | - --moe-shared-expert-intermediate-size 1536 |
125 | | - --moe-router-topk 8 |
126 | | - --moe-grouped-gemm --moe-permute-fusion |
127 | | - --moe-ffn-hidden-size 1536 |
128 | | - --moe-router-score-function sigmoid |
129 | | - --moe-router-pre-softmax |
130 | | - --moe-router-enable-expert-bias |
131 | | - --moe-router-bias-update-rate 0 |
132 | | - --moe-router-load-balancing-type seq_aux_loss |
133 | | - --moe-router-topk-scaling-factor 2.5 |
134 | | - --moe-aux-loss-coeff 0 |
135 | | - --moe-router-dtype fp32 |
136 | | - --moe-token-dispatcher-type flex |
137 | | - --moe-enable-deepep |
138 | | -""" |
139 | | - |
140 | | -# GLM-4.7-Flash (30B MoE with MLA) model architecture args |
141 | | -# Based on: scripts/models/glm4.7-30B-A3B.sh |
142 | | -GLM_4_7_FLASH_MODEL_ARGS = """ |
143 | | - --num-layers 47 --hidden-size 2048 --ffn-hidden-size 10240 |
144 | | - --num-attention-heads 20 --vocab-size 154880 |
145 | | - --make-vocab-size-divisible-by 64 |
146 | | - --normalization RMSNorm --norm-epsilon 1e-5 --swiglu |
147 | | - --disable-bias-linear --add-qkv-bias --qk-layernorm |
148 | | - --untie-embeddings-and-output-weights |
149 | | - --position-embedding-type rope --no-position-embedding |
150 | | - --use-rotary-position-embeddings --rotary-base 1000000 --no-rope-fusion |
151 | | - --multi-latent-attention |
152 | | - --q-lora-rank 768 --kv-lora-rank 512 |
153 | | - --qk-head-dim 192 --v-head-dim 256 --kv-channels 192 |
154 | | - --qk-pos-emb-head-dim 64 |
155 | | - --num-experts 64 |
156 | | - --moe-layer-freq "[0]*1+[1]*46" |
157 | | - --moe-shared-expert-intermediate-size 1536 |
158 | | - --moe-router-topk 4 |
159 | | - --moe-grouped-gemm --moe-permute-fusion |
160 | | - --moe-ffn-hidden-size 1536 |
161 | | - --moe-router-score-function sigmoid |
162 | | - --moe-router-pre-softmax |
163 | | - --moe-router-enable-expert-bias |
164 | | - --moe-router-bias-update-rate 0 |
165 | | - --moe-router-load-balancing-type aux_loss |
166 | | - --moe-router-topk-scaling-factor 1.8 |
167 | | - --moe-aux-loss-coeff 0 |
168 | | - --moe-router-dtype fp32 |
169 | | - --moe-token-dispatcher-type flex |
170 | | - --moe-enable-deepep |
171 | | -""" |
172 | | - |
173 | | -# GLM training args with MoE parallelism |
174 | | -GLM_4_7_TRAINING_ARGS = """ |
175 | | - --tensor-model-parallel-size 8 --pipeline-model-parallel-size 4 |
176 | | - --context-parallel-size 2 |
177 | | - --expert-model-parallel-size 16 --expert-tensor-parallel-size 1 |
178 | | - --sequence-parallel |
179 | | - --decoder-last-pipeline-num-layers 23 |
180 | | - --recompute-granularity full --recompute-method uniform --recompute-num-layers 1 |
181 | | - --use-dynamic-batch-size --max-tokens-per-gpu 16384 |
182 | | - --megatron-to-hf-mode bridge |
183 | | - --attention-dropout 0.0 --hidden-dropout 0.0 |
184 | | - --attention-backend flash |
185 | | - --optimizer-cpu-offload --overlap-cpu-optimizer-d2h-h2d |
186 | | - --use-precision-aware-optimizer |
187 | | -""" |
188 | | - |
189 | | -GLM_4_7_FLASH_TRAINING_ARGS = """ |
190 | | - --tensor-model-parallel-size 4 --pipeline-model-parallel-size 2 |
191 | | - --context-parallel-size 2 |
192 | | - --expert-model-parallel-size 8 --expert-tensor-parallel-size 1 |
193 | | - --sequence-parallel |
194 | | - --decoder-last-pipeline-num-layers 23 |
195 | | - --recompute-granularity full --recompute-method uniform --recompute-num-layers 1 |
196 | | - --use-dynamic-batch-size --max-tokens-per-gpu 32768 |
197 | | - --megatron-to-hf-mode bridge |
198 | | - --attention-dropout 0.0 --hidden-dropout 0.0 |
199 | | - --attention-backend flash |
200 | | - --optimizer-cpu-offload --overlap-cpu-optimizer-d2h-h2d |
201 | | - --use-precision-aware-optimizer |
202 | | -""" |
0 commit comments