Skip to content

Commit 276ad84

Browse files
committed
🔥 Remove GLM
1 parent 11475a2 commit 276ad84

File tree

4 files changed

+2
-271
lines changed

4 files changed

+2
-271
lines changed

slime/configs/base.py

Lines changed: 0 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66

77
from dataclasses import dataclass, field
88
from pathlib import Path
9-
from typing import Optional
10-
import textwrap
119

1210

1311
@dataclass
@@ -109,94 +107,3 @@ def generate_train_args(self, hf_model_path: str, checkpoints_path: Path, data_p
109107
--apply-chat-template --rollout-shuffle --rm-type math
110108
"""
111109

112-
# GLM-4.7 (358B MoE) model architecture args
113-
# Based on: https://huggingface.co/zai-org/GLM-4.7/blob/main/config.json
114-
GLM_4_7_MODEL_ARGS = """
115-
--num-layers 92 --hidden-size 5120 --ffn-hidden-size 12288
116-
--num-attention-heads 96 --group-query-attention --num-query-groups 8
117-
--kv-channels 128 --vocab-size 151552
118-
--normalization RMSNorm --norm-epsilon 1e-5 --swiglu
119-
--add-qkv-bias --qk-layernorm
120-
--untie-embeddings-and-output-weights
121-
--use-rotary-position-embeddings --rotary-base 1000000
122-
--num-experts 160
123-
--moe-layer-freq "[0]*3+[1]*89"
124-
--moe-shared-expert-intermediate-size 1536
125-
--moe-router-topk 8
126-
--moe-grouped-gemm --moe-permute-fusion
127-
--moe-ffn-hidden-size 1536
128-
--moe-router-score-function sigmoid
129-
--moe-router-pre-softmax
130-
--moe-router-enable-expert-bias
131-
--moe-router-bias-update-rate 0
132-
--moe-router-load-balancing-type seq_aux_loss
133-
--moe-router-topk-scaling-factor 2.5
134-
--moe-aux-loss-coeff 0
135-
--moe-router-dtype fp32
136-
--moe-token-dispatcher-type flex
137-
--moe-enable-deepep
138-
"""
139-
140-
# GLM-4.7-Flash (30B MoE with MLA) model architecture args
141-
# Based on: scripts/models/glm4.7-30B-A3B.sh
142-
GLM_4_7_FLASH_MODEL_ARGS = """
143-
--num-layers 47 --hidden-size 2048 --ffn-hidden-size 10240
144-
--num-attention-heads 20 --vocab-size 154880
145-
--make-vocab-size-divisible-by 64
146-
--normalization RMSNorm --norm-epsilon 1e-5 --swiglu
147-
--disable-bias-linear --add-qkv-bias --qk-layernorm
148-
--untie-embeddings-and-output-weights
149-
--position-embedding-type rope --no-position-embedding
150-
--use-rotary-position-embeddings --rotary-base 1000000 --no-rope-fusion
151-
--multi-latent-attention
152-
--q-lora-rank 768 --kv-lora-rank 512
153-
--qk-head-dim 192 --v-head-dim 256 --kv-channels 192
154-
--qk-pos-emb-head-dim 64
155-
--num-experts 64
156-
--moe-layer-freq "[0]*1+[1]*46"
157-
--moe-shared-expert-intermediate-size 1536
158-
--moe-router-topk 4
159-
--moe-grouped-gemm --moe-permute-fusion
160-
--moe-ffn-hidden-size 1536
161-
--moe-router-score-function sigmoid
162-
--moe-router-pre-softmax
163-
--moe-router-enable-expert-bias
164-
--moe-router-bias-update-rate 0
165-
--moe-router-load-balancing-type aux_loss
166-
--moe-router-topk-scaling-factor 1.8
167-
--moe-aux-loss-coeff 0
168-
--moe-router-dtype fp32
169-
--moe-token-dispatcher-type flex
170-
--moe-enable-deepep
171-
"""
172-
173-
# GLM training args with MoE parallelism
174-
GLM_4_7_TRAINING_ARGS = """
175-
--tensor-model-parallel-size 8 --pipeline-model-parallel-size 4
176-
--context-parallel-size 2
177-
--expert-model-parallel-size 16 --expert-tensor-parallel-size 1
178-
--sequence-parallel
179-
--decoder-last-pipeline-num-layers 23
180-
--recompute-granularity full --recompute-method uniform --recompute-num-layers 1
181-
--use-dynamic-batch-size --max-tokens-per-gpu 16384
182-
--megatron-to-hf-mode bridge
183-
--attention-dropout 0.0 --hidden-dropout 0.0
184-
--attention-backend flash
185-
--optimizer-cpu-offload --overlap-cpu-optimizer-d2h-h2d
186-
--use-precision-aware-optimizer
187-
"""
188-
189-
GLM_4_7_FLASH_TRAINING_ARGS = """
190-
--tensor-model-parallel-size 4 --pipeline-model-parallel-size 2
191-
--context-parallel-size 2
192-
--expert-model-parallel-size 8 --expert-tensor-parallel-size 1
193-
--sequence-parallel
194-
--decoder-last-pipeline-num-layers 23
195-
--recompute-granularity full --recompute-method uniform --recompute-num-layers 1
196-
--use-dynamic-batch-size --max-tokens-per-gpu 32768
197-
--megatron-to-hf-mode bridge
198-
--attention-dropout 0.0 --hidden-dropout 0.0
199-
--attention-backend flash
200-
--optimizer-cpu-offload --overlap-cpu-optimizer-d2h-h2d
201-
--use-precision-aware-optimizer
202-
"""

slime/configs/glm_4_7.py

Lines changed: 0 additions & 87 deletions
This file was deleted.

slime/configs/glm_4_7_flash.py

Lines changed: 0 additions & 89 deletions
This file was deleted.

slime/modal_train.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ def list_available_configs():
346346

347347
@app.function(
348348
image=image,
349-
gpu="H200:8", # GLM-4.7 needs H200s for memory
349+
gpu="H200:8",
350350
volumes={
351351
HF_CACHE_PATH.as_posix(): hf_cache_volume,
352352
CHECKPOINTS_PATH.as_posix(): checkpoints_volume,
@@ -362,7 +362,7 @@ def list_available_configs():
362362
)
363363
@modal.experimental.clustered(
364364
4, rdma=True
365-
) # 12 nodes for GLM-4.7 (8 train + 4 rollout)
365+
)
366366
async def train_multi_node(config: str = "qwen-0.5b-sync"):
367367
"""Main entry point for multi-node GRPO training on Modal.
368368

0 commit comments

Comments
 (0)