|
| 1 | +experiment_name: mini-model |
| 2 | +trial_name: on-policy |
| 3 | +allocation_mode: "sglang:d8t4p1+d8t1p4" |
| 4 | +seed: 42 |
| 5 | +total_train_epochs: 10 |
| 6 | +total_train_steps: 1145 |
| 7 | +weight_update_type: "astate" |
| 8 | +enable_colocate_mode: true |
| 9 | + |
| 10 | +storage_prefix: "/storage/openpsi" |
| 11 | + |
| 12 | +tokenizer_path: "/storage/xukuan.xk/repos/antnlp/personal/pretrained_models/ring-moe-v2-sft-general700w_longcot200w_0725/hf_ckpts/28869_kz" |
| 13 | +train_dataset: |
| 14 | + path: "/storage/dataset/nlp/areal/moe_lite_math_0527_merge_train_areal.jsonl" |
| 15 | + shuffle: true |
| 16 | + max_length: 1024 |
| 17 | + batch_size: 64 |
| 18 | + type: "rl" |
| 19 | + |
| 20 | +scheduler: |
| 21 | + endpoint: "http://asystem-scheduler.asystem-my001-swift.svc.sigma-my001.ml01.sgp-ml.local:8081" |
| 22 | + functioncall_service_domain: "http://110.75.237.19:8080" |
| 23 | + reward_model_path: "/storage/jiulin.jl/Skywork-Reward-V2-Qwen3-8B" |
| 24 | + reward_model_service_url: "http://reward-model-service.asystem-test.svc.sigma-my001.ml01.sgp-ml.local:30000/classify" |
| 25 | + |
| 26 | +stats_logger: |
| 27 | + experiment_name: ${experiment_name} |
| 28 | + trial_name: ${trial_name} |
| 29 | + fileroot: "${storage_prefix}/experiments" |
| 30 | + wandb: |
| 31 | + mode: "online" |
| 32 | + wandb_base_url: "https://slurm.alipay.com" |
| 33 | + wandb_api_key: "local-3bca3d5f00a980f3075b3e8ff2e16adc4ef43ffe" |
| 34 | + tensorboard: |
| 35 | + path: "/home/admin/logs/tfevent/asystem" |
| 36 | + |
| 37 | +gconfig: |
| 38 | + n_samples: 8 |
| 39 | + min_new_tokens: 0 |
| 40 | + # NOTE!! |
| 41 | + # Due to the limitations of sglang, max_new_tokens + max_prompt_len must be less than the model's context_len (set in the model's config.json), |
| 42 | + # and cannot be equal to it. See https://github.com/sgl-project/sglang/blob/f98366604b23e331422bf3c62d4e7410ae4fab87/python/sglang/srt/managers/tokenizer_manager.py#L638C9-L638C11 |
| 43 | + max_new_tokens: 15360 |
| 44 | + greedy: false |
| 45 | + temperature: 1.0 |
| 46 | + top_k: 1000000 |
| 47 | + top_p: 1.0 |
| 48 | + |
| 49 | +rollout: |
| 50 | + experiment_name: ${experiment_name} |
| 51 | + trial_name: ${trial_name} |
| 52 | + model_path: ${tokenizer_path} |
| 53 | + storage_path: "${storage_prefix}/checkpoints" |
| 54 | + seed: ${seed} |
| 55 | + engine_config: |
| 56 | + attention_backend: "triton" |
| 57 | + disable_custom_all_reduce: true |
| 58 | + enable_metrics: true |
| 59 | + mem_fraction_static: 0.7 |
| 60 | + triton_attention_num_kv_splits: 16 |
| 61 | + tokenizer_mode: "auto" |
| 62 | + load_format: "auto" |
| 63 | + is_embedding: false |
| 64 | + kv_cache_dtype: "auto" |
| 65 | + max_prefill_tokens: 32768 |
| 66 | + schedule_policy: "fcfs" |
| 67 | + schedule_conservativeness: 1.0 |
| 68 | + disable_cuda_graph: false |
| 69 | + disable_radix_cache: true |
| 70 | + disable_cuda_graph_padding: false |
| 71 | + enable_nccl_nvls: false |
| 72 | + disable_outlines_disk_cache: false |
| 73 | + disable_overlap_schedule: false |
| 74 | + enable_mixed_chunk: false |
| 75 | + enable_dp_attention: false |
| 76 | + enable_ep_moe: false |
| 77 | + enable_torch_compile: false |
| 78 | + torch_compile_max_bs: 32 |
| 79 | + triton_attention_reduce_in_fp32: false |
| 80 | + cuda_graph_bs: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512 ] |
| 81 | + num_continuous_decode_steps: 1 |
| 82 | + enable_nan_detection: false |
| 83 | + allow_auto_truncate: false |
| 84 | + enable_p2p_check: false |
| 85 | + enable_memory_saver: false |
| 86 | + chunked_prefill_size: null |
| 87 | + context_length: null |
| 88 | + cpu_offload_gb: 0 |
| 89 | + dp_size: 1 |
| 90 | + dtype: "auto" |
| 91 | + sampling_backend: "pytorch" |
| 92 | + log_level: "info" |
| 93 | + log_level_http: null |
| 94 | + log_requests: false |
| 95 | + log_requests_level: 0 |
| 96 | + max_running_requests: null |
| 97 | + show_time_cost: false |
| 98 | + scheduling_specs: |
| 99 | + - type: worker |
| 100 | + image: /storage/openpsi/images/areal-25.01-sglang-bf16-editable-metrics-xccl-20250716.sif |
| 101 | + - type: engine |
| 102 | + gpu: 1 |
| 103 | + # you can customize environment variables here |
| 104 | + env_vars: |
| 105 | + # if use ling max v2, need to specify USE_MAX_V2 = 1 |
| 106 | + USE_MAX_V2: 1 |
| 107 | + image: /storage/openpsi/images/hybrid-engine-13680179-20250923154343.sif |
| 108 | + |
| 109 | +actor: &actor_ref |
| 110 | + experiment_name: ${experiment_name} |
| 111 | + trial_name: ${trial_name} |
| 112 | + hybrid_engine: |
| 113 | + experiment_name: ${experiment_name} |
| 114 | + trial_name: ${trial_name} |
| 115 | + group_size: ${gconfig.n_samples} |
| 116 | + train_bs_n_seqs: ${train_dataset.batch_size} |
| 117 | + max_tokens_per_mb: 16384 |
| 118 | + wrap_policy: |
| 119 | + n_minibatches: 1 |
| 120 | + kl_ctl: 0.0 |
| 121 | + recompute_logp: false |
| 122 | + adv_norm: false |
| 123 | + discount: 1.0 |
| 124 | + gae_lambda: 1.0 |
| 125 | + eps_clip: 0.2 |
| 126 | + clip_ratio_low: 0.2 |
| 127 | + clip_ratio_high: 0.28 |
| 128 | + c_clip: null |
| 129 | + value_eps_clip: 0.2 |
| 130 | + max_reward_clip: 5.0 |
| 131 | + disable_value: true |
| 132 | + early_stop_kl: null |
| 133 | + early_stop_imp_ratio: null |
| 134 | + adaptive_kl_ctl: false |
| 135 | + adaptive_kl_target: 6 |
| 136 | + adaptive_kl_horizon: 10000 |
| 137 | + enable_save: true |
| 138 | + value_norm: true |
| 139 | + value_norm_type: "exp" |
| 140 | + value_norm_beta: 0.99995 |
| 141 | + value_norm_eps: 1e-5 |
| 142 | + group_size: 8 |
| 143 | + generation_size: null |
| 144 | + mask_no_eos_with_zero: false |
| 145 | + group_adv_norm: true |
| 146 | + mask_too_long: false |
| 147 | + use_dense_reward: false |
| 148 | + reward_delta: true |
| 149 | + token_normalize_scope: "global" |
| 150 | + sample_reuse: 1 |
| 151 | + temperature: 1.0 |
| 152 | + reward_output_scaling: 0.5 |
| 153 | + reward_output_bias: -1.0 |
| 154 | + remote_megatron_config: |
| 155 | + adam_beta1: 0.9 |
| 156 | + adam_beta2: 0.999 |
| 157 | + adam_eps: 1.0e-08 |
| 158 | + adaptive_layer_bias_update_strategy: sqrt |
| 159 | + add_bias_linear: false |
| 160 | + add_position_embedding: true |
| 161 | + apply_rope_fusion: true |
| 162 | + async_save: false |
| 163 | + attention_backend: "flash" |
| 164 | + attention_dropout: 0.0 |
| 165 | + attention_softmax_in_fp32: true |
| 166 | + auto_detect_ckpt_format: true |
| 167 | + bf16: true |
| 168 | + clip_grad: 1.0 |
| 169 | + context_parallel_size: 1 |
| 170 | + cp_comm_type: "p2p" |
| 171 | + cross_entropy_loss_fusion: false |
| 172 | + distributed_backend: "nccl" |
| 173 | + distributed_timeout_minutes: 600 |
| 174 | + enable_one_logger: false |
| 175 | + expert_model_parallel_size: 8 |
| 176 | + ffn_hidden_size: 5120 |
| 177 | + first_k_dense_replace: 1 |
| 178 | + global_batch_size: 512 |
| 179 | + gradient_accumulation_fusion: true |
| 180 | + group_query_attention: true |
| 181 | + hidden_dropout: 0.0 |
| 182 | + hidden_size: 2048 |
| 183 | + init_method_std: 0.006 |
| 184 | + load: /storage/xukuan.xk/repos/antnlp/personal/pretrained_models/ring-moe-v2-sft-general700w_longcot200w_0725/iter_0028869 |
| 185 | + log_loss_scale_to_tensorboard: false |
| 186 | + log_num_zeros_in_grad: true |
| 187 | + log_params_norm: true |
| 188 | + log_throughput: true |
| 189 | + log_timers_to_tensorboard: true |
| 190 | + log_validation_ppl_to_tensorboard: true |
| 191 | + lr: 3.0e-06 |
| 192 | + lr_decay_style: constant |
| 193 | + lr_warmup_iters: 10 |
| 194 | + make_vocab_size_divisible_by: 128 |
| 195 | + masked_softmax_fusion: true |
| 196 | + max_position_embeddings: 16384 |
| 197 | + micro_batch_size: 1 |
| 198 | + moe_ffn_hidden_size: 512 |
| 199 | + moe_grouped_gemm: true |
| 200 | + moe_layer_freq: |
| 201 | + - 0 |
| 202 | + - 1 |
| 203 | + - 1 |
| 204 | + - 1 |
| 205 | + - 1 |
| 206 | + - 1 |
| 207 | + - 1 |
| 208 | + - 1 |
| 209 | + - 1 |
| 210 | + - 1 |
| 211 | + - 1 |
| 212 | + - 1 |
| 213 | + - 1 |
| 214 | + - 1 |
| 215 | + - 1 |
| 216 | + - 1 |
| 217 | + - 1 |
| 218 | + - 1 |
| 219 | + - 1 |
| 220 | + - 1 |
| 221 | + moe_per_layer_logging: true |
| 222 | + moe_permute_fusion: true |
| 223 | + moe_router_bias_update_rate: 0.00 |
| 224 | + moe_router_dtype: fp32 |
| 225 | + moe_router_enable_expert_bias: true |
| 226 | + moe_router_group_topk: 4 |
| 227 | + moe_router_num_groups: 8 |
| 228 | + moe_router_score_function: sigmoid |
| 229 | + moe_router_topk: 8 |
| 230 | + moe_router_topk_scaling_factor: 2.5 |
| 231 | + moe_shared_expert_intermediate_size: 512 |
| 232 | + moe_shared_expert_overlap: true |
| 233 | + moe_token_dispatcher_type: alltoall |
| 234 | + norm_epsilon: 1.0e-06 |
| 235 | + normalization: "RMSNorm" |
| 236 | + num_attention_heads: 16 |
| 237 | + num_experts: 256 |
| 238 | + num_layers: 20 |
| 239 | + num_query_groups: 4 |
| 240 | + optim_normhead_fwd_alltoall: true |
| 241 | + optimizer: "adam" |
| 242 | + overlap_grad_reduce: true |
| 243 | + overlap_p2p_comm: true |
| 244 | + overlap_param_gather: false |
| 245 | + pipeline_model_parallel_size: 4 |
| 246 | + position_embedding_type: "rope" |
| 247 | + qk_layernorm: true |
| 248 | + recompute_granularity: "full" |
| 249 | + recompute_method: "uniform" |
| 250 | + recompute_num_layers: 5 |
| 251 | + rotary_base: 600000 |
| 252 | + rotary_percent: 0.5 |
| 253 | + save: /mnt/asystem-s3/common/users/senlin.zsl/experiments/2025-07-19_14-32-43/experiments/models/mcore_ckpt_32/asystem_moe_mini |
| 254 | + save_interval: 1 |
| 255 | + seed: 42 |
| 256 | + seq_length: 16384 |
| 257 | + sequence_parallel: true |
| 258 | + skip_casting_dtype_for_param_pattern: '^expert_bias$|.+\.expert_bias$' |
| 259 | + swiglu: true |
| 260 | + tensor_model_parallel_size: 1 |
| 261 | + tensorboard_log_interval: 1 |
| 262 | + tokenizer_model: ${tokenizer_path} |
| 263 | + tokenizer_type: "HuggingFaceTokenizer" |
| 264 | + train_iters: 100000 |
| 265 | + transformer_xl: false |
| 266 | + unidirectional: true |
| 267 | + untie_embeddings_and_output_weights: true |
| 268 | + use_distributed_optimizer: true |
| 269 | + use_flash_attn: true |
| 270 | + use_init_chunk: true |
| 271 | + use_mcore_models: true |
| 272 | + use_norm_head: false |
| 273 | + use_pack_lazy_loader: true |
| 274 | + use_random_logits: true |
| 275 | + use_rotary_position_embeddings: true |
| 276 | + vocab_size: 157184 |
| 277 | + weight_decay: 0.01 |
| 278 | + loss_configs: |
| 279 | + kl_ctl: 0.0 |
| 280 | + scheduling_specs: |
| 281 | + - type: worker |
| 282 | + image: /storage/openpsi/images/areal-25.01-sglang-bf16-editable-metrics-xccl-20250716.sif |
| 283 | + - type: engine |
| 284 | + gpu: 1 |
| 285 | + # you can customize environment variables here |
| 286 | + env_vars: |
| 287 | + # if not set CUDA_LAUNCH_BLOCKING = 1, megatron engine will hang in train phase |
| 288 | + CUDA_LAUNCH_BLOCKING: 1 |
| 289 | + # if use ling max v2, need to specify USE_MAX_V2 = 1 |
| 290 | + USE_MAX_V2: 1 |
| 291 | + image: /storage/openpsi/images/hybrid-engine-13680179-20250923154343.sif |
| 292 | + |
| 293 | +ref: |
| 294 | + <<: *actor_ref |
| 295 | + |
| 296 | +recover: |
| 297 | + experiment_name: ${experiment_name} |
| 298 | + trial_name: ${trial_name} |
| 299 | + recover_meta_info_path: "" |
| 300 | + enable_recover: true |
| 301 | + latest_disable_save_hf: true |
| 302 | + periodic_disable_save_hf: false |
| 303 | + latest_save_interval: 1 |
| 304 | + periodic_save_interval: 20 |
| 305 | + fileroot: "${storage_prefix}/experiments" |
0 commit comments