Megatron-MoE-ModelZoo/runtime_configs/benchmarking/runtime.conf at main · yanring/Megatron-MoE-ModelZoo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#! /bin/bash

# Benchmarking model configurations
declare -A BENCHMARKING_MODEL_CONFIGS=(
    # MODEL:                TP PP EP CP VPP MBS GBS LAYERS DISPATCHER GROUPED_GEMM NNODES RUN_TIME PRETRAIN SLEN    DATASET   [PP_FIRST PP_LAST]
    [Mixtral-8x2B]="        1  1  8  1  1   2   256   24    alltoall     false      8     00:20:00    1     4096  CustomDataset"
    [Mixtral-8x7B]="        1  4  8  1  8   1   256   32    alltoall     true       8     00:20:00    0     4096  CustomDataset"
    [Mixtral-8x22B]="       2  8  8  1  7   1   256   56    alltoall     true       16    00:20:00    0     4096  CustomDataset"
    [DeepSeek-V2]="         1  16 8  1  2   1   1024  60    alltoall     true       32    00:20:00    0     4096  CustomDataset    2  2"
    [DeepSeek-V2-Lite]="    1  1  8  1  1   1   512   27    alltoall     true       1     00:20:00    0     4096  CustomDataset"
    [DeepSeek-V3]="         1  16 64 1  1   1   8192  61    flex         true       128   00:40:00    0     4096  CustomDataset    4  1"
    [DeepSeek-V3-Proxy]="   1  4  16 1  1   1   512   14    flex         true       8     00:20:00    0     4096  CustomDataset    2  2"
    [Qwen3-235B-A22B]="     2  8  32 1  4   1   256   96    flex         true       32    00:20:00    0     4096  CustomDataset" # Account loss and embedding for number of layers
    [Qwen3-30B-A3B]="       1  1  8  1  1   1   256   48    alltoall     true       4     00:20:00    0     4096  CustomDataset"
    [Qwen3-Next-80B-A3B]="  1  4  8  1  3   1   256   48    flex         true       8     00:20:00    0     4096  CustomDataset"
)

# Function to set model configurations based on model name
set_model_configs() {
    local -n model_configs=$1
    local model_name=$2

    # Verify if model_name exists in model_configs
    if [[ -v model_configs[$model_name] ]]; then
        local config=(${model_configs[$model_name]})
    else
        echo "Error: Invalid model $model_name."
        return 1
    fi

    # Parallelism configurations
    export TP=${TP:-${config[0]}}
    export PP=${PP:-${config[1]}}
    export EP=${EP:-${config[2]}}
    export CP=${CP:-${config[3]}}
    export VPP=${VPP:-${config[4]}}

    # Batch size configurations
    export MBS=${MBS:-${config[5]}}
    export GBS=${GBS:-${config[6]}}

    # Model architecture configurations
    export NUM_LAYERS=${NUM_LAYERS:-${config[7]}}

    # MoE configurations
    export MOE_TOKEN_DISPATCHER=${MOE_TOKEN_DISPATCHER:-${config[8]}}
    export MOE_GROUPED_GEMM=${MOE_GROUPED_GEMM:-${config[9]}}

    # Training configurations
    export NNODES=${NNODES:-${config[10]}}
    export RUN_TIME=${RUN_TIME:-${config[11]}}
    export PRETRAIN=${PRETRAIN:-${config[12]}}

    # Data configurations
    export SEQ_LEN=${SEQ_LEN:-${config[13]}}
    export DATASET=${DATASET:-${config[14]}}

    # Virtual pipeline parallelism configurations
    if [[ $VPP -gt 1 ]]; then
        export LAYERS_PER_VP=${LAYERS_PER_VP:-$((NUM_LAYERS / PP / VPP))}
    else
        export LAYERS_PER_VP=false
    fi

    # Uneven pipeline parallelism configurations
    if [[ $((NUM_LAYERS % PP)) -ne 0 ]]; then
        export PP_FIRST=${PP_FIRST:-${config[15]}}
        export PP_LAST=${PP_LAST:-${config[16]}}
    else
        export PP_FIRST=false
        export PP_LAST=false
    fi
}

# Set model configurations based on MODEL
set_model_configs BENCHMARKING_MODEL_CONFIGS ${MODEL}