ccl-bench/workload_card_template.yaml at main · cornell-sysphotonics/ccl-bench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# workload_template.yaml
version: 1

description: > # Description of the workload.

hf_url: # huggingface model link, e.g., https://huggingface.co/meta-llama/Llama-3.1-8B
trace_url: # trace url or file path, e.g., https://drive.google.com/file/d/1EK_YROAho2sBvYVn8idF6DbnaG3SZ0he/view
contributor: # your name please
contact: # your email please

workload:
  model:
    phase: training  # Training or inference phase, e.g., training, inference, serving
    moe: false  # Mixture of Experts (MoE) enabled or not, e.g., true, false.
    granularity: model_fwd_bwd_pass  # Granularity level, e.g., "model_fwd_bwd_pass", "model_fwd" "layer", "kernel".
    model_family: llama-3.1-8b  # Model family name, e.g., "llama-3.1-8b".
    precision: bf16  # Precision type, e.g., "fp32", "bf16".
    epochs: 1  # Number of epochs (may not apply to inference), e.g., 1, 3, 5.
    iteration: 5  # Number of iterations/epoch, e.g., 10, 500, 1000.
    model_arch:
      num_params: 16380544000         # total parameter count
      num_params_embedding: 204800000 # vocab_size(100000) * hidden_size(2048)
      num_layers: 28                  # L: transformer decoder layers
      num_heads: 16                   # Q: number of attention heads
      head_dim: 128                   # H: per-head dimension (hidden_size / num_heads)
  data:
    batch_size: 4  # Global batch size, e.g., 4, 8, 16.
    seq_len: 8192  # Sequence length, e.g., 512, 1024, 8192.
    input_len: 1024 # for inference
    output_len: 128 # for inference
    dataset: c4  # Dataset name, e.g., "c4", "wikitext".
  hardware:
    network_topo:
      topology: slingshot  # Network topology, e.g., "slingshot", "fat-tree".
      bandwidth_gbps:
        - 200  # Scale-out bandwidth in Gbps, e.g., 100, 200.
        - 2000  # Scale-up bandwidth in Gbps, e.g., 1000, 2000.
    xpu_spec:
      type: GPU  # Processing unit type, e.g., "GPU", "TPU".
      model: nvidia_a100  # Processing unit model, e.g., "nvidia_a100", "tpu_v4".
      total_count: 16  # Total number of processing units, e.g., 8, 16.
      count_per_node: 4  # Number of processing units per node, e.g., 4, 8.
    driver_version: cuda_12.4  # Driver version, e.g., "cuda_12.4", "rocm_5.6".

Model-executor:
  framework:
    name: torchtitan  # Framework name, e.g., "torchtitan", "deepspeed".
    compiler_tool_selection: plain_pytorch  # Compiler tool, e.g., "plain_pytorch", "triton".
  model_plan_parallelization:
    dp_replicate: 1 # Data parallel replication factor, e.g., 1, 2.
    dp_shard: 2 # Data parallel sharding factor, e.g., 1, 2.
    tp: 4 # Tensor parallelism factor, e.g., 2, 4.
    pp: 2 # Pipeline parallelism factor, e.g., 1, 2.
    cp: 1 # Context parallelism factor, e.g., 1, 2.
    ep: 1 # Expert parallelism factor (for MoE models), e.g., 1, 2.
    pp_mb: 1 # Pipeline model parallel micro-batch size, e.g., 1, 2.
  communication_library:
    name:  # Communication library name, e.g., "NCCL", "Gloo".
    version: # Communication library version, e.g., "2.14.3", "1.10.0".
    env:
      NCCL_IB_QPS_PER_CONNECTION:  # NCCL environment variable, e.g., 8, 16.
  protocol_selection:
    - rocev2 # Communication protocol for scale-out, e.g., "tcp", "rocev2".
    - p2p # Another communication protocol for scake-up, e.g., "p2p", "memcpy".

metric_source:
  traces:
    -  # trace type, e.g., "nsys", "json",
  metrics_specific_trace:
    -  # metric specific trace type, e.g., "memory_trace", "accuracy_trace".