-
Notifications
You must be signed in to change notification settings - Fork 24
Expand file tree
/
Copy pathworkload_card_template.yaml
More file actions
70 lines (65 loc) · 3.38 KB
/
workload_card_template.yaml
File metadata and controls
70 lines (65 loc) · 3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# workload_template.yaml
version: 1
description: > # Description of the workload.
hf_url: # huggingface model link, e.g., https://huggingface.co/meta-llama/Llama-3.1-8B
trace_url: # trace url or file path, e.g., https://drive.google.com/file/d/1EK_YROAho2sBvYVn8idF6DbnaG3SZ0he/view
contributor: # your name please
contact: # your email please
workload:
model:
phase: training # Training or inference phase, e.g., training, inference, serving
moe: false # Mixture of Experts (MoE) enabled or not, e.g., true, false.
granularity: model_fwd_bwd_pass # Granularity level, e.g., "model_fwd_bwd_pass", "model_fwd" "layer", "kernel".
model_family: llama-3.1-8b # Model family name, e.g., "llama-3.1-8b".
precision: bf16 # Precision type, e.g., "fp32", "bf16".
epochs: 1 # Number of epochs (may not apply to inference), e.g., 1, 3, 5.
iteration: 5 # Number of iterations/epoch, e.g., 10, 500, 1000.
model_arch:
num_params: 16380544000 # total parameter count
num_params_embedding: 204800000 # vocab_size(100000) * hidden_size(2048)
num_layers: 28 # L: transformer decoder layers
num_heads: 16 # Q: number of attention heads
head_dim: 128 # H: per-head dimension (hidden_size / num_heads)
data:
batch_size: 4 # Global batch size, e.g., 4, 8, 16.
seq_len: 8192 # Sequence length, e.g., 512, 1024, 8192.
input_len: 1024 # for inference
output_len: 128 # for inference
dataset: c4 # Dataset name, e.g., "c4", "wikitext".
hardware:
network_topo:
topology: slingshot # Network topology, e.g., "slingshot", "fat-tree".
bandwidth_gbps:
- 200 # Scale-out bandwidth in Gbps, e.g., 100, 200.
- 2000 # Scale-up bandwidth in Gbps, e.g., 1000, 2000.
xpu_spec:
type: GPU # Processing unit type, e.g., "GPU", "TPU".
model: nvidia_a100 # Processing unit model, e.g., "nvidia_a100", "tpu_v4".
total_count: 16 # Total number of processing units, e.g., 8, 16.
count_per_node: 4 # Number of processing units per node, e.g., 4, 8.
driver_version: cuda_12.4 # Driver version, e.g., "cuda_12.4", "rocm_5.6".
Model-executor:
framework:
name: torchtitan # Framework name, e.g., "torchtitan", "deepspeed".
compiler_tool_selection: plain_pytorch # Compiler tool, e.g., "plain_pytorch", "triton".
model_plan_parallelization:
dp_replicate: 1 # Data parallel replication factor, e.g., 1, 2.
dp_shard: 2 # Data parallel sharding factor, e.g., 1, 2.
tp: 4 # Tensor parallelism factor, e.g., 2, 4.
pp: 2 # Pipeline parallelism factor, e.g., 1, 2.
cp: 1 # Context parallelism factor, e.g., 1, 2.
ep: 1 # Expert parallelism factor (for MoE models), e.g., 1, 2.
pp_mb: 1 # Pipeline model parallel micro-batch size, e.g., 1, 2.
communication_library:
name: # Communication library name, e.g., "NCCL", "Gloo".
version: # Communication library version, e.g., "2.14.3", "1.10.0".
env:
NCCL_IB_QPS_PER_CONNECTION: # NCCL environment variable, e.g., 8, 16.
protocol_selection:
- rocev2 # Communication protocol for scale-out, e.g., "tcp", "rocev2".
- p2p # Another communication protocol for scake-up, e.g., "p2p", "memcpy".
metric_source:
traces:
- # trace type, e.g., "nsys", "json",
metrics_specific_trace:
- # metric specific trace type, e.g., "memory_trace", "accuracy_trace".