Skip to content

Commit 4950f85

Browse files
authored
Add config for H100 94GB (#225)
* Update 2x H100 superbench config * Add H100 94 GB config
1 parent 96934a9 commit 4950f85

File tree

3 files changed

+232
-2
lines changed

3 files changed

+232
-2
lines changed

.pipelines/azure-pipelines-linux.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ resources:
1616
options: --entrypoint=""
1717

1818
variables:
19-
VcVersion : 1.13.5
19+
VcVersion : 1.13.6
2020
ROOT: $(Build.SourcesDirectory)
2121
CDP_DEFINITION_BUILD_COUNT: $[counter('', 0)] # needed for onebranch.pipeline.version task https://aka.ms/obpipelines/versioning
2222
ENABLE_PRS_DELAYSIGN: 1

.pipelines/azure-pipelines.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ pool:
1818
vmImage: windows-latest
1919

2020
variables:
21-
VcVersion : 1.13.5
21+
VcVersion : 1.13.6
2222
ROOT: $(Build.SourcesDirectory)
2323
CDP_DEFINITION_BUILD_COUNT: $[counter('', 0)] # needed for onebranch.pipeline.version task https://aka.ms/obpipelines/versioning
2424
ENABLE_PRS_DELAYSIGN: 1
Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
version: v0.8
2+
superbench:
3+
enable:
4+
# microbenchmark - computation
5+
- kernel-launch
6+
- gemm-flops
7+
- cublaslt-gemm
8+
# cublas-function
9+
- matmul
10+
- gpu-burn
11+
# microbenchmark - communication
12+
- cpu-memory-bw-latency
13+
- mem-bw
14+
- gpu-copy-bw:perf
15+
- gpu-copy-bw:correctness
16+
- nccl-bw:nvlink
17+
# microbenchmark - comput-comm. overlap
18+
- computation-communication-overlap
19+
- sharding-matmul
20+
# microbenchmark - storage
21+
# model benchmark - inferece
22+
- ort-inference
23+
# model benchmark - training
24+
- model-benchmarks:gpt
25+
# model-benchmarks:bert
26+
- model-benchmarks:lstm
27+
- model-benchmarks:resnet
28+
- model-benchmarks:densenet
29+
- model-benchmarks:vgg
30+
- model-benchmarks:stress
31+
monitor:
32+
enable: false
33+
var:
34+
default_timeout: &default_timeout 600
35+
default_local_mode: &default_local_mode
36+
modes:
37+
- name: local
38+
proc_num: 2
39+
prefix: CUDA_VISIBLE_DEVICES={proc_rank}
40+
parallel: yes
41+
default_pytorch_mode: &default_pytorch_mode
42+
modes:
43+
- name: torch.distributed
44+
proc_num: 2
45+
node_num: 1
46+
frameworks: [pytorch]
47+
model_ddp_parameter: &model_ddp_param
48+
duration: 0
49+
num_warmup: 64
50+
num_steps: 2048
51+
sample_count: 8192
52+
batch_size: 32
53+
precision: [float32, float16]
54+
model_action: [train]
55+
pin_memory: yes
56+
nccl_parameter: &nccl_param
57+
minbytes: 1K
58+
maxbytes: 16G
59+
stepfactor: 2
60+
check: 1
61+
warmup_iters: 20
62+
iters: 100
63+
benchmarks:
64+
# microbenchmark - computation
65+
kernel-launch:
66+
<<: *default_local_mode
67+
timeout: *default_timeout
68+
gemm-flops:
69+
<<: *default_local_mode
70+
timeout: 3600
71+
cublaslt-gemm:
72+
<<: *default_local_mode
73+
timeout: *default_timeout
74+
parameters:
75+
shapes:
76+
- 4096,4096,4096
77+
- 8192,8192,8192
78+
- 16384,16384,16384
79+
- 12608,1024,1024
80+
- 12608,4096,1024
81+
- 12608,1024,3072
82+
- 12608,1024,4096
83+
- 12608,3072,1024
84+
cublas-function:
85+
<<: *default_local_mode
86+
timeout: 1200
87+
matmul:
88+
<<: *default_local_mode
89+
timeout: *default_timeout
90+
frameworks: [pytorch]
91+
gpu-burn:
92+
timeout: 1800
93+
modes:
94+
- name: local
95+
parallel: no
96+
parameters:
97+
time: 900
98+
doubles: true
99+
tensor_core: true
100+
# microbenchmark - communication
101+
cpu-memory-bw-latency:
102+
timeout: *default_timeout
103+
modes:
104+
- name: local
105+
parallel: no
106+
parameters:
107+
tests:
108+
- bandwidth_matrix
109+
- latency_matrix
110+
- max_bandwidth
111+
mem-bw:
112+
timeout: *default_timeout
113+
modes:
114+
- name: local
115+
proc_num: 2
116+
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N {proc_rank}
117+
parallel: no
118+
gpu-copy-bw:perf:
119+
timeout: 1200
120+
modes:
121+
- name: local
122+
parallel: no
123+
parameters:
124+
mem_type: [htod, dtoh, dtod]
125+
copy_type: [sm, dma]
126+
gpu-copy-bw:correctness:
127+
timeout: *default_timeout
128+
modes:
129+
- name: local
130+
parallel: no
131+
parameters:
132+
mem_type: [htod, dtoh, dtod]
133+
copy_type: [sm, dma]
134+
size: 4096
135+
num_warm_up: 0
136+
num_loops: 1
137+
check_data: true
138+
nccl-bw:nvlink:
139+
timeout: *default_timeout
140+
modes:
141+
- name: mpi
142+
proc_num: 2
143+
node_num: 1
144+
parameters:
145+
<<: *nccl_param
146+
# microbenchmark - comput-comm. overlap
147+
computation-communication-overlap:
148+
<<: *default_pytorch_mode
149+
timeout: *default_timeout
150+
sharding-matmul:
151+
<<: *default_pytorch_mode
152+
timeout: *default_timeout
153+
# model benchmark - inferece
154+
ort-inference:
155+
<<: *default_local_mode
156+
timeout: *default_timeout
157+
# model benchmark - training
158+
model-benchmarks:gpt:
159+
<<: *default_pytorch_mode
160+
timeout: 1800
161+
models:
162+
- gpt2-small
163+
- gpt2-large
164+
parameters:
165+
<<: *model_ddp_param
166+
batch_size: 8
167+
seq_len: 224
168+
model-benchmarks:bert:
169+
<<: *default_pytorch_mode
170+
timeout: 4800
171+
models:
172+
- bert-base
173+
- bert-large
174+
parameters:
175+
<<: *model_ddp_param
176+
precision: [float32, float16, fp8_hybrid]
177+
seq_len: 224
178+
model-benchmarks:lstm:
179+
<<: *default_pytorch_mode
180+
timeout: *default_timeout
181+
models:
182+
- lstm
183+
parameters:
184+
<<: *model_ddp_param
185+
batch_size: 224
186+
input_size: 224
187+
hidden_size: 1000
188+
seq_len: 32
189+
pin_memory: no
190+
model-benchmarks:resnet:
191+
<<: *default_pytorch_mode
192+
timeout: 1800
193+
models:
194+
- resnet50
195+
- resnet101
196+
- resnet152
197+
parameters:
198+
<<: *model_ddp_param
199+
batch_size: 192
200+
num_steps: 512
201+
model-benchmarks:densenet:
202+
<<: *default_pytorch_mode
203+
timeout: 1800
204+
models:
205+
- densenet169
206+
- densenet201
207+
parameters:
208+
<<: *model_ddp_param
209+
pin_memory: no
210+
model-benchmarks:vgg:
211+
<<: *default_pytorch_mode
212+
timeout: 1800
213+
models:
214+
- vgg11
215+
- vgg13
216+
- vgg16
217+
- vgg19
218+
parameters:
219+
<<: *model_ddp_param
220+
pin_memory: no
221+
model-benchmarks:stress:
222+
<<: *default_pytorch_mode
223+
timeout: 7200
224+
models:
225+
- bert-large
226+
parameters:
227+
<<: *model_ddp_param
228+
seq_len: 224
229+
duration: 1800
230+
num_steps: -100

0 commit comments

Comments
 (0)