Skip to content

Commit 2b2ceb8

Browse files
authored
Merge pull request #73 from NVIDIA/wip-fio-support-125K-io-size
Add support for 125KB IO workload
2 parents d21e2d3 + b479378 commit 2b2ceb8

File tree

7 files changed

+225
-10
lines changed

7 files changed

+225
-10
lines changed

.gitlab-ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,5 +41,5 @@ test:
4141
- python setup.py bdist_wheel sdist
4242
- pip install dist/nvidia_bobber-*-none-any.whl
4343
- bobber cast /raid
44-
- bobber run-all --ssh-iface enp2s0f0 --iterations 2 --batch-size-sm 512 --batch-size-lg 256 --gpus 4 --bw-threads 16 --iops-threads 200 test_results localhost
44+
- bobber run-all --ssh-iface enp2s0f0 --iterations 2 --batch-size-sm 512 --batch-size-lg 256 --gpus 4 --bw-threads 16 --125k-threads 32 --iops-threads 96 test_results localhost
4545
- bobber parse-results --compare-baseline single-dgx-station-baseline test_results/

bobber/bobber.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
RUN_NCCL,
2020
RUN_STG_BW,
2121
RUN_STG_IOPS,
22+
RUN_STG_125K,
2223
RUN_STG_META,
2324
SYSTEMS
2425
)
@@ -129,6 +130,9 @@ def parse_args(version: str) -> Namespace:
129130
commands_parent.add_argument('--bw-threads', help='Maximum number of '
130131
'threads to use for bandwidth tests',
131132
type=int)
133+
commands_parent.add_argument('--125k-threads', dest='stg_125k_threads',
134+
help='Maximum number of threads to use for '
135+
'125K IO size tests', type=int)
132136
commands_parent.add_argument('--iops-threads', help='Maximum number of '
133137
'threads to use for iops tests', type=int)
134138
commands_parent.add_argument('--iterations', help='Number of iterations to'
@@ -143,11 +147,12 @@ def parse_args(version: str) -> Namespace:
143147
'would result in tests for 1, 2, and 3 '
144148
'systems)', action='store_true')
145149
commands_parent.add_argument('--system', help='If system is specified, '
146-
'iops-threads, bw-threads, gpus, batch size, '
147-
'and network interface names are given '
148-
'default values - override by specifying the '
149-
'flags you\'d prefer to override, ignore the '
150-
'flags you are ok with using defaults for '
150+
'iops-threads, 125k-threads, bw-threads, '
151+
'gpus, batch size, and network interface '
152+
'names are given default values - override '
153+
'by specifying the flags you\'d prefer to '
154+
'override, ignore the flags you are ok with '
155+
'using defaults for '
151156
'supported systems: dgx-a100-single, '
152157
'dgx-a100-dual, and dgx-2 for now. -single '
153158
'is used for a system with a single storage '
@@ -170,11 +175,13 @@ def parse_args(version: str) -> Namespace:
170175
parents=[commands_parent])
171176
commands.add_parser(RUN_NCCL, help='Run NCCL tests only',
172177
parents=[commands_parent])
173-
commands.add_parser(RUN_STG_BW, help='Run storage bandwdith tests only',
178+
commands.add_parser(RUN_STG_BW, help='Run storage bandwidth test only',
174179
parents=[commands_parent])
175-
commands.add_parser(RUN_STG_IOPS, help='Run storage IOPS tests only',
180+
commands.add_parser(RUN_STG_125K, help='Run storage 125 IO size test only',
176181
parents=[commands_parent])
177-
commands.add_parser(RUN_STG_META, help='Run storage metadata tests only',
182+
commands.add_parser(RUN_STG_IOPS, help='Run storage IOPS test only',
183+
parents=[commands_parent])
184+
commands.add_parser(RUN_STG_META, help='Run storage metadata test only',
178185
parents=[commands_parent])
179186

180187
# Options specific to exporting the containers

bobber/lib/analysis/aggregate_results.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,18 @@ class AggregateResults:
7070
write_iops_params : dict
7171
A ``dictionary`` of the parameters used during the fio write iops
7272
tests.
73+
read_125k_bw : dict
74+
A ``dictionary`` containing all of the fio 125k read bandwidth results
75+
for N-systems.
76+
write_125k_bw : dict
77+
A ``dictionary`` containing all of the fio 125k write bandwidth results
78+
for N-systems.
79+
read_125k_bw_params : dict
80+
A ``dictionary`` of the parameters used during the fio 125k read
81+
bandwidth tests.
82+
write_125k_bw_params : dict
83+
A ``dictionary`` of the parameters used during the fio 125k write
84+
bandwidth tests.
7385
max_bw : dict
7486
A ``dictionary`` of the maximum bus bandwidth achieved from NCCL tests.
7587
bytes_sizes : dict
@@ -93,6 +105,10 @@ def __init__(self,
93105
write_iops: dict,
94106
read_iops_params: dict,
95107
write_iops_params: dict,
108+
read_125k_bw: dict,
109+
write_125k_bw: dict,
110+
read_125k_bw_params: dict,
111+
write_125k_bw_params: dict,
96112
max_bw: dict,
97113
bytes_sizes: dict,
98114
dali_results: dict,
@@ -102,10 +118,14 @@ def __init__(self,
102118
self._read_bw_params = read_bw_params
103119
self._read_iops = read_iops
104120
self._read_iops_params = read_iops_params
121+
self._125k_read_bw = read_125k_bw
122+
self._125k_read_bw_params = read_125k_bw_params
105123
self._write_bw = write_bw
106124
self._write_bw_params = write_bw_params
107125
self._write_iops = write_iops
108126
self._write_iops_params = write_iops_params
127+
self._125k_write_bw = write_125k_bw
128+
self._125k_write_bw_params = write_125k_bw_params
109129
self._max_bw = max_bw
110130
self._bytes_sizes = bytes_sizes
111131
self._dali_results = dali_results
@@ -124,6 +144,8 @@ def __str__(self) -> str:
124144
Aggregate Write Bandwidth: 1.232 GB/s
125145
Aggregate Read IOPS: 136.5 k IOPS
126146
Aggregate Write IOPS: 135.0 k IOPS
147+
Aggregate 125k Read Bandwidth: 1.595 GB/s
148+
Aggregate 125k Write Bandwidth: 1.232 GB/s
127149
NCCL Max Bus Bandwidth: 79.865 at 512.0 MB
128150
Mdtest
129151
Directory creation: 71406.29550000001 ops
@@ -159,6 +181,10 @@ def __str__(self) -> str:
159181
['Systems tested:', self._num_systems, ''],
160182
['Aggregate Read Bandwidth:', self.average_read_bw, ' GB/s'],
161183
['Aggregate Write Bandwidth:', self.average_write_bw, ' GB/s'],
184+
['Aggregate 125k Read Bandwidth:', self.average_125k_read_bw,
185+
' GB/s'],
186+
['Aggregate 125k Write Bandwidth:', self.average_125k_write_bw,
187+
' GB/s'],
162188
['Aggregate Read IOPS:', self.average_read_iops, 'k IOPS'],
163189
['Aggregate Write IOPS:', self.average_write_iops, 'k IOPS'],
164190
]
@@ -275,6 +301,15 @@ def json(self) -> dict:
275301
'write': self._write_iops_params
276302
}
277303
},
304+
'125k_bandwidth': {
305+
'read': self._average_125k_read_bw(),
306+
'write': self._average_125k_write_bw(),
307+
'unit': 'operations/second',
308+
'parameters': {
309+
'read': self._125k_read_bw_params,
310+
'write': self._125k_write_bw_params
311+
}
312+
},
278313
'nccl': {
279314
'max_bus_bw': self.max_bus_bandwidth,
280315
'max_bus_bytes': self.max_bus_bytes,
@@ -325,6 +360,44 @@ def average_write_bw(self) -> float:
325360
"""
326361
return round(self._average_write_bw() * 1e-9, 3)
327362

363+
@average_decorator
364+
def _average_125k_read_bw(self) -> float:
365+
"""
366+
Returns the average 125k read bandwidth as a ``float`` for all
367+
iterations in B/s. Defaults to 0.0.
368+
"""
369+
try:
370+
return self._125k_read_bw[self._num_systems]
371+
except KeyError:
372+
return 0.0
373+
374+
@property
375+
def average_125k_read_bw(self) -> float:
376+
"""
377+
Returns the average 125k read bandwidth as a ``float`` for all
378+
iterations in GB/s, rounded to the nearest thousandth.
379+
"""
380+
return round(self._average_125k_read_bw() * 1e-9, 3)
381+
382+
@average_decorator
383+
def _average_125k_write_bw(self) -> float:
384+
"""
385+
Returns the average 125k write bandwidth as a ``float`` for all
386+
iterations in B/s. Defaults to 0.0
387+
"""
388+
try:
389+
return self._125k_write_bw[self._num_systems]
390+
except KeyError:
391+
return 0.0
392+
393+
@property
394+
def average_125k_write_bw(self) -> float:
395+
"""
396+
Returns the average 125k write bandwidth as a ``float`` for all
397+
iterations in GB/s, rounded to the nearest thousandth.
398+
"""
399+
return round(self._average_125k_write_bw() * 1e-9, 3)
400+
328401
@average_decorator
329402
def _average_read_iops(self) -> float:
330403
"""

bobber/lib/analysis/parse_results.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,42 @@ def parse_fio_iops(log_files: list) -> Tuple[dict, dict, dict, dict]:
110110
return read_sys_results, write_sys_results, read_params, write_params
111111

112112

113+
def parse_fio_125k_bw(log_files: list) -> Tuple[dict, dict, dict, dict]:
114+
"""
115+
Parse all FIO 125k bandwidth logs.
116+
117+
Find each FIO 125k bandwidth log in the results directory and parse the
118+
read and write results and parameters from each log for all system counts.
119+
120+
Parameters
121+
----------
122+
log_files : list
123+
A ``list`` of ``strings`` of the paths to each log file in the results
124+
directory.
125+
126+
Returns
127+
-------
128+
tuple
129+
A ``tuple`` of four dictionaries containing the 125k read results, 125k
130+
write results, 125k read parameters, and 125k write parameters for all
131+
system counts.
132+
"""
133+
read_sys_results = defaultdict(list)
134+
write_sys_results = defaultdict(list)
135+
read_params, write_params = None, None
136+
137+
fio_logs_by_systems = divide_logs_by_systems(log_files,
138+
'stg_125k_iteration')
139+
140+
for systems, files in fio_logs_by_systems.items():
141+
read_sys_results, write_sys_results, read_params, write_params = \
142+
parse_fio_bw_file(files,
143+
systems,
144+
read_sys_results,
145+
write_sys_results)
146+
return read_sys_results, write_sys_results, read_params, write_params
147+
148+
113149
def parse_nccl(log_files: list) -> Tuple[dict, dict]:
114150
"""
115151
Parse all NCCL logs.
@@ -251,6 +287,10 @@ def save_yaml_baseline(final_dictionary_output: dict,
251287
# FIO IOPS speed in ops/second
252288
read: {results.get('iops', {}).get('read', 0)}
253289
write: {results.get('iops', {}).get('write', 0)}
290+
125k_bandwidth:
291+
# FIO 125k BW speed in bytes/second
292+
read: {results.get('125k_bandwidth', {}).get('read', 0)}
293+
write: {results.get('125k_bandwidth', {}).get('write', 0)}
254294
nccl:
255295
# NCCL maximum bus bandwidth in GB/s
256296
max_bus_bw: {results.get('nccl', {}).get('max_bus_bw', 0)}
@@ -315,6 +355,9 @@ def main(directory: str,
315355
override_version_check)
316356
bw_results = parse_fio_bw(log_files)
317357
read_bw, write_bw, read_bw_params, write_bw_params = bw_results
358+
bw_125k_results = parse_fio_125k_bw(log_files)
359+
read_125k_bw, write_125k_bw, read_125k_bw_params, write_125k_bw_params = \
360+
bw_125k_results
318361
iops_results = parse_fio_iops(log_files)
319362
read_iops, write_iops, read_iops_params, write_iops_params = iops_results
320363
metadata = parse_meta(log_files)
@@ -323,7 +366,8 @@ def main(directory: str,
323366
total_systems = 0
324367
systems = []
325368

326-
for result in [read_bw, read_iops, max_bw, dali_results, metadata]:
369+
for result in [read_bw, read_iops, read_125k_bw, max_bw, dali_results,
370+
metadata]:
327371
try:
328372
total_systems = max(result.keys())
329373
systems = sorted(result.keys())
@@ -341,6 +385,10 @@ def main(directory: str,
341385
write_iops,
342386
read_iops_params,
343387
write_iops_params,
388+
read_125k_bw,
389+
write_125k_bw,
390+
read_125k_bw_params,
391+
write_125k_bw_params,
344392
max_bw,
345393
bytes_sizes,
346394
dali_results,

bobber/lib/analysis/table.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
FIO_WRITE_BW = f'{bcolors.BOLD}FIO Write (GB/s) - 1MB BS{bcolors.ENDC}'
1111
FIO_READ_IOP = f'{bcolors.BOLD}FIO Read (k IOPS) - 4K BS{bcolors.ENDC}'
1212
FIO_WRITE_IOP = f'{bcolors.BOLD}FIO Write (k IOPS) - 4K BS{bcolors.ENDC}'
13+
FIO_125K_READ_BW = f'{bcolors.BOLD}FIO Read (GB/s) - 125K BS{bcolors.ENDC}'
14+
FIO_125K_WRITE_BW = f'{bcolors.BOLD}FIO Write (GB/s) - 125K BS{bcolors.ENDC}'
1315
NCCL = f'{bcolors.BOLD}NCCL Max BW (GB/s){bcolors.ENDC}'
1416
DALI_IMG_SM = (f'{bcolors.BOLD}DALI Standard 800x600 throughput '
1517
f'(images/second){bcolors.ENDC}')
@@ -146,6 +148,38 @@ def fio_iops(results: list) -> Tuple[list, list]:
146148
return [read, write]
147149

148150

151+
def fio_125k_bw(results: list) -> Tuple[list, list]:
152+
"""
153+
Save the FIO 125k bandwidth read and write results.
154+
155+
Save the read and write results from the FIO 125k bandwidth tests on an
156+
increasing per-system basis with the first element in the list being the
157+
column header.
158+
159+
Parameters
160+
----------
161+
results : list
162+
A ``list`` of ``dictionaries`` containing all results from the tests.
163+
164+
Returns
165+
-------
166+
tuple
167+
Returns a ``tuple`` of (``list``, ``list``) containing the read and
168+
write 125k bandwidth results, respectively.
169+
"""
170+
try:
171+
read = [FIO_125K_READ_BW] + [bytes_to_gb(result[1]['125k_bandwidth']
172+
['read'])
173+
for result in results]
174+
write = [FIO_125K_WRITE_BW] + [bytes_to_gb(result[1]['125k_bandwidth']
175+
['write'])
176+
for result in results]
177+
except KeyError:
178+
return []
179+
else:
180+
return [read, write]
181+
182+
149183
def nccl(results: list) -> list:
150184
"""
151185
Save the NCCL results.
@@ -288,6 +322,7 @@ def display_table(json_results: dict) -> NoReturn:
288322

289323
data += fio_bw(results)
290324
data += fio_iops(results)
325+
data += fio_125k_bw(results)
291326
data += nccl(results)
292327
data += dali(results)
293328

bobber/lib/constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@
99
RUN_NCCL = 'run-nccl'
1010
RUN_STG_BW = 'run-stg-bw'
1111
RUN_STG_IOPS = 'run-stg-iops'
12+
RUN_STG_125K = 'run-stg-125k'
1213
RUN_STG_META = 'run-stg-meta'
1314

1415
DGX_A100_SINGLE = {
1516
'gpus': 8,
1617
'bw_threads': 16,
18+
'stg_125k_threads': 16,
1719
'iops_threads': 200,
1820
'batch_size_sm': 512,
1921
'batch_size_lg': 256,
@@ -25,6 +27,7 @@
2527
DGX_A100_DUAL = {
2628
'gpus': 8,
2729
'bw_threads': 16,
30+
'stg_125k_threads': 16,
2831
'iops_threads': 200,
2932
'batch_size_sm': 512,
3033
'batch_size_lg': 256,
@@ -36,6 +39,7 @@
3639
DGX_2 = {
3740
'gpus': 16,
3841
'bw-threads': 16,
42+
'stg_125k_threads': 16,
3943
'batch-size-sm': 150,
4044
'batch-size-lg': 75,
4145
'iops-threads': 80,

0 commit comments

Comments
 (0)