Skip to content

Commit 0954ef3

Browse files
ppwwyyxxfacebook-github-bot
authored andcommitted
more detailed dataloader benchmark
Reviewed By: zhanghang1989 Differential Revision: D29766279 fbshipit-source-id: d1481523a468cc8d42a133cfc769b182c9d71f10
1 parent 82a57ce commit 0954ef3

File tree

5 files changed

+274
-39
lines changed

5 files changed

+274
-39
lines changed

detectron2/config/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ def wrapped(*args, **kwargs):
194194
else:
195195
return orig_func(*args, **kwargs)
196196

197+
wrapped.from_config = from_config
197198
return wrapped
198199

199200
return wrapper

detectron2/data/benchmark.py

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
import logging
2+
import numpy as np
3+
from itertools import count
4+
from typing import List, Tuple
5+
import torch
6+
import tqdm
7+
from fvcore.common.timer import Timer
8+
9+
from detectron2.utils import comm
10+
11+
from .build import build_batch_data_loader
12+
from .common import DatasetFromList, MapDataset
13+
from .samplers import TrainingSampler
14+
15+
logger = logging.getLogger(__name__)
16+
17+
18+
class _EmptyMapDataset(torch.utils.data.Dataset):
19+
"""
20+
Map anything to emptiness.
21+
"""
22+
23+
def __init__(self, dataset):
24+
self.ds = dataset
25+
26+
def __len__(self):
27+
return len(self.ds)
28+
29+
def __getitem__(self, idx):
30+
_ = self.ds[idx]
31+
return [0]
32+
33+
34+
def iter_benchmark(
35+
iterator, num_iter: int, warmup: int = 5, max_time_seconds: float = 60
36+
) -> Tuple[float, List[float]]:
37+
"""
38+
Benchmark an iterator/iterable for `num_iter` iterations with an extra
39+
`warmup` iterations of warmup.
40+
End early if `max_time_seconds` time is spent on iterations.
41+
42+
Returns:
43+
float: average time (seconds) per iteration
44+
list[float]: time spent on each iteration. Sometimes useful for further analysis.
45+
"""
46+
num_iter, warmup = int(num_iter), int(warmup)
47+
48+
iterator = iter(iterator)
49+
for _ in range(warmup):
50+
next(iterator)
51+
timer = Timer()
52+
all_times = []
53+
for curr_iter in tqdm.trange(num_iter):
54+
start = timer.seconds()
55+
if start > max_time_seconds:
56+
num_iter = curr_iter
57+
break
58+
next(iterator)
59+
all_times.append(timer.seconds() - start)
60+
avg = timer.seconds() / num_iter
61+
return avg, all_times
62+
63+
64+
class DataLoaderBenchmark:
65+
"""
66+
Some common benchmarks that help understand perf bottleneck of a standard dataloader
67+
made of dataset, mapper and sampler.
68+
"""
69+
70+
def __init__(
71+
self,
72+
dataset,
73+
*,
74+
mapper,
75+
sampler=None,
76+
total_batch_size,
77+
num_workers=0,
78+
max_time_seconds: int = 90,
79+
):
80+
"""
81+
Args:
82+
max_time_seconds (int): maximum time to spent for each benchmark
83+
other args: same as in `build.py:build_detection_train_loader`
84+
"""
85+
if isinstance(dataset, list):
86+
dataset = DatasetFromList(dataset, copy=False, serialize=True)
87+
if sampler is None:
88+
sampler = TrainingSampler(len(dataset))
89+
90+
self.dataset = dataset
91+
self.mapper = mapper
92+
self.sampler = sampler
93+
self.total_batch_size = total_batch_size
94+
self.num_workers = num_workers
95+
self.per_gpu_batch_size = self.total_batch_size // comm.get_world_size()
96+
97+
self.max_time_seconds = max_time_seconds
98+
99+
def _benchmark(self, iterator, num_iter, warmup, msg=None):
100+
avg, all_times = iter_benchmark(iterator, num_iter, warmup, self.max_time_seconds)
101+
if msg is not None:
102+
self._log_time(msg, avg, all_times)
103+
return avg, all_times
104+
105+
def _log_time(self, msg, avg, all_times, distributed=False):
106+
percentiles = [np.percentile(all_times, k, interpolation="nearest") for k in [1, 5, 95, 99]]
107+
if not distributed:
108+
logger.info(
109+
f"{msg}: avg={1.0/avg:.1f} it/s, "
110+
f"p1={percentiles[0]:.2g}s, p5={percentiles[1]:.2g}s, "
111+
f"p95={percentiles[2]:.2g}s, p99={percentiles[3]:.2g}s."
112+
)
113+
return
114+
avg_per_gpu = comm.all_gather(avg)
115+
percentiles_per_gpu = comm.all_gather(percentiles)
116+
if comm.get_rank() > 0:
117+
return
118+
for idx, avg, percentiles in zip(count(), avg_per_gpu, percentiles_per_gpu):
119+
logger.info(
120+
f"GPU{idx} {msg}: avg={1.0/avg:.1f} it/s, "
121+
f"p1={percentiles[0]:.2g}s, p5={percentiles[1]:.2g}s, "
122+
f"p95={percentiles[2]:.2g}s, p99={percentiles[3]:.2g}s."
123+
)
124+
125+
def benchmark_dataset(self, num_iter, warmup=5):
126+
"""
127+
Benchmark the speed of taking raw samples from the dataset.
128+
"""
129+
130+
def loader():
131+
while True:
132+
for k in self.sampler:
133+
yield self.dataset[k]
134+
135+
self._benchmark(loader(), num_iter, warmup, "Dataset Alone")
136+
137+
def benchmark_mapper(self, num_iter, warmup=5):
138+
"""
139+
Benchmark the speed of taking raw samples from the dataset and map
140+
them in a single process.
141+
"""
142+
143+
def loader():
144+
while True:
145+
for k in self.sampler:
146+
yield self.mapper(self.dataset[k])
147+
148+
self._benchmark(loader(), num_iter, warmup, "Single Process Mapper (sec/sample)")
149+
150+
def benchmark_workers(self, num_iter, warmup=10):
151+
"""
152+
Benchmark the dataloader by tuning num_workers to [0, 1, self.num_workers].
153+
"""
154+
candidates = [0, 1]
155+
if self.num_workers not in candidates:
156+
candidates.append(self.num_workers)
157+
158+
dataset = MapDataset(self.dataset, self.mapper)
159+
for n in candidates:
160+
loader = build_batch_data_loader(
161+
dataset,
162+
self.sampler,
163+
self.total_batch_size,
164+
num_workers=n,
165+
)
166+
self._benchmark(
167+
iter(loader),
168+
num_iter * max(n, 1),
169+
warmup * max(n, 1),
170+
f"DataLoader ({n} workers, bs={self.per_gpu_batch_size})",
171+
)
172+
del loader
173+
174+
def benchmark_IPC(self, num_iter, warmup=10):
175+
"""
176+
Benchmark the dataloader where each worker outputs nothing. This
177+
eliminates the IPC overhead compared to the regular dataloader.
178+
179+
PyTorch multiprocessing's IPC only optimizes for torch tensors.
180+
Large numpy arrays or other data structure may incur large IPC overhead.
181+
"""
182+
n = self.num_workers
183+
dataset = _EmptyMapDataset(MapDataset(self.dataset, self.mapper))
184+
loader = build_batch_data_loader(
185+
dataset, self.sampler, self.total_batch_size, num_workers=n
186+
)
187+
self._benchmark(
188+
iter(loader),
189+
num_iter * max(n, 1),
190+
warmup * max(n, 1),
191+
f"DataLoader ({n} workers, bs={self.per_gpu_batch_size}) w/o comm",
192+
)
193+
194+
def benchmark_distributed(self, num_iter, warmup=10):
195+
"""
196+
Benchmark the dataloader in each distributed worker, and log results of
197+
all workers. This helps understand the final performance as well as
198+
the variances among workers.
199+
200+
It also prints startup time (first iter) of the dataloader.
201+
"""
202+
gpu = comm.get_world_size()
203+
dataset = MapDataset(self.dataset, self.mapper)
204+
n = self.num_workers
205+
loader = build_batch_data_loader(
206+
dataset, self.sampler, self.total_batch_size, num_workers=n
207+
)
208+
209+
timer = Timer()
210+
loader = iter(loader)
211+
next(loader)
212+
startup_time = timer.seconds()
213+
logger.info("Dataloader startup time: {:.2f} seconds".format(startup_time))
214+
215+
comm.synchronize()
216+
217+
avg, all_times = self._benchmark(loader, num_iter * max(n, 1), warmup * max(n, 1))
218+
del loader
219+
self._log_time(
220+
f"DataLoader ({gpu} GPUs x {n} workers, total bs={self.total_batch_size})",
221+
avg,
222+
all_times,
223+
True,
224+
)

tests/config/test_yacs_config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,8 @@ def testFuncWithCfg(self):
259259
self.assertEqual(_test_func(cfg, arg1=100, arg2=20), (100, 20, 30, 4))
260260
self.assertEqual(_test_func(cfg, arg1=100, arg2=20, arg4=40), (100, 20, 30, 40))
261261

262+
self.assertTrue(callable(_test_func.from_config))
263+
262264
def testOmegaConf(self):
263265
cfg = model_zoo.get_config("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml")
264266
cfg = OmegaConf.create(cfg.dump())

tools/benchmark.py

Lines changed: 46 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,13 @@
1515
from torch.nn.parallel import DistributedDataParallel
1616

1717
from detectron2.checkpoint import DetectionCheckpointer
18-
from detectron2.config import get_cfg
18+
from detectron2.config import LazyConfig, get_cfg, instantiate
1919
from detectron2.data import (
2020
DatasetFromList,
2121
build_detection_test_loader,
2222
build_detection_train_loader,
2323
)
24+
from detectron2.data.benchmark import DataLoaderBenchmark
2425
from detectron2.engine import AMPTrainer, SimpleTrainer, default_argument_parser, hooks, launch
2526
from detectron2.modeling import build_model
2627
from detectron2.solver import build_optimizer
@@ -33,15 +34,31 @@
3334

3435

3536
def setup(args):
36-
cfg = get_cfg()
37-
cfg.merge_from_file(args.config_file)
38-
cfg.SOLVER.BASE_LR = 0.001 # Avoid NaNs. Not useful in this script anyway.
39-
cfg.merge_from_list(args.opts)
40-
cfg.freeze()
37+
if args.config_file.endswith(".yaml"):
38+
cfg = get_cfg()
39+
cfg.merge_from_file(args.config_file)
40+
cfg.SOLVER.BASE_LR = 0.001 # Avoid NaNs. Not useful in this script anyway.
41+
cfg.merge_from_list(args.opts)
42+
cfg.freeze()
43+
else:
44+
cfg = LazyConfig.load(args.config_file)
45+
cfg = LazyConfig.apply_overrides(cfg, args.opts)
4146
setup_logger(distributed_rank=comm.get_rank())
4247
return cfg
4348

4449

50+
def create_data_benchmark(cfg, args):
51+
if args.config_file.endswith(".py"):
52+
dl_cfg = cfg.dataloader.train
53+
dl_cfg._target_ = DataLoaderBenchmark
54+
return instantiate(dl_cfg)
55+
else:
56+
kwargs = build_detection_train_loader.from_config(cfg)
57+
kwargs.pop("aspect_ratio_grouping", None)
58+
kwargs["_target_"] = DataLoaderBenchmark
59+
return instantiate(kwargs)
60+
61+
4562
def RAM_msg():
4663
vram = psutil.virtual_memory()
4764
return "RAM Usage: {:.2f}/{:.2f} GB".format(
@@ -51,41 +68,29 @@ def RAM_msg():
5168

5269
def benchmark_data(args):
5370
cfg = setup(args)
54-
5571
logger.info("After spawning " + RAM_msg())
56-
timer = Timer()
57-
dataloader = build_detection_train_loader(cfg)
58-
logger.info("Initialize loader using {} seconds.".format(timer.seconds()))
59-
60-
timer.reset()
61-
itr = iter(dataloader)
62-
for i in range(10): # warmup
63-
next(itr)
64-
if i == 0:
65-
startup_time = timer.seconds()
66-
logger.info("Startup time: {} seconds".format(startup_time))
67-
timer = Timer()
68-
max_iter = 1000
69-
for _ in tqdm.trange(max_iter):
70-
next(itr)
71-
logger.info(
72-
"{} iters ({} images) in {} seconds.".format(
73-
max_iter, max_iter * cfg.SOLVER.IMS_PER_BATCH, timer.seconds()
74-
)
75-
)
7672

73+
benchmark = create_data_benchmark(cfg, args)
74+
benchmark.benchmark_distributed(250, 10)
7775
# test for a few more rounds
7876
for k in range(10):
7977
logger.info(f"Iteration {k} " + RAM_msg())
80-
timer = Timer()
81-
max_iter = 1000
82-
for _ in tqdm.trange(max_iter):
83-
next(itr)
84-
logger.info(
85-
"{} iters ({} images) in {} seconds.".format(
86-
max_iter, max_iter * cfg.SOLVER.IMS_PER_BATCH, timer.seconds()
87-
)
88-
)
78+
benchmark.benchmark_distributed(250, 1)
79+
80+
81+
def benchmark_data_advanced(args): # benchmark dataloader with more details
82+
cfg = setup(args)
83+
benchmark = create_data_benchmark(cfg, args)
84+
85+
if comm.get_rank() == 0:
86+
benchmark.benchmark_dataset(100)
87+
benchmark.benchmark_mapper(100)
88+
benchmark.benchmark_workers(100, warmup=10)
89+
benchmark.benchmark_IPC(100, warmup=10)
90+
if comm.get_world_size() > 1:
91+
benchmark.benchmark_distributed(100)
92+
logger.info("Rerun ...")
93+
benchmark.benchmark_distributed(100)
8994

9095

9196
def benchmark_train(args):
@@ -157,14 +162,17 @@ def f():
157162

158163
if __name__ == "__main__":
159164
parser = default_argument_parser()
160-
parser.add_argument("--task", choices=["train", "eval", "data"], required=True)
165+
parser.add_argument("--task", choices=["train", "eval", "data", "data_advanced"], required=True)
161166
args = parser.parse_args()
162167
assert not args.eval_only
163168

164169
logger.info("Environment info:\n" + collect_env_info())
170+
if "data" in args.task:
171+
print("Initial " + RAM_msg())
165172
if args.task == "data":
166173
f = benchmark_data
167-
print("Initial " + RAM_msg())
174+
if args.task == "data_advanced":
175+
f = benchmark_data_advanced
168176
elif args.task == "train":
169177
"""
170178
Note: training speed may not be representative.

tools/lazyconfig_train_net.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def do_train(args, cfg):
5050
dataloader.evaluator: instantiate to evaluator for test set
5151
optimizer: instantaite to an optimizer
5252
lr_multiplier: instantiate to a fvcore scheduler
53-
train: other misc config defined in `common_train.py`, including:
53+
train: other misc config defined in `configs/common/train.py`, including:
5454
output_dir (str)
5555
init_checkpoint (str)
5656
amp.enabled (bool)

0 commit comments

Comments
 (0)