-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathbenchmark_runtime_detailed.py
More file actions
53 lines (41 loc) · 1.58 KB
/
benchmark_runtime_detailed.py
File metadata and controls
53 lines (41 loc) · 1.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import time
import statistics as stats
import pandas as pd
import bib_dedupe.bib_dedupe as bd
from pathlib import Path
BENCHMARK_DIR = Path("../tests/ldd-full-benchmark")
def timed(label, fn, *args, **kwargs):
t0 = time.perf_counter()
out = fn(*args, **kwargs)
dt = time.perf_counter() - t0
return out, dt
def benchmark_pipeline(records_df, *, cpu=-1, repeats=5, warmup=1):
# warmup (important for caches, process pools, etc.)
for _ in range(warmup):
prepped = bd.prep(records_df, verbosity_level=0, cpu=cpu)
pairs = bd.block(prepped, verbosity_level=0, cpu=cpu)
_ = bd.match(pairs, verbosity_level=0, cpu=cpu)
prep_times, block_times, match_times = [], [], []
for _ in range(repeats):
prepped, t_prep = timed("prep", bd.prep, records_df, verbosity_level=0, cpu=cpu)
pairs, t_block = timed("block", bd.block, prepped, verbosity_level=0, cpu=cpu)
matched, t_match = timed("match", bd.match, pairs, verbosity_level=0, cpu=cpu)
prep_times.append(t_prep)
block_times.append(t_block)
match_times.append(t_match)
def summ(xs):
return {
"n": len(xs),
"mean_s": stats.mean(xs),
"median_s": stats.median(xs),
"min_s": min(xs),
"max_s": max(xs),
}
return {
"prep": summ(prep_times),
"block": summ(block_times),
"match_total": summ(match_times),
}
dataset = "cardiac"
df = pd.read_csv(BENCHMARK_DIR / dataset / "records_pre_merged.csv")
print(benchmark_pipeline(df, cpu=-1, repeats=10, warmup=2))