Skip to content

Commit f228c96

Browse files
committed
chore: add more benchmarks
1 parent 4d8c569 commit f228c96

File tree

20 files changed

+1625
-0
lines changed

20 files changed

+1625
-0
lines changed

benchmarks/cache/entities.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import asyncio
2+
import itertools
3+
import os
4+
import sys
5+
import time
6+
7+
import numpy as np
8+
import pandas as pd
9+
import requests
10+
from cachetools import TTLCache
11+
from tqdm import tqdm
12+
13+
sys.path.insert(0, os.path.dirname("../pyrdf2vec"))
14+
15+
from pyrdf2vec.graphs import KG
16+
from pyrdf2vec.walkers import RandomWalker
17+
18+
dcemd_to_avg_stdev = {} # type: ignore
19+
20+
21+
for db, is_cache, entities, max_depth, max_walks in itertools.product(
22+
["mutag", "am", "dbpedia"],
23+
[True],
24+
[25, 50, 100],
25+
[4],
26+
[500],
27+
):
28+
label = "bond"
29+
if db == "am":
30+
label = "proxy"
31+
elif db == "dbpedia":
32+
label = "DBpedia_URL"
33+
34+
e = [e for e in pd.read_csv(f"res/{db}.tsv", sep="\t")[label]][:entities:]
35+
36+
requests.get("http://10.2.35.70:5000/restart_stardog")
37+
38+
times = []
39+
40+
for _ in tqdm(range(10)):
41+
cache = TTLCache(maxsize=1024, ttl=1200) if is_cache else None
42+
kg = KG(
43+
f"http://10.2.35.70:5820/{db}",
44+
cache=cache,
45+
skip_verify=True,
46+
mul_req=False,
47+
)
48+
tic = time.perf_counter()
49+
entity_walks = RandomWalker(max_depth, max_walks).extract(kg, e)
50+
toc = time.perf_counter()
51+
times.append(toc - tic)
52+
53+
avg_stdev = [
54+
np.round(np.mean(times), 2),
55+
np.round(np.std(times), 2),
56+
]
57+
58+
num_walks = sum([len(e_walk) for e_walk in entity_walks])
59+
print(
60+
f"(db={db},is_cache={is_cache},entities={len(e)},"
61+
+ f"max_depth={max_depth},max_walks={max_walks}) = "
62+
+ f"{avg_stdev[0]} +/- {avg_stdev[1]} > {num_walks} walks"
63+
)
64+
dcemd_to_avg_stdev[
65+
(db, is_cache, entities, max_depth, max_walks, num_walks)
66+
] = avg_stdev
67+
68+
for k, v in dcemd_to_avg_stdev.items():
69+
print(
70+
f"(db={k[0]},is_cache={k[1]},entities={k[2]},"
71+
+ f"max_depth={k[3]},max_walks={k[4]}) = "
72+
+ f"{v[0]} +/- {v[1]} > {k[5]} walks"
73+
)

benchmarks/cache/max-depth.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import asyncio
2+
import itertools
3+
import os
4+
import sys
5+
import time
6+
7+
import numpy as np
8+
import pandas as pd
9+
import requests
10+
from cachetools import TTLCache
11+
from tqdm import tqdm
12+
13+
sys.path.insert(0, os.path.dirname("../pyrdf2vec"))
14+
15+
from pyrdf2vec.graphs import KG
16+
from pyrdf2vec.walkers import RandomWalker
17+
18+
dcemd_to_avg_stdev = {} # type: ignore
19+
20+
21+
for db, is_cache, entities, max_depth, max_walks in itertools.product(
22+
["mutag", "am", "dbpedia"],
23+
[True],
24+
[50],
25+
[2, 4, 6],
26+
[500],
27+
):
28+
label = "bond"
29+
if db == "am":
30+
label = "proxy"
31+
elif db == "dbpedia":
32+
label = "DBpedia_URL"
33+
34+
e = [e for e in pd.read_csv(f"res/{db}.tsv", sep="\t")[label]][:entities:]
35+
36+
requests.get("http://10.2.35.70:5000/restart_stardog")
37+
38+
times = []
39+
40+
for _ in tqdm(range(10)):
41+
cache = TTLCache(maxsize=1024, ttl=1200) if is_cache else None
42+
kg = KG(
43+
f"http://10.2.35.70:5820/{db}",
44+
cache=cache,
45+
skip_verify=True,
46+
mul_req=False,
47+
)
48+
tic = time.perf_counter()
49+
entity_walks = RandomWalker(max_depth, max_walks).extract(kg, e)
50+
toc = time.perf_counter()
51+
times.append(toc - tic)
52+
53+
avg_stdev = [
54+
np.round(np.mean(times), 2),
55+
np.round(np.std(times), 2),
56+
]
57+
58+
num_walks = sum([len(e_walk) for e_walk in entity_walks])
59+
print(
60+
f"(db={db},is_cache={is_cache},entities={len(e)},"
61+
+ f"max_depth={max_depth},max_walks={max_walks}) = "
62+
+ f"{avg_stdev[0]} +/- {avg_stdev[1]} > {num_walks} walks"
63+
)
64+
dcemd_to_avg_stdev[
65+
(db, is_cache, entities, max_depth, max_walks, num_walks)
66+
] = avg_stdev
67+
68+
for k, v in dcemd_to_avg_stdev.items():
69+
print(
70+
f"(db={k[0]},is_cache={k[1]},entities={k[2]},"
71+
+ f"max_depth={k[3]},max_walks={k[4]}) = "
72+
+ f"{v[0]} +/- {v[1]} > {k[5]} walks"
73+
)

benchmarks/cache/max-walks.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import asyncio
2+
import itertools
3+
import os
4+
import sys
5+
import time
6+
7+
import numpy as np
8+
import pandas as pd
9+
import requests
10+
from cachetools import TTLCache
11+
from tqdm import tqdm
12+
13+
sys.path.insert(0, os.path.dirname("../pyrdf2vec"))
14+
15+
from pyrdf2vec.graphs import KG
16+
from pyrdf2vec.walkers import RandomWalker
17+
18+
dcemd_to_avg_stdev = {} # type: ignore
19+
20+
21+
for db, is_cache, entities, max_depth, max_walks in itertools.product(
22+
["mutag", "am", "dbpedia"],
23+
[True],
24+
[50],
25+
[4],
26+
[100, 500, 1000],
27+
):
28+
label = "bond"
29+
if db == "am":
30+
label = "proxy"
31+
elif db == "dbpedia":
32+
label = "DBpedia_URL"
33+
34+
e = [e for e in pd.read_csv(f"res/{db}.tsv", sep="\t")[label]][:entities:]
35+
36+
requests.get("http://10.2.35.70:5000/restart_stardog")
37+
38+
times = []
39+
40+
for _ in tqdm(range(10)):
41+
cache = TTLCache(maxsize=1024, ttl=1200) if is_cache else None
42+
kg = KG(
43+
f"http://10.2.35.70:5820/{db}",
44+
cache=cache,
45+
skip_verify=True,
46+
mul_req=False,
47+
)
48+
tic = time.perf_counter()
49+
entity_walks = RandomWalker(max_depth, max_walks).extract(kg, e)
50+
toc = time.perf_counter()
51+
times.append(toc - tic)
52+
53+
avg_stdev = [
54+
np.round(np.mean(times), 2),
55+
np.round(np.std(times), 2),
56+
]
57+
58+
num_walks = sum([len(e_walk) for e_walk in entity_walks])
59+
print(
60+
f"(db={db},is_cache={is_cache},entities={len(e)},"
61+
+ f"max_depth={max_depth},max_walks={max_walks}) = "
62+
+ f"{avg_stdev[0]} +/- {avg_stdev[1]} > {num_walks} walks"
63+
)
64+
dcemd_to_avg_stdev[
65+
(db, is_cache, entities, max_depth, max_walks, num_walks)
66+
] = avg_stdev
67+
68+
for k, v in dcemd_to_avg_stdev.items():
69+
print(
70+
f"(db={k[0]},is_cache={k[1]},entities={k[2]},"
71+
+ f"max_depth={k[3]},max_walks={k[4]}) = "
72+
+ f"{v[0]} +/- {v[1]} > {k[5]} walks"
73+
)

benchmarks/embedders/max_depth.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import asyncio
2+
import itertools
3+
import os
4+
import sys
5+
6+
import numpy as np
7+
import pandas as pd
8+
import requests
9+
from tqdm import tqdm
10+
11+
sys.path.insert(0, os.path.dirname("../pyrdf2vec"))
12+
13+
from sklearn.metrics import accuracy_score
14+
from sklearn.model_selection import GridSearchCV
15+
from sklearn.svm import SVC
16+
17+
from pyrdf2vec import RDF2VecTransformer
18+
from pyrdf2vec.embedders import FastText, Word2Vec
19+
from pyrdf2vec.graphs import KG
20+
from pyrdf2vec.samplers import UniformSampler
21+
22+
dcemd_to_avg_stdev = {}
23+
RANDOM_STATE = 22
24+
25+
for db, embedder_name, max_depth, max_walks in itertools.product(
26+
["mutag", "am", "dbpedia"],
27+
[
28+
"Word2Vec",
29+
"FastText",
30+
],
31+
[1, 2, 4],
32+
[250],
33+
):
34+
test_data = pd.read_csv(f"res/{db}/test.tsv", sep="\t")
35+
train_data = pd.read_csv(f"res/{db}/train.tsv", sep="\t")
36+
37+
label = "bond"
38+
if db == "am":
39+
label = "proxy"
40+
train_labels = list(train_data["label_cateogory"])
41+
test_labels = list(test_data["label_category"])
42+
elif db == "dbpedia":
43+
label = "DBpedia_URL"
44+
train_labels = list(train_data["label"])
45+
test_labels = list(test_data["label"])
46+
else:
47+
train_labels = list(train_data["label_mutagenic"])
48+
test_labels = list(test_data["label_mutagenic"])
49+
50+
train_entities = [entity for entity in train_data[label]]
51+
test_entities = [entity for entity in test_data[label]]
52+
entities = train_entities + test_entities
53+
54+
embedder = Word2Vec(workers=1)
55+
if embedder_name == "FastText":
56+
embedder = FastText(workers=1)
57+
58+
skip_preds = {}
59+
if db == "mutag":
60+
skip_preds = {"http://dl-learner.org/carcinogenesis#isMutagenic"}
61+
elif db == "am":
62+
skip_preds = {
63+
"http://purl.org/collections/nl/am/objectCategory",
64+
"http://purl.org/collections/nl/am/material",
65+
}
66+
67+
requests.get("http://10.2.35.70:5000/restart_stardog")
68+
accuracies = []
69+
70+
for _ in tqdm(range(10)):
71+
embeddings, _ = RDF2VecTransformer(
72+
embedder,
73+
walkers=[
74+
RandomWalker(max_depth, max_walks, UniformSampler(), n_jobs=4)
75+
],
76+
).fit_transform(
77+
KG(
78+
f"http://10.2.35.70:5820/{db}",
79+
skip_predicates=skip_preds,
80+
skip_verify=True,
81+
mul_req=True,
82+
),
83+
entities,
84+
)
85+
86+
train_embeddings = embeddings[: len(train_entities)]
87+
test_embeddings = embeddings[len(train_entities) :]
88+
89+
clf = GridSearchCV(
90+
SVC(random_state=RANDOM_STATE),
91+
{"C": [10 ** i for i in range(-3, 4)]},
92+
)
93+
clf.fit(train_embeddings, train_labels)
94+
predictions = clf.predict(test_embeddings)
95+
96+
score = accuracy_score(test_labels, predictions)
97+
avg_stdev = [
98+
np.round(np.mean(score), 4),
99+
np.round(np.std(score), 4),
100+
]
101+
102+
accuracies.append(avg_stdev)
103+
104+
print(
105+
f"{walker.name},accuracy={avg_stdev[0] * 100 :.2f} % +/- {avg_stdev[1]}"
106+
)
107+
dcemd_to_avg_stdev[
108+
(db, embedder_name, max_depth, max_walks)
109+
] = avg_stdev
110+
111+
for k, v in dcemd_to_avg_stdev.items():
112+
print(
113+
f"(db={k[0]},embedder={k[1]},"
114+
+ f"max_depth={k[2]},max_walks={k[3]}) = "
115+
+ f"{v[0] * 100 :.4f}"
116+
)

0 commit comments

Comments
 (0)