Skip to content

Commit 3e1ccdc

Browse files
committed
eval pattern generator canonicalization parallelized
1 parent a7250af commit 3e1ccdc

File tree

1 file changed

+39
-14
lines changed

1 file changed

+39
-14
lines changed

eval.py

Lines changed: 39 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616
from rdflib import Variable
1717
from scipy.special import binom
1818
from scipy.misc import comb
19+
from scoop.futures import map as parallel_map
20+
import scoop
21+
import scoop.futures
22+
from splendid import chunker
1923

2024
from logging_config import logging
2125
from graph_pattern import SOURCE_VAR
@@ -28,7 +32,7 @@
2832
logger.info('init')
2933

3034

31-
DEBUG = True
35+
DEBUG = False
3236
HOLE = sys.maxint # placeholder for holes in partial patterns
3337

3438
# debug logging in this module is actually quite expensive (> 30 % of time). In
@@ -372,23 +376,44 @@ def main():
372376
length = 5
373377
canonical = True
374378

375-
gen_patterns = []
379+
_patterns = set()
376380
n = 0
377381
i = 0
378-
for n, (i, pattern) in enumerate(patterns(
379-
length,
380-
loops=False,
381-
node_edge_joint=False,
382-
p_only_connected=False,
383-
source_target_edges=False,
384-
exclude_isomorphic=canonical,
385-
count_candidates_only=False,
386-
)):
387-
print('%d: Pattern id %d: %s' % (n, i, pattern))
388-
gen_patterns.append((i, pattern))
382+
383+
pg = patterns(
384+
length,
385+
loops=False,
386+
node_edge_joint=False,
387+
p_only_connected=False,
388+
source_target_edges=False,
389+
exclude_isomorphic=canonical and not scoop.IS_RUNNING,
390+
count_candidates_only=False,
391+
)
392+
393+
if canonical and scoop.IS_RUNNING:
394+
# Graph pattern isomorphism checking is what takes by far the longest.
395+
# run canonicalization in parallel
396+
# chunks used for efficiency and to hinder parallel_map from trying to
397+
# eat up all candidates first
398+
for chunk in chunker(pg, 10000):
399+
cgps = parallel_map(
400+
lambda res: (res[0], canonicalize(res[1]) if res[1] else None),
401+
chunk
402+
)
403+
for i, pattern in cgps:
404+
if pattern not in _patterns:
405+
print('%d: Pattern id %d: %s' % (n, i, pattern))
406+
_patterns.add(pattern)
407+
n += 1
408+
else:
409+
# run potential canonicalization inline
410+
for n, (i, pattern) in enumerate(pg):
411+
print('%d: Pattern id %d: %s' % (n, i, pattern))
412+
_patterns.add(pattern)
413+
# last res of pg is (i, None)
414+
_patterns.remove(None)
389415
print('Number of pattern candidates: %d' % i)
390416
print('Number of patterns: %d' % n)
391-
_patterns = set(gp for pid, gp in gen_patterns[:-1])
392417

393418
# testing flipped edges (only works if we're working with canonicals)
394419
if canonical:

0 commit comments

Comments
 (0)