|
16 | 16 | from rdflib import Variable
|
17 | 17 | from scipy.special import binom
|
18 | 18 | from scipy.misc import comb
|
| 19 | +from scoop.futures import map as parallel_map |
| 20 | +import scoop |
| 21 | +import scoop.futures |
| 22 | +from splendid import chunker |
19 | 23 |
|
20 | 24 | from logging_config import logging
|
21 | 25 | from graph_pattern import SOURCE_VAR
|
|
28 | 32 | logger.info('init')
|
29 | 33 |
|
30 | 34 |
|
31 |
| -DEBUG = True |
| 35 | +DEBUG = False |
32 | 36 | HOLE = sys.maxint # placeholder for holes in partial patterns
|
33 | 37 |
|
34 | 38 | # debug logging in this module is actually quite expensive (> 30 % of time). In
|
@@ -372,23 +376,44 @@ def main():
|
372 | 376 | length = 5
|
373 | 377 | canonical = True
|
374 | 378 |
|
375 |
| - gen_patterns = [] |
| 379 | + _patterns = set() |
376 | 380 | n = 0
|
377 | 381 | i = 0
|
378 |
| - for n, (i, pattern) in enumerate(patterns( |
379 |
| - length, |
380 |
| - loops=False, |
381 |
| - node_edge_joint=False, |
382 |
| - p_only_connected=False, |
383 |
| - source_target_edges=False, |
384 |
| - exclude_isomorphic=canonical, |
385 |
| - count_candidates_only=False, |
386 |
| - )): |
387 |
| - print('%d: Pattern id %d: %s' % (n, i, pattern)) |
388 |
| - gen_patterns.append((i, pattern)) |
| 382 | + |
| 383 | + pg = patterns( |
| 384 | + length, |
| 385 | + loops=False, |
| 386 | + node_edge_joint=False, |
| 387 | + p_only_connected=False, |
| 388 | + source_target_edges=False, |
| 389 | + exclude_isomorphic=canonical and not scoop.IS_RUNNING, |
| 390 | + count_candidates_only=False, |
| 391 | + ) |
| 392 | + |
| 393 | + if canonical and scoop.IS_RUNNING: |
| 394 | + # Graph pattern isomorphism checking is what takes by far the longest. |
| 395 | + # run canonicalization in parallel |
| 396 | + # chunks used for efficiency and to hinder parallel_map from trying to |
| 397 | + # eat up all candidates first |
| 398 | + for chunk in chunker(pg, 10000): |
| 399 | + cgps = parallel_map( |
| 400 | + lambda res: (res[0], canonicalize(res[1]) if res[1] else None), |
| 401 | + chunk |
| 402 | + ) |
| 403 | + for i, pattern in cgps: |
| 404 | + if pattern not in _patterns: |
| 405 | + print('%d: Pattern id %d: %s' % (n, i, pattern)) |
| 406 | + _patterns.add(pattern) |
| 407 | + n += 1 |
| 408 | + else: |
| 409 | + # run potential canonicalization inline |
| 410 | + for n, (i, pattern) in enumerate(pg): |
| 411 | + print('%d: Pattern id %d: %s' % (n, i, pattern)) |
| 412 | + _patterns.add(pattern) |
| 413 | + # last res of pg is (i, None) |
| 414 | + _patterns.remove(None) |
389 | 415 | print('Number of pattern candidates: %d' % i)
|
390 | 416 | print('Number of patterns: %d' % n)
|
391 |
| - _patterns = set(gp for pid, gp in gen_patterns[:-1]) |
392 | 417 |
|
393 | 418 | # testing flipped edges (only works if we're working with canonicals)
|
394 | 419 | if canonical:
|
|
0 commit comments