Skip to content

Commit 28c6c18

Browse files
committed
WIP: eval pattern generator massively improved by numerical recursive enumeration and early termination
1 parent 5d15b53 commit 28c6c18

File tree

1 file changed

+186
-8
lines changed

1 file changed

+186
-8
lines changed

eval.py

Lines changed: 186 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@
55
from __future__ import print_function
66
from __future__ import unicode_literals
77

8+
from copy import deepcopy
89
from itertools import combinations
910
from itertools import combinations_with_replacement
1011
from itertools import permutations
1112
from itertools import product
13+
import sys
1214

1315
from rdflib import Variable
1416
from scipy.special import binom
17+
from scipy.misc import comb
1518

1619
from logging_config import logging
1720
from graph_pattern import SOURCE_VAR
@@ -24,6 +27,7 @@
2427

2528

2629
DEBUG = False
30+
HOLE = sys.maxint # placeholder for holes in partial patterns
2731

2832
# debug logging in this module is actually quite expensive (> 30 % of time). In
2933
# case it's undesired the following removes that overhead.
@@ -34,6 +38,173 @@ def quick_skip_debug_log(*args, **kwds):
3438
logger.debug = quick_skip_debug_log
3539

3640

41+
def numerical_patterns(
42+
length,
43+
_partial_pattern=None,
44+
_pos=None,
45+
_var=1,
46+
):
47+
"""Numerical pattern generator.
48+
49+
A pattern is a tuple of 3 tuples of variables, so for example the following
50+
is a pattern of length 2:
51+
((?source, ?v3, ?target), (?target, ?v3, ?v4))
52+
53+
For brevity, we can write the same as:
54+
'acb bcd' or numerical as '132 231'
55+
56+
In the short version we could map ?source to 'a' or '1', ?target to 'b' or
57+
'2' and the other variables to the following letters / numbers.
58+
59+
During generation we should take care that we don't generate a whole lot of
60+
unnecessary duplicates (so patterns that are obviously invalid or isomorphic
61+
to previous ones).
62+
63+
A pattern is valid if:
64+
- its triples are sorted
65+
NO: 221 112 --> YES: 112 221
66+
- its triples are pairwise distinct
67+
NO: 112 112
68+
- its triples are pairwise connected
69+
NO: 123 456
70+
YES: 123 345
71+
YES: 123 132
72+
- the used variables don't skip a variable
73+
NO: 124 456 --> YES: 123 345
74+
- variables aren't unnecessary high
75+
NO: 124 334 --> YES: 123 443
76+
NO: 421 534 --> YES: 123 451
77+
YES: 312 411
78+
- it uses between 2 (source and target) and 2n + 1 vars (3 + 2 + 2 + ...)
79+
80+
"""
81+
if not _partial_pattern:
82+
_partial_pattern = [[HOLE, HOLE, HOLE] for _ in range(length)]
83+
_pos = (0, 0)
84+
85+
i, j = _pos
86+
_partial_pattern = deepcopy(_partial_pattern)
87+
_partial_pattern[i][j] = _var
88+
89+
if i >= 1 and _partial_pattern[i - 1] >= _partial_pattern[i]:
90+
# current triple must be larger than previous one for sorting and to
91+
# exclude multiple equivalent triples
92+
return
93+
94+
if i >= 1 and j == 2:
95+
# we just completed a triple, check that it's connected
96+
t = _partial_pattern[i]
97+
for pt in _partial_pattern[:i]:
98+
if t[0] in pt or t[1] in pt or t[2] in pt:
99+
break
100+
else:
101+
# we're not connected, early terminate this
102+
# This is safe as a later triple can't reconnect us anymore without
103+
# an isomorphic, lower enumeration that would've been encountered
104+
# before:
105+
# say we have
106+
# abc xyz uvw
107+
# with xyz not being connected yet and uvw or any later part
108+
# connecting xyz back to abc. We can just use a breadth first search
109+
# from abc via those connecting triples and re-label all encountered
110+
# vars by breadth first search encountering. That re-labeling is
111+
# guaranteed to forward connect and it will generate a smaller
112+
# labelling than the current one.
113+
return
114+
115+
if i >= length - 1 and j >= 2:
116+
# we're at the end of the pattern
117+
yield _partial_pattern
118+
else:
119+
# advance to next position
120+
j += 1
121+
if j > 2:
122+
j = 0
123+
i += 1
124+
125+
flat_pp = [v for t in _partial_pattern for v in t]
126+
prev_vars = [v for v in flat_pp][:3*i + j]
127+
prev_max_var = max([v for v in prev_vars if v != HOLE])
128+
_star_var = 1
129+
# if i > 0:
130+
# # doesn't seem to hold :(
131+
# _star_var = _partial_pattern[i - 1][j]
132+
_end_var = min(
133+
prev_max_var + 1, # can't skip a var
134+
# 2*length + 1, # can't exceed max total number of vars (induced)
135+
3 + 2*i, # vars in triple i can't exceed this, otherwise not sorted
136+
)
137+
for v in range(_star_var, _end_var + 1):
138+
for pattern in numerical_patterns(
139+
length,
140+
_partial_pattern=_partial_pattern,
141+
_pos=(i, j),
142+
_var=v
143+
):
144+
yield pattern
145+
146+
147+
def patterns(
148+
length,
149+
exclude_isomorphic=True,
150+
count_candidates_only=False,
151+
):
152+
"""Takes a numerical pattern and generates actual patterns from it."""
153+
assert not count_candidates_only or not exclude_isomorphic, \
154+
'count_candidates_only cannot be used with isomorphism check'
155+
156+
canonicalized_patterns = {}
157+
158+
pid = -1
159+
for c, num_pat in enumerate(numerical_patterns(length)):
160+
numbers = sorted(set([v for t in num_pat for v in t]))
161+
# var_map = {i: '?v%d' % i for i in numbers}
162+
# pattern = GraphPattern(
163+
# tuple([tuple([var_map[i] for i in t]) for t in numerical_repr]))
164+
if count_candidates_only:
165+
l = len(numbers)
166+
perms = l * (l-1)
167+
pid += perms
168+
# yield pid, None # way slower, rather show progress from here:
169+
if c % 100000 == 0:
170+
logger.info(
171+
'pattern id: %d, vars: %d, permutations: %d',
172+
pid, l, perms
173+
)
174+
continue
175+
176+
for s, t in permutations(numbers, 2):
177+
pid += 1
178+
leftover_numbers = [n for n in numbers if n != s and n != t]
179+
var_map = {n: Variable('v%d' % i)
180+
for i, n in enumerate(leftover_numbers)}
181+
var_map[s] = SOURCE_VAR
182+
var_map[t] = TARGET_VAR
183+
gp = GraphPattern(
184+
tuple([tuple([var_map[i] for i in trip]) for trip in num_pat]))
185+
186+
# exclude patterns which are isomorphic to already generated ones
187+
if exclude_isomorphic:
188+
cgp = canonicalize(gp)
189+
if cgp in canonicalized_patterns:
190+
igp = canonicalized_patterns[cgp]
191+
igp_numpat, igp_s, igp_t, igp_gp = igp
192+
logger.debug(
193+
'excluded isomorphic %s with ?s=%d, ?t=%d:\n'
194+
'isomorphic to %s with ?s=%d, ?t=%d:\n'
195+
'%sand\n%s',
196+
num_pat, s, t,
197+
igp_numpat, igp_s, igp_t,
198+
gp, igp_gp,
199+
)
200+
continue
201+
else:
202+
canonicalized_patterns[cgp] = (num_pat, s, t, gp)
203+
gp = cgp
204+
yield pid, gp
205+
yield pid + 1, None
206+
207+
37208
def pattern_generator(
38209
length,
39210
loops=True,
@@ -119,24 +290,31 @@ def pattern_generator(
119290

120291

121292
def main():
122-
length = 3
123-
# 3: 47478 (pcon, nej) of 6666891
124-
# 4:
125-
# 5:
293+
length = 1
294+
# len | pcon | nej | pcon, nej | candidates | candidates |
295+
# | | | (canonical) | (old method) | (numerical) |
296+
# ----+------+-----+--------------+----------------+-------------+
297+
# 1 | 8 | 12 | 12 | 27 | 12 |
298+
# 2 | 146 | 469 | 693 | 7750 | 1314 |
299+
# 3 | | | 47478 | 6666891 | 151534 |
300+
# 4 | | | | 11671285626 | 20884300 |
301+
# 5 | | | | 34549552710596 | 3461471628 |
126302

127303
gen_patterns = []
128-
for n, (i, pattern) in enumerate(pattern_generator(length)):
304+
i = 0
305+
for n, (i, pattern) in enumerate(patterns(length, False, True)):
129306
print('%d: Pattern id %d: %s' % (n, i, pattern))
130307
gen_patterns.append((i, pattern))
131-
patterns = set(gp for pid, gp in gen_patterns[:-1])
308+
print(i)
309+
_patterns = set(gp for pid, gp in gen_patterns[:-1])
132310

133311
# testing flipped edges
134-
for gp in patterns:
312+
for gp in _patterns:
135313
for i in range(length):
136314
mod_gp = gp.flip_edge(i)
137315
# can happen that flipped edge was there already
138316
if len(mod_gp) == length:
139-
assert canonicalize(mod_gp) in patterns
317+
assert canonicalize(mod_gp) in _patterns
140318

141319

142320
if __name__ == '__main__':

0 commit comments

Comments
 (0)