5
5
from __future__ import print_function
6
6
from __future__ import unicode_literals
7
7
8
+ from copy import deepcopy
8
9
from itertools import combinations
9
10
from itertools import combinations_with_replacement
10
11
from itertools import permutations
11
12
from itertools import product
13
+ import sys
12
14
13
15
from rdflib import Variable
14
16
from scipy .special import binom
17
+ from scipy .misc import comb
15
18
16
19
from logging_config import logging
17
20
from graph_pattern import SOURCE_VAR
24
27
25
28
26
29
DEBUG = False
30
+ HOLE = sys .maxint # placeholder for holes in partial patterns
27
31
28
32
# debug logging in this module is actually quite expensive (> 30 % of time). In
29
33
# case it's undesired the following removes that overhead.
@@ -34,6 +38,173 @@ def quick_skip_debug_log(*args, **kwds):
34
38
logger .debug = quick_skip_debug_log
35
39
36
40
41
+ def numerical_patterns (
42
+ length ,
43
+ _partial_pattern = None ,
44
+ _pos = None ,
45
+ _var = 1 ,
46
+ ):
47
+ """Numerical pattern generator.
48
+
49
+ A pattern is a tuple of 3 tuples of variables, so for example the following
50
+ is a pattern of length 2:
51
+ ((?source, ?v3, ?target), (?target, ?v3, ?v4))
52
+
53
+ For brevity, we can write the same as:
54
+ 'acb bcd' or numerical as '132 231'
55
+
56
+ In the short version we could map ?source to 'a' or '1', ?target to 'b' or
57
+ '2' and the other variables to the following letters / numbers.
58
+
59
+ During generation we should take care that we don't generate a whole lot of
60
+ unnecessary duplicates (so patterns that are obviously invalid or isomorphic
61
+ to previous ones).
62
+
63
+ A pattern is valid if:
64
+ - its triples are sorted
65
+ NO: 221 112 --> YES: 112 221
66
+ - its triples are pairwise distinct
67
+ NO: 112 112
68
+ - its triples are pairwise connected
69
+ NO: 123 456
70
+ YES: 123 345
71
+ YES: 123 132
72
+ - the used variables don't skip a variable
73
+ NO: 124 456 --> YES: 123 345
74
+ - variables aren't unnecessary high
75
+ NO: 124 334 --> YES: 123 443
76
+ NO: 421 534 --> YES: 123 451
77
+ YES: 312 411
78
+ - it uses between 2 (source and target) and 2n + 1 vars (3 + 2 + 2 + ...)
79
+
80
+ """
81
+ if not _partial_pattern :
82
+ _partial_pattern = [[HOLE , HOLE , HOLE ] for _ in range (length )]
83
+ _pos = (0 , 0 )
84
+
85
+ i , j = _pos
86
+ _partial_pattern = deepcopy (_partial_pattern )
87
+ _partial_pattern [i ][j ] = _var
88
+
89
+ if i >= 1 and _partial_pattern [i - 1 ] >= _partial_pattern [i ]:
90
+ # current triple must be larger than previous one for sorting and to
91
+ # exclude multiple equivalent triples
92
+ return
93
+
94
+ if i >= 1 and j == 2 :
95
+ # we just completed a triple, check that it's connected
96
+ t = _partial_pattern [i ]
97
+ for pt in _partial_pattern [:i ]:
98
+ if t [0 ] in pt or t [1 ] in pt or t [2 ] in pt :
99
+ break
100
+ else :
101
+ # we're not connected, early terminate this
102
+ # This is safe as a later triple can't reconnect us anymore without
103
+ # an isomorphic, lower enumeration that would've been encountered
104
+ # before:
105
+ # say we have
106
+ # abc xyz uvw
107
+ # with xyz not being connected yet and uvw or any later part
108
+ # connecting xyz back to abc. We can just use a breadth first search
109
+ # from abc via those connecting triples and re-label all encountered
110
+ # vars by breadth first search encountering. That re-labeling is
111
+ # guaranteed to forward connect and it will generate a smaller
112
+ # labelling than the current one.
113
+ return
114
+
115
+ if i >= length - 1 and j >= 2 :
116
+ # we're at the end of the pattern
117
+ yield _partial_pattern
118
+ else :
119
+ # advance to next position
120
+ j += 1
121
+ if j > 2 :
122
+ j = 0
123
+ i += 1
124
+
125
+ flat_pp = [v for t in _partial_pattern for v in t ]
126
+ prev_vars = [v for v in flat_pp ][:3 * i + j ]
127
+ prev_max_var = max ([v for v in prev_vars if v != HOLE ])
128
+ _star_var = 1
129
+ # if i > 0:
130
+ # # doesn't seem to hold :(
131
+ # _star_var = _partial_pattern[i - 1][j]
132
+ _end_var = min (
133
+ prev_max_var + 1 , # can't skip a var
134
+ # 2*length + 1, # can't exceed max total number of vars (induced)
135
+ 3 + 2 * i , # vars in triple i can't exceed this, otherwise not sorted
136
+ )
137
+ for v in range (_star_var , _end_var + 1 ):
138
+ for pattern in numerical_patterns (
139
+ length ,
140
+ _partial_pattern = _partial_pattern ,
141
+ _pos = (i , j ),
142
+ _var = v
143
+ ):
144
+ yield pattern
145
+
146
+
147
+ def patterns (
148
+ length ,
149
+ exclude_isomorphic = True ,
150
+ count_candidates_only = False ,
151
+ ):
152
+ """Takes a numerical pattern and generates actual patterns from it."""
153
+ assert not count_candidates_only or not exclude_isomorphic , \
154
+ 'count_candidates_only cannot be used with isomorphism check'
155
+
156
+ canonicalized_patterns = {}
157
+
158
+ pid = - 1
159
+ for c , num_pat in enumerate (numerical_patterns (length )):
160
+ numbers = sorted (set ([v for t in num_pat for v in t ]))
161
+ # var_map = {i: '?v%d' % i for i in numbers}
162
+ # pattern = GraphPattern(
163
+ # tuple([tuple([var_map[i] for i in t]) for t in numerical_repr]))
164
+ if count_candidates_only :
165
+ l = len (numbers )
166
+ perms = l * (l - 1 )
167
+ pid += perms
168
+ # yield pid, None # way slower, rather show progress from here:
169
+ if c % 100000 == 0 :
170
+ logger .info (
171
+ 'pattern id: %d, vars: %d, permutations: %d' ,
172
+ pid , l , perms
173
+ )
174
+ continue
175
+
176
+ for s , t in permutations (numbers , 2 ):
177
+ pid += 1
178
+ leftover_numbers = [n for n in numbers if n != s and n != t ]
179
+ var_map = {n : Variable ('v%d' % i )
180
+ for i , n in enumerate (leftover_numbers )}
181
+ var_map [s ] = SOURCE_VAR
182
+ var_map [t ] = TARGET_VAR
183
+ gp = GraphPattern (
184
+ tuple ([tuple ([var_map [i ] for i in trip ]) for trip in num_pat ]))
185
+
186
+ # exclude patterns which are isomorphic to already generated ones
187
+ if exclude_isomorphic :
188
+ cgp = canonicalize (gp )
189
+ if cgp in canonicalized_patterns :
190
+ igp = canonicalized_patterns [cgp ]
191
+ igp_numpat , igp_s , igp_t , igp_gp = igp
192
+ logger .debug (
193
+ 'excluded isomorphic %s with ?s=%d, ?t=%d:\n '
194
+ 'isomorphic to %s with ?s=%d, ?t=%d:\n '
195
+ '%sand\n %s' ,
196
+ num_pat , s , t ,
197
+ igp_numpat , igp_s , igp_t ,
198
+ gp , igp_gp ,
199
+ )
200
+ continue
201
+ else :
202
+ canonicalized_patterns [cgp ] = (num_pat , s , t , gp )
203
+ gp = cgp
204
+ yield pid , gp
205
+ yield pid + 1 , None
206
+
207
+
37
208
def pattern_generator (
38
209
length ,
39
210
loops = True ,
@@ -119,24 +290,31 @@ def pattern_generator(
119
290
120
291
121
292
def main ():
122
- length = 3
123
- # 3: 47478 (pcon, nej) of 6666891
124
- # 4:
125
- # 5:
293
+ length = 1
294
+ # len | pcon | nej | pcon, nej | candidates | candidates |
295
+ # | | | (canonical) | (old method) | (numerical) |
296
+ # ----+------+-----+--------------+----------------+-------------+
297
+ # 1 | 8 | 12 | 12 | 27 | 12 |
298
+ # 2 | 146 | 469 | 693 | 7750 | 1314 |
299
+ # 3 | | | 47478 | 6666891 | 151534 |
300
+ # 4 | | | | 11671285626 | 20884300 |
301
+ # 5 | | | | 34549552710596 | 3461471628 |
126
302
127
303
gen_patterns = []
128
- for n , (i , pattern ) in enumerate (pattern_generator (length )):
304
+ i = 0
305
+ for n , (i , pattern ) in enumerate (patterns (length , False , True )):
129
306
print ('%d: Pattern id %d: %s' % (n , i , pattern ))
130
307
gen_patterns .append ((i , pattern ))
131
- patterns = set (gp for pid , gp in gen_patterns [:- 1 ])
308
+ print (i )
309
+ _patterns = set (gp for pid , gp in gen_patterns [:- 1 ])
132
310
133
311
# testing flipped edges
134
- for gp in patterns :
312
+ for gp in _patterns :
135
313
for i in range (length ):
136
314
mod_gp = gp .flip_edge (i )
137
315
# can happen that flipped edge was there already
138
316
if len (mod_gp ) == length :
139
- assert canonicalize (mod_gp ) in patterns
317
+ assert canonicalize (mod_gp ) in _patterns
140
318
141
319
142
320
if __name__ == '__main__' :
0 commit comments