Skip to content

Commit cb7f01e

Browse files
committed
Merge branch 'enhance_docstrings'
* enhance_docstrings: enhanced mutation docstrings, especially mutate_fix_var remaining_gain_sample_gtps n renamed to more appropriate max_n added docstrings
2 parents 4622592 + d11a609 commit cb7f01e

File tree

4 files changed

+96
-23
lines changed

4 files changed

+96
-23
lines changed

gp_learner.py

Lines changed: 80 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,7 @@ def _mutate_merge_var_helper(vars_):
386386

387387

388388
def mutate_merge_var_mix(child):
389+
"""Merges two variables into one, potentially merging node and edge vars."""
389390
vars_ = child.vars_in_graph
390391
rand_vars, merge_able_vars = _mutate_merge_var_helper(vars_)
391392

@@ -399,6 +400,11 @@ def mutate_merge_var_mix(child):
399400

400401

401402
def mutate_merge_var_sep(child):
403+
"""Merges two variables into one, won't merge node and edge vars.
404+
405+
Considers the node variables and edge variables separately.
406+
Depending on availability either merges 2 node variables or 2 edge variable.
407+
"""
402408
node_vars = {n for n in child.nodes if isinstance(n, Variable)}
403409
rand_node_vars, merge_able_node_vars = _mutate_merge_var_helper(node_vars)
404410

@@ -436,6 +442,14 @@ def mutate_del_triple(child):
436442

437443

438444
def mutate_expand_node(child, pb_en_out_link):
445+
"""Expands a random node by adding a new var-only triple to it.
446+
447+
Randomly selects a node. Then (depending on the probability pb_en_out_link)
448+
adds an outgoing or incoming triple with two new vars to it.
449+
450+
:arg pb_en_out_link: Probability to create an outgoing triple.
451+
:return: A child with the added outgoing/incoming triple.
452+
"""
439453
# TODO: can maybe be improved by sparqling
440454
nodes = list(child.nodes)
441455
node = random.choice(nodes)
@@ -449,6 +463,13 @@ def mutate_expand_node(child, pb_en_out_link):
449463

450464

451465
def mutate_add_edge(child):
466+
"""Adds an edge between 2 randomly selected nodes.
467+
468+
Randomly selects two nodes, then adds a new triple (n1, e, n2), where e is
469+
a new variable.
470+
471+
:return: A child with the added edge.
472+
"""
452473
# TODO: can maybe be improved by sparqling
453474
nodes = list(child.nodes)
454475
if len(nodes) < 2:
@@ -460,6 +481,13 @@ def mutate_add_edge(child):
460481

461482

462483
def mutate_increase_dist(child):
484+
"""Increases the distance between ?source and ?target by one hop.
485+
486+
Randomly adds a var only triple to the ?source or ?target var. Then swaps
487+
the new node with ?source/?target to increase the distance by one hop.
488+
489+
:return: A child with increased distance between ?source and ?target.
490+
"""
463491
if not child.complete():
464492
return child
465493
var_node = gen_random_var()
@@ -477,6 +505,13 @@ def mutate_increase_dist(child):
477505

478506

479507
def mutate_fix_var_filter(item_counts):
508+
"""Filters results for fix var mutation.
509+
510+
Excludes:
511+
- too long literals
512+
- URIs with encoding errors (real world!)
513+
- BNode results (they will not be fixed but stay SPARQL vars)
514+
"""
480515
assert isinstance(item_counts, Counter)
481516
for i in list(item_counts.keys()):
482517
if isinstance(i, Literal):
@@ -514,21 +549,59 @@ def mutate_fix_var(
514549
timeout,
515550
gtp_scores,
516551
child,
517-
gtp_sample_n=config.MUTPB_FV_RGTP_SAMPLE_N,
552+
gtp_sample_max_n=config.MUTPB_FV_RGTP_SAMPLE_N,
518553
rand_var=None,
519-
sample_n=config.MUTPB_FV_SAMPLE_MAXN,
554+
sample_max_n=config.MUTPB_FV_SAMPLE_MAXN,
520555
limit=config.MUTPB_FV_QUERY_LIMIT,
521556
):
557+
"""Finds possible fixations for a randomly selected variable of the pattern.
558+
559+
This is the a very important mutation of the gp learner, as it is the main
560+
source of actually gaining information from the SPARQL endpoint.
561+
562+
The outline of the mutation is as follows:
563+
- If not passed in, randomly selects a variable (rand_var) of the pattern
564+
(node or edge var, excluding ?source and ?target).
565+
- Randomly selects a subset of up to gtp_sample_max_n GTPs with
566+
probabilities according to their remaining gains. The number of GTPs
567+
picked is randomized (see below).
568+
- Issues SPARQL queries to find possible fixations for the selected variable
569+
under the previously selected GTPs subset. Counts the fixation's
570+
occurrences wrt. the GTPs and sorts the result descending by these counts.
571+
- Limits the result rows to deal with potential long-tails.
572+
- Filters the resulting rows with mutate_fix_var_filter.
573+
- From the limited, filtered result rows randomly selects up to sample_max_n
574+
candidate fixations with probabilities according to their counts.
575+
- For each candidate fixation returns a child in which rand_var is replaced
576+
with the candidate fixation.
577+
578+
The reasons for fixing rand_var based on a randomly sized subset of GTPs
579+
are efficiency and shadowing problems with common long-tails. Due to the
580+
later imposed limit (which is vital in real world use-cases),
581+
a few remaining GTPs that share more than `limit` potential fixations (so
582+
have a common long-tail) could otherwise hide solutions for other
583+
remaining GTPs. This can be the case if these common fixations have low
584+
fitness. By randomizing the subset size, we will eventually (and more
585+
likely) select other combinations of remaining GTPs.
586+
587+
:param gtp_sample_max_n: Maximum GTPs subset size to base fixations on.
588+
:param rand_var: If given uses this variable instead of a random one.
589+
:param sample_max_n: Maximum number of children.
590+
:param limit: SPARQL limit for the top-k result rows.
591+
:return: A list of children in which the selected variable is substituted
592+
with fixation candidates wrt. GTPs.
593+
"""
522594
assert isinstance(child, GraphPattern)
523595
assert isinstance(gtp_scores, GTPScores)
524596

525597
# The further we get, the less gtps are remaining. Sampling too many (all)
526598
# of them might hurt as common substitutions (> limit ones) which are dead
527599
# ends could cover less common ones that could actually help
528-
gtp_sample_n = min(gtp_sample_n, int(gtp_scores.remaining_gain))
529-
gtp_sample_n = random.randint(1, gtp_sample_n)
600+
gtp_sample_max_n = min(gtp_sample_max_n, int(gtp_scores.remaining_gain))
601+
gtp_sample_max_n = random.randint(1, gtp_sample_max_n)
530602

531-
ground_truth_pairs = gtp_scores.remaining_gain_sample_gtps(n=gtp_sample_n)
603+
ground_truth_pairs = gtp_scores.remaining_gain_sample_gtps(
604+
max_n=gtp_sample_max_n)
532605
rand_vars = child.vars_in_graph - {SOURCE_VAR, TARGET_VAR}
533606
if len(rand_vars) < 1:
534607
return [child]
@@ -549,13 +622,13 @@ def mutate_fix_var(
549622
return [child]
550623
# randomly pick n of the substitutions with a prob ~ to their counts
551624
items, counts = zip(*substitution_counts.most_common())
552-
substs = sample_from_list(items, counts, sample_n)
625+
substs = sample_from_list(items, counts, sample_max_n)
553626
logger.info(
554627
'fixed variable %s in %sto:\n %s\n<%d out of:\n%s\n',
555628
rand_var.n3(),
556629
child,
557630
'\n '.join([subst.n3() for subst in substs]),
558-
sample_n,
631+
sample_max_n,
559632
'\n'.join([' %d: %s' % (c, v.n3())
560633
for v, c in substitution_counts.most_common()]),
561634
)

gtp_scores.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,19 +52,19 @@ def update_with_gps(self, gps):
5252
self.gtp_max_precisions[gtp] = precision
5353
return precision_gain
5454

55-
def remaining_gain_sample_gtps(self, n=None):
55+
def remaining_gain_sample_gtps(self, max_n=None):
5656
"""Sample ground truth pairs according to remaining gains.
5757
58-
This method draws up to n ground truth pairs using their remaining gains
59-
as sample probabilities. If less than n probabilities are > 0 it draws
60-
less gtps.
58+
This method draws up to max_n ground truth pairs using their remaining
59+
gains as sample probabilities. GTPs with remaining gain of 0 are never
60+
returned, so if less than n probabilities are > 0 it draws less gtps.
6161
62-
:param n: Up to n items to sample.
62+
:param max_n: Up to n items to sample.
6363
:return: list of ground truth pairs sampled according to their remaining
6464
gains in gtp_scores with max length of n.
6565
"""
6666
gtps, gains = zip(*self.get_remaining_gains().items())
67-
return sample_from_list(gtps, gains, n)
67+
return sample_from_list(gtps, gains, max_n)
6868

6969
def __sub__(self, other):
7070
if not isinstance(other, GTPScores):

tests/test_gp_learner_offline.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -169,14 +169,13 @@ def test_simplify_pattern():
169169
assert res == gp, 'not simplified:\n%s' % res.to_sparql_select_query()
170170

171171
# counter example of an advanced but restricting pattern:
172-
gp = gp + [
172+
gp += [
173173
(SOURCE_VAR, Variable('v3'), Variable('v4')),
174174
(Variable('v5'), Variable('v6'), Variable('v4')),
175175
(Variable('v4'), Variable('v7'), Variable('v8')),
176176
(TARGET_VAR, Variable('v3'), SOURCE_VAR),
177177
(dbp['City'], Variable('v6'), dbp['Country']),
178178
(dbp['Country'], Variable('v8'), dbp['City']),
179-
180179
]
181180
res = mutate_simplify_pattern(gp)
182181
assert res == gp, 'was simplified (bad):\n%s' % res.to_sparql_select_query()
@@ -221,22 +220,22 @@ def test_simplify_pattern():
221220

222221
def test_remaining_gain_sample_gtps():
223222
n = len(ground_truth_pairs)
224-
gtps = sorted(gtp_scores.remaining_gain_sample_gtps(n=n))
223+
gtps = sorted(gtp_scores.remaining_gain_sample_gtps(max_n=n))
225224
assert len(gtps) == n
226225
# if we draw everything the results should always be everything
227-
assert gtps == sorted(gtp_scores.remaining_gain_sample_gtps(n=n))
226+
assert gtps == sorted(gtp_scores.remaining_gain_sample_gtps(max_n=n))
228227
# if we don't draw everything it's quite unlikely we get the same result
229-
gtps = gtp_scores.remaining_gain_sample_gtps(n=5)
228+
gtps = gtp_scores.remaining_gain_sample_gtps(max_n=5)
230229
assert len(gtps) == 5
231-
assert gtps != gtp_scores.remaining_gain_sample_gtps(n=5)
230+
assert gtps != gtp_scores.remaining_gain_sample_gtps(max_n=5)
232231

233232
# make sure we never get items that are fully covered already
234233
gtp_scores.gtp_max_precisions[ground_truth_pairs[0]] = 1
235234
c = Counter()
236235
k = 100
237236
n = 128
238237
for i in range(k):
239-
c.update(gtp_scores.remaining_gain_sample_gtps(n=n))
238+
c.update(gtp_scores.remaining_gain_sample_gtps(max_n=n))
240239
assert ground_truth_pairs[0] not in c
241240
assert sum(c.values()) == k * n
242241
# count how many aren't in gtps
@@ -260,7 +259,7 @@ def test_remaining_gain_sample_gtps():
260259
assert gtpe_scores.remaining_gain == 1
261260
c = Counter()
262261
for i in range(100):
263-
c.update(gtpe_scores.remaining_gain_sample_gtps(n=1))
262+
c.update(gtpe_scores.remaining_gain_sample_gtps(max_n=1))
264263
assert len(c) == 2
265264
assert sum(c.values()) == 100
266265
assert (binom.pmf(c[high_prob], 100, .9) > 0.001 and

utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,8 +206,9 @@ def sample_from_list(l, probs, max_n=None):
206206
"""Sample list according to probs.
207207
208208
This method draws up to max_n items from l using the given list of probs as
209-
sample probabilities. max_n defaults to len(l) if not specified. If less
210-
than max_n probabilities are > 0 only those items are returned.
209+
sample probabilities. max_n defaults to len(l) if not specified. Items with
210+
probability 0 are never sampled, so if less than max_n probabilities are > 0
211+
only those items are returned.
211212
212213
:param l: list from which to draw items.
213214
:param probs: List of probabilities to draw items. Normalized by sum(probs).

0 commit comments

Comments
 (0)