Skip to content

Commit d11a609

Browse files
committed
enhanced mutation docstrings, especially mutate_fix_var
1 parent 8065067 commit d11a609

File tree

1 file changed

+62
-24
lines changed

1 file changed

+62
-24
lines changed

gp_learner.py

Lines changed: 62 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -364,12 +364,7 @@ def _mutate_merge_var_helper(vars_):
364364

365365

366366
def mutate_merge_var_mix(child):
367-
"""Merges two variables into one.
368-
369-
Considers both node variables and edge variables together.
370-
It is possible to merge an edge and a node too.
371-
Randomly chooses a variable to replace and a variable to merge into.
372-
"""
367+
"""Merges two variables into one, potentially merging node and edge vars."""
373368
vars_ = child.vars_in_graph
374369
rand_vars, merge_able_vars = _mutate_merge_var_helper(vars_)
375370

@@ -383,11 +378,10 @@ def mutate_merge_var_mix(child):
383378

384379

385380
def mutate_merge_var_sep(child):
386-
"""Merges two variables into one.
381+
"""Merges two variables into one, won't merge node and edge vars.
387382
388383
Considers the node variables and edge variables separately.
389-
Either merges 2 node variables or 2 edge variable, depending on a random
390-
choice.Randomly chooses a variable to replace and a variable to merge into.
384+
Depending on availability either merges 2 node variables or 2 edge variable.
391385
"""
392386
node_vars = {n for n in child.nodes if isinstance(n, Variable)}
393387
rand_node_vars, merge_able_node_vars = _mutate_merge_var_helper(node_vars)
@@ -426,12 +420,13 @@ def mutate_del_triple(child):
426420

427421

428422
def mutate_expand_node(child, pb_en_out_link):
429-
"""Expands a random node of the pattern by adding a new triple to it.
423+
"""Expands a random node by adding a new var-only triple to it.
430424
431-
The variables to be attached to this node, to form a triple, are chosen
432-
randomly.Depending on the probability, makes it an outgoing edge or an
433-
incoming edge.
434-
:return: The modified child, with the added triple.
425+
Randomly selects a node. Then (depending on the probability pb_en_out_link)
426+
adds an outgoing or incoming triple with two new vars to it.
427+
428+
:arg pb_en_out_link: Probability to create an outgoing triple.
429+
:return: A child with the added outgoing/incoming triple.
435430
"""
436431
# TODO: can maybe be improved by sparqling
437432
nodes = list(child.nodes)
@@ -446,10 +441,12 @@ def mutate_expand_node(child, pb_en_out_link):
446441

447442

448443
def mutate_add_edge(child):
449-
"""Chooses any 2 nodes from the pattern, and adds an edge between them.
444+
"""Adds an edge between 2 randomly selected nodes.
445+
446+
Randomly selects two nodes, then adds a new triple (n1, e, n2), where e is
447+
a new variable.
450448
451-
The edge is labeled with a new randomly chosen variable.
452-
:return: Modified child, with the new edge
449+
:return: A child with the added edge.
453450
"""
454451
# TODO: can maybe be improved by sparqling
455452
nodes = list(child.nodes)
@@ -462,11 +459,12 @@ def mutate_add_edge(child):
462459

463460

464461
def mutate_increase_dist(child):
465-
"""increases distance between source and target by one hop.
462+
"""Increases the distance between ?source and ?target by one hop.
466463
467-
Adds a triple, to either the source var or the target var.
468-
Interchange the new node with source/target variable to increase distance.
469-
:return: The modified child, with the new triple.
464+
Randomly adds a var only triple to the ?source or ?target var. Then swaps
465+
the new node with ?source/?target to increase the distance by one hop.
466+
467+
:return: A child with increased distance between ?source and ?target.
470468
"""
471469
if not child.complete():
472470
return child
@@ -485,6 +483,13 @@ def mutate_increase_dist(child):
485483

486484

487485
def mutate_fix_var_filter(item_counts):
486+
"""Filters results for fix var mutation.
487+
488+
Excludes:
489+
- too long literals
490+
- URIs with encoding errors (real world!)
491+
- BNode results (they will not be fixed but stay SPARQL vars)
492+
"""
488493
assert isinstance(item_counts, Counter)
489494
for i in list(item_counts.keys()):
490495
if isinstance(i, Literal):
@@ -527,9 +532,42 @@ def mutate_fix_var(
527532
sample_max_n=config.MUTPB_FV_SAMPLE_MAXN,
528533
limit=config.MUTPB_FV_QUERY_LIMIT,
529534
):
530-
"""Chooses a random variable from the pattern(node or edge).
531-
532-
Substitutes it with all possible fixed variables.
535+
"""Finds possible fixations for a randomly selected variable of the pattern.
536+
537+
This is the a very important mutation of the gp learner, as it is the main
538+
source of actually gaining information from the SPARQL endpoint.
539+
540+
The outline of the mutation is as follows:
541+
- If not passed in, randomly selects a variable (rand_var) of the pattern
542+
(node or edge var, excluding ?source and ?target).
543+
- Randomly selects a subset of up to gtp_sample_max_n GTPs with
544+
probabilities according to their remaining gains. The number of GTPs
545+
picked is randomized (see below).
546+
- Issues SPARQL queries to find possible fixations for the selected variable
547+
under the previously selected GTPs subset. Counts the fixation's
548+
occurrences wrt. the GTPs and sorts the result descending by these counts.
549+
- Limits the result rows to deal with potential long-tails.
550+
- Filters the resulting rows with mutate_fix_var_filter.
551+
- From the limited, filtered result rows randomly selects up to sample_max_n
552+
candidate fixations with probabilities according to their counts.
553+
- For each candidate fixation returns a child in which rand_var is replaced
554+
with the candidate fixation.
555+
556+
The reasons for fixing rand_var based on a randomly sized subset of GTPs
557+
are efficiency and shadowing problems with common long-tails. Due to the
558+
later imposed limit (which is vital in real world use-cases),
559+
a few remaining GTPs that share more than `limit` potential fixations (so
560+
have a common long-tail) could otherwise hide solutions for other
561+
remaining GTPs. This can be the case if these common fixations have low
562+
fitness. By randomizing the subset size, we will eventually (and more
563+
likely) select other combinations of remaining GTPs.
564+
565+
:param gtp_sample_max_n: Maximum GTPs subset size to base fixations on.
566+
:param rand_var: If given uses this variable instead of a random one.
567+
:param sample_max_n: Maximum number of children.
568+
:param limit: SPARQL limit for the top-k result rows.
569+
:return: A list of children in which the selected variable is substituted
570+
with fixation candidates wrt. GTPs.
533571
"""
534572
assert isinstance(child, GraphPattern)
535573
assert isinstance(gtp_scores, GTPScores)

0 commit comments

Comments
 (0)