@@ -364,12 +364,7 @@ def _mutate_merge_var_helper(vars_):
364
364
365
365
366
366
def mutate_merge_var_mix (child ):
367
- """Merges two variables into one.
368
-
369
- Considers both node variables and edge variables together.
370
- It is possible to merge an edge and a node too.
371
- Randomly chooses a variable to replace and a variable to merge into.
372
- """
367
+ """Merges two variables into one, potentially merging node and edge vars."""
373
368
vars_ = child .vars_in_graph
374
369
rand_vars , merge_able_vars = _mutate_merge_var_helper (vars_ )
375
370
@@ -383,11 +378,10 @@ def mutate_merge_var_mix(child):
383
378
384
379
385
380
def mutate_merge_var_sep (child ):
386
- """Merges two variables into one.
381
+ """Merges two variables into one, won't merge node and edge vars .
387
382
388
383
Considers the node variables and edge variables separately.
389
- Either merges 2 node variables or 2 edge variable, depending on a random
390
- choice.Randomly chooses a variable to replace and a variable to merge into.
384
+ Depending on availability either merges 2 node variables or 2 edge variable.
391
385
"""
392
386
node_vars = {n for n in child .nodes if isinstance (n , Variable )}
393
387
rand_node_vars , merge_able_node_vars = _mutate_merge_var_helper (node_vars )
@@ -426,12 +420,13 @@ def mutate_del_triple(child):
426
420
427
421
428
422
def mutate_expand_node (child , pb_en_out_link ):
429
- """Expands a random node of the pattern by adding a new triple to it.
423
+ """Expands a random node by adding a new var-only triple to it.
430
424
431
- The variables to be attached to this node, to form a triple, are chosen
432
- randomly.Depending on the probability, makes it an outgoing edge or an
433
- incoming edge.
434
- :return: The modified child, with the added triple.
425
+ Randomly selects a node. Then (depending on the probability pb_en_out_link)
426
+ adds an outgoing or incoming triple with two new vars to it.
427
+
428
+ :arg pb_en_out_link: Probability to create an outgoing triple.
429
+ :return: A child with the added outgoing/incoming triple.
435
430
"""
436
431
# TODO: can maybe be improved by sparqling
437
432
nodes = list (child .nodes )
@@ -446,10 +441,12 @@ def mutate_expand_node(child, pb_en_out_link):
446
441
447
442
448
443
def mutate_add_edge (child ):
449
- """Chooses any 2 nodes from the pattern, and adds an edge between them.
444
+ """Adds an edge between 2 randomly selected nodes.
445
+
446
+ Randomly selects two nodes, then adds a new triple (n1, e, n2), where e is
447
+ a new variable.
450
448
451
- The edge is labeled with a new randomly chosen variable.
452
- :return: Modified child, with the new edge
449
+ :return: A child with the added edge.
453
450
"""
454
451
# TODO: can maybe be improved by sparqling
455
452
nodes = list (child .nodes )
@@ -462,11 +459,12 @@ def mutate_add_edge(child):
462
459
463
460
464
461
def mutate_increase_dist (child ):
465
- """increases distance between source and target by one hop.
462
+ """Increases the distance between ? source and ? target by one hop.
466
463
467
- Adds a triple, to either the source var or the target var.
468
- Interchange the new node with source/target variable to increase distance.
469
- :return: The modified child, with the new triple.
464
+ Randomly adds a var only triple to the ?source or ?target var. Then swaps
465
+ the new node with ?source/?target to increase the distance by one hop.
466
+
467
+ :return: A child with increased distance between ?source and ?target.
470
468
"""
471
469
if not child .complete ():
472
470
return child
@@ -485,6 +483,13 @@ def mutate_increase_dist(child):
485
483
486
484
487
485
def mutate_fix_var_filter (item_counts ):
486
+ """Filters results for fix var mutation.
487
+
488
+ Excludes:
489
+ - too long literals
490
+ - URIs with encoding errors (real world!)
491
+ - BNode results (they will not be fixed but stay SPARQL vars)
492
+ """
488
493
assert isinstance (item_counts , Counter )
489
494
for i in list (item_counts .keys ()):
490
495
if isinstance (i , Literal ):
@@ -527,9 +532,42 @@ def mutate_fix_var(
527
532
sample_max_n = config .MUTPB_FV_SAMPLE_MAXN ,
528
533
limit = config .MUTPB_FV_QUERY_LIMIT ,
529
534
):
530
- """Chooses a random variable from the pattern(node or edge).
531
-
532
- Substitutes it with all possible fixed variables.
535
+ """Finds possible fixations for a randomly selected variable of the pattern.
536
+
537
+ This is the a very important mutation of the gp learner, as it is the main
538
+ source of actually gaining information from the SPARQL endpoint.
539
+
540
+ The outline of the mutation is as follows:
541
+ - If not passed in, randomly selects a variable (rand_var) of the pattern
542
+ (node or edge var, excluding ?source and ?target).
543
+ - Randomly selects a subset of up to gtp_sample_max_n GTPs with
544
+ probabilities according to their remaining gains. The number of GTPs
545
+ picked is randomized (see below).
546
+ - Issues SPARQL queries to find possible fixations for the selected variable
547
+ under the previously selected GTPs subset. Counts the fixation's
548
+ occurrences wrt. the GTPs and sorts the result descending by these counts.
549
+ - Limits the result rows to deal with potential long-tails.
550
+ - Filters the resulting rows with mutate_fix_var_filter.
551
+ - From the limited, filtered result rows randomly selects up to sample_max_n
552
+ candidate fixations with probabilities according to their counts.
553
+ - For each candidate fixation returns a child in which rand_var is replaced
554
+ with the candidate fixation.
555
+
556
+ The reasons for fixing rand_var based on a randomly sized subset of GTPs
557
+ are efficiency and shadowing problems with common long-tails. Due to the
558
+ later imposed limit (which is vital in real world use-cases),
559
+ a few remaining GTPs that share more than `limit` potential fixations (so
560
+ have a common long-tail) could otherwise hide solutions for other
561
+ remaining GTPs. This can be the case if these common fixations have low
562
+ fitness. By randomizing the subset size, we will eventually (and more
563
+ likely) select other combinations of remaining GTPs.
564
+
565
+ :param gtp_sample_max_n: Maximum GTPs subset size to base fixations on.
566
+ :param rand_var: If given uses this variable instead of a random one.
567
+ :param sample_max_n: Maximum number of children.
568
+ :param limit: SPARQL limit for the top-k result rows.
569
+ :return: A list of children in which the selected variable is substituted
570
+ with fixation candidates wrt. GTPs.
533
571
"""
534
572
assert isinstance (child , GraphPattern )
535
573
assert isinstance (gtp_scores , GTPScores )
0 commit comments