@@ -386,6 +386,7 @@ def _mutate_merge_var_helper(vars_):
386
386
387
387
388
388
def mutate_merge_var_mix (child ):
389
+ """Merges two variables into one, potentially merging node and edge vars."""
389
390
vars_ = child .vars_in_graph
390
391
rand_vars , merge_able_vars = _mutate_merge_var_helper (vars_ )
391
392
@@ -399,6 +400,11 @@ def mutate_merge_var_mix(child):
399
400
400
401
401
402
def mutate_merge_var_sep (child ):
403
+ """Merges two variables into one, won't merge node and edge vars.
404
+
405
+ Considers the node variables and edge variables separately.
406
+ Depending on availability either merges 2 node variables or 2 edge variable.
407
+ """
402
408
node_vars = {n for n in child .nodes if isinstance (n , Variable )}
403
409
rand_node_vars , merge_able_node_vars = _mutate_merge_var_helper (node_vars )
404
410
@@ -436,6 +442,14 @@ def mutate_del_triple(child):
436
442
437
443
438
444
def mutate_expand_node (child , pb_en_out_link ):
445
+ """Expands a random node by adding a new var-only triple to it.
446
+
447
+ Randomly selects a node. Then (depending on the probability pb_en_out_link)
448
+ adds an outgoing or incoming triple with two new vars to it.
449
+
450
+ :arg pb_en_out_link: Probability to create an outgoing triple.
451
+ :return: A child with the added outgoing/incoming triple.
452
+ """
439
453
# TODO: can maybe be improved by sparqling
440
454
nodes = list (child .nodes )
441
455
node = random .choice (nodes )
@@ -449,6 +463,13 @@ def mutate_expand_node(child, pb_en_out_link):
449
463
450
464
451
465
def mutate_add_edge (child ):
466
+ """Adds an edge between 2 randomly selected nodes.
467
+
468
+ Randomly selects two nodes, then adds a new triple (n1, e, n2), where e is
469
+ a new variable.
470
+
471
+ :return: A child with the added edge.
472
+ """
452
473
# TODO: can maybe be improved by sparqling
453
474
nodes = list (child .nodes )
454
475
if len (nodes ) < 2 :
@@ -460,6 +481,13 @@ def mutate_add_edge(child):
460
481
461
482
462
483
def mutate_increase_dist (child ):
484
+ """Increases the distance between ?source and ?target by one hop.
485
+
486
+ Randomly adds a var only triple to the ?source or ?target var. Then swaps
487
+ the new node with ?source/?target to increase the distance by one hop.
488
+
489
+ :return: A child with increased distance between ?source and ?target.
490
+ """
463
491
if not child .complete ():
464
492
return child
465
493
var_node = gen_random_var ()
@@ -477,6 +505,13 @@ def mutate_increase_dist(child):
477
505
478
506
479
507
def mutate_fix_var_filter (item_counts ):
508
+ """Filters results for fix var mutation.
509
+
510
+ Excludes:
511
+ - too long literals
512
+ - URIs with encoding errors (real world!)
513
+ - BNode results (they will not be fixed but stay SPARQL vars)
514
+ """
480
515
assert isinstance (item_counts , Counter )
481
516
for i in list (item_counts .keys ()):
482
517
if isinstance (i , Literal ):
@@ -514,21 +549,59 @@ def mutate_fix_var(
514
549
timeout ,
515
550
gtp_scores ,
516
551
child ,
517
- gtp_sample_n = config .MUTPB_FV_RGTP_SAMPLE_N ,
552
+ gtp_sample_max_n = config .MUTPB_FV_RGTP_SAMPLE_N ,
518
553
rand_var = None ,
519
- sample_n = config .MUTPB_FV_SAMPLE_MAXN ,
554
+ sample_max_n = config .MUTPB_FV_SAMPLE_MAXN ,
520
555
limit = config .MUTPB_FV_QUERY_LIMIT ,
521
556
):
557
+ """Finds possible fixations for a randomly selected variable of the pattern.
558
+
559
+ This is the a very important mutation of the gp learner, as it is the main
560
+ source of actually gaining information from the SPARQL endpoint.
561
+
562
+ The outline of the mutation is as follows:
563
+ - If not passed in, randomly selects a variable (rand_var) of the pattern
564
+ (node or edge var, excluding ?source and ?target).
565
+ - Randomly selects a subset of up to gtp_sample_max_n GTPs with
566
+ probabilities according to their remaining gains. The number of GTPs
567
+ picked is randomized (see below).
568
+ - Issues SPARQL queries to find possible fixations for the selected variable
569
+ under the previously selected GTPs subset. Counts the fixation's
570
+ occurrences wrt. the GTPs and sorts the result descending by these counts.
571
+ - Limits the result rows to deal with potential long-tails.
572
+ - Filters the resulting rows with mutate_fix_var_filter.
573
+ - From the limited, filtered result rows randomly selects up to sample_max_n
574
+ candidate fixations with probabilities according to their counts.
575
+ - For each candidate fixation returns a child in which rand_var is replaced
576
+ with the candidate fixation.
577
+
578
+ The reasons for fixing rand_var based on a randomly sized subset of GTPs
579
+ are efficiency and shadowing problems with common long-tails. Due to the
580
+ later imposed limit (which is vital in real world use-cases),
581
+ a few remaining GTPs that share more than `limit` potential fixations (so
582
+ have a common long-tail) could otherwise hide solutions for other
583
+ remaining GTPs. This can be the case if these common fixations have low
584
+ fitness. By randomizing the subset size, we will eventually (and more
585
+ likely) select other combinations of remaining GTPs.
586
+
587
+ :param gtp_sample_max_n: Maximum GTPs subset size to base fixations on.
588
+ :param rand_var: If given uses this variable instead of a random one.
589
+ :param sample_max_n: Maximum number of children.
590
+ :param limit: SPARQL limit for the top-k result rows.
591
+ :return: A list of children in which the selected variable is substituted
592
+ with fixation candidates wrt. GTPs.
593
+ """
522
594
assert isinstance (child , GraphPattern )
523
595
assert isinstance (gtp_scores , GTPScores )
524
596
525
597
# The further we get, the less gtps are remaining. Sampling too many (all)
526
598
# of them might hurt as common substitutions (> limit ones) which are dead
527
599
# ends could cover less common ones that could actually help
528
- gtp_sample_n = min (gtp_sample_n , int (gtp_scores .remaining_gain ))
529
- gtp_sample_n = random .randint (1 , gtp_sample_n )
600
+ gtp_sample_max_n = min (gtp_sample_max_n , int (gtp_scores .remaining_gain ))
601
+ gtp_sample_max_n = random .randint (1 , gtp_sample_max_n )
530
602
531
- ground_truth_pairs = gtp_scores .remaining_gain_sample_gtps (n = gtp_sample_n )
603
+ ground_truth_pairs = gtp_scores .remaining_gain_sample_gtps (
604
+ max_n = gtp_sample_max_n )
532
605
rand_vars = child .vars_in_graph - {SOURCE_VAR , TARGET_VAR }
533
606
if len (rand_vars ) < 1 :
534
607
return [child ]
@@ -549,13 +622,13 @@ def mutate_fix_var(
549
622
return [child ]
550
623
# randomly pick n of the substitutions with a prob ~ to their counts
551
624
items , counts = zip (* substitution_counts .most_common ())
552
- substs = sample_from_list (items , counts , sample_n )
625
+ substs = sample_from_list (items , counts , sample_max_n )
553
626
logger .info (
554
627
'fixed variable %s in %sto:\n %s\n <%d out of:\n %s\n ' ,
555
628
rand_var .n3 (),
556
629
child ,
557
630
'\n ' .join ([subst .n3 () for subst in substs ]),
558
- sample_n ,
631
+ sample_max_n ,
559
632
'\n ' .join ([' %d: %s' % (c , v .n3 ())
560
633
for v , c in substitution_counts .most_common ()]),
561
634
)
0 commit comments