remaining_gain_sample_gtps n renamed to more appropriate max_n

joernhees · joernhees · commit 8065067418f7 · 2017-02-06T11:17:59.000+01:00
diff --git a/gp_learner.py b/gp_learner.py
@@ -522,9 +522,9 @@ def mutate_fix_var(
         timeout,
         gtp_scores,
         child,
-        gtp_sample_n=config.MUTPB_FV_RGTP_SAMPLE_N,
+        gtp_sample_max_n=config.MUTPB_FV_RGTP_SAMPLE_N,
         rand_var=None,
-        sample_n=config.MUTPB_FV_SAMPLE_MAXN,
+        sample_max_n=config.MUTPB_FV_SAMPLE_MAXN,
         limit=config.MUTPB_FV_QUERY_LIMIT,
 ):
     """Chooses a random variable from the pattern(node or edge).
@@ -537,10 +537,11 @@ def mutate_fix_var(
     # The further we get, the less gtps are remaining. Sampling too many (all)
     # of them might hurt as common substitutions (> limit ones) which are dead
     # ends could cover less common ones that could actually help
-    gtp_sample_n = min(gtp_sample_n, int(gtp_scores.remaining_gain))
-    gtp_sample_n = random.randint(1, gtp_sample_n)
+    gtp_sample_max_n = min(gtp_sample_max_n, int(gtp_scores.remaining_gain))
+    gtp_sample_max_n = random.randint(1, gtp_sample_max_n)
 
-    ground_truth_pairs = gtp_scores.remaining_gain_sample_gtps(n=gtp_sample_n)
+    ground_truth_pairs = gtp_scores.remaining_gain_sample_gtps(
+        max_n=gtp_sample_max_n)
     rand_vars = child.vars_in_graph - {SOURCE_VAR, TARGET_VAR}
     if len(rand_vars) < 1:
         return [child]
@@ -561,13 +562,13 @@ def mutate_fix_var(
         return [child]
     # randomly pick n of the substitutions with a prob ~ to their counts
     items, counts = zip(*substitution_counts.most_common())
-    substs = sample_from_list(items, counts, sample_n)
+    substs = sample_from_list(items, counts, sample_max_n)
     logger.info(
         'fixed variable %s in %sto:\n %s\n<%d out of:\n%s\n',
         rand_var.n3(),
         child,
         '\n '.join([subst.n3() for subst in substs]),
-        sample_n,
+        sample_max_n,
         '\n'.join([' %d: %s' % (c, v.n3())
                    for v, c in substitution_counts.most_common()]),
     )
diff --git a/gtp_scores.py b/gtp_scores.py
@@ -52,19 +52,19 @@ def update_with_gps(self, gps):
                     self.gtp_max_precisions[gtp] = precision
         return precision_gain
 
-    def remaining_gain_sample_gtps(self, n=None):
+    def remaining_gain_sample_gtps(self, max_n=None):
         """Sample ground truth pairs according to remaining gains.
 
-        This method draws up to n ground truth pairs using their remaining gains
-        as sample probabilities. If less than n probabilities are > 0 it draws
-        less gtps.
+        This method draws up to max_n ground truth pairs using their remaining
+        gains as sample probabilities. GTPs with remaining gain of 0 are never
+        returned, so if less than n probabilities are > 0 it draws less gtps.
 
-        :param n: Up to n items to sample.
+        :param max_n: Up to n items to sample.
         :return: list of ground truth pairs sampled according to their remaining
             gains in gtp_scores with max length of n.
         """
         gtps, gains = zip(*self.get_remaining_gains().items())
-        return sample_from_list(gtps, gains, n)
+        return sample_from_list(gtps, gains, max_n)
 
     def __sub__(self, other):
         if not isinstance(other, GTPScores):
diff --git a/tests/test_gp_learner_offline.py b/tests/test_gp_learner_offline.py
@@ -169,14 +169,13 @@ def test_simplify_pattern():
     assert res == gp, 'not simplified:\n%s' % res.to_sparql_select_query()
 
     # counter example of an advanced but restricting pattern:
-    gp = gp + [
+    gp += [
         (SOURCE_VAR, Variable('v3'), Variable('v4')),
         (Variable('v5'), Variable('v6'), Variable('v4')),
         (Variable('v4'), Variable('v7'), Variable('v8')),
         (TARGET_VAR, Variable('v3'), SOURCE_VAR),
         (dbp['City'], Variable('v6'), dbp['Country']),
         (dbp['Country'], Variable('v8'), dbp['City']),
-
     ]
     res = mutate_simplify_pattern(gp)
     assert res == gp, 'was simplified (bad):\n%s' % res.to_sparql_select_query()
@@ -221,22 +220,22 @@ def test_simplify_pattern():
 
 def test_remaining_gain_sample_gtps():
     n = len(ground_truth_pairs)
-    gtps = sorted(gtp_scores.remaining_gain_sample_gtps(n=n))
+    gtps = sorted(gtp_scores.remaining_gain_sample_gtps(max_n=n))
     assert len(gtps) == n
     # if we draw everything the results should always be everything
-    assert gtps == sorted(gtp_scores.remaining_gain_sample_gtps(n=n))
+    assert gtps == sorted(gtp_scores.remaining_gain_sample_gtps(max_n=n))
     # if we don't draw everything it's quite unlikely we get the same result
-    gtps = gtp_scores.remaining_gain_sample_gtps(n=5)
+    gtps = gtp_scores.remaining_gain_sample_gtps(max_n=5)
     assert len(gtps) == 5
-    assert gtps != gtp_scores.remaining_gain_sample_gtps(n=5)
+    assert gtps != gtp_scores.remaining_gain_sample_gtps(max_n=5)
 
     # make sure we never get items that are fully covered already
     gtp_scores.gtp_max_precisions[ground_truth_pairs[0]] = 1
     c = Counter()
     k = 100
     n = 128
     for i in range(k):
-        c.update(gtp_scores.remaining_gain_sample_gtps(n=n))
+        c.update(gtp_scores.remaining_gain_sample_gtps(max_n=n))
     assert ground_truth_pairs[0] not in c
     assert sum(c.values()) == k * n
     # count how many aren't in gtps
@@ -260,7 +259,7 @@ def test_remaining_gain_sample_gtps():
     assert gtpe_scores.remaining_gain == 1
     c = Counter()
     for i in range(100):
-        c.update(gtpe_scores.remaining_gain_sample_gtps(n=1))
+        c.update(gtpe_scores.remaining_gain_sample_gtps(max_n=1))
     assert len(c) == 2
     assert sum(c.values()) == 100
     assert (binom.pmf(c[high_prob], 100, .9) > 0.001 and
diff --git a/utils.py b/utils.py
@@ -247,8 +247,9 @@ def sample_from_list(l, probs, max_n=None):
     """Sample list according to probs.
 
     This method draws up to max_n items from l using the given list of probs as
-    sample probabilities. max_n defaults to len(l) if not specified. If less
-    than max_n probabilities are > 0 only those items are returned.
+    sample probabilities. max_n defaults to len(l) if not specified. Items with
+    probability 0 are never sampled, so if less than max_n probabilities are > 0
+    only those items are returned.
 
     :param l: list from which to draw items.
     :param probs: List of probabilities to draw items. Normalized by sum(probs).