change function signature to: f(x_seed: Tuple[int, float]) for AverageLearner1D

basnijholt · basnijholt · commit 4a363a294844 · 2021-08-27T16:15:34.000+02:00
diff --git a/adaptive/learner/average_learner1D.py b/adaptive/learner/average_learner1D.py
@@ -1,6 +1,8 @@
 from collections import defaultdict
 from copy import deepcopy
 from math import hypot
+from numbers import Number
+from typing import Dict, List, Sequence, Tuple, Union
 
 import numpy as np
 import scipy.stats
@@ -10,6 +12,10 @@
 from adaptive.learner.learner1D import Learner1D, _get_intervals
 from adaptive.notebook_integration import ensure_holoviews
 
+Point = Tuple[int, Number]
+Points = List[Point]
+Value = Union[Number, Sequence[Number]]
+
 
 class AverageLearner1D(Learner1D):
     """Learns and predicts a noisy function 'f:ℝ → ℝ^N'.
@@ -77,7 +83,7 @@ def __init__(
         self.neighbor_sampling = neighbor_sampling
 
         # Contains all samples f(x) for each
-        # point x in the form {x0:[f_0(x0), f_1(x0), ...], ...}
+        # point x in the form {x0: {0: f_0(x0), 1: f_1(x0), ...}, ...}
         self._data_samples = SortedDict()
         # Contains the number of samples taken
         # at each point x in the form {x0: n0, x1: n1, ...}
@@ -95,17 +101,17 @@ def __init__(
         self.rescaled_error = decreasing_dict()
 
     @property
-    def nsamples(self):
+    def nsamples(self) -> int:
         """Returns the total number of samples"""
         return sum(self._number_samples.values())
 
     @property
-    def min_samples_per_point(self):
+    def min_samples_per_point(self) -> int:
         if not self._number_samples:
             return 0
         return min(self._number_samples.values())
 
-    def ask(self, n, tell_pending=True):
+    def ask(self, n: int, tell_pending: bool = True) -> Tuple[Points, List[float]]:
         """Return 'n' points that are expected to maximally reduce the loss."""
         # If some point is undersampled, resample it
         if len(self._undersampled_points):
@@ -133,32 +139,34 @@ def ask(self, n, tell_pending=True):
 
         return points, loss_improvements
 
-    def _ask_for_more_samples(self, x, n):
+    def _ask_for_more_samples(self, x: Number, n: int) -> Tuple[Points, List[float]]:
         """When asking for n points, the learner returns n times an existing point
         to be resampled, since in general n << min_samples and this point will
         need to be resampled many more times"""
-        points = [x] * n
+        n_existing = self._number_samples.get(x, 0)
+        points = [(seed + n_existing, x) for seed in range(n)]
+
         loss_improvements = [0] * n  # We set the loss_improvements of resamples to 0
         return points, loss_improvements
 
-    def _ask_for_new_point(self, n):
+    def _ask_for_new_point(self, n: int) -> Tuple[Points, List[float]]:
         """When asking for n new points, the learner returns n times a single
         new point, since in general n << min_samples and this point will need
         to be resampled many more times"""
         points, loss_improvements = self._ask_points_without_adding(1)
-        points = points * n
+        points = [(seed, x) for seed, x in zip(range(n), n * points)]
         loss_improvements = loss_improvements + [0] * (n - 1)
         return points, loss_improvements
 
-    def tell_pending(self, x):
-        if x in self.data:
-            self.pending_points.add(x)
-        else:
-            self.pending_points.add(x)
+    def tell_pending(self, seed_x: Point) -> None:
+        _, x = seed_x
+        self.pending_points.add(seed_x)
+        if x not in self.data:
             self._update_neighbors(x, self.neighbors_combined)
             self._update_losses(x, real=False)
 
-    def tell(self, x, y):
+    def tell(self, seed_x: Point, y: Value) -> None:
+        seed, x = seed_x
         if y is None:
             raise TypeError(
                 "Y-value may not be None, use learner.tell_pending(x)"
@@ -170,13 +178,13 @@ def tell(self, x, y):
 
         if x not in self.data:
             self._update_data(x, y, "new")
-            self._update_data_structures(x, y, "new")
-        else:
+            self._update_data_structures(seed_x, y, "new")
+        elif seed not in self._data_samples[x]:  # check if the seed is new
             self._update_data(x, y, "resampled")
-            self._update_data_structures(x, y, "resampled")
-        self.pending_points.discard(x)
+            self._update_data_structures(seed_x, y, "resampled")
+        self.pending_points.discard(seed_x)
 
-    def _update_rescaled_error_in_mean(self, x, point_type: str) -> None:
+    def _update_rescaled_error_in_mean(self, x: Number, point_type: str) -> None:
         """Updates ``self.rescaled_error``.
 
         Parameters
@@ -213,17 +221,18 @@ def _update_rescaled_error_in_mean(self, x, point_type: str) -> None:
             norm = min(d_left, d_right)
             self.rescaled_error[x] = self.error[x] / norm
 
-    def _update_data(self, x, y, point_type: str):
+    def _update_data(self, x: Number, y: Value, point_type: str) -> None:
         if point_type == "new":
             self.data[x] = y
         elif point_type == "resampled":
             n = len(self._data_samples[x])
             new_average = self.data[x] * n / (n + 1) + y / (n + 1)
             self.data[x] = new_average
 
-    def _update_data_structures(self, x, y, point_type: str):
+    def _update_data_structures(self, seed_x: Point, y: Value, point_type: str) -> None:
+        seed, x = seed_x
         if point_type == "new":
-            self._data_samples[x] = [y]
+            self._data_samples[x] = {seed: y}
 
             if not self.bounds[0] <= x <= self.bounds[1]:
                 return
@@ -247,7 +256,7 @@ def _update_data_structures(self, x, y, point_type: str):
             self._update_rescaled_error_in_mean(x, "new")
 
         elif point_type == "resampled":
-            self._data_samples[x].append(y)
+            self._data_samples[x][seed] = y
             ns = self._number_samples
             ns[x] += 1
             n = ns[x]
@@ -268,7 +277,7 @@ def _update_data_structures(self, x, y, point_type: str):
             # the std of the mean multiplied by a t-Student factor to ensure that
             # the mean value lies within the correct interval of confidence
             y_avg = self.data[x]
-            ys = self._data_samples[x]
+            ys = self._data_samples[x].values()
             self.error[x] = self._calc_error_in_mean(ys, y_avg, n)
             self._update_distances(x)
             self._update_rescaled_error_in_mean(x, "resampled")
@@ -288,15 +297,15 @@ def _update_data_structures(self, x, y, point_type: str):
                     self._update_interpolated_loss_in_interval(*interval)
                 self._oldscale = deepcopy(self._scale)
 
-    def _update_distances(self, x):
+    def _update_distances(self, x: Number) -> None:
         x_left, x_right = self.neighbors[x]
         y = self.data[x]
         if x_left is not None:
             self._distances[x_left] = hypot((x - x_left), (y - self.data[x_left]))
         if x_right is not None:
             self._distances[x] = hypot((x_right - x), (self.data[x_right] - y))
 
-    def _update_losses_resampling(self, x, real=True):
+    def _update_losses_resampling(self, x: Number, real=True) -> None:
         """Update all losses that depend on x, whenever the new point is a re-sampled point."""
         # (x_left, x_right) are the "real" neighbors of 'x'.
         x_left, x_right = self._find_neighbors(x, self.neighbors)
@@ -325,42 +334,43 @@ def _update_losses_resampling(self, x, real=True):
         if (b is not None) and right_loss_is_unknown:
             self.losses_combined[x, b] = float("inf")
 
-    def _calc_error_in_mean(self, ys, y_avg, n):
+    def _calc_error_in_mean(self, ys: Sequence[Value], y_avg: Value, n: int) -> float:
         variance_in_mean = sum((y - y_avg) ** 2 for y in ys) / (n - 1)
         t_student = scipy.stats.t.ppf(1 - self.alpha, df=n - 1)
         return t_student * (variance_in_mean / n) ** 0.5
 
-    def tell_many(self, xs, ys):
+    def tell_many(self, xs: Points, ys: Sequence[Value]) -> None:
         # Check that all x are within the bounds
-        if not np.prod([x >= self.bounds[0] and x <= self.bounds[1] for x in xs]):
+        if not np.prod([x >= self.bounds[0] and x <= self.bounds[1] for _, x in xs]):
             raise ValueError(
                 "x value out of bounds, "
                 "remove x or enlarge the bounds of the learner"
             )
 
         # Create a mapping of points to a list of samples
-        mapping = defaultdict(list)
-        for x, y in zip(xs, ys):
-            mapping[x].append(y)
-
-        for x, ys in mapping.items():
-            if len(ys) == 1:
-                self.tell(x, ys[0])
-            elif len(ys) > 1:
+        mapping = defaultdict(lambda: defaultdict(dict))
+        for (seed, x), y in zip(xs, ys):
+            mapping[x][seed] = y
+
+        for x, seed_y_mapping in mapping.items():
+            if len(seed_y_mapping) == 1:
+                seed, y = list(seed_y_mapping.items())[0]
+                self.tell((seed, x), y)
+            elif len(seed_y_mapping) > 1:
                 # If we stored more than 1 y-value for the previous x,
                 # use a more efficient routine to tell many samples
                 # simultaneously, before we move on to a new x
-                self.tell_many_at_point(x, ys)
+                self.tell_many_at_point(x, seed_y_mapping)
 
-    def tell_many_at_point(self, x, ys):
+    def tell_many_at_point(self, x: float, seed_y_mapping: Dict[int, Value]) -> None:
         """Tell the learner about many samples at a certain location x.
 
         Parameters
         ----------
         x : float
             Value from the function domain.
-        ys : List[float]
-            List of data samples at ``x``.
+        seed_y_mapping : Dict[int, Value]
+            Dictionary of ``seed`` -> ``y`` at ``x``.
         """
         # Check x is within the bounds
         if not np.prod(x >= self.bounds[0] and x <= self.bounds[1]):
@@ -369,16 +379,20 @@ def tell_many_at_point(self, x, ys):
                 "remove x or enlarge the bounds of the learner"
             )
 
-        ys = list(ys)  # cast to list *and* make a copy
         # If x is a new point:
         if x not in self.data:
-            y = ys.pop(0)
+            # we make a copy because we don't want to modify the original dict
+            seed_y_mapping = seed_y_mapping.copy()
+            seed = next(iter(seed_y_mapping))
+            y = seed_y_mapping.pop(seed)
             self._update_data(x, y, "new")
-            self._update_data_structures(x, y, "new")
+            self._update_data_structures((seed, x), y, "new")
+
+        ys = list(seed_y_mapping.values())  # cast to list *and* make a copy
 
         # If x is not a new point or if there were more than 1 sample in ys:
         if len(ys) > 0:
-            self._data_samples[x].extend(ys)
+            self._data_samples[x].update(seed_y_mapping)
             n = len(ys) + self._number_samples[x]
             self.data[x] = (
                 np.mean(ys) * len(ys) + self.data[x] * self._number_samples[x]
@@ -390,24 +404,24 @@ def tell_many_at_point(self, x, ys):
             if n > self.min_samples:
                 self._undersampled_points.discard(x)
             self.error[x] = self._calc_error_in_mean(
-                self._data_samples[x], self.data[x], n
+                self._data_samples[x].values(), self.data[x], n
             )
             self._update_distances(x)
             self._update_rescaled_error_in_mean(x, "resampled")
             if self.error[x] <= self.min_error or n >= self.max_samples:
                 self.rescaled_error.pop(x, None)
-            self._update_scale(x, min(self._data_samples[x]))
-            self._update_scale(x, max(self._data_samples[x]))
+            self._update_scale(x, min(self._data_samples[x].values()))
+            self._update_scale(x, max(self._data_samples[x].values()))
             self._update_losses_resampling(x, real=True)
             if self._scale[1] > self._recompute_losses_factor * self._oldscale[1]:
                 for interval in reversed(self.losses):
                     self._update_interpolated_loss_in_interval(*interval)
                 self._oldscale = deepcopy(self._scale)
 
-    def _get_data(self):
+    def _get_data(self) -> SortedDict:
         return self._data_samples
 
-    def _set_data(self, data):
+    def _set_data(self, data: SortedDict) -> None:
         if data:
             for x, samples in data.items():
                 self.tell_many_at_point(x, samples)
diff --git a/adaptive/tests/test_average_learner1d.py b/adaptive/tests/test_average_learner1d.py
@@ -37,7 +37,7 @@ def test_tell_many_at_point():
     for k, v1 in learner._data_samples.items():
         v2 = control._data_samples[k]
         assert len(v1) == len(v2)
-        np.testing.assert_almost_equal(np.sort(v1), np.sort(v2))
+        np.testing.assert_almost_equal(sorted(v1.values()), sorted(v2.values()))
 
     assert learner._bbox[0] == control._bbox[0]
     assert learner._bbox[1] == control._bbox[1]
diff --git a/adaptive/tests/test_learners.py b/adaptive/tests/test_learners.py
@@ -166,11 +166,12 @@ def gaussian(n):
 
 @learn_with(AverageLearner1D, bounds=[-2, 2])
 def noisy_peak(
-    x,
+    seed_x,
     sigma: uniform(1.5, 2.5),
     peak_width: uniform(0.04, 0.06),
     offset: uniform(-0.6, -0.3),
 ):
+    seed, x = seed_x
     y = x ** 3 - x + 3 * peak_width ** 2 / (peak_width ** 2 + (x - offset) ** 2)
     noise = np.random.normal(0, sigma)
     return y + noise
@@ -411,13 +412,17 @@ def test_point_adding_order_is_irrelevant(learner_type, f, learner_kwargs):
         learner.tell(*p)
 
     M = random.randint(10, 30)
-    pls = zip(*learner.ask(M))
-    cpls = zip(*control.ask(M))
+    pls = sorted(zip(*learner.ask(M)))
+    cpls = sorted(zip(*control.ask(M)))
     # Point ordering within a single call to 'ask'
     # is not guaranteed to be the same by the API.
     # We compare the sorted points instead of set, because the points
     # should only be identical up to machine precision.
-    np.testing.assert_almost_equal(sorted(pls), sorted(cpls))
+    if isinstance(pls[0][0], tuple):
+        # This is the case for AverageLearner1D
+        pls = [(*x, y) for x, y in pls]
+        cpls = [(*x, y) for x, y in cpls]
+    np.testing.assert_almost_equal(pls, cpls)
 
 
 # XXX: the Learner2D fails with ~50% chance
@@ -473,7 +478,7 @@ def test_learner_performance_is_invariant_under_scaling(
     l_kwargs["bounds"] = xscale * np.array(l_kwargs["bounds"])
     learner = learner_type(lambda x: yscale * f(np.array(x) / xscale), **l_kwargs)
 
-    if learner_type in [Learner1D, LearnerND]:
+    if learner_type in [Learner1D, LearnerND, AverageLearner1D]:
         learner._recompute_losses_factor = 1
         control._recompute_losses_factor = 1