changed default to sample with replacement and adapt tutorial

ValentinGebhart · ValentinGebhart · commit d0c18190f16c · 2025-07-22T17:11:38.000+02:00
diff --git a/climada/util/yearsets.py b/climada/util/yearsets.py
@@ -25,7 +25,7 @@
 
 
 def impact_yearset(
-    imp, sampled_years, lam=None, correction_fac=True, with_replacement=False, seed=None
+    imp, sampled_years, lam=None, correction_fac=False, with_replacement=True, seed=None
 ):
     """Create a yearset of impacts (yimp) containing a probabilistic impact for each year
     in the sampled_years list by sampling events from the impact received as input with a
@@ -41,9 +41,9 @@ def impact_yearset(
         sampled_years : list
             A list of years that shall be covered by the resulting yimp.
         with_replacement : bool, optional
-            If False and all frequencies of freqs_orig are constant, events are sampled
-            without replacement. Otherwise, events are sampled with replacement.
-            Defaults to False.
+            If True, impact events are sampled with replacement. If False, events are sampled
+            without replacement. Sampling without replacement can yield distorted samples if
+            frequencies of different events are unqual. Defaults to True.
         seed : Any, optional
             seed for the default bit generator
             default: None
@@ -87,7 +87,7 @@ def impact_yearset(
 
 
 def impact_yearset_from_sampling_vect(
-    imp, sampled_years, sampling_vect, correction_fac=True
+    imp, sampled_years, sampling_vect, correction_fac=False
 ):
     """Create a yearset of impacts (yimp) containing a probabilistic impact for each year
     in the sampled_years list by sampling events from the impact received as input following
@@ -171,7 +171,7 @@ def sample_from_poisson(n_sampled_years, lam, seed=None):
     return np.round(np.random.poisson(lam=lam, size=n_sampled_years)).astype("int")
 
 
-def sample_events(events_per_year, freqs_orig, with_replacement=False, seed=None):
+def sample_events(events_per_year, freqs_orig, with_replacement=True, seed=None):
     """Sample events uniformely from an array (indices_orig) without replacement
     (if sum(events_per_year) > n_input_events the input events are repeated
     (tot_n_events/n_input_events) times, by ensuring that the same events doens't
@@ -184,9 +184,9 @@ def sample_events(events_per_year, freqs_orig, with_replacement=False, seed=None
         freqs_orig : np.ndarray
             Frequency of each input event
         with_replacement : bool, optional
-            If False and all frequencies of freqs_orig are constant, events are sampled
-            without replacement. Otherwise, events are sampled with replacement.
-            Defaults to False.
+            If True, impact events are sampled with replacement. If False, events are sampled
+            without replacement. Sampling without replacement can yield distorted samples if
+            frequencies of different events are unqual. Defaults to True.
         seed : Any, optional
             seed for the default bit generator.
             Default: None
@@ -201,13 +201,20 @@ def sample_events(events_per_year, freqs_orig, with_replacement=False, seed=None
 
     sampling_vect = []
     indices_orig = np.arange(len(freqs_orig))
+    rng = default_rng(seed)
 
-    # this is the previous way of sampling
-    # (without replacement, works well if frequencies are constant)
-    if np.unique(freqs_orig).size == 1 and not with_replacement:
+    # sample without replacement, works well if event frequencies are equal
+    if with_replacement is False:
+        # warn if frequencies of different events are not equal
+        if np.unique(freqs_orig).size != 1:
+            LOGGER.warning(
+                "The frequencies of the different events are not equal. This can lead to "
+                "distorted sampling if the frequencies vary significantly. To avoid this, "
+                "please set with_replacement=True to sample with replacement instead."
+            )
 
         indices = indices_orig
-        rng = default_rng(seed)
+        freqs = freqs_orig
 
         # sample events for each sampled year
         for amount_events in events_per_year:
@@ -222,31 +229,27 @@ def sample_events(events_per_year, freqs_orig, with_replacement=False, seed=None
             # if not enough events remaining, use original events
             if len(indices) < amount_events or len(indices) == 0:
                 indices = indices_orig
+                freqs = freqs_orig
 
             # sample events
+            probab_dis = freqs / sum(freqs)
             selected_events = rng.choice(
-                indices, size=amount_events, replace=False
+                indices, size=amount_events, replace=False, p=probab_dis
             ).astype("int")
 
             # determine used events to remove them from sampling pool
             idx_to_remove = [
                 np.where(indices == event)[0][0] for event in selected_events
             ]
             indices = np.delete(indices, idx_to_remove)
+            freqs = np.delete(freqs, idx_to_remove)
 
             # save sampled events in sampling vector
             sampling_vect.append(selected_events)
 
     else:
         # easier method if we allow for replacement sample with replacement
-        if with_replacement is False:
-            LOGGER.warning(
-                "Sampling without replacement not implemented for events with varying "
-                "frequencies. Events are sampled with replacement."
-            )
-
         probab_dis = freqs_orig / sum(freqs_orig)
-        rng = default_rng(seed)
 
         # sample events for each sampled year
         selected_events = rng.choice(
diff --git a/doc/user-guide/climada_util_yearsets.ipynb b/doc/user-guide/climada_util_yearsets.ipynb
@@ -11,9 +11,9 @@
     "The function `impact_yearset` performs all these computational steps, taking an `imp` and the list of sampled_years (`sampled_years`) as input. The output of the function is the `yimp` object and the `sampling_vect`.\n",
     "Moreover, a `sampling_vect` (generated in a previous run) can be provided as optional input and the user can custom-define the Poisson parameter `lam`. Reapplying the same sampling_vect does not only allow to reproduce the generated `yimp`, but also for a physically consistent way of sampling impacts caused by different hazards. \n",
     "\n",
-    "*Sampling options.* Per default, impact events are sampled without replacement (if the original impact object contains enough events), such that the different yearly impacts stem from different events. This is only implemented if the event frequencies of the original impact object (`imp.frequency`) are constant. If the events differ in their frequency, they are sampled with replacement. This sampling behaviour can also be achieved by setting `with_replacement=True`. \n",
+    "*Sampling options.* Per default, impact events are sampled with replacement. When setting `with_replacement=False`, the impact events are sampled without replacement (given that the original impact object contains enough events). Note that sampling without replacement can lead to distorted sampling if the frequencies of the different impacts (`imp.frequency`) are not equal.\n",
     "\n",
-    "*Correction factor.* Per default, a correction factor is applied uniformly to all yearly impacts, such that the final `yimp` object has the same average annual impact than `imp`, the original impact object. When setting `correction_fac=False`, the correction factor is not applied.\n",
+    "*Correction factor.* By setting `correction_fac=False`, a correction factor is applied uniformly to all yearly impacts, such that the final `yimp` object has the same average annual impact than `imp`, the original impact object.\n",
     "\n"
    ]
   },
@@ -64,7 +64,7 @@
     {
      "data": {
       "text/plain": [
-       "array([1, 3, 0, 0, 2, 0, 1, 2, 1, 3])"
+       "array([1, 2, 1, 0, 1, 0, 0, 2, 1, 2])"
       ]
      },
      "execution_count": 2,
@@ -91,15 +91,15 @@
      "data": {
       "text/plain": [
        "[array([2]),\n",
-       " array([0, 4, 3]),\n",
+       " array([1, 0]),\n",
+       " array([2]),\n",
        " array([], dtype=int64),\n",
+       " array([2]),\n",
        " array([], dtype=int64),\n",
-       " array([5, 1]),\n",
        " array([], dtype=int64),\n",
-       " array([3]),\n",
-       " array([5, 1]),\n",
-       " array([0]),\n",
-       " array([2, 1, 5])]"
+       " array([1, 4]),\n",
+       " array([2]),\n",
+       " array([4, 4])]"
       ]
      },
      "execution_count": 3,
@@ -121,7 +121,7 @@
     {
      "data": {
       "text/plain": [
-       "array([ 4, 66,  0,  0, 64,  0,  6, 64,  0, 68])"
+       "array([  4,   2,   4,   0,   4,   0,   0,  62,   4, 120])"
       ]
      },
      "execution_count": 4,
@@ -143,7 +143,7 @@
     {
      "data": {
       "text/plain": [
-       "0.9852941176470589"
+       "1.34"
       ]
      },
      "execution_count": 5,
@@ -180,9 +180,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "yimp.at_event =  [ 60   0   0   8 128   0   0 130 122   8]\n",
-      "imp_per_year =  [ 60   0   0   8 128   0   0 130 122   8]\n",
-      "The expected annual impact 45.6 differs from the one of the original impact (26.8).\n"
+      "yimp.at_event =  [  2   0   0 122  14   0   0 246   4 124]\n",
+      "imp_per_year =  [  2   0   0 122  14   0   0 246   4 124]\n",
+      "The expected annual impact 51.2 differs from the one of the original impact (26.8).\n"
      ]
     }
    ],
@@ -228,9 +228,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2025-07-17 14:56:28,995 - climada.util.yearsets - INFO - The correction factor is 0.5877192982456141.\n",
-      "yimp.at_event =  [35.26  0.    0.    4.7  75.23  0.    0.   76.4  71.7   4.7 ]\n",
-      "imp_per_year =  [35.26  0.    0.    4.7  75.23  0.    0.   76.4  71.7   4.7 ]\n",
+      "2025-07-22 17:10:20,134 - climada.util.yearsets - INFO - The correction factor is 0.5234375.\n",
+      "yimp.at_event =  [  1.05   0.     0.    63.86   7.33   0.     0.   128.77   2.09  64.91]\n",
+      "imp_per_year =  [  1.05   0.     0.    63.86   7.33   0.     0.   128.77   2.09  64.91]\n",
       "The expected annual impact 26.8 is equal to the one of the original impact (26.8).\n"
      ]
     }