Merge pull request #2431 from NNPDF/make_replicas_colibri

scarlehoff · web-flow · commit aab063492764 · 2026-03-12T10:27:38.000+01:00
Make replicas Colibri compatible
diff --git a/.github/workflows/fitbot.yml b/.github/workflows/fitbot.yml
@@ -12,7 +12,7 @@ env:
   POSTFIT_NREP: 15 # requested minimum replicas for postfit
   # IMPORTANT
   # WHEN CHANGING THE REFERENCE SET, THE NEW REFERENCE MUST BE MANUALLY UPLOADED TO THE SERVER
-  REFERENCE_SET: NNNBOT-1a81255f3-2026-01-27 # reference set for exact results
+  REFERENCE_SET: NNBOT-351efa9ef-2026-03-11 # reference set for exact results
   STABLE_REFERENCE_SET: NNBOT-99108504e-2025-11-22 # reference set for last tag
   PYTHONHASHSEED: "0"
 
diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py
@@ -110,6 +110,34 @@ def replica_luxseed(replica, luxseed):
     return replica_nnseed(replica, luxseed)
 
 
+def group_replica_mcseed(replica_mcseed, groups_dataset_inputs_loaded_cd_with_cuts, genrep=True):
+    """Generates the ``mcseed`` for a group of datasets. This is done by hashing the names
+      of the datasets in the group and adding it to the ``replica_mcseed`
+    Parameters
+    ---------
+    groups_dataset_inputs_loaded_cd_with_cuts: list[:py:class:`nnpdf_data.coredata.CommonData`]
+        List of CommonData objects which stores information about systematic errors,
+        their treatment and description, for each dataset.
+    replica_mcseed: int
+    """
+    if not genrep:
+        return None
+    names_for_salt = []
+    # Try to use the new dataset name, but make older runs reproducible by keeping the old names.
+    # WARNING: don't rely on this behaviour, this might be removed in future releases
+
+    for loaded_cd in groups_dataset_inputs_loaded_cd_with_cuts:
+        if loaded_cd.legacy_names is None:
+            names_for_salt.append(loaded_cd.setname)
+        else:
+            names_for_salt.append(loaded_cd.legacy_names[0])
+    name_salt = "-".join(names_for_salt)
+
+    name_seed = int(hashlib.sha256(name_salt.encode()).hexdigest(), 16) % 10**8
+    res = name_seed + replica_mcseed
+    return res
+
+
 class _Masks(TupleComp):
     """Class holding the training validation mask for a group of datasets
     If the same group of dataset receives the same trvlseed then the mask
diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
@@ -121,16 +121,17 @@ def read_replica_pseudodata(fit, context_index, replica):
 
 
 def make_replica(
-    groups_dataset_inputs_loaded_cd_with_cuts,
-    replica_mcseed,
+    central_values_array,
+    group_replica_mcseed,
     dataset_inputs_sampling_covmat,
+    group_multiplicative_errors=None,
+    group_positivity_mask=None,
     sep_mult=False,
     genrep=True,
     max_tries=int(1e6),
-    resample_negative_pseudodata=False,
 ):
-    """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
-    objects and returns a pseudodata replica accounting for
+    """Function that takes in a central value array and a covariance matrix
+    and returns a pseudodata replica accounting for
     possible correlations between systematic uncertainties.
 
     The function loops until positive definite pseudodata is generated for any
@@ -139,17 +140,22 @@ def make_replica(
 
     Parameters
     ---------
-    groups_dataset_inputs_loaded_cd_with_cuts: list[:py:class:`nnpdf_data.coredata.CommonData`]
-        List of CommonData objects which stores information about systematic errors,
-        their treatment and description, for each dataset.
+    central_values_array: np.array
+        Numpy array which is N_dat (where N_dat is the combined number of data points after cuts)
+        containing the central values of the data.
 
-    replica_mcseed: int, None
-        Seed used to initialise the numpy random number generator. If ``None`` then a random seed is
-        allocated using the default numpy behaviour.
+    group_replica_mcseed: int
+        Seed used to initialise the numpy random number generator.
 
     dataset_inputs_sampling_covmat: np.array
         Full covmat to be used. It can be either only experimental or also theoretical.
 
+    group_multiplicative_errors: dict
+        Dictionary containing the multiplicative uncertainties contribution to the pseudodata replica.
+
+    group_positivity_mask: np.array
+        Boolean array of shape (N_dat,) indicating which data points should be positive.
+
     sep_mult: bool
         Specifies whether computing the shifts with the full covmat
         or whether multiplicative errors should be separated
@@ -162,9 +168,6 @@ def make_replica(
         If after max_tries (default=1e6) no physical configuration is found,
         it will raise a :py:class:`ReplicaGenerationError`
 
-    resample_negative_pseudodata: bool
-        When True, replicas that produce negative predictions will be resampled for ``max_tries``
-        until all points are positive (default: False)
     Returns
     -------
     pseudodata: np.array
@@ -187,90 +190,122 @@ def make_replica(
        0.34206012, 0.31866286, 0.2790856 , 0.33257621, 0.33680007,
     """
     if not genrep:
-        return np.concatenate(
-            [cd.central_values for cd in groups_dataset_inputs_loaded_cd_with_cuts]
-        )
-    # Seed the numpy RNG with the seed and the name of the datasets in this run
-
-    # TODO: to be simplified after the reader is merged, together with an update of the regression tests
-    # this is necessary to reproduce exactly the results due to the replicas being generated with a hash
-    # Only when the sets are legacy (or coming from a legacy runcard) this shall be used
-    names_for_salt = []
-    for loaded_cd in groups_dataset_inputs_loaded_cd_with_cuts:
-        if loaded_cd.legacy_names is None:
-            names_for_salt.append(loaded_cd.setname)
-        else:
-            names_for_salt.append(loaded_cd.legacy_names[0])
-    name_salt = "-".join(names_for_salt)
+        return central_values_array
 
-    name_seed = int(hashlib.sha256(name_salt.encode()).hexdigest(), 16) % 10**8
-    rng = np.random.default_rng(seed=replica_mcseed + name_seed)
+    # Set random seed
+    rng = np.random.default_rng(seed=group_replica_mcseed)
     # construct covmat
     covmat = dataset_inputs_sampling_covmat
     covmat_sqrt = sqrt_covmat(covmat)
-    # Loading the data
-    pseudodatas = []
-    check_positive_masks = []
-    nonspecial_mult = []
-    special_mult = []
-    for cd in groups_dataset_inputs_loaded_cd_with_cuts:
-        # copy here to avoid mutating the central values.
-        pseudodata = cd.central_values.to_numpy()
 
-        pseudodatas.append(pseudodata)
-        # Separation of multiplicative errors. If sep_mult is True also the exp_covmat is produced
-        # without multiplicative errors
-        if sep_mult:
-            mult_errors = cd.multiplicative_errors
-            mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
-            mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
-            nonspecial_mult.append((mult_uncorr_errors, mult_corr_errors))
-            special_mult.append(
-                mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
-            )
-        if "ASY" in cd.commondataproc or cd.commondataproc.endswith("_POL"):
-            check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool))
-        else:
-            check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))
-    # concatenating special multiplicative errors, pseudodatas and positive mask
-    if sep_mult:
-        special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
-    all_pseudodata = np.concatenate(pseudodatas, axis=0)
-    full_mask = np.concatenate(check_positive_masks, axis=0)
+    full_mask = (
+        group_positivity_mask
+        if group_positivity_mask is not None
+        else np.zeros_like(central_values_array, dtype=bool)
+    )
     # The inner while True loop is for ensuring a positive definite
     # pseudodata replica
     for _ in range(max_tries):
         mult_shifts = []
         # Prepare the per-dataset multiplicative shifts
-        for mult_uncorr_errors, mult_corr_errors in nonspecial_mult:
-            # convert to from percent to fraction
-            mult_shift = (
-                1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
-            ).prod(axis=1)
+        if group_multiplicative_errors is not None:
+            for mult_uncorr_errors, mult_corr_errors in group_multiplicative_errors[
+                "nonspecial_mult"
+            ]:
+                # convert to from percent to fraction
+                mult_shift = (
+                    1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
+                ).prod(axis=1)
 
-            mult_shift *= (
-                1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100
-            ).prod(axis=1)
+                mult_shift *= (
+                    1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100
+                ).prod(axis=1)
 
-            mult_shifts.append(mult_shift)
+                mult_shifts.append(mult_shift)
 
         # If sep_mult is true then the multiplicative shifts were not included in the covmat
         shifts = covmat_sqrt @ rng.normal(size=covmat.shape[1])
         mult_part = 1.0
         if sep_mult:
+            special_mult_errors = group_multiplicative_errors["special_mult"]
             special_mult = (
                 1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100
             ).prod(axis=1)
             mult_part = np.concatenate(mult_shifts, axis=0) * special_mult
         # Shifting pseudodata
-        shifted_pseudodata = (all_pseudodata + shifts) * mult_part
+        shifted_pseudodata = (central_values_array + shifts) * mult_part
         # positivity control
-        if np.all(shifted_pseudodata[full_mask] >= 0) or not resample_negative_pseudodata:
+        if np.all(shifted_pseudodata[full_mask] >= 0):
             return shifted_pseudodata
 
-    dfail = " ".join(i.setname for i in groups_dataset_inputs_loaded_cd_with_cuts)
-    log.error(f"Error generating replicas for the group: {dfail}")
-    raise ReplicaGenerationError(f"No valid replica found after {max_tries} attempts")
+    # Find which dataset index corresponds to the negative points, and print it out for debugging purposes
+    negative_mask = shifted_pseudodata < 0 & full_mask
+    negative_indices = np.where(negative_mask)[0]
+
+    raise ReplicaGenerationError(
+        f"No valid replica found after {max_tries} attempts. "
+        f"Negative global indices: {negative_indices.tolist()}"
+    )
+
+
+def central_values_array(groups_dataset_inputs_loaded_cd_with_cuts):
+    """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
+    and returns the central values concatenated in a single array.
+    """
+    central_values = []
+    for cd in groups_dataset_inputs_loaded_cd_with_cuts:
+        central_values.append(cd.central_values.to_numpy())
+    return np.concatenate(central_values, axis=0)
+
+
+def group_multiplicative_errors(groups_dataset_inputs_loaded_cd_with_cuts, sep_mult=False):
+    """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
+    and returns the multiplicative uncertainties contribution to the pseudodata replica.
+    """
+    if not sep_mult:
+        return None
+
+    nonspecial_mult = []
+    special_mult = []
+    special_mult_errors = []
+    for cd in groups_dataset_inputs_loaded_cd_with_cuts:
+        if sep_mult:
+            mult_errors = cd.multiplicative_errors
+            mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
+            mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
+            nonspecial_mult.append((mult_uncorr_errors, mult_corr_errors))
+            special_mult.append(
+                mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
+            )
+
+    # concatenating special multiplicative errors
+    if sep_mult:
+        special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
+
+    multiplicative_errors = {
+        "nonspecial_mult": nonspecial_mult,
+        "special_mult": special_mult_errors,
+    }
+
+    return multiplicative_errors
+
+
+def group_positivity_mask(
+    groups_dataset_inputs_loaded_cd_with_cuts, resample_negative_pseudodata=False
+):
+    """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
+    and returns a boolean mask indicating which data points should be positive.
+    """
+    if not resample_negative_pseudodata:
+        return None
+    check_positive_masks = []
+    for cd in groups_dataset_inputs_loaded_cd_with_cuts:
+        if "ASY" in cd.commondataproc or cd.commondataproc.endswith("_POL"):
+            check_positive_masks.append(np.zeros_like(cd.central_values.to_numpy(), dtype=bool))
+        else:
+            check_positive_masks.append(np.ones_like(cd.central_values.to_numpy(), dtype=bool))
+    full_mask = np.concatenate(check_positive_masks, axis=0)
+    return full_mask
 
 
 def indexed_make_replica(groups_index, make_replica):
@@ -397,8 +432,11 @@ def make_level1_data(data, level0_commondata_wc, filterseed, data_index, sep_mul
     )
 
     # ================== generation of Level1 data ======================#
+    central_vals= central_values_array(level0_commondata_wc)
+    group_mult_errs = group_multiplicative_errors(level0_commondata_wc, sep_mult=sep_mult)
+    group_pos_mask = group_positivity_mask(level0_commondata_wc)
     level1_data = make_replica(
-        level0_commondata_wc, filterseed, covmat, sep_mult=sep_mult, genrep=True
+        central_vals, filterseed, covmat, group_multiplicative_errors=group_mult_errs, group_positivity_mask=group_pos_mask, sep_mult=sep_mult, genrep=True,
     )
 
     indexed_level1_data = indexed_make_replica(data_index, level1_data)
diff --git a/validphys2/src/validphys/tests/conftest.py b/validphys2/src/validphys/tests/conftest.py
@@ -107,15 +107,16 @@ def data_internal_cuts_config(data_config):
 
 
 @pytest.fixture(scope='module')
-def data_internal_cuts_new_theory_config(data_internal_cuts_config):
+def data_internal_cuts_closure_config(data_internal_cuts_config):
+    # Filterseed is not added so that it is changed by the tests
     config = dict(data_internal_cuts_config)
-    config["theoryid"] = THEORYID
+    config["fakepdf"] = PDF
     return config
 
 
 @pytest.fixture(scope='module')
-def data_fromfit_cuts_config(data_internal_cuts_new_theory_config):
-    config = dict(data_internal_cuts_new_theory_config)
+def data_fromfit_cuts_config(data_internal_cuts_config):
+    config = dict(data_internal_cuts_config)
     config.update(use_cuts="fromfit")
     return config
 
diff --git a/validphys2/src/validphys/tests/test_datafiles.py b/validphys2/src/validphys/tests/test_datafiles.py
@@ -1,8 +1,8 @@
 """
-    Test all datafiles
+Test all datafiles
 
-    The checks in ``test_all_datasets`` are run for each dataset independently so that one gets one
-    failure per dataset in case of problems
+The checks in ``test_all_datasets`` are run for each dataset independently so that one gets one
+failure per dataset in case of problems
 """
 
 import pytest
@@ -24,7 +24,7 @@ def _load_main_and_variants(dataset_name):
 
 
 @pytest.mark.parametrize("dataset_name", all_datasets)
-def test_all_datasets(dataset_name, data_internal_cuts_new_theory_config):
+def test_all_datasets(dataset_name):
     """Checks that a dataset can be loaded (together with its variants),
     that the kinematics, uncertainties and data can be read.
 
diff --git a/validphys2/src/validphys/tests/test_pseudodata.py b/validphys2/src/validphys/tests/test_pseudodata.py
@@ -8,6 +8,7 @@
 recreation.
 """
 
+import numpy as np
 from numpy.testing import assert_allclose
 import pandas as pd
 import pytest
@@ -115,3 +116,16 @@ def test_level0_commondata_wc():
     assert_allclose(
         dataset_t0_predictions(t0dataset=datasetspec, t0set=t0set), l0_vals, rtol=1e-07, atol=0
     )
+
+
+def test_level1_commondata(data_internal_cuts_closure_config):
+    """
+    check whether level 1 commondata can be generated and that it is seed-dependent.
+    """
+    mod_data1 = API.make_level1_data(**data_internal_cuts_closure_config, filterseed=2)
+    mod_data2 = API.make_level1_data(**data_internal_cuts_closure_config, filterseed=42)
+
+    arr1 = pd.concat(i.central_values for i in mod_data1)
+    arr2 = pd.concat(i.central_values for i in mod_data2)
+
+    assert not np.allclose(arr1, arr2)