NNPDF · scarlehoff · Mar 12, 2026 · Feb 18, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py
@@ -110,6 +110,34 @@ def replica_luxseed(replica, luxseed):
     return replica_nnseed(replica, luxseed)
 
 
+def group_replica_mcseed(replica_mcseed, groups_dataset_inputs_loaded_cd_with_cuts, genrep=True):
+    """Generates the ``mcseed`` for a group of datasets. This is done by hashing the names
+      of the datasets in the group and adding it to the ``replica_mcseed`
+    Parameters
+    ---------
+    groups_dataset_inputs_loaded_cd_with_cuts: list[:py:class:`nnpdf_data.coredata.CommonData`]
+        List of CommonData objects which stores information about systematic errors,
+        their treatment and description, for each dataset.
+    replica_mcseed: int
+    """
+    if not genrep:
+        return None
+    names_for_salt = []
+    # Try to use the new dataset name, but make older runs reproducible by keeping the old names.
+    # WARNING: don't rely on this behaviour, this might be removed in future releases
+
+    for loaded_cd in groups_dataset_inputs_loaded_cd_with_cuts:
+        if loaded_cd.legacy_names is None:
+            names_for_salt.append(loaded_cd.setname)
+        else:
+            names_for_salt.append(loaded_cd.legacy_names[0])
+    name_salt = "-".join(names_for_salt)
+
+    name_seed = int(hashlib.sha256(name_salt.encode()).hexdigest(), 16) % 10**8
+    res = name_seed + replica_mcseed
+    return res
+
+
 class _Masks(TupleComp):
     """Class holding the training validation mask for a group of datasets
     If the same group of dataset receives the same trvlseed then the mask

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
@@ -121,16 +121,17 @@ def read_replica_pseudodata(fit, context_index, replica):
 
 
 def make_replica(
-    groups_dataset_inputs_loaded_cd_with_cuts,
-    replica_mcseed,
+    central_values_array,
+    group_replica_mcseed,
     dataset_inputs_sampling_covmat,
+    group_multiplicative_errors=None,
+    group_positivity_mask=None,
     sep_mult=False,
     genrep=True,
     max_tries=int(1e6),
-    resample_negative_pseudodata=False,
 ):
-    """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
-    objects and returns a pseudodata replica accounting for
+    """Function that takes in a central value array and a covariance matrix
+    and returns a pseudodata replica accounting for
     possible correlations between systematic uncertainties.
 
     The function loops until positive definite pseudodata is generated for any
@@ -139,17 +140,22 @@ def make_replica(
 
     Parameters
     ---------
-    groups_dataset_inputs_loaded_cd_with_cuts: list[:py:class:`nnpdf_data.coredata.CommonData`]
-        List of CommonData objects which stores information about systematic errors,
-        their treatment and description, for each dataset.
+    central_values_array: np.array
+        Numpy array which is N_dat (where N_dat is the combined number of data points after cuts)
+        containing the central values of the data.
 
-    replica_mcseed: int, None
-        Seed used to initialise the numpy random number generator. If ``None`` then a random seed is
-        allocated using the default numpy behaviour.
+    group_replica_mcseed: int
+        Seed used to initialise the numpy random number generator.
 
     dataset_inputs_sampling_covmat: np.array
         Full covmat to be used. It can be either only experimental or also theoretical.
 
+    group_multiplicative_errors: dict
+        Dictionary containing the multiplicative uncertainties contribution to the pseudodata replica.
+
+    group_positivity_mask: np.array
+        Boolean array of shape (N_dat,) indicating which data points should be positive.
+
     sep_mult: bool
         Specifies whether computing the shifts with the full covmat
         or whether multiplicative errors should be separated
@@ -162,9 +168,6 @@ def make_replica(
         If after max_tries (default=1e6) no physical configuration is found,
         it will raise a :py:class:`ReplicaGenerationError`
 
-    resample_negative_pseudodata: bool
-        When True, replicas that produce negative predictions will be resampled for ``max_tries``
-        until all points are positive (default: False)
     Returns
     -------
     pseudodata: np.array
@@ -187,90 +190,123 @@ def make_replica(
        0.34206012, 0.31866286, 0.2790856 , 0.33257621, 0.33680007,
     """
     if not genrep:
-        return np.concatenate(
-            [cd.central_values for cd in groups_dataset_inputs_loaded_cd_with_cuts]
-        )
-    # Seed the numpy RNG with the seed and the name of the datasets in this run
-
-    # TODO: to be simplified after the reader is merged, together with an update of the regression tests
-    # this is necessary to reproduce exactly the results due to the replicas being generated with a hash
-    # Only when the sets are legacy (or coming from a legacy runcard) this shall be used
-    names_for_salt = []
-    for loaded_cd in groups_dataset_inputs_loaded_cd_with_cuts:
-        if loaded_cd.legacy_names is None:
-            names_for_salt.append(loaded_cd.setname)
-        else:
-            names_for_salt.append(loaded_cd.legacy_names[0])
-    name_salt = "-".join(names_for_salt)
-
-    name_seed = int(hashlib.sha256(name_salt.encode()).hexdigest(), 16) % 10**8
-    rng = np.random.default_rng(seed=replica_mcseed + name_seed)
+        return central_values_array
+
+    # Set random seed
+    rng = np.random.default_rng(seed=group_replica_mcseed)
     # construct covmat
     covmat = dataset_inputs_sampling_covmat
     covmat_sqrt = sqrt_covmat(covmat)
-    # Loading the data
-    pseudodatas = []
-    check_positive_masks = []
-    nonspecial_mult = []
-    special_mult = []
-    for cd in groups_dataset_inputs_loaded_cd_with_cuts:
-        # copy here to avoid mutating the central values.
-        pseudodata = cd.central_values.to_numpy()
 
-        pseudodatas.append(pseudodata)
-        # Separation of multiplicative errors. If sep_mult is True also the exp_covmat is produced
-        # without multiplicative errors
-        if sep_mult:
-            mult_errors = cd.multiplicative_errors
-            mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
-            mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
-            nonspecial_mult.append((mult_uncorr_errors, mult_corr_errors))
-            special_mult.append(
-                mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
-            )
-        if "ASY" in cd.commondataproc or cd.commondataproc.endswith("_POL"):
-            check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool))
-        else:
-            check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))
-    # concatenating special multiplicative errors, pseudodatas and positive mask
-    if sep_mult:
-        special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
-    all_pseudodata = np.concatenate(pseudodatas, axis=0)
-    full_mask = np.concatenate(check_positive_masks, axis=0)
+    full_mask = (
+        group_positivity_mask
+        if group_positivity_mask is not None
+        else np.zeros_like(central_values_array, dtype=bool)
+    )
     # The inner while True loop is for ensuring a positive definite
     # pseudodata replica
     for _ in range(max_tries):
         mult_shifts = []
         # Prepare the per-dataset multiplicative shifts
-        for mult_uncorr_errors, mult_corr_errors in nonspecial_mult:
-            # convert to from percent to fraction
-            mult_shift = (
-                1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
-            ).prod(axis=1)
+        if group_multiplicative_errors is not None:
+            for mult_uncorr_errors, mult_corr_errors in group_multiplicative_errors[
+                "nonspecial_mult"
+            ]:
+                # convert to from percent to fraction
+                mult_shift = (
+                    1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
+                ).prod(axis=1)
 
-            mult_shift *= (
-                1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100
-            ).prod(axis=1)
+                mult_shift *= (
+                    1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100
+                ).prod(axis=1)
 
-            mult_shifts.append(mult_shift)
+                mult_shifts.append(mult_shift)
 
         # If sep_mult is true then the multiplicative shifts were not included in the covmat
         shifts = covmat_sqrt @ rng.normal(size=covmat.shape[1])
         mult_part = 1.0
         if sep_mult:
+            special_mult_errors = group_multiplicative_errors["special_mult"]
             special_mult = (
                 1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100
             ).prod(axis=1)
             mult_part = np.concatenate(mult_shifts, axis=0) * special_mult
         # Shifting pseudodata
-        shifted_pseudodata = (all_pseudodata + shifts) * mult_part
+        shifted_pseudodata = (central_values_array + shifts) * mult_part
         # positivity control
-        if np.all(shifted_pseudodata[full_mask] >= 0) or not resample_negative_pseudodata:
+        if np.all(shifted_pseudodata[full_mask] >= 0):
             return shifted_pseudodata
 
-    dfail = " ".join(i.setname for i in groups_dataset_inputs_loaded_cd_with_cuts)
-    log.error(f"Error generating replicas for the group: {dfail}")
-    raise ReplicaGenerationError(f"No valid replica found after {max_tries} attempts")
+    # Find which dataset index corresponds to the negative points, and print it out for debugging purposes
+    negative_mask = shifted_pseudodata < 0 & full_mask
+    negative_indices = np.where(negative_mask)[0]
+
+    raise ReplicaGenerationError(
+        f"No valid replica found after {max_tries} attempts. "
+        f"Negative global indices: {negative_indices.tolist()}"
+    )
+
+
+def central_values_array(groups_dataset_inputs_loaded_cd_with_cuts):
+    """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
+    and returns the central values concatenated in a single array.
+    """
+    central_values = []
+    for cd in groups_dataset_inputs_loaded_cd_with_cuts:
+        central_values.append(cd.central_values.to_numpy())
+    return np.concatenate(central_values, axis=0)
+
+
+def group_multiplicative_errors(groups_dataset_inputs_loaded_cd_with_cuts, sep_mult):
+    """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
+    and returns the multiplicative uncertainties contribution to the pseudodata replica.
-    and returns the multiplicative uncertainties contribution to the pseudodata replica.
+    and returns the multiplicative uncertainties contribution to the pseudodata replica in the form
+    of a dictionary separating the dataframes of the special and non-special correlations.
+
+    Returns
+    --------
+      multiplicative_errors: dict 
+        {"nonspecial_mult", "special_mult"}
-    and returns the multiplicative uncertainties contribution to the pseudodata replica.
+    and returns the multiplicative uncertainties contribution to the pseudodata replica in the form
+    of a dictionary separating the dataframes of the special and non-special correlations.
+
+    Returns
+    --------
+      multiplicative_errors: dict 
+        {"nonspecial_mult", "special_mult"}
+    """
+    if not sep_mult:
+        return None
+
+    nonspecial_mult = []
+    special_mult = []
+    special_mult_errors = []
+    for cd in groups_dataset_inputs_loaded_cd_with_cuts:
+        if sep_mult:
+            mult_errors = cd.multiplicative_errors
+            mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
+            mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
+            nonspecial_mult.append((mult_uncorr_errors, mult_corr_errors))
+            special_mult.append(
+                mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
+            )
+
+    # concatenating special multiplicative errors
+    if sep_mult:
+        special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
+
+    multiplicative_errors = {
+        "nonspecial_mult": nonspecial_mult,
+        "special_mult": special_mult_errors,
+    }
+
+    return multiplicative_errors
+
+
+def group_positivity_mask(
+    groups_dataset_inputs_loaded_cd_with_cuts, resample_negative_pseudodata=False
+):
+    """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
+    and returns a boolean mask indicating which data points should be positive.
+    """
+    if not resample_negative_pseudodata:
+        return None
+    else:
+        check_positive_masks = []
+        for cd in groups_dataset_inputs_loaded_cd_with_cuts:
+            if "ASY" in cd.commondataproc or cd.commondataproc.endswith("_POL"):
+                check_positive_masks.append(np.zeros_like(cd.central_values.to_numpy(), dtype=bool))
+            else:
+                check_positive_masks.append(np.ones_like(cd.central_values.to_numpy(), dtype=bool))
+        full_mask = np.concatenate(check_positive_masks, axis=0)
+        return full_mask
 
 
 def indexed_make_replica(groups_index, make_replica):