Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions validphys2/src/validphys/n3fit_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,34 @@ def replica_luxseed(replica, luxseed):
return replica_nnseed(replica, luxseed)


def group_replica_mcseed(replica_mcseed, groups_dataset_inputs_loaded_cd_with_cuts, genrep=True):
"""Generates the ``mcseed`` for a group of datasets. This is done by hashing the names
of the datasets in the group and adding it to the ``replica_mcseed`
Parameters
---------
groups_dataset_inputs_loaded_cd_with_cuts: list[:py:class:`nnpdf_data.coredata.CommonData`]
List of CommonData objects which stores information about systematic errors,
their treatment and description, for each dataset.
replica_mcseed: int
"""
if not genrep:
return None
names_for_salt = []
# Try to use the new dataset name, but make older runs reproducible by keeping the old names.
# WARNING: don't rely on this behaviour, this might be removed in future releases

for loaded_cd in groups_dataset_inputs_loaded_cd_with_cuts:
if loaded_cd.legacy_names is None:
names_for_salt.append(loaded_cd.setname)
else:
names_for_salt.append(loaded_cd.legacy_names[0])
name_salt = "-".join(names_for_salt)

name_seed = int(hashlib.sha256(name_salt.encode()).hexdigest(), 16) % 10**8
res = name_seed + replica_mcseed
return res


class _Masks(TupleComp):
"""Class holding the training validation mask for a group of datasets
If the same group of dataset receives the same trvlseed then the mask
Expand Down
184 changes: 110 additions & 74 deletions validphys2/src/validphys/pseudodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,16 +121,17 @@ def read_replica_pseudodata(fit, context_index, replica):


def make_replica(
groups_dataset_inputs_loaded_cd_with_cuts,
replica_mcseed,
central_values_array,
group_replica_mcseed,
dataset_inputs_sampling_covmat,
group_multiplicative_errors=None,
group_positivity_mask=None,
sep_mult=False,
genrep=True,
max_tries=int(1e6),
resample_negative_pseudodata=False,
):
"""Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
objects and returns a pseudodata replica accounting for
"""Function that takes in a central value array and a covariance matrix
and returns a pseudodata replica accounting for
possible correlations between systematic uncertainties.

The function loops until positive definite pseudodata is generated for any
Expand All @@ -139,17 +140,22 @@ def make_replica(

Parameters
---------
groups_dataset_inputs_loaded_cd_with_cuts: list[:py:class:`nnpdf_data.coredata.CommonData`]
List of CommonData objects which stores information about systematic errors,
their treatment and description, for each dataset.
central_values_array: np.array
Numpy array which is N_dat (where N_dat is the combined number of data points after cuts)
containing the central values of the data.

replica_mcseed: int, None
Seed used to initialise the numpy random number generator. If ``None`` then a random seed is
allocated using the default numpy behaviour.
group_replica_mcseed: int
Seed used to initialise the numpy random number generator.

dataset_inputs_sampling_covmat: np.array
Full covmat to be used. It can be either only experimental or also theoretical.

group_multiplicative_errors: dict
Dictionary containing the multiplicative uncertainties contribution to the pseudodata replica.

group_positivity_mask: np.array
Boolean array of shape (N_dat,) indicating which data points should be positive.

sep_mult: bool
Specifies whether computing the shifts with the full covmat
or whether multiplicative errors should be separated
Expand All @@ -162,9 +168,6 @@ def make_replica(
If after max_tries (default=1e6) no physical configuration is found,
it will raise a :py:class:`ReplicaGenerationError`

resample_negative_pseudodata: bool
When True, replicas that produce negative predictions will be resampled for ``max_tries``
until all points are positive (default: False)
Returns
-------
pseudodata: np.array
Expand All @@ -187,90 +190,123 @@ def make_replica(
0.34206012, 0.31866286, 0.2790856 , 0.33257621, 0.33680007,
"""
if not genrep:
return np.concatenate(
[cd.central_values for cd in groups_dataset_inputs_loaded_cd_with_cuts]
)
# Seed the numpy RNG with the seed and the name of the datasets in this run

# TODO: to be simplified after the reader is merged, together with an update of the regression tests
# this is necessary to reproduce exactly the results due to the replicas being generated with a hash
# Only when the sets are legacy (or coming from a legacy runcard) this shall be used
names_for_salt = []
for loaded_cd in groups_dataset_inputs_loaded_cd_with_cuts:
if loaded_cd.legacy_names is None:
names_for_salt.append(loaded_cd.setname)
else:
names_for_salt.append(loaded_cd.legacy_names[0])
name_salt = "-".join(names_for_salt)

name_seed = int(hashlib.sha256(name_salt.encode()).hexdigest(), 16) % 10**8
rng = np.random.default_rng(seed=replica_mcseed + name_seed)
return central_values_array

# Set random seed
rng = np.random.default_rng(seed=group_replica_mcseed)
# construct covmat
covmat = dataset_inputs_sampling_covmat
covmat_sqrt = sqrt_covmat(covmat)
# Loading the data
pseudodatas = []
check_positive_masks = []
nonspecial_mult = []
special_mult = []
for cd in groups_dataset_inputs_loaded_cd_with_cuts:
# copy here to avoid mutating the central values.
pseudodata = cd.central_values.to_numpy()

pseudodatas.append(pseudodata)
# Separation of multiplicative errors. If sep_mult is True also the exp_covmat is produced
# without multiplicative errors
if sep_mult:
mult_errors = cd.multiplicative_errors
mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
nonspecial_mult.append((mult_uncorr_errors, mult_corr_errors))
special_mult.append(
mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
)
if "ASY" in cd.commondataproc or cd.commondataproc.endswith("_POL"):
check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool))
else:
check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))
# concatenating special multiplicative errors, pseudodatas and positive mask
if sep_mult:
special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
all_pseudodata = np.concatenate(pseudodatas, axis=0)
full_mask = np.concatenate(check_positive_masks, axis=0)
full_mask = (
group_positivity_mask
if group_positivity_mask is not None
else np.zeros_like(central_values_array, dtype=bool)
)
# The inner while True loop is for ensuring a positive definite
# pseudodata replica
for _ in range(max_tries):
mult_shifts = []
# Prepare the per-dataset multiplicative shifts
for mult_uncorr_errors, mult_corr_errors in nonspecial_mult:
# convert to from percent to fraction
mult_shift = (
1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
).prod(axis=1)
if group_multiplicative_errors is not None:
for mult_uncorr_errors, mult_corr_errors in group_multiplicative_errors[
"nonspecial_mult"
]:
# convert to from percent to fraction
mult_shift = (
1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
).prod(axis=1)

mult_shift *= (
1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100
).prod(axis=1)
mult_shift *= (
1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100
).prod(axis=1)

mult_shifts.append(mult_shift)
mult_shifts.append(mult_shift)

# If sep_mult is true then the multiplicative shifts were not included in the covmat
shifts = covmat_sqrt @ rng.normal(size=covmat.shape[1])
mult_part = 1.0
if sep_mult:
special_mult_errors = group_multiplicative_errors["special_mult"]
special_mult = (
1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100
).prod(axis=1)
mult_part = np.concatenate(mult_shifts, axis=0) * special_mult
# Shifting pseudodata
shifted_pseudodata = (all_pseudodata + shifts) * mult_part
shifted_pseudodata = (central_values_array + shifts) * mult_part
# positivity control
if np.all(shifted_pseudodata[full_mask] >= 0) or not resample_negative_pseudodata:
if np.all(shifted_pseudodata[full_mask] >= 0):
return shifted_pseudodata

dfail = " ".join(i.setname for i in groups_dataset_inputs_loaded_cd_with_cuts)
log.error(f"Error generating replicas for the group: {dfail}")
raise ReplicaGenerationError(f"No valid replica found after {max_tries} attempts")
# Find which dataset index corresponds to the negative points, and print it out for debugging purposes
negative_mask = shifted_pseudodata < 0 & full_mask
negative_indices = np.where(negative_mask)[0]

raise ReplicaGenerationError(
f"No valid replica found after {max_tries} attempts. "
f"Negative global indices: {negative_indices.tolist()}"
)


def central_values_array(groups_dataset_inputs_loaded_cd_with_cuts):
"""Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
and returns the central values concatenated in a single array.
"""
central_values = []
for cd in groups_dataset_inputs_loaded_cd_with_cuts:
central_values.append(cd.central_values.to_numpy())
return np.concatenate(central_values, axis=0)


def group_multiplicative_errors(groups_dataset_inputs_loaded_cd_with_cuts, sep_mult):
"""Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
and returns the multiplicative uncertainties contribution to the pseudodata replica.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
and returns the multiplicative uncertainties contribution to the pseudodata replica.
and returns the multiplicative uncertainties contribution to the pseudodata replica in the form
of a dictionary separating the dataframes of the special and non-special correlations.
Returns
--------
multiplicative_errors: dict
{"nonspecial_mult", "special_mult"}

@enocera do you remember why are we doing this separation??

"""
if not sep_mult:
return None

nonspecial_mult = []
special_mult = []
special_mult_errors = []
for cd in groups_dataset_inputs_loaded_cd_with_cuts:
if sep_mult:
mult_errors = cd.multiplicative_errors
mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
nonspecial_mult.append((mult_uncorr_errors, mult_corr_errors))
special_mult.append(
mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
)

# concatenating special multiplicative errors
if sep_mult:
special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()

multiplicative_errors = {
"nonspecial_mult": nonspecial_mult,
"special_mult": special_mult_errors,
}

return multiplicative_errors


def group_positivity_mask(
groups_dataset_inputs_loaded_cd_with_cuts, resample_negative_pseudodata=False
):
"""Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
and returns a boolean mask indicating which data points should be positive.
"""
if not resample_negative_pseudodata:
return None
else:
check_positive_masks = []
for cd in groups_dataset_inputs_loaded_cd_with_cuts:
if "ASY" in cd.commondataproc or cd.commondataproc.endswith("_POL"):
check_positive_masks.append(np.zeros_like(cd.central_values.to_numpy(), dtype=bool))
else:
check_positive_masks.append(np.ones_like(cd.central_values.to_numpy(), dtype=bool))
full_mask = np.concatenate(check_positive_masks, axis=0)
return full_mask


def indexed_make_replica(groups_index, make_replica):
Expand Down
Loading