Skip to content

Commit aab0634

Browse files
authored
Merge pull request #2431 from NNPDF/make_replicas_colibri
Make replicas Colibri compatible
2 parents b91ba01 + ce74fbc commit aab0634

File tree

6 files changed

+164
-83
lines changed

6 files changed

+164
-83
lines changed

.github/workflows/fitbot.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ env:
1212
POSTFIT_NREP: 15 # requested minimum replicas for postfit
1313
# IMPORTANT
1414
# WHEN CHANGING THE REFERENCE SET, THE NEW REFERENCE MUST BE MANUALLY UPLOADED TO THE SERVER
15-
REFERENCE_SET: NNNBOT-1a81255f3-2026-01-27 # reference set for exact results
15+
REFERENCE_SET: NNBOT-351efa9ef-2026-03-11 # reference set for exact results
1616
STABLE_REFERENCE_SET: NNBOT-99108504e-2025-11-22 # reference set for last tag
1717
PYTHONHASHSEED: "0"
1818

validphys2/src/validphys/n3fit_data.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,34 @@ def replica_luxseed(replica, luxseed):
110110
return replica_nnseed(replica, luxseed)
111111

112112

113+
def group_replica_mcseed(replica_mcseed, groups_dataset_inputs_loaded_cd_with_cuts, genrep=True):
114+
"""Generates the ``mcseed`` for a group of datasets. This is done by hashing the names
115+
of the datasets in the group and adding it to the ``replica_mcseed`
116+
Parameters
117+
---------
118+
groups_dataset_inputs_loaded_cd_with_cuts: list[:py:class:`nnpdf_data.coredata.CommonData`]
119+
List of CommonData objects which stores information about systematic errors,
120+
their treatment and description, for each dataset.
121+
replica_mcseed: int
122+
"""
123+
if not genrep:
124+
return None
125+
names_for_salt = []
126+
# Try to use the new dataset name, but make older runs reproducible by keeping the old names.
127+
# WARNING: don't rely on this behaviour, this might be removed in future releases
128+
129+
for loaded_cd in groups_dataset_inputs_loaded_cd_with_cuts:
130+
if loaded_cd.legacy_names is None:
131+
names_for_salt.append(loaded_cd.setname)
132+
else:
133+
names_for_salt.append(loaded_cd.legacy_names[0])
134+
name_salt = "-".join(names_for_salt)
135+
136+
name_seed = int(hashlib.sha256(name_salt.encode()).hexdigest(), 16) % 10**8
137+
res = name_seed + replica_mcseed
138+
return res
139+
140+
113141
class _Masks(TupleComp):
114142
"""Class holding the training validation mask for a group of datasets
115143
If the same group of dataset receives the same trvlseed then the mask

validphys2/src/validphys/pseudodata.py

Lines changed: 112 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -121,16 +121,17 @@ def read_replica_pseudodata(fit, context_index, replica):
121121

122122

123123
def make_replica(
124-
groups_dataset_inputs_loaded_cd_with_cuts,
125-
replica_mcseed,
124+
central_values_array,
125+
group_replica_mcseed,
126126
dataset_inputs_sampling_covmat,
127+
group_multiplicative_errors=None,
128+
group_positivity_mask=None,
127129
sep_mult=False,
128130
genrep=True,
129131
max_tries=int(1e6),
130-
resample_negative_pseudodata=False,
131132
):
132-
"""Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
133-
objects and returns a pseudodata replica accounting for
133+
"""Function that takes in a central value array and a covariance matrix
134+
and returns a pseudodata replica accounting for
134135
possible correlations between systematic uncertainties.
135136
136137
The function loops until positive definite pseudodata is generated for any
@@ -139,17 +140,22 @@ def make_replica(
139140
140141
Parameters
141142
---------
142-
groups_dataset_inputs_loaded_cd_with_cuts: list[:py:class:`nnpdf_data.coredata.CommonData`]
143-
List of CommonData objects which stores information about systematic errors,
144-
their treatment and description, for each dataset.
143+
central_values_array: np.array
144+
Numpy array which is N_dat (where N_dat is the combined number of data points after cuts)
145+
containing the central values of the data.
145146
146-
replica_mcseed: int, None
147-
Seed used to initialise the numpy random number generator. If ``None`` then a random seed is
148-
allocated using the default numpy behaviour.
147+
group_replica_mcseed: int
148+
Seed used to initialise the numpy random number generator.
149149
150150
dataset_inputs_sampling_covmat: np.array
151151
Full covmat to be used. It can be either only experimental or also theoretical.
152152
153+
group_multiplicative_errors: dict
154+
Dictionary containing the multiplicative uncertainties contribution to the pseudodata replica.
155+
156+
group_positivity_mask: np.array
157+
Boolean array of shape (N_dat,) indicating which data points should be positive.
158+
153159
sep_mult: bool
154160
Specifies whether computing the shifts with the full covmat
155161
or whether multiplicative errors should be separated
@@ -162,9 +168,6 @@ def make_replica(
162168
If after max_tries (default=1e6) no physical configuration is found,
163169
it will raise a :py:class:`ReplicaGenerationError`
164170
165-
resample_negative_pseudodata: bool
166-
When True, replicas that produce negative predictions will be resampled for ``max_tries``
167-
until all points are positive (default: False)
168171
Returns
169172
-------
170173
pseudodata: np.array
@@ -187,90 +190,122 @@ def make_replica(
187190
0.34206012, 0.31866286, 0.2790856 , 0.33257621, 0.33680007,
188191
"""
189192
if not genrep:
190-
return np.concatenate(
191-
[cd.central_values for cd in groups_dataset_inputs_loaded_cd_with_cuts]
192-
)
193-
# Seed the numpy RNG with the seed and the name of the datasets in this run
194-
195-
# TODO: to be simplified after the reader is merged, together with an update of the regression tests
196-
# this is necessary to reproduce exactly the results due to the replicas being generated with a hash
197-
# Only when the sets are legacy (or coming from a legacy runcard) this shall be used
198-
names_for_salt = []
199-
for loaded_cd in groups_dataset_inputs_loaded_cd_with_cuts:
200-
if loaded_cd.legacy_names is None:
201-
names_for_salt.append(loaded_cd.setname)
202-
else:
203-
names_for_salt.append(loaded_cd.legacy_names[0])
204-
name_salt = "-".join(names_for_salt)
193+
return central_values_array
205194

206-
name_seed = int(hashlib.sha256(name_salt.encode()).hexdigest(), 16) % 10**8
207-
rng = np.random.default_rng(seed=replica_mcseed + name_seed)
195+
# Set random seed
196+
rng = np.random.default_rng(seed=group_replica_mcseed)
208197
# construct covmat
209198
covmat = dataset_inputs_sampling_covmat
210199
covmat_sqrt = sqrt_covmat(covmat)
211-
# Loading the data
212-
pseudodatas = []
213-
check_positive_masks = []
214-
nonspecial_mult = []
215-
special_mult = []
216-
for cd in groups_dataset_inputs_loaded_cd_with_cuts:
217-
# copy here to avoid mutating the central values.
218-
pseudodata = cd.central_values.to_numpy()
219200

220-
pseudodatas.append(pseudodata)
221-
# Separation of multiplicative errors. If sep_mult is True also the exp_covmat is produced
222-
# without multiplicative errors
223-
if sep_mult:
224-
mult_errors = cd.multiplicative_errors
225-
mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
226-
mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
227-
nonspecial_mult.append((mult_uncorr_errors, mult_corr_errors))
228-
special_mult.append(
229-
mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
230-
)
231-
if "ASY" in cd.commondataproc or cd.commondataproc.endswith("_POL"):
232-
check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool))
233-
else:
234-
check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))
235-
# concatenating special multiplicative errors, pseudodatas and positive mask
236-
if sep_mult:
237-
special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
238-
all_pseudodata = np.concatenate(pseudodatas, axis=0)
239-
full_mask = np.concatenate(check_positive_masks, axis=0)
201+
full_mask = (
202+
group_positivity_mask
203+
if group_positivity_mask is not None
204+
else np.zeros_like(central_values_array, dtype=bool)
205+
)
240206
# The inner while True loop is for ensuring a positive definite
241207
# pseudodata replica
242208
for _ in range(max_tries):
243209
mult_shifts = []
244210
# Prepare the per-dataset multiplicative shifts
245-
for mult_uncorr_errors, mult_corr_errors in nonspecial_mult:
246-
# convert to from percent to fraction
247-
mult_shift = (
248-
1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
249-
).prod(axis=1)
211+
if group_multiplicative_errors is not None:
212+
for mult_uncorr_errors, mult_corr_errors in group_multiplicative_errors[
213+
"nonspecial_mult"
214+
]:
215+
# convert to from percent to fraction
216+
mult_shift = (
217+
1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
218+
).prod(axis=1)
250219

251-
mult_shift *= (
252-
1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100
253-
).prod(axis=1)
220+
mult_shift *= (
221+
1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100
222+
).prod(axis=1)
254223

255-
mult_shifts.append(mult_shift)
224+
mult_shifts.append(mult_shift)
256225

257226
# If sep_mult is true then the multiplicative shifts were not included in the covmat
258227
shifts = covmat_sqrt @ rng.normal(size=covmat.shape[1])
259228
mult_part = 1.0
260229
if sep_mult:
230+
special_mult_errors = group_multiplicative_errors["special_mult"]
261231
special_mult = (
262232
1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100
263233
).prod(axis=1)
264234
mult_part = np.concatenate(mult_shifts, axis=0) * special_mult
265235
# Shifting pseudodata
266-
shifted_pseudodata = (all_pseudodata + shifts) * mult_part
236+
shifted_pseudodata = (central_values_array + shifts) * mult_part
267237
# positivity control
268-
if np.all(shifted_pseudodata[full_mask] >= 0) or not resample_negative_pseudodata:
238+
if np.all(shifted_pseudodata[full_mask] >= 0):
269239
return shifted_pseudodata
270240

271-
dfail = " ".join(i.setname for i in groups_dataset_inputs_loaded_cd_with_cuts)
272-
log.error(f"Error generating replicas for the group: {dfail}")
273-
raise ReplicaGenerationError(f"No valid replica found after {max_tries} attempts")
241+
# Find which dataset index corresponds to the negative points, and print it out for debugging purposes
242+
negative_mask = shifted_pseudodata < 0 & full_mask
243+
negative_indices = np.where(negative_mask)[0]
244+
245+
raise ReplicaGenerationError(
246+
f"No valid replica found after {max_tries} attempts. "
247+
f"Negative global indices: {negative_indices.tolist()}"
248+
)
249+
250+
251+
def central_values_array(groups_dataset_inputs_loaded_cd_with_cuts):
252+
"""Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
253+
and returns the central values concatenated in a single array.
254+
"""
255+
central_values = []
256+
for cd in groups_dataset_inputs_loaded_cd_with_cuts:
257+
central_values.append(cd.central_values.to_numpy())
258+
return np.concatenate(central_values, axis=0)
259+
260+
261+
def group_multiplicative_errors(groups_dataset_inputs_loaded_cd_with_cuts, sep_mult=False):
262+
"""Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
263+
and returns the multiplicative uncertainties contribution to the pseudodata replica.
264+
"""
265+
if not sep_mult:
266+
return None
267+
268+
nonspecial_mult = []
269+
special_mult = []
270+
special_mult_errors = []
271+
for cd in groups_dataset_inputs_loaded_cd_with_cuts:
272+
if sep_mult:
273+
mult_errors = cd.multiplicative_errors
274+
mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
275+
mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
276+
nonspecial_mult.append((mult_uncorr_errors, mult_corr_errors))
277+
special_mult.append(
278+
mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
279+
)
280+
281+
# concatenating special multiplicative errors
282+
if sep_mult:
283+
special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
284+
285+
multiplicative_errors = {
286+
"nonspecial_mult": nonspecial_mult,
287+
"special_mult": special_mult_errors,
288+
}
289+
290+
return multiplicative_errors
291+
292+
293+
def group_positivity_mask(
294+
groups_dataset_inputs_loaded_cd_with_cuts, resample_negative_pseudodata=False
295+
):
296+
"""Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
297+
and returns a boolean mask indicating which data points should be positive.
298+
"""
299+
if not resample_negative_pseudodata:
300+
return None
301+
check_positive_masks = []
302+
for cd in groups_dataset_inputs_loaded_cd_with_cuts:
303+
if "ASY" in cd.commondataproc or cd.commondataproc.endswith("_POL"):
304+
check_positive_masks.append(np.zeros_like(cd.central_values.to_numpy(), dtype=bool))
305+
else:
306+
check_positive_masks.append(np.ones_like(cd.central_values.to_numpy(), dtype=bool))
307+
full_mask = np.concatenate(check_positive_masks, axis=0)
308+
return full_mask
274309

275310

276311
def indexed_make_replica(groups_index, make_replica):
@@ -397,8 +432,11 @@ def make_level1_data(data, level0_commondata_wc, filterseed, data_index, sep_mul
397432
)
398433

399434
# ================== generation of Level1 data ======================#
435+
central_vals= central_values_array(level0_commondata_wc)
436+
group_mult_errs = group_multiplicative_errors(level0_commondata_wc, sep_mult=sep_mult)
437+
group_pos_mask = group_positivity_mask(level0_commondata_wc)
400438
level1_data = make_replica(
401-
level0_commondata_wc, filterseed, covmat, sep_mult=sep_mult, genrep=True
439+
central_vals, filterseed, covmat, group_multiplicative_errors=group_mult_errs, group_positivity_mask=group_pos_mask, sep_mult=sep_mult, genrep=True,
402440
)
403441

404442
indexed_level1_data = indexed_make_replica(data_index, level1_data)

validphys2/src/validphys/tests/conftest.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -107,15 +107,16 @@ def data_internal_cuts_config(data_config):
107107

108108

109109
@pytest.fixture(scope='module')
110-
def data_internal_cuts_new_theory_config(data_internal_cuts_config):
110+
def data_internal_cuts_closure_config(data_internal_cuts_config):
111+
# Filterseed is not added so that it is changed by the tests
111112
config = dict(data_internal_cuts_config)
112-
config["theoryid"] = THEORYID
113+
config["fakepdf"] = PDF
113114
return config
114115

115116

116117
@pytest.fixture(scope='module')
117-
def data_fromfit_cuts_config(data_internal_cuts_new_theory_config):
118-
config = dict(data_internal_cuts_new_theory_config)
118+
def data_fromfit_cuts_config(data_internal_cuts_config):
119+
config = dict(data_internal_cuts_config)
119120
config.update(use_cuts="fromfit")
120121
return config
121122

validphys2/src/validphys/tests/test_datafiles.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
"""
2-
Test all datafiles
2+
Test all datafiles
33
4-
The checks in ``test_all_datasets`` are run for each dataset independently so that one gets one
5-
failure per dataset in case of problems
4+
The checks in ``test_all_datasets`` are run for each dataset independently so that one gets one
5+
failure per dataset in case of problems
66
"""
77

88
import pytest
@@ -24,7 +24,7 @@ def _load_main_and_variants(dataset_name):
2424

2525

2626
@pytest.mark.parametrize("dataset_name", all_datasets)
27-
def test_all_datasets(dataset_name, data_internal_cuts_new_theory_config):
27+
def test_all_datasets(dataset_name):
2828
"""Checks that a dataset can be loaded (together with its variants),
2929
that the kinematics, uncertainties and data can be read.
3030

validphys2/src/validphys/tests/test_pseudodata.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
recreation.
99
"""
1010

11+
import numpy as np
1112
from numpy.testing import assert_allclose
1213
import pandas as pd
1314
import pytest
@@ -115,3 +116,16 @@ def test_level0_commondata_wc():
115116
assert_allclose(
116117
dataset_t0_predictions(t0dataset=datasetspec, t0set=t0set), l0_vals, rtol=1e-07, atol=0
117118
)
119+
120+
121+
def test_level1_commondata(data_internal_cuts_closure_config):
122+
"""
123+
check whether level 1 commondata can be generated and that it is seed-dependent.
124+
"""
125+
mod_data1 = API.make_level1_data(**data_internal_cuts_closure_config, filterseed=2)
126+
mod_data2 = API.make_level1_data(**data_internal_cuts_closure_config, filterseed=42)
127+
128+
arr1 = pd.concat(i.central_values for i in mod_data1)
129+
arr2 = pd.concat(i.central_values for i in mod_data2)
130+
131+
assert not np.allclose(arr1, arr2)

0 commit comments

Comments
 (0)