diff --git a/.github/workflows/fitbot.yml b/.github/workflows/fitbot.yml index 4db180f7f3..dacd7f78dd 100644 --- a/.github/workflows/fitbot.yml +++ b/.github/workflows/fitbot.yml @@ -12,7 +12,7 @@ env: POSTFIT_NREP: 15 # requested minimum replicas for postfit # IMPORTANT # WHEN CHANGING THE REFERENCE SET, THE NEW REFERENCE MUST BE MANUALLY UPLOADED TO THE SERVER - REFERENCE_SET: NNNBOT-1a81255f3-2026-01-27 # reference set for exact results + REFERENCE_SET: NNBOT-351efa9ef-2026-03-11 # reference set for exact results STABLE_REFERENCE_SET: NNBOT-99108504e-2025-11-22 # reference set for last tag PYTHONHASHSEED: "0" diff --git a/n3fit/src/n3fit/backends/keras_backend/callbacks.py b/n3fit/src/n3fit/backends/keras_backend/callbacks.py index 8dc6bbee48..c0eee11e00 100644 --- a/n3fit/src/n3fit/backends/keras_backend/callbacks.py +++ b/n3fit/src/n3fit/backends/keras_backend/callbacks.py @@ -16,7 +16,7 @@ from time import time from keras import backend as K -from keras.callbacks import Callback, TensorBoard +from keras import callbacks import numpy as np from .operations import decorator_compiler @@ -24,7 +24,7 @@ log = logging.getLogger(__name__) -class CallbackStep(Callback): +class CallbackStep(callbacks.Callback): """ Wrapper around the keras Callback that keeps track of how the steps are divided between epochs and batches. @@ -214,7 +214,7 @@ def gen_tensorboard_callback(log_dir, profiling=False, histogram_freq=0): Whether or not to save profiling information (default False) """ profile_batch = 1 if profiling else 0 - clb = TensorBoard( + clb = callbacks.TensorBoard( log_dir=log_dir, histogram_freq=histogram_freq, write_graph=True, diff --git a/pyproject.toml b/pyproject.toml index 261c28b207..56a35b2cf5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ packaging = "*" psutil = "*" tensorflow = "*" keras = "^3.1" +tensorboard = "*" # keras requires tensorboard but some versions are missing the dependency eko = "^0.15.1" joblib = "*" seaborn = "*" diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index 2d23307365..d49df31f07 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -110,6 +110,34 @@ def replica_luxseed(replica, luxseed): return replica_nnseed(replica, luxseed) +def group_replica_mcseed(replica_mcseed, groups_dataset_inputs_loaded_cd_with_cuts, genrep=True): + """Generates the ``mcseed`` for a group of datasets. This is done by hashing the names + of the datasets in the group and adding it to the ``replica_mcseed` + Parameters + --------- + groups_dataset_inputs_loaded_cd_with_cuts: list[:py:class:`nnpdf_data.coredata.CommonData`] + List of CommonData objects which stores information about systematic errors, + their treatment and description, for each dataset. + replica_mcseed: int + """ + if not genrep: + return None + names_for_salt = [] + # Try to use the new dataset name, but make older runs reproducible by keeping the old names. + # WARNING: don't rely on this behaviour, this might be removed in future releases + + for loaded_cd in groups_dataset_inputs_loaded_cd_with_cuts: + if loaded_cd.legacy_names is None: + names_for_salt.append(loaded_cd.setname) + else: + names_for_salt.append(loaded_cd.legacy_names[0]) + name_salt = "-".join(names_for_salt) + + name_seed = int(hashlib.sha256(name_salt.encode()).hexdigest(), 16) % 10**8 + res = name_seed + replica_mcseed + return res + + class _Masks(TupleComp): """Class holding the training validation mask for a group of datasets If the same group of dataset receives the same trvlseed then the mask diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 08a4f7103e..de7dd36c69 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -121,16 +121,17 @@ def read_replica_pseudodata(fit, context_index, replica): def make_replica( - groups_dataset_inputs_loaded_cd_with_cuts, - replica_mcseed, + central_values_array, + group_replica_mcseed, dataset_inputs_sampling_covmat, + group_multiplicative_errors=None, + group_positivity_mask=None, sep_mult=False, genrep=True, max_tries=int(1e6), - resample_negative_pseudodata=False, ): - """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData` - objects and returns a pseudodata replica accounting for + """Function that takes in a central value array and a covariance matrix + and returns a pseudodata replica accounting for possible correlations between systematic uncertainties. The function loops until positive definite pseudodata is generated for any @@ -139,17 +140,22 @@ def make_replica( Parameters --------- - groups_dataset_inputs_loaded_cd_with_cuts: list[:py:class:`nnpdf_data.coredata.CommonData`] - List of CommonData objects which stores information about systematic errors, - their treatment and description, for each dataset. + central_values_array: np.array + Numpy array which is N_dat (where N_dat is the combined number of data points after cuts) + containing the central values of the data. - replica_mcseed: int, None - Seed used to initialise the numpy random number generator. If ``None`` then a random seed is - allocated using the default numpy behaviour. + group_replica_mcseed: int + Seed used to initialise the numpy random number generator. dataset_inputs_sampling_covmat: np.array Full covmat to be used. It can be either only experimental or also theoretical. + group_multiplicative_errors: dict + Dictionary containing the multiplicative uncertainties contribution to the pseudodata replica. + + group_positivity_mask: np.array + Boolean array of shape (N_dat,) indicating which data points should be positive. + sep_mult: bool Specifies whether computing the shifts with the full covmat or whether multiplicative errors should be separated @@ -162,9 +168,6 @@ def make_replica( If after max_tries (default=1e6) no physical configuration is found, it will raise a :py:class:`ReplicaGenerationError` - resample_negative_pseudodata: bool - When True, replicas that produce negative predictions will be resampled for ``max_tries`` - until all points are positive (default: False) Returns ------- pseudodata: np.array @@ -187,90 +190,122 @@ def make_replica( 0.34206012, 0.31866286, 0.2790856 , 0.33257621, 0.33680007, """ if not genrep: - return np.concatenate( - [cd.central_values for cd in groups_dataset_inputs_loaded_cd_with_cuts] - ) - # Seed the numpy RNG with the seed and the name of the datasets in this run - - # TODO: to be simplified after the reader is merged, together with an update of the regression tests - # this is necessary to reproduce exactly the results due to the replicas being generated with a hash - # Only when the sets are legacy (or coming from a legacy runcard) this shall be used - names_for_salt = [] - for loaded_cd in groups_dataset_inputs_loaded_cd_with_cuts: - if loaded_cd.legacy_names is None: - names_for_salt.append(loaded_cd.setname) - else: - names_for_salt.append(loaded_cd.legacy_names[0]) - name_salt = "-".join(names_for_salt) + return central_values_array - name_seed = int(hashlib.sha256(name_salt.encode()).hexdigest(), 16) % 10**8 - rng = np.random.default_rng(seed=replica_mcseed + name_seed) + # Set random seed + rng = np.random.default_rng(seed=group_replica_mcseed) # construct covmat covmat = dataset_inputs_sampling_covmat covmat_sqrt = sqrt_covmat(covmat) - # Loading the data - pseudodatas = [] - check_positive_masks = [] - nonspecial_mult = [] - special_mult = [] - for cd in groups_dataset_inputs_loaded_cd_with_cuts: - # copy here to avoid mutating the central values. - pseudodata = cd.central_values.to_numpy() - pseudodatas.append(pseudodata) - # Separation of multiplicative errors. If sep_mult is True also the exp_covmat is produced - # without multiplicative errors - if sep_mult: - mult_errors = cd.multiplicative_errors - mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy() - mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy() - nonspecial_mult.append((mult_uncorr_errors, mult_corr_errors)) - special_mult.append( - mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)] - ) - if "ASY" in cd.commondataproc or cd.commondataproc.endswith("_POL"): - check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool)) - else: - check_positive_masks.append(np.ones_like(pseudodata, dtype=bool)) - # concatenating special multiplicative errors, pseudodatas and positive mask - if sep_mult: - special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy() - all_pseudodata = np.concatenate(pseudodatas, axis=0) - full_mask = np.concatenate(check_positive_masks, axis=0) + full_mask = ( + group_positivity_mask + if group_positivity_mask is not None + else np.zeros_like(central_values_array, dtype=bool) + ) # The inner while True loop is for ensuring a positive definite # pseudodata replica for _ in range(max_tries): mult_shifts = [] # Prepare the per-dataset multiplicative shifts - for mult_uncorr_errors, mult_corr_errors in nonspecial_mult: - # convert to from percent to fraction - mult_shift = ( - 1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100 - ).prod(axis=1) + if group_multiplicative_errors is not None: + for mult_uncorr_errors, mult_corr_errors in group_multiplicative_errors[ + "nonspecial_mult" + ]: + # convert to from percent to fraction + mult_shift = ( + 1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100 + ).prod(axis=1) - mult_shift *= ( - 1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100 - ).prod(axis=1) + mult_shift *= ( + 1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100 + ).prod(axis=1) - mult_shifts.append(mult_shift) + mult_shifts.append(mult_shift) # If sep_mult is true then the multiplicative shifts were not included in the covmat shifts = covmat_sqrt @ rng.normal(size=covmat.shape[1]) mult_part = 1.0 if sep_mult: + special_mult_errors = group_multiplicative_errors["special_mult"] special_mult = ( 1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100 ).prod(axis=1) mult_part = np.concatenate(mult_shifts, axis=0) * special_mult # Shifting pseudodata - shifted_pseudodata = (all_pseudodata + shifts) * mult_part + shifted_pseudodata = (central_values_array + shifts) * mult_part # positivity control - if np.all(shifted_pseudodata[full_mask] >= 0) or not resample_negative_pseudodata: + if np.all(shifted_pseudodata[full_mask] >= 0): return shifted_pseudodata - dfail = " ".join(i.setname for i in groups_dataset_inputs_loaded_cd_with_cuts) - log.error(f"Error generating replicas for the group: {dfail}") - raise ReplicaGenerationError(f"No valid replica found after {max_tries} attempts") + # Find which dataset index corresponds to the negative points, and print it out for debugging purposes + negative_mask = shifted_pseudodata < 0 & full_mask + negative_indices = np.where(negative_mask)[0] + + raise ReplicaGenerationError( + f"No valid replica found after {max_tries} attempts. " + f"Negative global indices: {negative_indices.tolist()}" + ) + + +def central_values_array(groups_dataset_inputs_loaded_cd_with_cuts): + """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData` + and returns the central values concatenated in a single array. + """ + central_values = [] + for cd in groups_dataset_inputs_loaded_cd_with_cuts: + central_values.append(cd.central_values.to_numpy()) + return np.concatenate(central_values, axis=0) + + +def group_multiplicative_errors(groups_dataset_inputs_loaded_cd_with_cuts, sep_mult=False): + """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData` + and returns the multiplicative uncertainties contribution to the pseudodata replica. + """ + if not sep_mult: + return None + + nonspecial_mult = [] + special_mult = [] + special_mult_errors = [] + for cd in groups_dataset_inputs_loaded_cd_with_cuts: + if sep_mult: + mult_errors = cd.multiplicative_errors + mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy() + mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy() + nonspecial_mult.append((mult_uncorr_errors, mult_corr_errors)) + special_mult.append( + mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)] + ) + + # concatenating special multiplicative errors + if sep_mult: + special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy() + + multiplicative_errors = { + "nonspecial_mult": nonspecial_mult, + "special_mult": special_mult_errors, + } + + return multiplicative_errors + + +def group_positivity_mask( + groups_dataset_inputs_loaded_cd_with_cuts, resample_negative_pseudodata=False +): + """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData` + and returns a boolean mask indicating which data points should be positive. + """ + if not resample_negative_pseudodata: + return None + check_positive_masks = [] + for cd in groups_dataset_inputs_loaded_cd_with_cuts: + if "ASY" in cd.commondataproc or cd.commondataproc.endswith("_POL"): + check_positive_masks.append(np.zeros_like(cd.central_values.to_numpy(), dtype=bool)) + else: + check_positive_masks.append(np.ones_like(cd.central_values.to_numpy(), dtype=bool)) + full_mask = np.concatenate(check_positive_masks, axis=0) + return full_mask def indexed_make_replica(groups_index, make_replica): @@ -397,8 +432,11 @@ def make_level1_data(data, level0_commondata_wc, filterseed, data_index, sep_mul ) # ================== generation of Level1 data ======================# + central_vals= central_values_array(level0_commondata_wc) + group_mult_errs = group_multiplicative_errors(level0_commondata_wc, sep_mult=sep_mult) + group_pos_mask = group_positivity_mask(level0_commondata_wc) level1_data = make_replica( - level0_commondata_wc, filterseed, covmat, sep_mult=sep_mult, genrep=True + central_vals, filterseed, covmat, group_multiplicative_errors=group_mult_errs, group_positivity_mask=group_pos_mask, sep_mult=sep_mult, genrep=True, ) indexed_level1_data = indexed_make_replica(data_index, level1_data) diff --git a/validphys2/src/validphys/tests/conftest.py b/validphys2/src/validphys/tests/conftest.py index a25bfea89b..528786722b 100644 --- a/validphys2/src/validphys/tests/conftest.py +++ b/validphys2/src/validphys/tests/conftest.py @@ -107,15 +107,16 @@ def data_internal_cuts_config(data_config): @pytest.fixture(scope='module') -def data_internal_cuts_new_theory_config(data_internal_cuts_config): +def data_internal_cuts_closure_config(data_internal_cuts_config): + # Filterseed is not added so that it is changed by the tests config = dict(data_internal_cuts_config) - config["theoryid"] = THEORYID + config["fakepdf"] = PDF return config @pytest.fixture(scope='module') -def data_fromfit_cuts_config(data_internal_cuts_new_theory_config): - config = dict(data_internal_cuts_new_theory_config) +def data_fromfit_cuts_config(data_internal_cuts_config): + config = dict(data_internal_cuts_config) config.update(use_cuts="fromfit") return config diff --git a/validphys2/src/validphys/tests/test_datafiles.py b/validphys2/src/validphys/tests/test_datafiles.py index b24e0a15ad..6de728e3ea 100644 --- a/validphys2/src/validphys/tests/test_datafiles.py +++ b/validphys2/src/validphys/tests/test_datafiles.py @@ -1,8 +1,8 @@ """ - Test all datafiles +Test all datafiles - The checks in ``test_all_datasets`` are run for each dataset independently so that one gets one - failure per dataset in case of problems +The checks in ``test_all_datasets`` are run for each dataset independently so that one gets one +failure per dataset in case of problems """ import pytest @@ -24,7 +24,7 @@ def _load_main_and_variants(dataset_name): @pytest.mark.parametrize("dataset_name", all_datasets) -def test_all_datasets(dataset_name, data_internal_cuts_new_theory_config): +def test_all_datasets(dataset_name): """Checks that a dataset can be loaded (together with its variants), that the kinematics, uncertainties and data can be read. diff --git a/validphys2/src/validphys/tests/test_pseudodata.py b/validphys2/src/validphys/tests/test_pseudodata.py index 74568171ac..cbb1cc8b8d 100644 --- a/validphys2/src/validphys/tests/test_pseudodata.py +++ b/validphys2/src/validphys/tests/test_pseudodata.py @@ -8,6 +8,7 @@ recreation. """ +import numpy as np from numpy.testing import assert_allclose import pandas as pd import pytest @@ -115,3 +116,16 @@ def test_level0_commondata_wc(): assert_allclose( dataset_t0_predictions(t0dataset=datasetspec, t0set=t0set), l0_vals, rtol=1e-07, atol=0 ) + + +def test_level1_commondata(data_internal_cuts_closure_config): + """ + check whether level 1 commondata can be generated and that it is seed-dependent. + """ + mod_data1 = API.make_level1_data(**data_internal_cuts_closure_config, filterseed=2) + mod_data2 = API.make_level1_data(**data_internal_cuts_closure_config, filterseed=42) + + arr1 = pd.concat(i.central_values for i in mod_data1) + arr2 = pd.concat(i.central_values for i in mod_data2) + + assert not np.allclose(arr1, arr2)