Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/fitbot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ env:
POSTFIT_NREP: 15 # requested minimum replicas for postfit
# IMPORTANT
# WHEN CHANGING THE REFERENCE SET, THE NEW REFERENCE MUST BE MANUALLY UPLOADED TO THE SERVER
REFERENCE_SET: NNNBOT-1a81255f3-2026-01-27 # reference set for exact results
REFERENCE_SET: NNBOT-351efa9ef-2026-03-11 # reference set for exact results
STABLE_REFERENCE_SET: NNBOT-99108504e-2025-11-22 # reference set for last tag
PYTHONHASHSEED: "0"

Expand Down
6 changes: 3 additions & 3 deletions n3fit/src/n3fit/backends/keras_backend/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@
from time import time

from keras import backend as K
from keras.callbacks import Callback, TensorBoard
from keras import callbacks
import numpy as np

from .operations import decorator_compiler

log = logging.getLogger(__name__)


class CallbackStep(Callback):
class CallbackStep(callbacks.Callback):
"""
Wrapper around the keras Callback that keeps track of how the steps are divided
between epochs and batches.
Expand Down Expand Up @@ -214,7 +214,7 @@ def gen_tensorboard_callback(log_dir, profiling=False, histogram_freq=0):
Whether or not to save profiling information (default False)
"""
profile_batch = 1 if profiling else 0
clb = TensorBoard(
clb = callbacks.TensorBoard(
log_dir=log_dir,
histogram_freq=histogram_freq,
write_graph=True,
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ packaging = "*"
psutil = "*"
tensorflow = "*"
keras = "^3.1"
tensorboard = "*" # keras requires tensorboard but some versions are missing the dependency
eko = "^0.15.1"
joblib = "*"
seaborn = "*"
Expand Down
28 changes: 28 additions & 0 deletions validphys2/src/validphys/n3fit_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,34 @@ def replica_luxseed(replica, luxseed):
return replica_nnseed(replica, luxseed)


def group_replica_mcseed(replica_mcseed, groups_dataset_inputs_loaded_cd_with_cuts, genrep=True):
"""Generates the ``mcseed`` for a group of datasets. This is done by hashing the names
of the datasets in the group and adding it to the ``replica_mcseed`
Parameters
---------
groups_dataset_inputs_loaded_cd_with_cuts: list[:py:class:`nnpdf_data.coredata.CommonData`]
List of CommonData objects which stores information about systematic errors,
their treatment and description, for each dataset.
replica_mcseed: int
"""
if not genrep:
return None
names_for_salt = []
# Try to use the new dataset name, but make older runs reproducible by keeping the old names.
# WARNING: don't rely on this behaviour, this might be removed in future releases

for loaded_cd in groups_dataset_inputs_loaded_cd_with_cuts:
if loaded_cd.legacy_names is None:
names_for_salt.append(loaded_cd.setname)
else:
names_for_salt.append(loaded_cd.legacy_names[0])
name_salt = "-".join(names_for_salt)

name_seed = int(hashlib.sha256(name_salt.encode()).hexdigest(), 16) % 10**8
res = name_seed + replica_mcseed
return res


class _Masks(TupleComp):
"""Class holding the training validation mask for a group of datasets
If the same group of dataset receives the same trvlseed then the mask
Expand Down
186 changes: 112 additions & 74 deletions validphys2/src/validphys/pseudodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,16 +121,17 @@ def read_replica_pseudodata(fit, context_index, replica):


def make_replica(
groups_dataset_inputs_loaded_cd_with_cuts,
replica_mcseed,
central_values_array,
group_replica_mcseed,
dataset_inputs_sampling_covmat,
group_multiplicative_errors=None,
group_positivity_mask=None,
sep_mult=False,
genrep=True,
max_tries=int(1e6),
resample_negative_pseudodata=False,
):
"""Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
objects and returns a pseudodata replica accounting for
"""Function that takes in a central value array and a covariance matrix
and returns a pseudodata replica accounting for
possible correlations between systematic uncertainties.

The function loops until positive definite pseudodata is generated for any
Expand All @@ -139,17 +140,22 @@ def make_replica(

Parameters
---------
groups_dataset_inputs_loaded_cd_with_cuts: list[:py:class:`nnpdf_data.coredata.CommonData`]
List of CommonData objects which stores information about systematic errors,
their treatment and description, for each dataset.
central_values_array: np.array
Numpy array which is N_dat (where N_dat is the combined number of data points after cuts)
containing the central values of the data.

replica_mcseed: int, None
Seed used to initialise the numpy random number generator. If ``None`` then a random seed is
allocated using the default numpy behaviour.
group_replica_mcseed: int
Seed used to initialise the numpy random number generator.

dataset_inputs_sampling_covmat: np.array
Full covmat to be used. It can be either only experimental or also theoretical.

group_multiplicative_errors: dict
Dictionary containing the multiplicative uncertainties contribution to the pseudodata replica.

group_positivity_mask: np.array
Boolean array of shape (N_dat,) indicating which data points should be positive.

sep_mult: bool
Specifies whether computing the shifts with the full covmat
or whether multiplicative errors should be separated
Expand All @@ -162,9 +168,6 @@ def make_replica(
If after max_tries (default=1e6) no physical configuration is found,
it will raise a :py:class:`ReplicaGenerationError`

resample_negative_pseudodata: bool
When True, replicas that produce negative predictions will be resampled for ``max_tries``
until all points are positive (default: False)
Returns
-------
pseudodata: np.array
Expand All @@ -187,90 +190,122 @@ def make_replica(
0.34206012, 0.31866286, 0.2790856 , 0.33257621, 0.33680007,
"""
if not genrep:
return np.concatenate(
[cd.central_values for cd in groups_dataset_inputs_loaded_cd_with_cuts]
)
# Seed the numpy RNG with the seed and the name of the datasets in this run

# TODO: to be simplified after the reader is merged, together with an update of the regression tests
# this is necessary to reproduce exactly the results due to the replicas being generated with a hash
# Only when the sets are legacy (or coming from a legacy runcard) this shall be used
names_for_salt = []
for loaded_cd in groups_dataset_inputs_loaded_cd_with_cuts:
if loaded_cd.legacy_names is None:
names_for_salt.append(loaded_cd.setname)
else:
names_for_salt.append(loaded_cd.legacy_names[0])
name_salt = "-".join(names_for_salt)
return central_values_array

name_seed = int(hashlib.sha256(name_salt.encode()).hexdigest(), 16) % 10**8
rng = np.random.default_rng(seed=replica_mcseed + name_seed)
# Set random seed
rng = np.random.default_rng(seed=group_replica_mcseed)
# construct covmat
covmat = dataset_inputs_sampling_covmat
covmat_sqrt = sqrt_covmat(covmat)
# Loading the data
pseudodatas = []
check_positive_masks = []
nonspecial_mult = []
special_mult = []
for cd in groups_dataset_inputs_loaded_cd_with_cuts:
# copy here to avoid mutating the central values.
pseudodata = cd.central_values.to_numpy()

pseudodatas.append(pseudodata)
# Separation of multiplicative errors. If sep_mult is True also the exp_covmat is produced
# without multiplicative errors
if sep_mult:
mult_errors = cd.multiplicative_errors
mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
nonspecial_mult.append((mult_uncorr_errors, mult_corr_errors))
special_mult.append(
mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
)
if "ASY" in cd.commondataproc or cd.commondataproc.endswith("_POL"):
check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool))
else:
check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))
# concatenating special multiplicative errors, pseudodatas and positive mask
if sep_mult:
special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
all_pseudodata = np.concatenate(pseudodatas, axis=0)
full_mask = np.concatenate(check_positive_masks, axis=0)
full_mask = (
group_positivity_mask
if group_positivity_mask is not None
else np.zeros_like(central_values_array, dtype=bool)
)
# The inner while True loop is for ensuring a positive definite
# pseudodata replica
for _ in range(max_tries):
mult_shifts = []
# Prepare the per-dataset multiplicative shifts
for mult_uncorr_errors, mult_corr_errors in nonspecial_mult:
# convert to from percent to fraction
mult_shift = (
1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
).prod(axis=1)
if group_multiplicative_errors is not None:
for mult_uncorr_errors, mult_corr_errors in group_multiplicative_errors[
"nonspecial_mult"
]:
# convert to from percent to fraction
mult_shift = (
1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
).prod(axis=1)

mult_shift *= (
1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100
).prod(axis=1)
mult_shift *= (
1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100
).prod(axis=1)

mult_shifts.append(mult_shift)
mult_shifts.append(mult_shift)

# If sep_mult is true then the multiplicative shifts were not included in the covmat
shifts = covmat_sqrt @ rng.normal(size=covmat.shape[1])
mult_part = 1.0
if sep_mult:
special_mult_errors = group_multiplicative_errors["special_mult"]
special_mult = (
1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100
).prod(axis=1)
mult_part = np.concatenate(mult_shifts, axis=0) * special_mult
# Shifting pseudodata
shifted_pseudodata = (all_pseudodata + shifts) * mult_part
shifted_pseudodata = (central_values_array + shifts) * mult_part
# positivity control
if np.all(shifted_pseudodata[full_mask] >= 0) or not resample_negative_pseudodata:
if np.all(shifted_pseudodata[full_mask] >= 0):
return shifted_pseudodata

dfail = " ".join(i.setname for i in groups_dataset_inputs_loaded_cd_with_cuts)
log.error(f"Error generating replicas for the group: {dfail}")
raise ReplicaGenerationError(f"No valid replica found after {max_tries} attempts")
# Find which dataset index corresponds to the negative points, and print it out for debugging purposes
negative_mask = shifted_pseudodata < 0 & full_mask
negative_indices = np.where(negative_mask)[0]

raise ReplicaGenerationError(
f"No valid replica found after {max_tries} attempts. "
f"Negative global indices: {negative_indices.tolist()}"
)


def central_values_array(groups_dataset_inputs_loaded_cd_with_cuts):
"""Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
and returns the central values concatenated in a single array.
"""
central_values = []
for cd in groups_dataset_inputs_loaded_cd_with_cuts:
central_values.append(cd.central_values.to_numpy())
return np.concatenate(central_values, axis=0)


def group_multiplicative_errors(groups_dataset_inputs_loaded_cd_with_cuts, sep_mult=False):
"""Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
and returns the multiplicative uncertainties contribution to the pseudodata replica.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
and returns the multiplicative uncertainties contribution to the pseudodata replica.
and returns the multiplicative uncertainties contribution to the pseudodata replica in the form
of a dictionary separating the dataframes of the special and non-special correlations.
Returns
--------
multiplicative_errors: dict
{"nonspecial_mult", "special_mult"}

@enocera do you remember why are we doing this separation??

"""
if not sep_mult:
return None

nonspecial_mult = []
special_mult = []
special_mult_errors = []
for cd in groups_dataset_inputs_loaded_cd_with_cuts:
if sep_mult:
mult_errors = cd.multiplicative_errors
mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
nonspecial_mult.append((mult_uncorr_errors, mult_corr_errors))
special_mult.append(
mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
)

# concatenating special multiplicative errors
if sep_mult:
special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()

multiplicative_errors = {
"nonspecial_mult": nonspecial_mult,
"special_mult": special_mult_errors,
}

return multiplicative_errors


def group_positivity_mask(
groups_dataset_inputs_loaded_cd_with_cuts, resample_negative_pseudodata=False
):
"""Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
and returns a boolean mask indicating which data points should be positive.
"""
if not resample_negative_pseudodata:
return None
check_positive_masks = []
for cd in groups_dataset_inputs_loaded_cd_with_cuts:
if "ASY" in cd.commondataproc or cd.commondataproc.endswith("_POL"):
check_positive_masks.append(np.zeros_like(cd.central_values.to_numpy(), dtype=bool))
else:
check_positive_masks.append(np.ones_like(cd.central_values.to_numpy(), dtype=bool))
full_mask = np.concatenate(check_positive_masks, axis=0)
return full_mask


def indexed_make_replica(groups_index, make_replica):
Expand Down Expand Up @@ -397,8 +432,11 @@ def make_level1_data(data, level0_commondata_wc, filterseed, data_index, sep_mul
)

# ================== generation of Level1 data ======================#
central_vals= central_values_array(level0_commondata_wc)
group_mult_errs = group_multiplicative_errors(level0_commondata_wc, sep_mult=sep_mult)
group_pos_mask = group_positivity_mask(level0_commondata_wc)
level1_data = make_replica(
level0_commondata_wc, filterseed, covmat, sep_mult=sep_mult, genrep=True
central_vals, filterseed, covmat, group_multiplicative_errors=group_mult_errs, group_positivity_mask=group_pos_mask, sep_mult=sep_mult, genrep=True,
)

indexed_level1_data = indexed_make_replica(data_index, level1_data)
Expand Down
9 changes: 5 additions & 4 deletions validphys2/src/validphys/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,16 @@ def data_internal_cuts_config(data_config):


@pytest.fixture(scope='module')
def data_internal_cuts_new_theory_config(data_internal_cuts_config):
def data_internal_cuts_closure_config(data_internal_cuts_config):
# Filterseed is not added so that it is changed by the tests
config = dict(data_internal_cuts_config)
config["theoryid"] = THEORYID
config["fakepdf"] = PDF
return config


@pytest.fixture(scope='module')
def data_fromfit_cuts_config(data_internal_cuts_new_theory_config):
config = dict(data_internal_cuts_new_theory_config)
def data_fromfit_cuts_config(data_internal_cuts_config):
config = dict(data_internal_cuts_config)
config.update(use_cuts="fromfit")
return config

Expand Down
8 changes: 4 additions & 4 deletions validphys2/src/validphys/tests/test_datafiles.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""
Test all datafiles
Test all datafiles

The checks in ``test_all_datasets`` are run for each dataset independently so that one gets one
failure per dataset in case of problems
The checks in ``test_all_datasets`` are run for each dataset independently so that one gets one
failure per dataset in case of problems
"""

import pytest
Expand All @@ -24,7 +24,7 @@ def _load_main_and_variants(dataset_name):


@pytest.mark.parametrize("dataset_name", all_datasets)
def test_all_datasets(dataset_name, data_internal_cuts_new_theory_config):
def test_all_datasets(dataset_name):
"""Checks that a dataset can be loaded (together with its variants),
that the kinematics, uncertainties and data can be read.

Expand Down
Loading
Loading