@@ -121,16 +121,17 @@ def read_replica_pseudodata(fit, context_index, replica):
121121
122122
123123def make_replica (
124- groups_dataset_inputs_loaded_cd_with_cuts ,
125- replica_mcseed ,
124+ central_values_array ,
125+ group_replica_mcseed ,
126126 dataset_inputs_sampling_covmat ,
127+ group_multiplicative_errors = None ,
128+ group_positivity_mask = None ,
127129 sep_mult = False ,
128130 genrep = True ,
129131 max_tries = int (1e6 ),
130- resample_negative_pseudodata = False ,
131132):
132- """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
133- objects and returns a pseudodata replica accounting for
133+ """Function that takes in a central value array and a covariance matrix
134+ and returns a pseudodata replica accounting for
134135 possible correlations between systematic uncertainties.
135136
136137 The function loops until positive definite pseudodata is generated for any
@@ -139,17 +140,22 @@ def make_replica(
139140
140141 Parameters
141142 ---------
142- groups_dataset_inputs_loaded_cd_with_cuts: list[:py:class:`nnpdf_data.coredata.CommonData`]
143- List of CommonData objects which stores information about systematic errors,
144- their treatment and description, for each dataset .
143+ central_values_array: np.array
144+ Numpy array which is N_dat (where N_dat is the combined number of data points after cuts)
145+ containing the central values of the data .
145146
146- replica_mcseed: int, None
147- Seed used to initialise the numpy random number generator. If ``None`` then a random seed is
148- allocated using the default numpy behaviour.
147+ group_replica_mcseed: int
148+ Seed used to initialise the numpy random number generator.
149149
150150 dataset_inputs_sampling_covmat: np.array
151151 Full covmat to be used. It can be either only experimental or also theoretical.
152152
153+ group_multiplicative_errors: dict
154+ Dictionary containing the multiplicative uncertainties contribution to the pseudodata replica.
155+
156+ group_positivity_mask: np.array
157+ Boolean array of shape (N_dat,) indicating which data points should be positive.
158+
153159 sep_mult: bool
154160 Specifies whether computing the shifts with the full covmat
155161 or whether multiplicative errors should be separated
@@ -162,9 +168,6 @@ def make_replica(
162168 If after max_tries (default=1e6) no physical configuration is found,
163169 it will raise a :py:class:`ReplicaGenerationError`
164170
165- resample_negative_pseudodata: bool
166- When True, replicas that produce negative predictions will be resampled for ``max_tries``
167- until all points are positive (default: False)
168171 Returns
169172 -------
170173 pseudodata: np.array
@@ -187,90 +190,122 @@ def make_replica(
187190 0.34206012, 0.31866286, 0.2790856 , 0.33257621, 0.33680007,
188191 """
189192 if not genrep :
190- return np .concatenate (
191- [cd .central_values for cd in groups_dataset_inputs_loaded_cd_with_cuts ]
192- )
193- # Seed the numpy RNG with the seed and the name of the datasets in this run
194-
195- # TODO: to be simplified after the reader is merged, together with an update of the regression tests
196- # this is necessary to reproduce exactly the results due to the replicas being generated with a hash
197- # Only when the sets are legacy (or coming from a legacy runcard) this shall be used
198- names_for_salt = []
199- for loaded_cd in groups_dataset_inputs_loaded_cd_with_cuts :
200- if loaded_cd .legacy_names is None :
201- names_for_salt .append (loaded_cd .setname )
202- else :
203- names_for_salt .append (loaded_cd .legacy_names [0 ])
204- name_salt = "-" .join (names_for_salt )
193+ return central_values_array
205194
206- name_seed = int ( hashlib . sha256 ( name_salt . encode ()). hexdigest (), 16 ) % 10 ** 8
207- rng = np .random .default_rng (seed = replica_mcseed + name_seed )
195+ # Set random seed
196+ rng = np .random .default_rng (seed = group_replica_mcseed )
208197 # construct covmat
209198 covmat = dataset_inputs_sampling_covmat
210199 covmat_sqrt = sqrt_covmat (covmat )
211- # Loading the data
212- pseudodatas = []
213- check_positive_masks = []
214- nonspecial_mult = []
215- special_mult = []
216- for cd in groups_dataset_inputs_loaded_cd_with_cuts :
217- # copy here to avoid mutating the central values.
218- pseudodata = cd .central_values .to_numpy ()
219200
220- pseudodatas .append (pseudodata )
221- # Separation of multiplicative errors. If sep_mult is True also the exp_covmat is produced
222- # without multiplicative errors
223- if sep_mult :
224- mult_errors = cd .multiplicative_errors
225- mult_uncorr_errors = mult_errors .loc [:, mult_errors .columns == "UNCORR" ].to_numpy ()
226- mult_corr_errors = mult_errors .loc [:, mult_errors .columns == "CORR" ].to_numpy ()
227- nonspecial_mult .append ((mult_uncorr_errors , mult_corr_errors ))
228- special_mult .append (
229- mult_errors .loc [:, ~ mult_errors .columns .isin (INTRA_DATASET_SYS_NAME )]
230- )
231- if "ASY" in cd .commondataproc or cd .commondataproc .endswith ("_POL" ):
232- check_positive_masks .append (np .zeros_like (pseudodata , dtype = bool ))
233- else :
234- check_positive_masks .append (np .ones_like (pseudodata , dtype = bool ))
235- # concatenating special multiplicative errors, pseudodatas and positive mask
236- if sep_mult :
237- special_mult_errors = pd .concat (special_mult , axis = 0 , sort = True ).fillna (0 ).to_numpy ()
238- all_pseudodata = np .concatenate (pseudodatas , axis = 0 )
239- full_mask = np .concatenate (check_positive_masks , axis = 0 )
201+ full_mask = (
202+ group_positivity_mask
203+ if group_positivity_mask is not None
204+ else np .zeros_like (central_values_array , dtype = bool )
205+ )
240206 # The inner while True loop is for ensuring a positive definite
241207 # pseudodata replica
242208 for _ in range (max_tries ):
243209 mult_shifts = []
244210 # Prepare the per-dataset multiplicative shifts
245- for mult_uncorr_errors , mult_corr_errors in nonspecial_mult :
246- # convert to from percent to fraction
247- mult_shift = (
248- 1 + mult_uncorr_errors * rng .normal (size = mult_uncorr_errors .shape ) / 100
249- ).prod (axis = 1 )
211+ if group_multiplicative_errors is not None :
212+ for mult_uncorr_errors , mult_corr_errors in group_multiplicative_errors [
213+ "nonspecial_mult"
214+ ]:
215+ # convert to from percent to fraction
216+ mult_shift = (
217+ 1 + mult_uncorr_errors * rng .normal (size = mult_uncorr_errors .shape ) / 100
218+ ).prod (axis = 1 )
250219
251- mult_shift *= (
252- 1 + mult_corr_errors * rng .normal (size = (1 , mult_corr_errors .shape [1 ])) / 100
253- ).prod (axis = 1 )
220+ mult_shift *= (
221+ 1 + mult_corr_errors * rng .normal (size = (1 , mult_corr_errors .shape [1 ])) / 100
222+ ).prod (axis = 1 )
254223
255- mult_shifts .append (mult_shift )
224+ mult_shifts .append (mult_shift )
256225
257226 # If sep_mult is true then the multiplicative shifts were not included in the covmat
258227 shifts = covmat_sqrt @ rng .normal (size = covmat .shape [1 ])
259228 mult_part = 1.0
260229 if sep_mult :
230+ special_mult_errors = group_multiplicative_errors ["special_mult" ]
261231 special_mult = (
262232 1 + special_mult_errors * rng .normal (size = (1 , special_mult_errors .shape [1 ])) / 100
263233 ).prod (axis = 1 )
264234 mult_part = np .concatenate (mult_shifts , axis = 0 ) * special_mult
265235 # Shifting pseudodata
266- shifted_pseudodata = (all_pseudodata + shifts ) * mult_part
236+ shifted_pseudodata = (central_values_array + shifts ) * mult_part
267237 # positivity control
268- if np .all (shifted_pseudodata [full_mask ] >= 0 ) or not resample_negative_pseudodata :
238+ if np .all (shifted_pseudodata [full_mask ] >= 0 ):
269239 return shifted_pseudodata
270240
271- dfail = " " .join (i .setname for i in groups_dataset_inputs_loaded_cd_with_cuts )
272- log .error (f"Error generating replicas for the group: { dfail } " )
273- raise ReplicaGenerationError (f"No valid replica found after { max_tries } attempts" )
241+ # Find which dataset index corresponds to the negative points, and print it out for debugging purposes
242+ negative_mask = shifted_pseudodata < 0 & full_mask
243+ negative_indices = np .where (negative_mask )[0 ]
244+
245+ raise ReplicaGenerationError (
246+ f"No valid replica found after { max_tries } attempts. "
247+ f"Negative global indices: { negative_indices .tolist ()} "
248+ )
249+
250+
251+ def central_values_array (groups_dataset_inputs_loaded_cd_with_cuts ):
252+ """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
253+ and returns the central values concatenated in a single array.
254+ """
255+ central_values = []
256+ for cd in groups_dataset_inputs_loaded_cd_with_cuts :
257+ central_values .append (cd .central_values .to_numpy ())
258+ return np .concatenate (central_values , axis = 0 )
259+
260+
261+ def group_multiplicative_errors (groups_dataset_inputs_loaded_cd_with_cuts , sep_mult = False ):
262+ """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
263+ and returns the multiplicative uncertainties contribution to the pseudodata replica.
264+ """
265+ if not sep_mult :
266+ return None
267+
268+ nonspecial_mult = []
269+ special_mult = []
270+ special_mult_errors = []
271+ for cd in groups_dataset_inputs_loaded_cd_with_cuts :
272+ if sep_mult :
273+ mult_errors = cd .multiplicative_errors
274+ mult_uncorr_errors = mult_errors .loc [:, mult_errors .columns == "UNCORR" ].to_numpy ()
275+ mult_corr_errors = mult_errors .loc [:, mult_errors .columns == "CORR" ].to_numpy ()
276+ nonspecial_mult .append ((mult_uncorr_errors , mult_corr_errors ))
277+ special_mult .append (
278+ mult_errors .loc [:, ~ mult_errors .columns .isin (INTRA_DATASET_SYS_NAME )]
279+ )
280+
281+ # concatenating special multiplicative errors
282+ if sep_mult :
283+ special_mult_errors = pd .concat (special_mult , axis = 0 , sort = True ).fillna (0 ).to_numpy ()
284+
285+ multiplicative_errors = {
286+ "nonspecial_mult" : nonspecial_mult ,
287+ "special_mult" : special_mult_errors ,
288+ }
289+
290+ return multiplicative_errors
291+
292+
293+ def group_positivity_mask (
294+ groups_dataset_inputs_loaded_cd_with_cuts , resample_negative_pseudodata = False
295+ ):
296+ """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData`
297+ and returns a boolean mask indicating which data points should be positive.
298+ """
299+ if not resample_negative_pseudodata :
300+ return None
301+ check_positive_masks = []
302+ for cd in groups_dataset_inputs_loaded_cd_with_cuts :
303+ if "ASY" in cd .commondataproc or cd .commondataproc .endswith ("_POL" ):
304+ check_positive_masks .append (np .zeros_like (cd .central_values .to_numpy (), dtype = bool ))
305+ else :
306+ check_positive_masks .append (np .ones_like (cd .central_values .to_numpy (), dtype = bool ))
307+ full_mask = np .concatenate (check_positive_masks , axis = 0 )
308+ return full_mask
274309
275310
276311def indexed_make_replica (groups_index , make_replica ):
@@ -397,8 +432,11 @@ def make_level1_data(data, level0_commondata_wc, filterseed, data_index, sep_mul
397432 )
398433
399434 # ================== generation of Level1 data ======================#
435+ central_vals = central_values_array (level0_commondata_wc )
436+ group_mult_errs = group_multiplicative_errors (level0_commondata_wc , sep_mult = sep_mult )
437+ group_pos_mask = group_positivity_mask (level0_commondata_wc )
400438 level1_data = make_replica (
401- level0_commondata_wc , filterseed , covmat , sep_mult = sep_mult , genrep = True
439+ central_vals , filterseed , covmat , group_multiplicative_errors = group_mult_errs , group_positivity_mask = group_pos_mask , sep_mult = sep_mult , genrep = True ,
402440 )
403441
404442 indexed_level1_data = indexed_make_replica (data_index , level1_data )
0 commit comments