1+ """
2+ This file contains the piece of code needed to implement the ATLAS Measurement
3+ of the cross-section and charge asymmetry of W bosons at 8TeV.
4+ Systematic uncertainties are implemented starting from the breakdown
5+ available on HepData.
6+ """
7+
8+ import pathlib
9+ import numpy as np
10+ import pandas as pd
11+ import yaml
12+
13+ from nnpdf_data .filter_utils .utils import prettify_float
14+
15+ yaml .add_representer (float , prettify_float )
16+
17+ from nnpdf_data .filter_utils .utils import symmetrize_errors as se
18+
19+
20+ def load_yaml (table_id : int , version : int = 1 ) -> dict :
21+ """Load the HEP data table in yaml format.
22+
23+ Parameters
24+ ----------
25+ table_id: int
26+ table ID number
27+ version: int
28+ version number
29+
30+ Returns
31+ -------
32+ dict:
33+ dictionary containing the table contents
34+ """
35+ filename = f"HEPData-ins1729240-v{ version } -Table_{ table_id } "
36+ table = pathlib .Path (f"./rawdata/{ filename } .yaml" )
37+ return yaml .safe_load (table .read_text ())
38+
39+
40+ def get_kinematics (hepdata : dict ) -> list :
41+ """
42+ Returns the kinematics in the form of a list of dictionaries.
43+
44+ Parameters
45+ ----------
46+ hepdata: dict
47+ HepData yaml content loaded as dictionary
48+
49+ Returns
50+ -------
51+ list:
52+ List of kinematic dictionaries
53+ """
54+ kin = []
55+
56+ for i , M in enumerate (hepdata ["independent_variables" ][0 ]['values' ]):
57+ kin_value = {
58+ 'abs_eta' : {
59+ 'min' : None ,
60+ 'mid' : 0.5 * (M ['low' ] + M ['high' ]),
61+ 'max' : None ,
62+ },
63+ 'm_W2' : {'min' : None , 'mid' : 6463.838404 , 'max' : None },
64+ 'sqrts' : {'min' : None , 'mid' : 8000.0 , 'max' : None },
65+ }
66+ kin .append (kin_value )
67+
68+ return kin
69+
70+
71+ def get_data_values (hepdata : dict ) -> list :
72+ """
73+ Returns the central data values in the form of a list.
74+
75+ Parameters
76+ ----------
77+ hepdata: dict
78+ HepData yaml content loaded as dictionary
79+
80+ Returns
81+ -------
82+ list:
83+ List of central data values
84+ """
85+ data_central = []
86+ values = hepdata ['dependent_variables' ][0 ]['values' ]
87+
88+ for value in values :
89+ # Store data central and convert units (pb to fb)
90+ data_central .append (value ['value' ] * 1000 )
91+
92+ return data_central
93+
94+
95+ def get_errors (hepdata : dict ) -> tuple :
96+ """
97+ Extract the uncertainties from hepdata and compute the shift of the central value
98+ in case of asymmetric uncertainties.
99+
100+ Parameters
101+ ----------
102+ hepdata: dict
103+ HepData yaml file loaded as dictionary
104+
105+ Returns
106+ -------
107+ tuple:
108+ (central_values, df_errors) where df_errors is a pandas DataFrame
109+ """
110+ central_values = []
111+ df_errors = pd .DataFrame ()
112+
113+ for i , bin in enumerate (hepdata ["dependent_variables" ][0 ]["values" ]):
114+ error_sources = []
115+ shift_cv = 0
116+ error_names = []
117+
118+ for source in bin ["errors" ]:
119+ error_names .append (source ["label" ])
120+
121+ if "asymerror" in source :
122+ delta_min = source ["asymerror" ]["minus" ]
123+ delta_plus = source ["asymerror" ]["plus" ]
124+ se_delta , se_sigma = se (delta_plus , delta_min )
125+ error_sources .append (se_sigma )
126+ shift_cv += se_delta
127+ elif "symerror" in source :
128+ se_sigma = source ["symerror" ]
129+ error_sources .append (se_sigma )
130+
131+ df_bin = pd .DataFrame ([error_sources ], columns = error_names , index = [f"bin { i } " ])
132+ df_errors = pd .concat ([df_errors , df_bin ])
133+ cv_i = bin ["value" ] + shift_cv
134+ central_values .append (cv_i * 1000 ) # Convert to fb
135+
136+ return central_values , df_errors
137+
138+
139+ def format_uncertainties (uncs : dict ) -> list :
140+ """Format the uncertainties to be dumped into the yaml file.
141+
142+ Parameters
143+ ----------
144+ uncs: dict
145+ Dictionary containing the various sources of uncertainties
146+
147+ Returns
148+ -------
149+ list:
150+ List of dictionaries whose elements are the various errors
151+ """
152+ combined_errors = []
153+ n_bins = uncs ["systematics" ].index .str .startswith ("bin" ).sum ()
154+
155+ for i in range (n_bins ):
156+ errors = {}
157+ if "statistics" in uncs :
158+ errors ["stat" ] = float (uncs ["statistics" ].loc [f"bin { i } " ].values .item ())
159+
160+ for j , unc in enumerate (uncs ["systematics" ].loc [f"bin { i } " ].values ):
161+ errors [f"sys_corr_{ j + 1 } " ] = float (unc )
162+
163+ combined_errors .append (errors )
164+
165+ return combined_errors
166+
167+
168+ def dump_commondata (kinematics : list , data : np .ndarray , errors : dict ) -> None :
169+ """Function that generates and writes the commondata files.
170+
171+ Parameters
172+ ----------
173+ kinematics: list
174+ List containing the kinematic values
175+ data: np.ndarray
176+ Array containing the central values
177+ errors: dict
178+ Dictionary containing the different errors
179+ """
180+ if "statistics" in errors :
181+ error_definition = {
182+ "stat" : {
183+ "description" : "Uncorrelated statistical uncertainties" ,
184+ "treatment" : errors ["statistics" ].loc ["treatment" ].iloc [0 ],
185+ "type" : errors ["statistics" ].loc ["type" ].iloc [0 ],
186+ }
187+ }
188+ else :
189+ error_definition = {}
190+
191+ n_sys = errors ["systematics" ].shape [1 ]
192+ for i in range (n_sys ):
193+ error_definition [f"sys_corr_{ i + 1 } " ] = {
194+ "description" : errors ["systematics" ].columns [i ],
195+ "treatment" : errors ["systematics" ].loc ["treatment" ].iloc [i ],
196+ "type" : errors ["systematics" ].loc ["type" ].iloc [i ],
197+ }
198+
199+ errors_formatted = format_uncertainties (errors )
200+
201+ with open ("data.yaml" , "w" ) as file :
202+ yaml .dump ({"data_central" : data .tolist ()}, file , sort_keys = False )
203+
204+ with open ("kinematics.yaml" , "w" ) as file :
205+ yaml .dump ({"bins" : kinematics }, file , sort_keys = False )
206+
207+ with open ("uncertainties.yaml" , "w" ) as file :
208+ yaml .dump (
209+ {"definitions" : error_definition , "bins" : errors_formatted },
210+ file ,
211+ sort_keys = False
212+ )
213+
214+
215+ def main_filter () -> None :
216+ """
217+ Main function that reads the HepData yaml files and generates the commondata files.
218+ Combines both W+ and W- data into single files.
219+ """
220+ # Load both W+ and W- data files
221+ hepdata_file_wplus = "rawdata/HEPData-ins1729240-v1-Table_5a.yaml"
222+ hepdata_file_wminus = "rawdata/HEPData-ins1729240-v1-Table_5b.yaml"
223+
224+ with open (hepdata_file_wplus , 'r' ) as file :
225+ yaml_content_wplus = yaml .safe_load (file )
226+
227+ with open (hepdata_file_wminus , 'r' ) as file :
228+ yaml_content_wminus = yaml .safe_load (file )
229+
230+ # Process W+ data
231+ kinematics_wplus = get_kinematics (yaml_content_wplus )
232+ central_values_wplus , uncertainties_wplus = get_errors (yaml_content_wplus )
233+
234+ # Process W- data
235+ kinematics_wminus = get_kinematics (yaml_content_wminus )
236+ central_values_wminus , uncertainties_wminus = get_errors (yaml_content_wminus )
237+
238+ # Combine W+ and W- data
239+ kinematics_all = kinematics_wplus + kinematics_wminus
240+ central_values_all = np .concatenate ([central_values_wplus , central_values_wminus ])
241+
242+ # Combine uncertainties DataFrames
243+ n_wplus = len (central_values_wplus )
244+ n_wminus = len (central_values_wminus )
245+
246+ # Re-index before concatenating
247+ uncertainties_wplus .index = [f"bin { i } " for i in range (n_wplus )]
248+ uncertainties_wminus .index = [f"bin { i } " for i in range (n_wplus , n_wplus + n_wminus )]
249+
250+ uncertainties_all = pd .concat ([uncertainties_wplus , uncertainties_wminus ])
251+
252+ # Determine types for each systematic
253+ # First column is stat, rest are systematics
254+ treatment_list = ["ADD" ] # stat is additive
255+ type_list = ["UNCORR" ] # stat is uncorrelated
256+
257+ for col in uncertainties_all .columns [1 :]:
258+ # Most systematics are multiplicative
259+ treatment_list .append ("MULT" )
260+
261+ # Determine correlation type
262+ if col == "sys,Modelling" :
263+ type_list .append ("CORR" )
264+ elif col .startswith ("sys,MJ" ):
265+ type_list .append ("CORR" )
266+ elif col .startswith ("sys,Muon" ):
267+ type_list .append ("CORR" )
268+ elif col .startswith ("sys,MET" ):
269+ type_list .append ("CORR" )
270+ elif col .startswith ("sys,Soft" ):
271+ type_list .append ("CORR" )
272+ elif col .startswith ("sys,PDF" ):
273+ type_list .append ("CORR" )
274+ elif "MC stat" in col :
275+ type_list .append ("UNCORR" )
276+ elif col == "sys,Pileup" :
277+ type_list .append ("CORR" )
278+ elif col .startswith ("sys,WW" ) or col .startswith ("sys,WZ" ) or col .startswith ("sys,ZZ" ):
279+ type_list .append ("CORR" )
280+ elif col .startswith ("sys,Wtaunu" ) or col .startswith ("sys,Ztautau" ) or col .startswith ("sys,Zmumu" ):
281+ type_list .append ("CORR" )
282+ elif col .startswith ("sys,ttbar" ) or col == "sys,photon" or col == "sys,single top" :
283+ type_list .append ("CORR" )
284+ elif col == "sys,Z-vertex" :
285+ type_list .append ("CORR" )
286+ elif col == "sys,F/B Compatibility" :
287+ type_list .append ("CORR" )
288+ else :
289+ type_list .append ("CORR" )
290+
291+ sys_types = {
292+ "treatment" : treatment_list ,
293+ "type" : type_list ,
294+ }
295+
296+ sys_types_df = pd .DataFrame (sys_types , index = uncertainties_all .columns ).T
297+ df_errors = pd .concat ([sys_types_df , uncertainties_all ])
298+
299+ # Split into statistics and systematics
300+ errors = {
301+ "statistics" : df_errors .iloc [:, [0 ]],
302+ "systematics" : df_errors .iloc [:, 1 :]
303+ }
304+
305+ # Dump the commondata files
306+ dump_commondata (kinematics_all , central_values_all , errors )
307+
308+ print ("Generated data.yaml, kinematics.yaml, and uncertainties.yaml" )
309+ print (f"Total bins: { len (central_values_all )} (W+: { n_wplus } , W-: { n_wminus } )" )
310+
311+
312+ if __name__ == "__main__" :
313+ # Process combined W+ and W- data
314+ main_filter ()
0 commit comments