Skip to content

Commit 626c4ca

Browse files
committed
add filter script, metadata, rawdata ...
1 parent 41e6eec commit 626c4ca

File tree

8 files changed

+3102
-0
lines changed

8 files changed

+3102
-0
lines changed
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
"count the number of datapoints in each rawdata table."
2+
3+
import os
4+
import glob
5+
import yaml
6+
7+
directory = "./rawdata"
8+
9+
# Glob pattern for the files
10+
pattern = os.path.join(directory, "HEPData-ins1729240-v1-Table_*.yaml")
11+
12+
total_count = 0
13+
14+
for filepath in glob.glob(pattern):
15+
with open(filepath, 'r') as f:
16+
data = yaml.safe_load(f)
17+
18+
try:
19+
values = data["independent_variables"][0]["values"]
20+
count = len(values)
21+
total_count += count
22+
print(f"{os.path.basename(filepath)}: {count} values")
23+
except (KeyError, IndexError):
24+
print(f"Warning: Skipping {filepath}, unexpected structure")
25+
26+
print(f"\nTotal number of values across all files: {total_count}")
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
data_central:
2+
- 630000
3+
- 635000
4+
- 642000
5+
- 638000
6+
- 643000
7+
- 655000
8+
- 661000
9+
- 662000
10+
- 662000
11+
- 658000
12+
- 642000
13+
- 494000
14+
- 490000
15+
- 486000
16+
- 474000
17+
- 465000
18+
- 456000
19+
- 440000
20+
- 427000
21+
- 410000
22+
- 389000
23+
- 368000
Lines changed: 314 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,314 @@
1+
"""
2+
This file contains the piece of code needed to implement the ATLAS Measurement
3+
of the cross-section and charge asymmetry of W bosons at 8TeV.
4+
Systematic uncertainties are implemented starting from the breakdown
5+
available on HepData.
6+
"""
7+
8+
import pathlib
9+
import numpy as np
10+
import pandas as pd
11+
import yaml
12+
13+
from nnpdf_data.filter_utils.utils import prettify_float
14+
15+
yaml.add_representer(float, prettify_float)
16+
17+
from nnpdf_data.filter_utils.utils import symmetrize_errors as se
18+
19+
20+
def load_yaml(table_id: int, version: int = 1) -> dict:
21+
"""Load the HEP data table in yaml format.
22+
23+
Parameters
24+
----------
25+
table_id: int
26+
table ID number
27+
version: int
28+
version number
29+
30+
Returns
31+
-------
32+
dict:
33+
dictionary containing the table contents
34+
"""
35+
filename = f"HEPData-ins1729240-v{version}-Table_{table_id}"
36+
table = pathlib.Path(f"./rawdata/{filename}.yaml")
37+
return yaml.safe_load(table.read_text())
38+
39+
40+
def get_kinematics(hepdata: dict) -> list:
41+
"""
42+
Returns the kinematics in the form of a list of dictionaries.
43+
44+
Parameters
45+
----------
46+
hepdata: dict
47+
HepData yaml content loaded as dictionary
48+
49+
Returns
50+
-------
51+
list:
52+
List of kinematic dictionaries
53+
"""
54+
kin = []
55+
56+
for i, M in enumerate(hepdata["independent_variables"][0]['values']):
57+
kin_value = {
58+
'abs_eta': {
59+
'min': None,
60+
'mid': 0.5 * (M['low'] + M['high']),
61+
'max': None,
62+
},
63+
'm_W2': {'min': None, 'mid': 6463.838404, 'max': None},
64+
'sqrts': {'min': None, 'mid': 8000.0, 'max': None},
65+
}
66+
kin.append(kin_value)
67+
68+
return kin
69+
70+
71+
def get_data_values(hepdata: dict) -> list:
72+
"""
73+
Returns the central data values in the form of a list.
74+
75+
Parameters
76+
----------
77+
hepdata: dict
78+
HepData yaml content loaded as dictionary
79+
80+
Returns
81+
-------
82+
list:
83+
List of central data values
84+
"""
85+
data_central = []
86+
values = hepdata['dependent_variables'][0]['values']
87+
88+
for value in values:
89+
# Store data central and convert units (pb to fb)
90+
data_central.append(value['value'] * 1000)
91+
92+
return data_central
93+
94+
95+
def get_errors(hepdata: dict) -> tuple:
96+
"""
97+
Extract the uncertainties from hepdata and compute the shift of the central value
98+
in case of asymmetric uncertainties.
99+
100+
Parameters
101+
----------
102+
hepdata: dict
103+
HepData yaml file loaded as dictionary
104+
105+
Returns
106+
-------
107+
tuple:
108+
(central_values, df_errors) where df_errors is a pandas DataFrame
109+
"""
110+
central_values = []
111+
df_errors = pd.DataFrame()
112+
113+
for i, bin in enumerate(hepdata["dependent_variables"][0]["values"]):
114+
error_sources = []
115+
shift_cv = 0
116+
error_names = []
117+
118+
for source in bin["errors"]:
119+
error_names.append(source["label"])
120+
121+
if "asymerror" in source:
122+
delta_min = source["asymerror"]["minus"]
123+
delta_plus = source["asymerror"]["plus"]
124+
se_delta, se_sigma = se(delta_plus, delta_min)
125+
error_sources.append(se_sigma)
126+
shift_cv += se_delta
127+
elif "symerror" in source:
128+
se_sigma = source["symerror"]
129+
error_sources.append(se_sigma)
130+
131+
df_bin = pd.DataFrame([error_sources], columns=error_names, index=[f"bin {i}"])
132+
df_errors = pd.concat([df_errors, df_bin])
133+
cv_i = bin["value"] + shift_cv
134+
central_values.append(cv_i * 1000) # Convert to fb
135+
136+
return central_values, df_errors
137+
138+
139+
def format_uncertainties(uncs: dict) -> list:
140+
"""Format the uncertainties to be dumped into the yaml file.
141+
142+
Parameters
143+
----------
144+
uncs: dict
145+
Dictionary containing the various sources of uncertainties
146+
147+
Returns
148+
-------
149+
list:
150+
List of dictionaries whose elements are the various errors
151+
"""
152+
combined_errors = []
153+
n_bins = uncs["systematics"].index.str.startswith("bin").sum()
154+
155+
for i in range(n_bins):
156+
errors = {}
157+
if "statistics" in uncs:
158+
errors["stat"] = float(uncs["statistics"].loc[f"bin {i}"].values.item())
159+
160+
for j, unc in enumerate(uncs["systematics"].loc[f"bin {i}"].values):
161+
errors[f"sys_corr_{j + 1}"] = float(unc)
162+
163+
combined_errors.append(errors)
164+
165+
return combined_errors
166+
167+
168+
def dump_commondata(kinematics: list, data: np.ndarray, errors: dict) -> None:
169+
"""Function that generates and writes the commondata files.
170+
171+
Parameters
172+
----------
173+
kinematics: list
174+
List containing the kinematic values
175+
data: np.ndarray
176+
Array containing the central values
177+
errors: dict
178+
Dictionary containing the different errors
179+
"""
180+
if "statistics" in errors:
181+
error_definition = {
182+
"stat": {
183+
"description": "Uncorrelated statistical uncertainties",
184+
"treatment": errors["statistics"].loc["treatment"].iloc[0],
185+
"type": errors["statistics"].loc["type"].iloc[0],
186+
}
187+
}
188+
else:
189+
error_definition = {}
190+
191+
n_sys = errors["systematics"].shape[1]
192+
for i in range(n_sys):
193+
error_definition[f"sys_corr_{i + 1}"] = {
194+
"description": errors["systematics"].columns[i],
195+
"treatment": errors["systematics"].loc["treatment"].iloc[i],
196+
"type": errors["systematics"].loc["type"].iloc[i],
197+
}
198+
199+
errors_formatted = format_uncertainties(errors)
200+
201+
with open("data.yaml", "w") as file:
202+
yaml.dump({"data_central": data.tolist()}, file, sort_keys=False)
203+
204+
with open("kinematics.yaml", "w") as file:
205+
yaml.dump({"bins": kinematics}, file, sort_keys=False)
206+
207+
with open("uncertainties.yaml", "w") as file:
208+
yaml.dump(
209+
{"definitions": error_definition, "bins": errors_formatted},
210+
file,
211+
sort_keys=False
212+
)
213+
214+
215+
def main_filter() -> None:
216+
"""
217+
Main function that reads the HepData yaml files and generates the commondata files.
218+
Combines both W+ and W- data into single files.
219+
"""
220+
# Load both W+ and W- data files
221+
hepdata_file_wplus = "rawdata/HEPData-ins1729240-v1-Table_5a.yaml"
222+
hepdata_file_wminus = "rawdata/HEPData-ins1729240-v1-Table_5b.yaml"
223+
224+
with open(hepdata_file_wplus, 'r') as file:
225+
yaml_content_wplus = yaml.safe_load(file)
226+
227+
with open(hepdata_file_wminus, 'r') as file:
228+
yaml_content_wminus = yaml.safe_load(file)
229+
230+
# Process W+ data
231+
kinematics_wplus = get_kinematics(yaml_content_wplus)
232+
central_values_wplus, uncertainties_wplus = get_errors(yaml_content_wplus)
233+
234+
# Process W- data
235+
kinematics_wminus = get_kinematics(yaml_content_wminus)
236+
central_values_wminus, uncertainties_wminus = get_errors(yaml_content_wminus)
237+
238+
# Combine W+ and W- data
239+
kinematics_all = kinematics_wplus + kinematics_wminus
240+
central_values_all = np.concatenate([central_values_wplus, central_values_wminus])
241+
242+
# Combine uncertainties DataFrames
243+
n_wplus = len(central_values_wplus)
244+
n_wminus = len(central_values_wminus)
245+
246+
# Re-index before concatenating
247+
uncertainties_wplus.index = [f"bin {i}" for i in range(n_wplus)]
248+
uncertainties_wminus.index = [f"bin {i}" for i in range(n_wplus, n_wplus + n_wminus)]
249+
250+
uncertainties_all = pd.concat([uncertainties_wplus, uncertainties_wminus])
251+
252+
# Determine types for each systematic
253+
# First column is stat, rest are systematics
254+
treatment_list = ["ADD"] # stat is additive
255+
type_list = ["UNCORR"] # stat is uncorrelated
256+
257+
for col in uncertainties_all.columns[1:]:
258+
# Most systematics are multiplicative
259+
treatment_list.append("MULT")
260+
261+
# Determine correlation type
262+
if col == "sys,Modelling":
263+
type_list.append("CORR")
264+
elif col.startswith("sys,MJ"):
265+
type_list.append("CORR")
266+
elif col.startswith("sys,Muon"):
267+
type_list.append("CORR")
268+
elif col.startswith("sys,MET"):
269+
type_list.append("CORR")
270+
elif col.startswith("sys,Soft"):
271+
type_list.append("CORR")
272+
elif col.startswith("sys,PDF"):
273+
type_list.append("CORR")
274+
elif "MC stat" in col:
275+
type_list.append("UNCORR")
276+
elif col == "sys,Pileup":
277+
type_list.append("CORR")
278+
elif col.startswith("sys,WW") or col.startswith("sys,WZ") or col.startswith("sys,ZZ"):
279+
type_list.append("CORR")
280+
elif col.startswith("sys,Wtaunu") or col.startswith("sys,Ztautau") or col.startswith("sys,Zmumu"):
281+
type_list.append("CORR")
282+
elif col.startswith("sys,ttbar") or col == "sys,photon" or col == "sys,single top":
283+
type_list.append("CORR")
284+
elif col == "sys,Z-vertex":
285+
type_list.append("CORR")
286+
elif col == "sys,F/B Compatibility":
287+
type_list.append("CORR")
288+
else:
289+
type_list.append("CORR")
290+
291+
sys_types = {
292+
"treatment": treatment_list,
293+
"type": type_list,
294+
}
295+
296+
sys_types_df = pd.DataFrame(sys_types, index=uncertainties_all.columns).T
297+
df_errors = pd.concat([sys_types_df, uncertainties_all])
298+
299+
# Split into statistics and systematics
300+
errors = {
301+
"statistics": df_errors.iloc[:, [0]],
302+
"systematics": df_errors.iloc[:, 1:]
303+
}
304+
305+
# Dump the commondata files
306+
dump_commondata(kinematics_all, central_values_all, errors)
307+
308+
print("Generated data.yaml, kinematics.yaml, and uncertainties.yaml")
309+
print(f"Total bins: {len(central_values_all)} (W+: {n_wplus}, W-: {n_wminus})")
310+
311+
312+
if __name__ == "__main__":
313+
# Process combined W+ and W- data
314+
main_filter()

0 commit comments

Comments
 (0)