Skip to content
Open
123 changes: 123 additions & 0 deletions nnpdf_data/nnpdf_data/commondata/CMS_2JET_8TEV/data.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
data_central:
- 1304.0
- 638.1
- 329.4
- 176.1
- 95.42
- 53.14
- 30.04
- 17.36
- 10.07
- 5.904
- 3.53
- 2.101
- 1.257
- 0.7631
- 0.4598
- 0.2762
- 0.1661
- 0.1004
- 0.06007
- 0.03592
- 0.02109
- 0.01254
- 0.007197
- 0.004058
- 0.002391
- 0.001419
- 0.0007634
- 0.0003934
- 0.0002155
- 0.0001198
- 4.445e-05
- 518.7
- 239.1
- 118.0
- 59.7
- 30.5
- 15.88
- 8.356
- 4.508
- 2.401
- 1.303
- 0.6989
- 0.3753
- 0.2016
- 0.1091
- 0.05782
- 0.03049
- 0.01591
- 0.008247
- 0.004217
- 0.002188
- 0.001044
- 0.0004743
- 0.000229
- 9.465e-05
- 4.074e-05
- 2.321e-05
- 39.11
- 15.49
- 5.749
- 2.689
- 1.138
- 0.4543
- 0.1875
- 0.08166
- 0.03022
- 0.009612
- 0.004567
- 0.00146
- 0.0004647
- 0.0001141
- 790.8
- 376.0
- 176.4
- 90.12
- 46.08
- 23.09
- 11.96
- 6.188
- 3.148
- 1.634
- 0.8323
- 0.4201
- 0.2114
- 0.1044
- 0.0504
- 0.02377
- 0.01105
- 0.005018
- 0.00204
- 0.00086
- 0.0003546
- 0.0001093
- 3.493e-05
- 188.4
- 79.95
- 36.65
- 16.05
- 7.067
- 3.108
- 1.352
- 0.593
- 0.2592
- 0.1068
- 0.04124
- 0.01593
- 0.005859
- 0.002166
- 0.0006671
- 0.0001653
- 4.066e-05
- 126.0
- 47.86
- 18.92
- 7.102
- 2.474
- 0.8486
- 0.2511
- 0.06703
- 0.01889
- 0.002968
- 0.0006449
250 changes: 250 additions & 0 deletions nnpdf_data/nnpdf_data/commondata/CMS_2JET_8TEV/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
import yaml
import numpy as np
import copy
from nnpdf_data.filter_utils.utils import cormat_to_covmat, covmat_to_artunc

def read_metadata() -> tuple:
'''
takes the important information from the metadata file
'''
with open('metadata.yaml', 'r') as file:
info = yaml.safe_load(file)
obs_info = info['implemented_observables'][0]
return obs_info['tables'], obs_info['ndata']


def read_kinematics_and_centrals(tables: list) -> tuple:
'''
using the table numbers from metadata,
reads the bins and central values from rawdata
return two lists: one for kinematics, one for central values
'''
bins = list()
centrals = list()
for table_num in tables[:6]: # select just the tables with kinematic data
with open(f'rawdata/table_{table_num}.yaml', 'r') as file:
kins = yaml.safe_load(file)
# get ystar, yboost, sqrts bins:
current_yy_bin = dict()
for yy_dict in kins['dependent_variables'][0]['qualifiers']:
for k, v in yy_dict.items():
if k == 'name':
if v != 'SQRT(S)':
bin_name = v.lower().replace('oost', '')
current_yy_bin[bin_name] = dict()
if k == 'value':
if type(v) == str:
lower = float(v[:3])
upper = float(v[-3:])
middle = (lower+upper)/2
current_yy_bin[bin_name] = {'min': lower, 'mid': middle, 'max': upper}
# get the ptavg bins and combine them with yys bins
for ptavg_dict in kins['independent_variables'][0]['values']:
lower = ptavg_dict['low']
upper = ptavg_dict['high']
middle = (lower + upper)/2
current_p_bin = {'pTavg': {'min': lower, 'mid': middle, 'max': upper}}
copy_yy = copy.deepcopy(current_yy_bin)
current_pyys_bin = copy_yy | current_p_bin
bins.append(current_pyys_bin)
# get the central values
for vals in kins['dependent_variables'][0]['values']:
centrals.append(vals['value'])
return bins, centrals


def dump_kinematics_and_centrals(bins: list, centrals: list) -> None:
# dump the dictionaries into files
with open('kinematics.yaml', 'w') as file:
yaml.safe_dump({'bins': bins}, file, sort_keys=False)
with open('data.yaml', 'w') as file:
yaml.safe_dump({'data_central': centrals}, file, sort_keys=False)


def read_kinematics_lengths(tables: list) -> list:
ndata = list()
for table_id in tables:
with open(f'rawdata/table_{table_id}.yaml', 'r') as file:
working_dict = yaml.safe_load(file)
ndata.append(len(working_dict['independent_variables'][0]['values']))
return ndata


def read_correlation_matrix(table_no: int) -> list:
corr_mat = list()
with open(f'rawdata/table_{table_no}.yaml', 'r') as file:
temp_mat = yaml.safe_load(file)['dependent_variables'][0]['values']
corr_mat = [small_dict['value'] for small_dict in temp_mat]
return corr_mat


def read_rel_errors(tables: list) -> list:
errors = list()
for table_num in tables:
temp_list = list()
with open(f'rawdata/table_{table_num}.yaml', 'r') as file:
temp_list = yaml.safe_load(file)['dependent_variables'][0]['values']
errors_in_table = list()
for data_point in temp_list:
errors_in_table.append(data_point['errors'])
errors += errors_in_table
return errors


def make_errors_absolute(errors: list, centrals: list):
if len(errors) != len(centrals):
print('lengths dont match')
else:
abs_errors = list()
for i in range(len(errors)):
c_val = centrals[i]
extracted_errors_at_dp = {small_dict['label']: c_val*float(small_dict['symerror'][:-1])/100 for small_dict in errors[i]}
abs_errors.append(extracted_errors_at_dp)
return abs_errors


def generate_stat_art_unc(abs_errors, kin_lengths):
stat_errors = [item['stat'] for item in abs_errors]
split_indices = [kin_lengths[0]]+[0]*(len(kin_lengths)-1)
for i in range(1, len(kin_lengths)):
split_indices[i] = split_indices[i-1]+kin_lengths[i]
split_indices = [0]+split_indices
art_unc = list()
for i in range(1, len(split_indices)):
current_errors = stat_errors[split_indices[i-1]:split_indices[i]]
current_corr_mat = read_correlation_matrix(i+6)
current_ndata = kin_lengths[i-1]
if not (len(current_errors) == current_ndata and len(current_corr_mat) == current_ndata**2):
print('lengths not matching:')
else:
current_cov_mat = cormat_to_covmat(err_list=current_errors, cormat_list=current_corr_mat)
current_art_unc = covmat_to_artunc(ndata = current_ndata, covmat_list = current_cov_mat)
big_art_unc = []
for small_row in current_art_unc:
big_art_unc.append([0]*split_indices[i-1]+small_row+[0]*(122-split_indices[i]))
art_unc += big_art_unc
return art_unc


def aggregate_uncertainties(abs_errors, art_unc):
all_uncertainties = []
for i in range(len(abs_errors)):
current_dict = abs_errors[i]
current_dict.pop('stat')
art_unc_list = art_unc[i]
art_unc_dict = {f'art_unc_{j+1}': art_unc_list[j] for j in range(len(art_unc_list))}
total_dict = current_dict | art_unc_dict
all_uncertainties.append(total_dict)
return all_uncertainties


def dump_uncertainties(all_unc):
singular_art_unc_desc = {'description': 'artificial uncertainty originating correlated statistical uncertainties',
'treatment': 'ADD',
'type': 'CORR'}
all_art_unc_desc = {f'art_unc_{i+1}': singular_art_unc_desc.copy() for i in range(122)}
other_unc = {
'uncor': {'description': 'stems from residual effects of small inefficiencies in the jet identification',
'treatment': 'ADD',
'type': 'UNCORR'},
'jererr': {'description': 'jet energy resolution',
'treatment': 'MULT',
'type': 'CORR'},
'lumi': {'description': 'luminosity uncertainty',
'treatment': 'MULT',
'type': 'CMSLUMI19P7'},
'nongaussiantails': {'description': 'non-Gaussian tails in detector response to jets',
'treatment': 'MULT',
'type': 'CORR'},
'AbsoluteScale': {'description': 'absolute jet energy scale calibration',
'treatment': 'MULT',
'type': 'CORR'},
'AbsoluteStat': {'description': 'statistical uncertainty of absolute JES',
'treatment': 'MULT',
'type': 'CORR'},
'AbsoluteMPFBias': {'description': 'bias in MPF response method',
'treatment': 'MULT',
'type': 'CORR'},
'Fragmentation': {'description': 'fragmentation uncertainty',
'treatment': 'MULT',
'type': 'CORR'},
'SinglePionECAL': {'description': 'e-calorimeter response to single pions',
'treatment': 'MULT',
'type': 'CORR'},
'SinglePionHCAL': {'description': 'h-calorimeter response to single pions',
'treatment': 'MULT',
'type': 'CORR'},
'FlavorQCD': {'description': 'jet flavour composition uncertainty',
'treatment': 'MULT',
'type': 'CORR'},
'RelativeJEREC1': {'description': 'JER relative uncertainty',
'treatment': 'MULT',
'type': 'CORR'},
'RelativeJEREC2': {'description': 'JER relative uncertainty',
'treatment': 'MULT',
'type': 'CORR'},
'RelativeJERHF': {'description': 'JER relative uncertainty',
'treatment': 'MULT',
'type': 'CORR'},
'RelativePtBB': {'description': 'Relative JES vs pT',
'treatment': 'MULT',
'type': 'CORR'},
'RelativePtEC1': {'description': 'Relative JES vs pT',
'treatment': 'MULT',
'type': 'CORR'},
'RelativePtEC2': {'description': 'Relative JES vs pT',
'treatment': 'MULT',
'type': 'CORR'},
'RelativePtHF': {'description': 'Relative JES vs pT',
'treatment': 'MULT',
'type': 'CORR'},
'RelativeFSR': {'description': 'Final-state radiation modeling',
'treatment': 'MULT',
'type': 'CORR'},
'RelativeStatEC2': {'description': 'Relative JES statistical uncertainty',
'treatment': 'MULT',
'type': 'CORR'},
'RelativeStatHF': {'description': 'Relative JES statistical uncertainty',
'treatment': 'MULT',
'type': 'CORR'},
'RelativeStatFSR': {'description': 'Relative JES statistical uncertainty',
'treatment': 'MULT',
'type': 'CORR'},
'PileUpDataMC': {'description': 'Data–MC pileup mismatch',
'treatment': 'MULT',
'type': 'CORR'},
'PileUpPtRef': {'description': 'Pileup pT reference uncertainty',
'treatment': 'MULT',
'type': 'CORR'},
'PileUpPtBB': {'description': 'Pileup pT uncertainty',
'treatment': 'MULT',
'type': 'CORR'},
'PileUpPtEC1': {'description': 'Pileup pT uncertainty',
'treatment': 'MULT',
'type': 'CORR'},
'PileUpPtEC2': {'description': 'Pileup pT uncertainty',
'treatment': 'MULT',
'type': 'CORR'},
'PileUpPtHF': {'description': 'Pileup pT uncertainty',
'treatment': 'MULT',
'type': 'CORR'}
}
definitions = {'definitions': other_unc | all_art_unc_desc}
uncertainties_yaml = definitions | {'bins': all_unc}
with open('uncertainties.yaml', 'w') as file:
yaml.safe_dump(uncertainties_yaml, file, sort_keys = False)

def main_filter():
tables = read_metadata()[0][:6]
bins, centrals = read_kinematics_and_centrals(tables)
kin_lengths = read_kinematics_lengths(tables)
errors = read_rel_errors([1,2,3,4,5,6])
abs_errors = make_errors_absolute(errors, centrals)
art_unc = generate_stat_art_unc(abs_errors, kin_lengths)
all_unc = aggregate_uncertainties(abs_errors, art_unc)

dump_kinematics_and_centrals(bins, centrals)
dump_uncertainties(all_unc)

if __name__ == '__main__':
main_filter()
Loading
Loading