Skip to content

Commit 527329a

Browse files
authored
Prepare for higher dimensional observables (alisw#969)
* Establish lntheta-lnkt plots for D0 * Generalize for higher dimensions * Establish response matrices for higher dimensions * Generalize unfolding to higher dimensions * Add support for array columns * Generalise building of response matrix * Fix pylint * Generalize feeddown and mctruth to higher dimensions * Add additional observables * Fix pylint
1 parent d48c2d6 commit 527329a

File tree

5 files changed

+220
-139
lines changed

5 files changed

+220
-139
lines changed

machine_learning_hep/analysis/analyzer_jets.py

Lines changed: 30 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,7 @@ def __init__(self, datap, case, typean, period):
7777

7878
self.observables = {
7979
'qa': ['zg', 'rg', 'nsd', 'zpar', 'dr', 'lntheta', 'lnkt', 'lntheta-lnkt'],
80-
'all': [var for var, spec in self.cfg('observables', {}).items()
81-
if '-' not in var and 'arraycols' not in spec],
80+
'all': [*self.cfg('observables', {})],
8281
}
8382

8483
self.bins_candpt = np.asarray(self.cfg('sel_an_binmin', []) + self.cfg('sel_an_binmax', [])[-1:], 'd')
@@ -780,7 +779,7 @@ def _analyze(self, method = 'sidesub'):
780779
self.logger.info("Signal extraction (method %s): obs. %s, %s, ipt %d",
781780
method, var, mcordata, ipt)
782781
if not self.cfg('hfjet', True):
783-
h = project_hist(h_in, axes_proj[1:], {})
782+
h = project_hist(h_in, list(range(1, get_dim(h_in))), {})
784783
elif method == 'sidesub':
785784
h = self._subtract_sideband(h_in, var, mcordata, ipt)
786785
elif method == 'sigextr':
@@ -789,8 +788,10 @@ def _analyze(self, method = 'sidesub'):
789788
self.logger.critical('invalid method %s', method)
790789
self._save_hist(h, f'h_ptjet{label}_{method}_noeff_{mcordata}_pt{ipt}.png')
791790
if mcordata == 'mc':
792-
h_proj = project_hist(h_in, axes_proj[1:], {})
793-
h_proj_lim = project_hist(h_in, axes_proj[1:], {0: (1, get_nbins(h_in, 0))})
791+
self.logger.info('projecting %s onto axes: %s', h_in, axes_proj[1:])
792+
h_proj = project_hist(h_in, list(range(1, get_dim(h_in))), {})
793+
h_proj_lim = project_hist(h_in, list(range(1, get_dim(h_in))),
794+
{0: (1, get_nbins(h_in, 0))})
794795
self._save_hist(h_proj, f'h_ptjet{label}_proj_noeff_{mcordata}_pt{ipt}.png')
795796
if h and h_proj:
796797
self.logger.debug('signal loss %s-%i: %g, fraction in under-/overflow: %g',
@@ -831,7 +832,7 @@ def _analyze(self, method = 'sidesub'):
831832
self._clip_neg(fh_sum_fdsub)
832833
self._save_hist(fh_sum_fdsub, f'h_ptjet{label}_{method}_{mcordata}.png')
833834

834-
if get_dim(fh_sum) > 1:
835+
if get_dim(fh_sum) == 2:
835836
axes = list(range(get_dim(fh_sum)))
836837
axis_ptjet = get_axis(fh_sum, 0)
837838
for iptjet in range(get_nbins(fh_sum, 0)):
@@ -861,8 +862,7 @@ def _analyze(self, method = 'sidesub'):
861862
continue
862863
axis_ptjet = get_axis(fh_sum_fdsub, 0)
863864
for j in range(get_nbins(fh_sum_fdsub, 0)):
864-
# TODO: generalize to higher dimensions
865-
hproj = project_hist(fh_sum_fdsub, [1], {0: [j+1, j+1]})
865+
hproj = project_hist(fh_sum_fdsub, list(range(1, get_dim(fh_sum_fdsub))), {0: [j+1, j+1]})
866866
range_ptjet = get_bin_limits(axis_ptjet, j + 1)
867867
self._save_hist(
868868
hproj, f'uf/h_{var}_{method}_{mcordata}_{string_range_ptjet(range_ptjet)}.png')
@@ -875,7 +875,7 @@ def _analyze(self, method = 'sidesub'):
875875
range_ptjet = get_bin_limits(axis_ptjet, j + 1)
876876
c = TCanvas()
877877
for i, h in enumerate(fh_unfolded):
878-
hproj = project_hist(h, [1], {0: (j+1, j+1)})
878+
hproj = project_hist(h, list(range(1, get_dim(h))), {0: (j+1, j+1)})
879879
empty = hproj.Integral() < 1.e-7
880880
if empty and i == 0:
881881
self.logger.error("Projection %s %s %s is empty.", var, mcordata,
@@ -889,7 +889,7 @@ def _analyze(self, method = 'sidesub'):
889889
self._save_hist(
890890
hproj,
891891
f'uf/h_{var}_{method}_unfolded_{mcordata}_' +
892-
f'{string_range_ptjet(range_ptjet)}_sel.png')
892+
f'{string_range_ptjet(range_ptjet)}_sel.png', "colz")
893893
# Save also the self-normalised version.
894894
if not empty:
895895
hproj_sel = hproj.Clone(f"{hproj.GetName()}_selfnorm")
@@ -977,6 +977,7 @@ def estimate_feeddown(self):
977977
df = pd.read_parquet(self.cfg('fd_parquet'))
978978
col_mapping = {'dr': 'delta_r_jet', 'zpar': 'z'} # TODO: check mapping
979979

980+
# TODO: generalize to higher dimensions
980981
for var in self.observables['all']:
981982
bins_ptjet = np.asarray(self.cfg('bins_ptjet'), 'd')
982983
# TODO: generalize or derive from histogram?
@@ -998,6 +999,7 @@ def estimate_feeddown(self):
998999
if f'{colname}' not in df:
9991000
if var is not None:
10001001
self.logger.error('No feeddown information for %s (%s), cannot estimate feeddown', var, colname)
1002+
print(df.info(), flush=True)
10011003
continue
10021004

10031005
# TODO: derive histogram
@@ -1028,6 +1030,10 @@ def estimate_feeddown(self):
10281030
rfile.Get(f'h_effkine_fd_det_nocuts_{var}'),
10291031
rfile.Get(f'h_effkine_fd_det_cut_{var}'))
10301032
h_response = rfile.Get(f'h_response_fd_{var}')
1033+
if not h_response:
1034+
self.logger.error("Could not find response matrix for fd estimation of %s", var)
1035+
rfile.ls()
1036+
continue
10311037
h_response_norm = norm_response(h_response, 3)
10321038
h3_fd_gen.Multiply(h_effkine_gen)
10331039
self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_gen_genkine.png')
@@ -1124,27 +1130,26 @@ def _build_effkine(self, h_nocuts, h_cuts):
11241130

11251131

11261132
def _build_response_matrix(self, h_response, h_eff = None, frac_flat = 0.):
1133+
dim = (get_dim(h_response) - 1) // 2
1134+
self.logger.info("Building %i-dim response matrix from %s", dim, h_response)
11271135
rm = ROOT.RooUnfoldResponse(
1128-
project_hist(h_response, [0, 1], {}), project_hist(h_response, [2, 3], {}))
1129-
h_gen = project_hist(h_response, [2, 3], {})
1130-
for hbin in itertools.product(
1131-
enumerate(list(get_axis(h_response, 0).GetXbins())[:-1], 1),
1132-
enumerate(list(get_axis(h_response, 1).GetXbins())[:-1], 1),
1133-
enumerate(list(get_axis(h_response, 2).GetXbins())[:-1], 1),
1134-
enumerate(list(get_axis(h_response, 3).GetXbins())[:-1], 1),
1135-
enumerate(list(get_axis(h_response, 4).GetXbins())[:-1], 1)):
1136+
project_hist(h_response, list(range(dim)), {}), project_hist(h_response, list(range(dim, 2 * dim)), {}))
1137+
h_gen = project_hist(h_response, list(range(dim, 2 * dim)), {})
1138+
1139+
x = (enumerate(list(get_axis(h_response, iaxis).GetXbins())[:-1], 1) for iaxis in range(2*dim+1))
1140+
for hbin in itertools.product(*x):
11361141
n = h_response.GetBinContent(
1137-
np.asarray([hbin[0][0], hbin[1][0], hbin[2][0], hbin[3][0], hbin[4][0]], 'i'))
1138-
eff = h_eff.GetBinContent(hbin[4][0]) if h_eff else 1.
1142+
np.asarray([hbin[i][0] for i in range(2*dim+1)], 'i'))
1143+
eff = h_eff.GetBinContent(hbin[2*dim][0]) if h_eff else 1.
11391144
if np.isclose(eff, 0.):
11401145
self.logger.error('efficiency 0 for %s', hbin[4])
11411146
continue
1142-
if (cnt_gen := h_gen.GetBinContent(hbin[2][0], hbin[3][0])) > 0.:
1147+
if (cnt_gen := h_gen.GetBinContent(*(hbin[i][0] for i in range(dim, 2*dim)))) > 0.:
11431148
fac = 1.
11441149
if frac_flat > 0.:
11451150
fac += frac_flat * (1. / cnt_gen - 1.)
11461151
for _ in range(int(n)):
1147-
rm.Fill(hbin[0][1], hbin[1][1], hbin[2][1], hbin[3][1], 1./eff * fac)
1152+
rm.Fill(*(hbin[iaxis][1] for iaxis in range(2*dim)), 1./eff * fac)
11481153
# rm.Mresponse().Print()
11491154
return rm
11501155

@@ -1165,7 +1170,7 @@ def _subtract_feeddown(self, hist, var, mcordata):
11651170

11661171
#region unfolding
11671172
def _unfold(self, hist, var, mcordata):
1168-
self.logger.debug('Unfolding for %s', var)
1173+
self.logger.info('Unfolding for %s', var)
11691174
suffix = '_frac' if mcordata == 'mc' else ''
11701175
with TFile(self.n_fileeff) as rfile:
11711176
h_response = rfile.Get(f'h_response_pr_{var}{suffix}')
@@ -1196,7 +1201,7 @@ def _unfold(self, hist, var, mcordata):
11961201
self._save_hist(h_effkine_gen, f'uf/h_effkine-ptjet-{var}_pr_gen_{mcordata}.png', 'text')
11971202

11981203
# TODO: move, has nothing to do with unfolding
1199-
if mcordata == 'mc':
1204+
if mcordata == 'mc' and get_dim(hist) <= 2:
12001205
h_mctruth_pr = rfile.Get(f'h_ptjet-pthf-{var}_pr_gen')
12011206
if h_mctruth_pr:
12021207
h_mctruth_pr = project_hist(h_mctruth_pr, [0, 2], {})
@@ -1219,7 +1224,7 @@ def _unfold(self, hist, var, mcordata):
12191224
self._save_hist(fh_unfolding_output, f'uf/h_ptjet-{var}_{mcordata}_unfoldeffcorr{n}.png', 'texte')
12201225
h_unfolding_output.append(fh_unfolding_output)
12211226

1222-
if mcordata == 'mc':
1227+
if mcordata == 'mc' and get_dim(hist) <= 2:
12231228
if h_mctruth_pr:
12241229
h_mcunfolded = fh_unfolding_output.Clone()
12251230
h_mcunfolded.Divide(h_mctruth_pr)

machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,7 @@ D0Jet_pp:
509509
multi:
510510
data:
511511
nprocessesparallel: 80
512-
maxfiles: [-1] #list of periods
512+
maxfiles: [1] #list of periods
513513
chunksizeunp: [100] #list of periods
514514
chunksizeskim: [100] #list of periods
515515
fracmerge: [.1] #list of periods
@@ -532,7 +532,7 @@ D0Jet_pp:
532532
mcreweights: [../Analyses] #list of periods
533533
mc:
534534
nprocessesparallel: 80
535-
maxfiles: [-1] #list of periods
535+
maxfiles: [1] #list of periods
536536
chunksizeunp: [100] #list of periods
537537
chunksizeskim: [1000] #list of periods
538538
fracmerge: [1.] #list of periods
@@ -721,17 +721,39 @@ D0Jet_pp:
721721
bins_det_fix: [10, 0., 1.]
722722
label: "#Delta#it{r}"
723723
lntheta:
724-
bins_gen_fix: [10, 0., 5.]
725-
bins_det_fix: [10, 0., 5.]
724+
bins_gen_fix: [8, 1., 5.]
725+
bins_det_fix: [8, 1., 5.]
726726
label: "#minusln(#it{#theta})"
727727
arraycols: [3]
728728
lnkt:
729-
bins_gen_fix: [10, -8., 2.]
730-
bins_det_fix: [10, -8., 2.]
729+
bins_gen_fix: [8, -4., 4.]
730+
bins_det_fix: [8, -4., 4.]
731731
label: "ln(#it{k}_{T}/(GeV/#it{c}))"
732732
arraycols: [3]
733733
lntheta-lnkt:
734734
arraycols: [3, 4]
735+
# new variables
736+
fEnergyMother:
737+
bins_gen_fix: [1, 0., 100.]
738+
bins_det_fix: [1, 0., 100.]
739+
arraycols: [3]
740+
# lntheta-lnkt-fEnergyMother:
741+
# arraycols: [3, 4, 5]
742+
fJetNConstituents:
743+
bins_gen_fix: [5, 0., 20.]
744+
bins_det_fix: [5, 0., 20.]
745+
zpar-fJetNConstituents: {}
746+
nsub21:
747+
# TODO: check for 1-track jets
748+
bins_gen_fix: [11, -1., 1.]
749+
bins_det_fix: [11, -1., 1.]
750+
eecweight:
751+
# TODO: adjust binning
752+
bins_gen_fix: [10, 0., 1.]
753+
bins_det_fix: [10, 0., 1.]
754+
arraycols: [3]
755+
fPairTheta-eecweight:
756+
arraycols: [3, 4]
735757

736758
data_selections:
737759
mcsig:

machine_learning_hep/processer.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@
2121
import re
2222
import sys
2323
import tempfile
24+
import traceback
2425
from copy import deepcopy
2526
from functools import reduce
27+
from typing import TypeVar
2628
from pandas.api.types import is_numeric_dtype
2729

2830
import numpy as np
@@ -294,7 +296,8 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab
294296
# Flag if they should be used
295297
self.do_custom_analysis_cuts = datap["analysis"][self.typean].get("use_cuts", False)
296298

297-
def cfg(self, param, default = None):
299+
T = TypeVar("T")
300+
def cfg(self, param: str, default: T = None) -> T:
298301
return reduce(lambda d, key: d.get(key, default) if isinstance(d, dict) else default,
299302
param.split("."), self.datap['analysis'][self.typean])
300303

@@ -510,6 +513,7 @@ def applymodel(self, file_index):
510513
@staticmethod
511514
def callback(ex):
512515
get_logger().exception('Error callback: %s', ex)
516+
traceback.print_stack()
513517
raise ex
514518

515519
def parallelizer(self, function, argument_list, maxperchunk):

0 commit comments

Comments
 (0)