Skip to content

Commit 1a763c7

Browse files
committed
Update for inclusive jets
1 parent f5ad602 commit 1a763c7

File tree

4 files changed

+100
-68
lines changed

4 files changed

+100
-68
lines changed

machine_learning_hep/data/data_run3/database_ml_parameters_Jet_pp.yml

Lines changed: 43 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,12 @@ Jet_pp:
5050
#region dfs
5151
dfs:
5252
read:
53-
evtorig:
54-
index: fIndexCJetCO
55-
trees:
56-
O2cjetco: [fPosZ]
57-
filter: "abs(fPosZ) < 10."
53+
# evtorig:
54+
# index: fIndexCJetCO
55+
# level: data
56+
# trees:
57+
# O2cjetco: [fPosZ]
58+
# filter: "abs(fPosZ) < 10."
5859
collcnt:
5960
trees:
6061
O2collcount:
@@ -113,6 +114,9 @@ Jet_pp:
113114
fPairTheta,
114115
]
115116
filter: "abs(fJetEta) < (.9 - (fJetR / 100.))" # TODO: check jet eta cut
117+
extra:
118+
fPt: 5.
119+
fM: 1.86
116120

117121
colldet:
118122
level: det
@@ -154,6 +158,9 @@ Jet_pp:
154158
fPairTheta,
155159
]
156160
filter: "abs(fJetEta) < (.9 - (fJetR / 100.))" # TODO: check jet eta cut
161+
extra:
162+
fPt: 5.
163+
fM: 1.86
157164

158165
colldata:
159166
level: data
@@ -189,6 +196,9 @@ Jet_pp:
189196
fPairTheta,
190197
]
191198
filter: "abs(fJetEta) < (.9 - (fJetR / 100.))" # TODO: check jet eta cut
199+
extra:
200+
fPt: 5.
201+
fM: 1.86
192202

193203
merge:
194204
- { base: jetgen, ref: collgen }
@@ -205,16 +215,21 @@ Jet_pp:
205215
jetdata:
206216
level: data
207217
file: AnalysisResultsReco.parquet
208-
# evtorig:
209-
# level: all
210-
# file: AnalysisResultsEvtOrig.parquet
211-
# evt:
212-
# level: all
213-
# source: evtorig
218+
colldata:
219+
level: data
220+
file: AnalysisResultsEvtOrig.parquet
221+
collgen:
222+
level: mc
223+
file: AnalysisResultsEvtOrig.parquet
224+
# colldata_all:
225+
# level: data
214226
# file: AnalysisResultsEvt.parquet
215-
# collcnt:
216-
# level: all
217-
# file: AnalysisResultsCollCnt.parquet
227+
# collgen_all:
228+
# level: mc
229+
# file: AnalysisResultsEvt.parquet
230+
collcnt:
231+
level: all
232+
file: AnalysisResultsCollCnt.parquet
218233
# bccnt:
219234
# level: all
220235
# file: AnalysisResultsBcCnt.parquet
@@ -392,7 +407,7 @@ Jet_pp:
392407
namefile_reco: AnalysisResultsReco.parquet
393408
namefile_evt: AnalysisResultsEvt.parquet
394409
namefile_collcnt: AnalysisResultsCollCnt.parquet
395-
namefile_bccnt: AnalysisResultsBcCnt.parquet
410+
# namefile_bccnt: AnalysisResultsBcCnt.parquet
396411
namefile_evtvalroot: AnalysisResultsROOTEvtVal.root
397412
namefile_evtorig: AnalysisResultsEvtOrig.parquet
398413
namefile_gen: AnalysisResultsGen.parquet
@@ -528,11 +543,12 @@ Jet_pp:
528543
dir_general_plots: /data2/jklein/data/analysis_plots
529544

530545
jet_obs: &jet_default
546+
hfjet: false
531547
sel_an_binmin: [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 16, 24] # hadron pt bins (sel_an_binmin bins)
532548
sel_an_binmax: [2, 3, 4, 5, 6, 7, 8, 10, 12, 16, 24, 48] # hadron pt bins (sel_an_binmin bins) # FIXME: move the last edge in sel_an_binmin
533549
bins_ptjet: [5, 7, 15, 30, 50, 70] # systematics, TODO: split rec and gen binning
534550
bins_ptjet_eff: [2, 5, 7, 15, 30, 50, 70, 90] # systematics, TODO: split rec and gen binning
535-
cand_collidx: fIndexHfD0CollBases
551+
# cand_collidx: fIndexHfD0CollBases
536552
counter_read_data: fReadCountsWithTVXAndZVertexAndSel8
537553
counter_read_mc: fReadCountsWithTVXAndZVertexAndSelMC
538554
counter_tvx: fReadCountsWithTVX
@@ -622,13 +638,13 @@ Jet_pp:
622638
lntheta-lnkt:
623639
arraycols: [3, 4]
624640

625-
data_selections:
626-
mcsig:
627-
level: mc
628-
query: "(isd0 & seld0) or (isd0bar & seld0bar)"
629-
mcrefl:
630-
level: mc
631-
query: "(isd0 & seld0bar) or (isd0bar & seld0)"
641+
# data_selections:
642+
# mcsig:
643+
# level: mc
644+
# query: "(isd0 & seld0) or (isd0bar & seld0bar)"
645+
# mcrefl:
646+
# level: mc
647+
# query: "(isd0 & seld0bar) or (isd0bar & seld0)"
632648

633649
corr_refl: true # systematics
634650
fit_levels: ["mcsig", "mcrefl", "mc", "data"]
@@ -945,10 +961,9 @@ Jet_pp:
945961
bin_width: 0.001 # bin width of the invariant mass histogram # systematics?
946962
n_rebin: 3 # number of mass bins to merge
947963
efficiency:
948-
extra_cols:
949-
["isd0", "isd0bar", "seld0", "seld0bar", "mlBkgScore"]
950-
filter_det: "(isd0 & seld0) or (isd0bar & seld0bar)"
951-
index_match: fIndexArrayD0CMCPJETOS_hf
964+
# extra_cols: ["isd0", "isd0bar", "seld0", "seld0bar"]
965+
# filter_det: "(isd0 & seld0) or (isd0bar & seld0bar)"
966+
index_match: fIndexArrayD0CMCPJETOS_geo
952967
correction_method: run3
953968

954969
unfolding_iterations: 8 # used, maximum iteration
@@ -1070,7 +1085,7 @@ Jet_pp:
10701085
variations_db: database_variations_Jet_pp_jet_obs.yml
10711086

10721087
# Additional cuts applied before mass histogram is filled
1073-
use_cuts: True # systematics
1088+
use_cuts: False # systematics
10741089
cuts: [
10751090
"mlBkgScore < 0.02",
10761091
"mlBkgScore < 0.02",

machine_learning_hep/processer.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,8 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab
111111
self.n_reco = datap["files_names"]["namefile_reco"]
112112
self.n_evt = datap["files_names"]["namefile_evt"]
113113
self.n_collcnt = datap["files_names"]["namefile_collcnt"]
114-
self.n_bccnt = datap["files_names"]["namefile_bccnt"]
115-
self.n_evtorig = datap["files_names"]["namefile_evtorig"]
114+
self.n_bccnt = datap["files_names"].get("namefile_bccnt")
115+
self.n_evtorig = datap["files_names"].get("namefile_evtorig")
116116
self.n_evt_count_ml = datap["files_names"].get("namefile_evt_count", "evtcount.yaml")
117117
self.n_gen = datap["files_names"]["namefile_gen"]
118118
self.n_filemass = datap["files_names"]["histofilename"]
@@ -373,10 +373,15 @@ def dfuse(df_spec):
373373
dfappend(df_name, df)
374374

375375
for df_name, df_spec in self.df_read.items():
376-
if dfuse(df_spec) and not dfs[df_name].empty:
376+
if dfuse(df_spec):
377+
if dfs[df_name].empty:
378+
self.logger.warning("DF %s is empty", df_name)
379+
else:
380+
self.logger.info("DF %s is filled", df_name)
377381
if 'extra' in df_spec:
378-
self.logger.debug(' %s -> extra', df_name)
382+
self.logger.info(' %s -> extra', df_name)
379383
for col_name, col_val in df_spec['extra'].items():
384+
self.logger.info(' %s -> %s', col_name, col_val)
380385
dfs[df_name][col_name] = dfs[df_name].eval(col_val)
381386
if 'extract_component' in df_spec:
382387
self.logger.debug(' %s -> extract_component', df_name)

machine_learning_hep/processer_jet.py

Lines changed: 46 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -124,19 +124,7 @@ def _calculate_variables(self, df, verify=False): # pylint: disable=invalid-name
124124
self.logger.info('calculating variables')
125125
if len(df) == 0:
126126
return df
127-
df['dr'] = np.sqrt((df.fJetEta - df.fEta)**2 + ((df.fJetPhi - df.fPhi + math.pi) % math.tau - math.pi)**2)
128-
df['jetPx'] = df.fJetPt * np.cos(df.fJetPhi)
129-
df['jetPy'] = df.fJetPt * np.sin(df.fJetPhi)
130-
df['jetPz'] = df.fJetPt * np.sinh(df.fJetEta)
131-
df['hfPx'] = df.fPt * np.cos(df.fPhi)
132-
df['hfPy'] = df.fPt * np.sin(df.fPhi)
133-
df['hfPz'] = df.fPt * np.sinh(df.fEta)
134-
df['zpar_num'] = df.jetPx * df.hfPx + df.jetPy * df.hfPy + df.jetPz * df.hfPz
135-
df['zpar_den'] = df.jetPx * df.jetPx + df.jetPy * df.jetPy + df.jetPz * df.jetPz
136-
df['zpar'] = df.zpar_num / df.zpar_den
137-
df[df['zpar'] >= 1.]['zpar'] = .999 # move 1 to last bin
138127
df['nsub21'] = df.fNSub2 / df.fNSub1
139-
140128
self.logger.debug('zg')
141129
df['zg_array'] = np.array(.5 - abs(df.fPtSubLeading / (df.fPtLeading + df.fPtSubLeading) - .5))
142130
zcut = self.cfg('zcut', .1)
@@ -150,6 +138,20 @@ def _calculate_variables(self, df, verify=False): # pylint: disable=invalid-name
150138
(lambda ar: np.log(ar.fPtSubLeading * np.sin(ar.fTheta))), axis=1)
151139
df['lntheta'] = df['fTheta'].apply(lambda x: -np.log(x))
152140
# df['lntheta'] = np.array(-np.log(df.fTheta))
141+
142+
if self.cfg('hfjet', True):
143+
df['dr'] = np.sqrt((df.fJetEta - df.fEta)**2 + ((df.fJetPhi - df.fPhi + math.pi) % math.tau - math.pi)**2)
144+
df['jetPx'] = df.fJetPt * np.cos(df.fJetPhi)
145+
df['jetPy'] = df.fJetPt * np.sin(df.fJetPhi)
146+
df['jetPz'] = df.fJetPt * np.sinh(df.fJetEta)
147+
df['hfPx'] = df.fPt * np.cos(df.fPhi)
148+
df['hfPy'] = df.fPt * np.sin(df.fPhi)
149+
df['hfPz'] = df.fPt * np.sinh(df.fEta)
150+
df['zpar_num'] = df.jetPx * df.hfPx + df.jetPy * df.hfPy + df.jetPz * df.hfPz
151+
df['zpar_den'] = df.jetPx * df.jetPx + df.jetPy * df.jetPy + df.jetPz * df.jetPz
152+
df['zpar'] = df.zpar_num / df.zpar_den
153+
df[df['zpar'] >= 1.]['zpar'] = .999 # move 1 to last bin
154+
153155
self.logger.debug('done')
154156
if verify:
155157
self._verify_variables(df)
@@ -172,18 +174,20 @@ def process_histomass_single(self, index):
172174
dfevtorig = read_df(self.l_evtorig[index])
173175
histonorm = TH1F("histonorm", "histonorm", 4, 0, 4)
174176
histonorm.SetBinContent(1, len(dfquery(dfevtorig, self.s_evtsel)))
175-
dfcollcnt = read_df(self.l_collcnt[index])
176-
ser_collcnt = dfcollcnt[self.cfg(f'counter_read_{self.mcordata}')]
177-
collcnt_read = functools.reduce(lambda x,y: float(x)+float(y), (ar[0] for ar in ser_collcnt))
178-
ser_collcnt = dfcollcnt[self.cfg('counter_tvx')]
179-
collcnt_tvx = functools.reduce(lambda x,y: float(x)+float(y), (ar[0] for ar in ser_collcnt))
180-
dfbccnt = read_df(self.l_bccnt[index])
181-
ser_bccnt = dfbccnt[self.cfg('counter_tvx')]
182-
bccnt_tvx = functools.reduce(lambda x,y: float(x)+float(y), (ar[0] for ar in ser_bccnt))
183-
self.logger.info('sampled %g collisions', collcnt_read)
184-
histonorm.SetBinContent(2, collcnt_read)
185-
histonorm.SetBinContent(3, collcnt_tvx)
186-
histonorm.SetBinContent(4, bccnt_tvx)
177+
if self.l_collcnt:
178+
dfcollcnt = read_df(self.l_collcnt[index])
179+
ser_collcnt = dfcollcnt[self.cfg(f'counter_read_{self.mcordata}')]
180+
collcnt_read = functools.reduce(lambda x,y: float(x)+float(y), (ar[0] for ar in ser_collcnt))
181+
self.logger.info('sampled %g collisions', collcnt_read)
182+
histonorm.SetBinContent(2, collcnt_read)
183+
ser_collcnt = dfcollcnt[self.cfg('counter_tvx')]
184+
collcnt_tvx = functools.reduce(lambda x,y: float(x)+float(y), (ar[0] for ar in ser_collcnt))
185+
histonorm.SetBinContent(3, collcnt_tvx)
186+
if self.l_bccnt:
187+
dfbccnt = read_df(self.l_bccnt[index])
188+
ser_bccnt = dfbccnt[self.cfg('counter_tvx')]
189+
bccnt_tvx = functools.reduce(lambda x,y: float(x)+float(y), (ar[0] for ar in ser_bccnt))
190+
histonorm.SetBinContent(4, bccnt_tvx)
187191
get_axis(histonorm, 0).SetBinLabel(1, 'N_{evt}')
188192
get_axis(histonorm, 0).SetBinLabel(2, 'N_{coll}')
189193
get_axis(histonorm, 0).SetBinLabel(3, 'N_{coll}^{TVX}')
@@ -314,23 +318,26 @@ def process_efficiency_single(self, index):
314318
hist.SetName(hist.GetName() + '_frac')
315319

316320
with TFile.Open(self.l_histoeff[index], "recreate") as rfile:
317-
# TODO: avoid hard-coding values here (check if restriction is needed at all)
318-
cols = ['ismcprompt', 'ismcsignal', 'ismcfd', 'fPt', 'fEta', 'fPhi', 'fJetPt', 'fJetEta', 'fJetPhi',
319-
'fPtLeading', 'fPtSubLeading', 'fTheta', 'fNSub2DR', 'fNSub1', 'fNSub2']
321+
# # TODO: avoid hard-coding values here (check if restriction is needed at all)
322+
# cols = ['ismcprompt', 'ismcsignal', 'ismcfd', 'fPt', 'fEta', 'fPhi', 'fJetPt', 'fJetEta', 'fJetPhi',
323+
# 'fPtLeading', 'fPtSubLeading', 'fTheta', 'fNSub2DR', 'fNSub1', 'fNSub2']
320324

321325
# read generator level
322-
dfgen_orig = pd.concat(read_df(self.mptfiles_gensk[bin][index], columns=cols)
326+
dfgen_orig = pd.concat(read_df(self.mptfiles_gensk[bin][index])
323327
for bin in self.active_bins_skim)
324328
df = self._calculate_variables(dfgen_orig)
325329
df = df.rename(lambda name: name + '_gen', axis=1)
326-
dfgen = {'pr': df.loc[(df.ismcsignal_gen == 1) & (df.ismcprompt_gen == 1)],
327-
'np': df.loc[(df.ismcsignal_gen == 1) & (df.ismcfd_gen == 1)]}
330+
if self.cfg('hfjet', True):
331+
dfgen = {'pr': df.loc[(df.ismcsignal_gen == 1) & (df.ismcprompt_gen == 1)],
332+
'np': df.loc[(df.ismcsignal_gen == 1) & (df.ismcfd_gen == 1)]}
333+
else:
334+
dfgen = {'pr': df, 'np': df}
328335

329336
# read detector level
330-
cols.extend(self.cfg('efficiency.extra_cols', []))
331-
if idx := self.cfg('efficiency.index_match'):
332-
cols.append(idx)
333-
df = pd.concat(read_df(self.mptfiles_recosk[bin][index], columns=cols)
337+
# cols.extend(self.cfg('efficiency.extra_cols', []))
338+
# if idx := self.cfg('efficiency.index_match'):
339+
# cols.append(idx)
340+
df = pd.concat(read_df(self.mptfiles_recosk[bin][index])
334341
for bin in self.active_bins_skim)
335342

336343
# Custom skimming cuts
@@ -342,8 +349,11 @@ def process_efficiency_single(self, index):
342349
else:
343350
self.logger.warning('No matching criterion specified, cannot match det and gen')
344351
df = self._calculate_variables(df)
345-
dfdet = {'pr': df.loc[(df.ismcsignal == 1) & (df.ismcprompt == 1)],
346-
'np': df.loc[(df.ismcsignal == 1) & (df.ismcfd == 1)]}
352+
if self.cfg('hfjet', True):
353+
dfdet = {'pr': df.loc[(df.ismcsignal == 1) & (df.ismcprompt == 1)],
354+
'np': df.loc[(df.ismcsignal == 1) & (df.ismcfd == 1)]}
355+
else:
356+
dfdet = {'pr': df, 'np': df}
347357

348358
dfmatch = {cat: pd.merge(dfdet[cat], dfgen[cat], left_on=['df', 'idx_match'], right_index=True)
349359
for cat in cats if 'idx_match' in dfdet[cat]}

machine_learning_hep/utilities_files.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,8 @@ def createlist(prefolder: str, mylistfolder: list[str], namefile: str):
131131
"""
132132
Appends base foldername + filename in list
133133
"""
134+
if not namefile:
135+
return []
134136
listfiles = appendfiletolist(mylistfolder, namefile)
135137
listfiles = appendmainfoldertolist(prefolder, listfiles)
136138
return listfiles

0 commit comments

Comments
 (0)