Skip to content

Commit 806f2a2

Browse files
benedikt-voelkelBenedikt Volkel
andauthored
Analysis and efficiency update (#706)
* add possibility to take efficiencies from a chosen multiplicity bin of another analysis * start development of concise meta data object to not hide all information of pT, mutliplicity and ML working point in a file or histogram name (see MLHEPMetaInfo in root.py) * add an option to add missing keys in modify_dictionary in do_variations.py Co-authored-by: Benedikt Volkel <[email protected]>
1 parent 2d7f193 commit 806f2a2

File tree

6 files changed

+139
-12
lines changed

6 files changed

+139
-12
lines changed

machine_learning_hep/analysis/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ In order to place additional cuts before a mass histogram is filled, those have
2121
2222
The cuts can then be accessed in `processer_<type>.process_histomass_single`. The database flag `use_cuts` is translated into the member `self.do_custom_analysis_cuts` and should be checked whether it's `True` in order to not circumvent it's purpose. Then, there is a helper function in `Processer` so if you have a dataframe corresponding to a certain pT bin, you can just do
2323

24+
## Using efficiencies from another analysis
25+
26+
To use the efficiencies from another analysis for a certain multiplicity bin one can use the fields `path_eff` and `mult_bin_eff` when using the analyzer class `AnalyzerDhadrons_mult`. When using this feature, both fields have to contain a list as long as the number of multiplicity bins. The first list lists the corresponding file to be used and the second list entries are integers referring to the i'th multiplicity bin histogram inside the file. `null` entries can be used to use the efficiencies of this very analysis multiplicity bin (which is of course the default when none of the lists is present).
27+
28+
2429
```python
2530
if self.do_custom_analysis_cuts:
2631
df = self.apply_cuts_ptbin(df, ipt)

machine_learning_hep/analysis/analyzerdhadrons_mult.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,25 @@ def __init__(self, datap, case, typean, period):
177177
None)
178178
self.path_for_crossmb = datap["analysis"][self.typean].get("path_for_crossmb", None)
179179

180+
# Take efficiencies from another analysis.
181+
self.path_file_eff = datap["analysis"][self.typean].get("path_eff", None)
182+
self.mult_bin_eff = datap["analysis"][self.typean].get("mult_bin_eff", None)
183+
184+
if (self.path_file_eff and not self.mult_bin_eff) or \
185+
(not self.path_file_eff and self.mult_bin_eff):
186+
# That is incoherent
187+
self.logger.fatal("Either both or none of the lists \"path_eff\" and \"mult_bin_eff\"" \
188+
"must be specified")
189+
190+
if not self.path_file_eff:
191+
self.path_file_eff = [None] * self.p_nbin2
192+
self.mult_bin_eff = [None] * self.p_nbin2
193+
194+
if len(self.path_file_eff) != self.p_nbin2 or len(self.mult_bin_eff) != self.p_nbin2:
195+
self.logger.fatal("Efficiencies are requested to be taken from another analysis. " \
196+
"Make sure lists \"path_eff\" and \"mult_bin_eff\" have the same " \
197+
"length as the number of those bins (%i).", self.p_nbin2)
198+
180199
# Fitting
181200
self.fitter = None
182201
self.p_performval = datap["analysis"].get("event_cand_validation", None)
@@ -433,23 +452,32 @@ def makenormyields(self):
433452
self.loadstyle()
434453
#self.test_aliphysics()
435454
#filedataval = TFile.Open(self.f_evtnorm)
436-
fileouteff = "%s/efficiencies%s%s.root" % \
437-
(self.d_resultsallpmc, self.case, self.typean)
438455
yield_filename = self.make_file_path(self.d_resultsallpdata, self.yields_filename, "root",
439456
None, [self.case, self.typean])
440457
gROOT.LoadMacro("HFPtSpectrum.C")
441458
from ROOT import HFPtSpectrum, HFPtSpectrum2, HFPtSpectrumRescaled
442459
histonorm = TH1F("histonorm", "histonorm", self.p_nbin2, 0, self.p_nbin2)
443460
for imult in range(self.p_nbin2):
461+
# Choose where efficiencies to take from. Either this mult. bin, another mult. bin
462+
# in this analysis or another mult. bin from another analysis specified explicitly
463+
# by the user.
464+
fileouteff = "{self.d_resultsallpmc}/efficiencies{self.case}{self.typean}.root" \
465+
if not self.path_file_eff[imult] else self.path_file_eff[imult]
466+
if not os.path.exists(fileouteff):
467+
self.logger.fatal("Efficiency file %s could not be found", fileouteff)
444468
bineff = -1
445-
if self.p_bineff is None:
469+
if self.mult_bin_eff[imult] is not None:
470+
bineff = self.mult_bin_eff[imult]
471+
print(f"Use efficiency from bin {bineff} from file {fileouteff}")
472+
elif self.p_bineff is None:
446473
bineff = imult
447474
print("Using efficiency for each var2 bin")
448475
else:
449476
bineff = self.p_bineff
450-
print("Using efficiency always from bin=", bineff)
451-
namehistoeffprompt = "eff_mult%d" % bineff
452-
namehistoefffeed = "eff_fd_mult%d" % bineff
477+
print(f"Using efficiency always from bin={bineff}")
478+
479+
namehistoeffprompt = f"eff_mult{bineff}"
480+
namehistoefffeed = f"eff_fd_mult{bineff}"
453481
nameyield = "hyields%d" % imult
454482
fileoutcrossmult = "%s/finalcross%s%smult%d.root" % \
455483
(self.d_resultsallpdata, self.case, self.typean, imult)

machine_learning_hep/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def update_config(database: dict, run_config: dict, database_overwrite=None): #
5050
# To be implemented
5151
if database_overwrite:
5252
logger.info("Updating database fields with custom user input")
53-
modify_dictionary(database, database_overwrite)
53+
modify_dictionary(database, database_overwrite, True)
5454

5555
# If not an ML analysis...
5656
if not database["doml"]:

machine_learning_hep/do_variations.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -157,14 +157,19 @@ def format_varlabel(varlabel: list, index: int, n_var: int):
157157
'''Format the label of a variation in a variation group.'''
158158
return "%s: %d" % (varlabel[0], index) if len(varlabel) != n_var else varlabel[index]
159159

160-
def modify_dictionary(dic: dict, diff: dict):
161-
'''Modify the dic dictionary using the diff dictionary.'''
160+
def modify_dictionary(dic: dict, diff: dict, add_not_present=False):
161+
'''Modify the dic dictionary using the diff dictionary.
162+
163+
Add additional keys if add_not_present is True
164+
'''
162165
for key, value in diff.items():
163-
if key in dic: # Do not add keys that are not already in the original dictionary.
166+
if key in dic:
164167
if isinstance(value, dict):
165-
modify_dictionary(dic[key], value)
168+
modify_dictionary(dic[key], value, add_not_present)
166169
else:
167170
dic[key] = format_value(dic[key], value)
171+
elif add_not_present:
172+
dic[key] = value
168173
else:
169174
msg_warn("Key %s was not found and will be ignored." % key)
170175

machine_learning_hep/processerdhadrons_mult.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
seldf_singlevar_inclusive, openfile
2929
from machine_learning_hep.utilities import mergerootfiles
3030
from machine_learning_hep.utilities import get_timestamp_string
31+
from machine_learning_hep.root import create_meta_info, write_meta_info
3132
#from machine_learning_hep.globalfitter import fitter
3233
from machine_learning_hep.processer import Processer
3334
from machine_learning_hep.bitwise import filter_bit_df, tag_bit_df
@@ -237,6 +238,12 @@ def process_histomass_single(self, index):
237238
(self.v_var_binning, self.lpt_finbinmin[ipt],
238239
self.lpt_finbinmax[ipt], self.lpt_probcutfin[bin_id],
239240
self.v_var2_binning, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2])
241+
curr_dir = myfile.mkdir(f"bin1_{ipt}_bin2_{ibin2}")
242+
meta_info = create_meta_info(self.v_var_binning, self.lpt_finbinmin[ipt],
243+
self.lpt_finbinmax[ipt], self.v_var2_binning,
244+
self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2],
245+
self.lpt_probcutfin[bin_id])
246+
write_meta_info(curr_dir, meta_info)
240247
h_invmass = TH1F("hmass" + suffix, "", self.p_num_bins,
241248
self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
242249
h_invmass_weight = TH1F("h_invmass_weight" + suffix, "", self.p_num_bins,

machine_learning_hep/root.py

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,92 @@
1919
import array
2020
import ast
2121
import numpy as np
22-
from ROOT import TNtuple, TFile # pylint: disable=import-error,no-name-in-module
22+
from ROOT import gROOT, TNtuple, TFile # pylint: disable=import-error,no-name-in-module
2323
from machine_learning_hep.logger import get_logger
2424

2525

26+
META_INFO = "struct MLHEPMetaInfo { \
27+
Float_t firstLow; \
28+
Float_t firstUp; \
29+
Float_t secondLow; \
30+
Float_t secondUp; \
31+
Float_t MLWorkingPoint; \
32+
std::string firstBinName; \
33+
std::string secondBinName; \
34+
};"
35+
gROOT.ProcessLine(META_INFO)
36+
37+
META_INFO_NAME = "MLHEPMetaInfo"
38+
39+
from ROOT import MLHEPMetaInfo # pylint: disable=wrong-import-position, import-error, no-name-in-module, ungrouped-imports
40+
41+
def create_meta_info(first_name, first_low, first_up, second_name, second_low, second_up, ml_wp):
42+
"""Fill MLHEPMetaInfo struct
43+
44+
Custom MLHEP ROOT struct to store meta info
45+
46+
Args:
47+
first_name: str
48+
name of first binning variable
49+
first_low: float
50+
low bin value of first variable
51+
first_up: float
52+
up bin value of first variable
53+
second_name: str
54+
name of second binning variable
55+
second_low: float
56+
low bin value of second variable
57+
second_up: float
58+
up bin value of second variable
59+
ml_wp: float
60+
ML working point used to cut
61+
62+
Returns:
63+
MLHEPMetaInfo
64+
"""
65+
66+
meta_info = MLHEPMetaInfo()
67+
meta_info.firstBinName = first_name
68+
meta_info.firstLow = first_low
69+
meta_info.firstUp = first_up
70+
meta_info.secondBinName = second_name
71+
meta_info.secondLow = second_low
72+
meta_info.secondUp = second_up
73+
meta_info.MLWorkingPoint = ml_wp
74+
return meta_info
75+
76+
77+
def write_meta_info(root_dir, meta_info):
78+
"""Write MLHEPMetaInfo to ROOT directory
79+
80+
Args:
81+
root_dir: inheriting from TDirectory
82+
ROOT directory where to write
83+
meta_info: MLHEPMetaInfo
84+
the meta info to be written
85+
"""
86+
root_dir.WriteObject(meta_info, META_INFO_NAME)
87+
88+
89+
def read_meta_info(root_dir, fail_not_found=True):
90+
"""Read MLHEPMetaInfo
91+
92+
Args:
93+
root_dir: inheriting from TDirectory
94+
ROOT directory where to read from
95+
fail_not_found: bool
96+
if True fail if not found
97+
Returns:
98+
MLHEPMetaInfo
99+
"""
100+
101+
102+
meta_info = root_dir.Get(META_INFO_NAME)
103+
if not meta_info and fail_not_found:
104+
get_logger().fatal("Cannot find %s in directory %s", META_INFO_NAME, root_dir.GetName())
105+
return meta_info
106+
107+
26108
def read_ntuple(ntuple, variables):
27109
"""
28110
Return a numpy array with the values from TNtuple.

0 commit comments

Comments
 (0)